| skipped 63 lines |
64 | 64 | | parser.add_argument("--output", default='output_$SEARCH_$DATE.txt', type=str, |
65 | 65 | | help="Output File (default: output_$SEARCH_$DATE.txt), where $SEARCH is replaced by the first " |
66 | 66 | | "chars of the search string and $DATE is replaced by the datetime") |
| 67 | + | parser.add_argument("--continuous_write", type=bool, default=False, |
| 68 | + | help="Write progressively to output file (default: False)") |
67 | 69 | | parser.add_argument("search", type=str, help="The search string or phrase") |
68 | 70 | | parser.add_argument("--limit", type=int, default=0, help="Set a max number of pages per engine to load") |
69 | 71 | | parser.add_argument("--barmode", type=str, default="fixed", help="Can be 'fixed' (default) or 'unknown'") |
| skipped 4 lines |
74 | 76 | | proxies = {'http': 'socks5h://{}'.format(args.proxy), 'https': 'socks5h://{}'.format(args.proxy)} |
75 | 77 | | tqdm_bar_format = "{desc}: {percentage:3.0f}% |{bar}| {n_fmt:3s} / {total_fmt:3s} [{elapsed:5s} < {remaining:5s}]" |
76 | 78 | | result = {} |
| 79 | + | filename = args.output |
77 | 80 | | |
78 | 81 | | |
79 | 82 | | def random_headers(): |
| skipped 703 lines |
783 | 786 | | |
784 | 787 | | def link_finder(engine_str, data_obj): |
785 | 788 | | global result |
| 789 | + | global filename |
786 | 790 | | name = "" |
787 | 791 | | link = "" |
| 792 | + | f = None |
| 793 | + | |
| 794 | + | if args.continuous_write: |
| 795 | + | f = open(filename, "a") |
788 | 796 | | |
789 | 797 | | def append_link(): |
790 | 798 | | result[engine_str].append({"name": name, "link": link}) |
| 799 | + | if args.continuous_write and f.writable(): |
| 800 | + | f.write("\"{}\",\"{}\",\"{}\"\n".format(engine_str, name, link)) |
791 | 801 | | |
792 | 802 | | if engine_str not in result: |
793 | 803 | | result[engine_str] = [] |
| skipped 35 lines |
829 | 839 | | |
830 | 840 | | if engine_str == "evosearch": |
831 | 841 | | if data_obj.find('div', attrs={"id": "results"}) is not None: |
832 | | - | count = 0 |
833 | 842 | | for div in data_obj.find('div', attrs={"id": "results"}).find_all('div', attrs={"class": "odrow"}): |
834 | 843 | | name = clear(div.find('div', attrs={"class": "title"}).find('a').get_text()) |
835 | 844 | | link = clear(div.find('div', attrs={"class": "title"}).find('a')['href'] |
836 | 845 | | .replace("./include/click_counter.php?url=", "") |
837 | 846 | | .replace("&query={}".format(args.search), "")) |
838 | | - | count += 1 |
839 | 847 | | append_link() |
840 | 848 | | |
841 | 849 | | if engine_str == "grams": |
| skipped 7 lines |
849 | 857 | | append_link() |
850 | 858 | | |
851 | 859 | | if engine_str == "haystack": |
852 | | - | if data_obj.find('div', attrs={"class": "result"}) is None: |
853 | | - | return -1 |
854 | | - | for div in data_obj.find_all('div', attrs={"class": "result"}): |
855 | | - | if div.find('a') is not None and div.find('i') is not None: |
856 | | - | name = clear(div.find('a').get_text()) |
857 | | - | link = clear(div.find('i').get_text()) |
858 | | - | append_link() |
| 860 | + | if data_obj.find('div', attrs={"class": "result"}) is not None: |
| 861 | + | for div in data_obj.find_all('div', attrs={"class": "result"}): |
| 862 | + | if div.find('a') is not None and div.find('i') is not None: |
| 863 | + | name = clear(div.find('a').get_text()) |
| 864 | + | link = clear(div.find('i').get_text()) |
| 865 | + | append_link() |
859 | 866 | | |
860 | 867 | | if engine_str == "multivac": |
861 | 868 | | for i in data_obj.find_all('dl'): |
| skipped 4 lines |
866 | 873 | | link = clear(link_tag['href']) |
867 | 874 | | append_link() |
868 | 875 | | else: |
869 | | - | return -1 |
| 876 | + | break |
870 | 877 | | |
871 | 878 | | if engine_str == "notevil": |
872 | 879 | | ''' As for OnionLand, we could use the span instead of the href to get a beautiful link |
| skipped 16 lines |
889 | 896 | | append_link() |
890 | 897 | | |
891 | 898 | | if engine_str == "onionland": |
892 | | - | if data_obj.find('div', attrs={"class": "row no-result-row"}): |
893 | | - | return -1 |
894 | | - | for i in data_obj.find_all('div', attrs={"class": "result-block"}): |
895 | | - | if not str(clear(i.find('div', attrs={'class': "title"}).find('a')['href'])).startswith("/ads"): |
896 | | - | name = clear(i.find('div', attrs={'class': "title"}).get_text()) |
897 | | - | link = clear(i.find('div', attrs={'class': "link"}).get_text()) |
898 | | - | append_link() |
| 899 | + | if not data_obj.find('div', attrs={"class": "row no-result-row"}): |
| 900 | + | for i in data_obj.find_all('div', attrs={"class": "result-block"}): |
| 901 | + | if not str(clear(i.find('div', attrs={'class': "title"}).find('a')['href'])).startswith("/ads"): |
| 902 | + | name = clear(i.find('div', attrs={'class': "title"}).get_text()) |
| 903 | + | link = clear(i.find('div', attrs={'class': "link"}).get_text()) |
| 904 | + | append_link() |
899 | 905 | | |
900 | 906 | | if engine_str == "onionsearchengine": |
901 | 907 | | for i in data_obj.find_all('table'): |
| skipped 67 lines |
969 | 975 | | link = n.find('a')['href'] |
970 | 976 | | append_link() |
971 | 977 | | |
| 978 | + | if args.continuous_write and not f.closed: |
| 979 | + | f.close() |
| 980 | + | |
| 981 | + | if len(result[engine_str]) <= 0: |
| 982 | + | return -1 |
| 983 | + | |
972 | 984 | | return 1 |
973 | 985 | | |
974 | 986 | | |
| skipped 6 lines |
981 | 993 | | print("Error: unable to connect") |
982 | 994 | | |
983 | 995 | | |
| 996 | + | def write_to_file(filename, results, engine): |
| 997 | + | f = open(filename, "w+") |
| 998 | + | for i in results[engine]: |
| 999 | + | f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"])) |
| 1000 | + | f.close() |
| 1001 | + | |
| 1002 | + | |
984 | 1003 | | def scrape(): |
985 | 1004 | | global result |
| 1005 | + | global filename |
986 | 1006 | | |
987 | 1007 | | start_time = datetime.now() |
988 | 1008 | | |
989 | 1009 | | # Building the filename |
990 | | - | filename = args.output |
991 | 1010 | | filename = str(filename).replace("$DATE", start_time.strftime("%Y%m%d%H%M%S")) |
992 | 1011 | | search = str(args.search).replace(" ", "") |
993 | 1012 | | if len(search) > 10: |
| skipped 15 lines |
1009 | 1028 | | |
1010 | 1029 | | stop_time = datetime.now() |
1011 | 1030 | | |
| 1031 | + | if not args.continuous_write: |
| 1032 | + | f = open(filename, "w+") |
| 1033 | + | for engine in result.keys(): |
| 1034 | + | for i in result[engine]: |
| 1035 | + | f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"])) |
| 1036 | + | f.close() |
| 1037 | + | |
1012 | 1038 | | total = 0 |
1013 | 1039 | | print("\nReport:") |
1014 | 1040 | | print(" Execution time: %s seconds" % (stop_time - start_time)) |
1015 | | - | |
1016 | | - | f = open(filename, "w+") |
1017 | 1041 | | for engine in result.keys(): |
1018 | 1042 | | print(" {}: {}".format(engine, str(len(result[engine])))) |
1019 | 1043 | | total += len(result[engine]) |
1020 | | - | for i in result[engine]: |
1021 | | - | f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"])) |
1022 | | - | |
1023 | | - | f.close() |
1024 | 1044 | | print(" Total: {} links written to {}".format(str(total), filename)) |
1025 | 1045 | | |
1026 | 1046 | | |
| skipped 3 lines |