| skipped 5 lines |
6 | 6 | | from datetime import datetime |
7 | 7 | | from functools import reduce |
8 | 8 | | from random import choice |
9 | | - | |
10 | 9 | | from multiprocessing import Pool, cpu_count, current_process, freeze_support |
11 | 10 | | from tqdm import tqdm |
12 | 11 | | |
| skipped 5 lines |
18 | 17 | | from bs4 import BeautifulSoup |
19 | 18 | | from urllib3.exceptions import ProtocolError |
20 | 19 | | |
21 | | - | import engines |
| 20 | + | ENGINES = { |
| 21 | + | "ahmia": "http://msydqstlz2kzerdg.onion", |
| 22 | + | "darksearchio": "http://darksearch.io", |
| 23 | + | "onionland": "http://3bbad7fauom4d6sgppalyqddsqbf5u5p56b5k5uk2zxsy3d6ey2jobad.onion", |
| 24 | + | "notevil": "http://hss3uro2hsxfogfq.onion", |
| 25 | + | "darksearchenginer": "http://7pwy57iklvt6lyhe.onion", |
| 26 | + | "phobos": "http://phobosxilamwcg75xt22id7aywkzol6q6rfl2flipcqoc4e4ahima5id.onion", |
| 27 | + | "onionsearchserver": "http://oss7wrm7xvoub77o.onion", |
| 28 | + | "torgle": "http://submarhglcl66nz6.onion/", |
| 29 | + | "torgle1": "http://torgle5fj664v7pf.onion", |
| 30 | + | "onionsearchengine": "http://onionf4j3fwqpeo5.onion", |
| 31 | + | "tordex": "http://tordex7iie7z2wcg.onion", |
| 32 | + | "tor66": "http://tor66sezptuu2nta.onion", |
| 33 | + | "tormax": "http://tormaxunodsbvtgo.onion", |
| 34 | + | "haystack": "http://haystakvxad7wbk5.onion", |
| 35 | + | "multivac": "http://multivacigqzqqon.onion", |
| 36 | + | "evosearch": "http://evo7no6twwwrm63c.onion", |
| 37 | + | "deeplink": "http://deeplinkdeatbml7.onion", |
| 38 | + | } |
22 | 39 | | |
23 | 40 | | desktop_agents = [ |
24 | 41 | | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', |
| skipped 12 lines |
37 | 54 | | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' |
38 | 55 | | ] |
39 | 56 | | |
40 | | - | supported_engines = engines.ENGINES |
| 57 | + | supported_engines = ENGINES |
41 | 58 | | |
42 | 59 | | available_csv_fields = [ |
43 | 60 | | "engine", |
| skipped 78 lines |
122 | 139 | | return results |
123 | 140 | | |
124 | 141 | | |
125 | | - | def torch(searchstr): |
126 | | - | results = [] |
127 | | - | torch_url = supported_engines['torch'] + "/4a1f6b371c/search.cgi?cmd=Search!&np={}&q={}" |
128 | | - | results_per_page = 10 |
129 | | - | max_nb_page = 100 |
130 | | - | if args.limit != 0: |
131 | | - | max_nb_page = args.limit |
132 | | - | |
133 | | - | with requests.Session() as s: |
134 | | - | s.proxies = proxies |
135 | | - | s.headers = random_headers() |
136 | | - | |
137 | | - | req = s.get(torch_url.format(0, quote(searchstr))) |
138 | | - | soup = BeautifulSoup(req.text, 'html5lib') |
139 | | - | |
140 | | - | page_number = 1 |
141 | | - | for i in soup.find("table", attrs={"width": "100%"}).find_all("small"): |
142 | | - | if i.get_text() is not None and "of" in i.get_text(): |
143 | | - | page_number = math.ceil(float(clear(i.get_text().split("-")[1].split("of")[1])) / results_per_page) |
144 | | - | if page_number > max_nb_page: |
145 | | - | page_number = max_nb_page |
146 | | - | |
147 | | - | pos = get_proc_pos() |
148 | | - | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("TORCH", pos), position=pos) as progress_bar: |
149 | | - | |
150 | | - | results = link_finder("torch", soup) |
151 | | - | progress_bar.update() |
152 | | - | |
153 | | - | # Usually range is 2 to n+1, but TORCH behaves differently |
154 | | - | for n in range(1, page_number): |
155 | | - | req = s.get(torch_url.format(n, quote(searchstr))) |
156 | | - | soup = BeautifulSoup(req.text, 'html5lib') |
157 | | - | results = results + link_finder("torch", soup) |
158 | | - | progress_bar.update() |
159 | | - | |
160 | | - | return results |
161 | | - | |
162 | | - | |
163 | | - | def torch1(searchstr): |
164 | | - | results = [] |
165 | | - | torch1_url = supported_engines['torch1'] + "/search?q={}&cmd=Search!" |
166 | | - | |
167 | | - | pos = get_proc_pos() |
168 | | - | with tqdm(total=1, initial=0, desc=get_tqdm_desc("TORCH 1", pos), position=pos) as progress_bar: |
169 | | - | response = requests.get(torch1_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
170 | | - | soup = BeautifulSoup(response.text, 'html5lib') |
171 | | - | results = link_finder("torch1", soup) |
172 | | - | progress_bar.update() |
173 | | - | |
174 | | - | return results |
175 | 142 | | |
176 | 143 | | |
177 | 144 | | def darksearchio(searchstr): |
| skipped 120 lines |
298 | 265 | | return results |
299 | 266 | | |
300 | 267 | | |
301 | | - | def visitor(searchstr): |
302 | | - | results = [] |
303 | | - | visitor_url = supported_engines['visitor'] + "/search/?q={}&page={}" |
304 | | - | max_nb_page = 30 |
305 | | - | if args.limit != 0: |
306 | | - | max_nb_page = args.limit |
307 | | - | |
308 | | - | pos = get_proc_pos() |
309 | | - | with tqdm(total=max_nb_page, initial=0, desc=get_tqdm_desc("VisiTOR", pos), position=pos) as progress_bar: |
310 | | - | continue_processing = True |
311 | | - | page_to_request = 1 |
312 | | - | |
313 | | - | with requests.Session() as s: |
314 | | - | s.proxies = proxies |
315 | | - | s.headers = random_headers() |
316 | | - | |
317 | | - | while continue_processing: |
318 | | - | resp = s.get(visitor_url.format(quote(searchstr), page_to_request)) |
319 | | - | soup = BeautifulSoup(resp.text, 'html5lib') |
320 | | - | results = results + link_finder("visitor", soup) |
321 | | - | progress_bar.update() |
322 | | - | |
323 | | - | next_page = soup.find('a', text="Next »") |
324 | | - | if next_page is None or page_to_request >= max_nb_page: |
325 | | - | continue_processing = False |
326 | | - | |
327 | | - | page_to_request += 1 |
328 | | - | |
329 | | - | return results |
330 | 268 | | |
331 | 269 | | |
332 | 270 | | def darksearchenginer(searchstr): |
| skipped 118 lines |
451 | 389 | | return results |
452 | 390 | | |
453 | 391 | | |
454 | | - | def grams(searchstr): |
455 | | - | results = [] |
456 | | - | # No multi pages handling as it is very hard to get many results on this engine |
457 | | - | grams_url1 = supported_engines['grams'] |
458 | | - | grams_url2 = supported_engines['grams'] + "/results" |
459 | | - | |
460 | | - | with requests.Session() as s: |
461 | | - | s.proxies = proxies |
462 | | - | s.headers = random_headers() |
463 | | - | |
464 | | - | resp = s.get(grams_url1) |
465 | | - | soup = BeautifulSoup(resp.text, 'html5lib') |
466 | | - | token = soup.find('input', attrs={'name': '_token'})['value'] |
467 | | - | |
468 | | - | pos = get_proc_pos() |
469 | | - | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Grams", pos), position=pos) as progress_bar: |
470 | | - | resp = s.post(grams_url2, data={"req": searchstr, "_token": token}) |
471 | | - | soup = BeautifulSoup(resp.text, 'html5lib') |
472 | | - | results = link_finder("grams", soup) |
473 | | - | progress_bar.update() |
474 | | - | |
475 | | - | return results |
476 | | - | |
477 | | - | |
478 | | - | def candle(searchstr): |
479 | | - | results = [] |
480 | | - | candle_url = supported_engines['candle'] + "/?q={}" |
481 | | - | |
482 | | - | pos = get_proc_pos() |
483 | | - | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Candle", pos), position=pos) as progress_bar: |
484 | | - | response = requests.get(candle_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
485 | | - | soup = BeautifulSoup(response.text, 'html5lib') |
486 | | - | results = link_finder("candle", soup) |
487 | | - | progress_bar.update() |
488 | | - | |
489 | | - | return results |
490 | | - | |
491 | | - | |
492 | | - | def torsearchengine(searchstr): |
493 | | - | results = [] |
494 | | - | torsearchengine_url = supported_engines['torsearchengine'] + "/search/move/?q={}&pn={}&num=10&sdh=&" |
495 | | - | max_nb_page = 100 |
496 | | - | if args.limit != 0: |
497 | | - | max_nb_page = args.limit |
498 | | - | |
499 | | - | with requests.Session() as s: |
500 | | - | s.proxies = proxies |
501 | | - | s.headers = random_headers() |
502 | | - | |
503 | | - | resp = s.get(torsearchengine_url.format(quote(searchstr), 1)) |
504 | | - | soup = BeautifulSoup(resp.text, 'html5lib') |
505 | | - | |
506 | | - | page_number = 1 |
507 | | - | for i in soup.find_all('div', attrs={"id": "subheader"}): |
508 | | - | if i.get_text() is not None and "of" in i.get_text(): |
509 | | - | total_results = int(i.find('p').find_all('b')[2].get_text().replace(",", "")) |
510 | | - | results_per_page = 10 |
511 | | - | page_number = math.ceil(total_results / results_per_page) |
512 | | - | if page_number > max_nb_page: |
513 | | - | page_number = max_nb_page |
514 | | - | |
515 | | - | pos = get_proc_pos() |
516 | | - | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Tor Search Engine", pos), position=pos) \ |
517 | | - | as progress_bar: |
518 | | - | |
519 | | - | results = link_finder("torsearchengine", soup) |
520 | | - | progress_bar.update() |
521 | | - | |
522 | | - | for n in range(2, page_number + 1): |
523 | | - | resp = s.get(torsearchengine_url.format(quote(searchstr), n)) |
524 | | - | soup = BeautifulSoup(resp.text, 'html5lib') |
525 | | - | results = results + link_finder("torsearchengine", soup) |
526 | | - | progress_bar.update() |
527 | | - | |
528 | | - | return results |
529 | | - | |
530 | 392 | | |
531 | 393 | | def torgle(searchstr): |
532 | 394 | | results = [] |
| skipped 127 lines |
660 | 522 | | |
661 | 523 | | def tormax(searchstr): |
662 | 524 | | results = [] |
663 | | - | tormax_url = supported_engines['tormax'] + "/tormax/search?q={}" |
| 525 | + | tormax_url = supported_engines['tormax'] + "/search?q={}" |
664 | 526 | | |
665 | 527 | | pos = get_proc_pos() |
666 | 528 | | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Tormax", pos), position=pos) as progress_bar: |
| skipped 124 lines |
791 | 653 | | return results |
792 | 654 | | |
793 | 655 | | |
794 | | - | def oneirun(searchstr): |
795 | | - | results = [] |
796 | | - | oneirun_url = supported_engines['oneirun'] + "/Home/IndexEn" |
797 | | - | |
798 | | - | with requests.Session() as s: |
799 | | - | s.proxies = proxies |
800 | | - | s.headers = random_headers() |
801 | | - | |
802 | | - | resp = s.get(oneirun_url) |
803 | | - | soup = BeautifulSoup(resp.text, 'html5lib') |
804 | | - | token = soup.find('input', attrs={"name": "__RequestVerificationToken"})['value'] |
805 | | - | |
806 | | - | pos = get_proc_pos() |
807 | | - | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Oneirun", pos), position=pos) as progress_bar: |
808 | | - | response = s.post(oneirun_url.format(quote(searchstr)), data={ |
809 | | - | "searchString": searchstr, |
810 | | - | "__RequestVerificationToken": token |
811 | | - | }) |
812 | | - | soup = BeautifulSoup(response.text, 'html5lib') |
813 | | - | results = link_finder("oneirun", soup) |
814 | | - | progress_bar.update() |
815 | | - | |
816 | | - | return results |
817 | | - | |
818 | 656 | | |
819 | 657 | | def deeplink(searchstr): |
820 | 658 | | results = [] |
| skipped 15 lines |
836 | 674 | | return results |
837 | 675 | | |
838 | 676 | | |
839 | | - | def torsearchengine1(searchstr): |
840 | | - | results = [] |
841 | | - | torsearchengine1_url1 = supported_engines['torsearchengine1'] |
842 | | - | torsearchengine1_url2 = supported_engines['torsearchengine1'] + "/index.php" |
843 | | - | |
844 | | - | with requests.Session() as s: |
845 | | - | s.proxies = proxies |
846 | | - | s.headers = random_headers() |
847 | | - | s.get(torsearchengine1_url1) |
848 | | - | |
849 | | - | pos = get_proc_pos() |
850 | | - | with tqdm(total=1, initial=0, desc=get_tqdm_desc("TOR Search Engine 1", pos), position=pos) as progress_bar: |
851 | | - | response = s.post(torsearchengine1_url2, {'search': searchstr, 'search2': ''}) |
852 | | - | soup = BeautifulSoup(response.text, 'html5lib') |
853 | | - | results = link_finder("torsearchengine1", soup) |
854 | | - | progress_bar.update() |
855 | | - | |
856 | | - | return results |
857 | | - | |
858 | 677 | | |
859 | 678 | | def torgle1(searchstr): |
860 | 679 | | results = [] |
| skipped 35 lines |
896 | 715 | | return results |
897 | 716 | | |
898 | 717 | | |
899 | | - | def grams1(searchstr): |
900 | | - | results = [] |
901 | | - | grams1_url = supported_engines['grams1'] + "/results/index.php?page={}&searchstr={}" |
902 | | - | results_per_page = 25 |
903 | | - | max_nb_page = 30 |
904 | | - | if args.limit != 0: |
905 | | - | max_nb_page = args.limit |
906 | | - | |
907 | | - | with requests.Session() as s: |
908 | | - | s.proxies = proxies |
909 | | - | s.headers = random_headers() |
910 | | - | |
911 | | - | resp = s.get(grams1_url.format(1, quote(searchstr))) |
912 | | - | soup = BeautifulSoup(resp.text, 'html5lib') |
913 | | - | |
914 | | - | page_number = 1 |
915 | | - | pages = soup.find_all('div', attrs={"class": "result-text"}) |
916 | | - | if pages is not None: |
917 | | - | res_re = re.match(r"About ([0-9]+) result(.*)", clear(pages[0].get_text())) |
918 | | - | total_results = int(res_re.group(1)) |
919 | | - | page_number = math.ceil(total_results / results_per_page) |
920 | | - | if page_number > max_nb_page: |
921 | | - | page_number = max_nb_page |
922 | | - | |
923 | | - | pos = get_proc_pos() |
924 | | - | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Grams 1", pos), position=pos) as progress_bar: |
925 | | - | results = link_finder("grams1", soup) |
926 | | - | progress_bar.update() |
927 | | - | |
928 | | - | for n in range(2, page_number + 1): |
929 | | - | resp = s.get(grams1_url.format(n, quote(searchstr))) |
930 | | - | soup = BeautifulSoup(resp.text, 'html5lib') |
931 | | - | results = results + link_finder("grams1", soup) |
932 | | - | progress_bar.update() |
933 | | - | |
934 | | - | return results |
935 | 718 | | |
936 | 719 | | |
937 | 720 | | def get_domain_from_url(link): |
| skipped 47 lines |
985 | 768 | | link = r.find('a')['href'].split('redirect_url=')[1] |
986 | 769 | | add_link() |
987 | 770 | | |
988 | | - | if engine_str == "candle": |
989 | | - | for r in data_obj.select("body h2 a"): |
990 | | - | if str(r['href']).startswith("http"): |
991 | | - | name = clear(r.get_text()) |
992 | | - | link = clear(r['href']) |
993 | | - | add_link() |
994 | | - | |
995 | 771 | | if engine_str == "darksearchenginer": |
996 | 772 | | for r in data_obj.select('.table-responsive a'): |
997 | 773 | | name = clear(r.get_text()) |
| skipped 20 lines |
1018 | 794 | | link = get_parameter(r['href'], 'url') |
1019 | 795 | | add_link() |
1020 | 796 | | |
1021 | | - | if engine_str == "grams": |
1022 | | - | for i in data_obj.find_all("div", attrs={"class": "media-body"}): |
1023 | | - | if not i.find('span'): |
1024 | | - | for r in i.select(".searchlinks a"): |
1025 | | - | name = clear(r.get_text()) |
1026 | | - | link = clear(r['href']) |
1027 | | - | add_link() |
1028 | 797 | | |
1029 | | - | if engine_str == "grams1": |
1030 | | - | for r in data_obj.select(".searchlinks a"): |
1031 | | - | name = clear(r.get_text()) |
1032 | | - | link = clear(r['href']) |
1033 | | - | add_link() |
1034 | 798 | | |
1035 | 799 | | if engine_str == "haystack": |
1036 | 800 | | for r in data_obj.select(".result b a"): |
| skipped 11 lines |
1048 | 812 | | break |
1049 | 813 | | |
1050 | 814 | | if engine_str == "notevil": |
1051 | | - | for r in data_obj.select('#content > div > p > a:not([target])'): |
| 815 | + | for r in data_obj.find_all("p"): |
| 816 | + | r=r.find("a") |
1052 | 817 | | name = clear(r.get_text()) |
1053 | | - | link = get_parameter(r['href'], 'url') |
| 818 | + | link = unquote(r["href"]).split('./r2d.php?url=')[1].split('&')[0] |
1054 | 819 | | add_link() |
1055 | 820 | | |
1056 | | - | if engine_str == "oneirun": |
1057 | | - | for td in data_obj.find_all('td', attrs={"style": "vertical-align: top;"}): |
1058 | | - | name = clear(td.find('h5').get_text()) |
1059 | | - | link = clear(td.find('a')['href']) |
1060 | | - | add_link() |
1061 | 821 | | |
1062 | 822 | | if engine_str == "onionland": |
1063 | 823 | | for r in data_obj.select('.result-block .title a'): |
| skipped 27 lines |
1091 | 851 | | link = clear(i.find('a')['href']) |
1092 | 852 | | add_link() |
1093 | 853 | | |
1094 | | - | if engine_str == "torch": |
1095 | | - | for r in data_obj.select("dl > dt > a"): |
1096 | | - | name = clear(r.get_text()) |
1097 | | - | link = clear(r['href']) |
1098 | | - | add_link() |
1099 | | - | |
1100 | | - | if engine_str == "torch1": |
1101 | | - | for r in data_obj.select("dl > dt > a"): |
1102 | | - | name = clear(r.get_text()) |
1103 | | - | link = clear(r['href']) |
1104 | | - | add_link() |
1105 | 854 | | |
1106 | 855 | | if engine_str == "tordex": |
1107 | 856 | | for r in data_obj.select('.container h5 a'): |
| skipped 17 lines |
1125 | 874 | | add_link() |
1126 | 875 | | |
1127 | 876 | | if engine_str == "tormax": |
1128 | | - | for r in data_obj.select("#search-results article a.title"): |
1129 | | - | name = clear(r.get_text()) |
1130 | | - | link = clear(r.find_next_sibling('div', {'class': 'url'}).get_text()) |
| 877 | + | for r in data_obj.find_all("section",attrs={"id":"search-results"})[0].find_all("article"): |
| 878 | + | name = clear(r.find('a',attrs={"class":"title"}).get_text()) |
| 879 | + | link = clear(r.find('div',attrs={"class":"url"}).get_text()) |
1131 | 880 | | add_link() |
1132 | 881 | | |
1133 | | - | if engine_str == "torsearchengine": |
1134 | | - | for i in data_obj.find_all('h3', attrs={'class': 'title text-truncate'}): |
1135 | | - | name = clear(i.find('a').get_text()) |
1136 | | - | link = i.find('a')['data-uri'] |
1137 | | - | add_link() |
1138 | 882 | | |
1139 | | - | if engine_str == "torsearchengine1": |
1140 | | - | for r in data_obj.find_all('span', {'style': 'font-size:1.2em;font-weight:bold;color:#1a0dab'}): |
1141 | | - | name = clear(r.get_text()) |
1142 | | - | link = r.find_next_sibling('a')['href'] |
1143 | | - | add_link() |
1144 | | - | |
1145 | | - | if engine_str == "visitor": |
1146 | | - | for r in data_obj.select(".hs_site h3 a"): |
1147 | | - | name = clear(r.get_text()) |
1148 | | - | link = clear(r['href']) |
1149 | | - | add_link() |
1150 | 883 | | |
1151 | 884 | | if args.continuous_write and not csv_file.closed: |
1152 | 885 | | csv_file.close() |
| skipped 7 lines |
1160 | 893 | | ret = [] |
1161 | 894 | | try: |
1162 | 895 | | ret = globals()[method_name](argument) |
1163 | | - | except ConnectionError: |
1164 | | - | print("Error: unable to connect") |
1165 | | - | except OSError: |
1166 | | - | print("Error: unable to connect") |
1167 | | - | except ProtocolError: |
| 896 | + | except: |
1168 | 897 | | print("Error: unable to connect") |
1169 | 898 | | return ret |
1170 | 899 | | |
| skipped 59 lines |
1230 | 959 | | total += n |
1231 | 960 | | print(" Total: {} links written to {}".format(str(total), filename)) |
1232 | 961 | | |
1233 | | - | |
1234 | | - | if __name__ == "__main__": |
1235 | | - | scrape() |
1236 | | - | |