| skipped 4 lines |
5 | 5 | | import time |
6 | 6 | | from datetime import datetime |
7 | 7 | | from random import choice |
| 8 | + | from tqdm import tqdm |
8 | 9 | | |
9 | 10 | | import requests |
| 11 | + | import urllib.parse as urlparse |
| 12 | + | from urllib.parse import parse_qs |
| 13 | + | from urllib.parse import quote |
| 14 | + | from urllib.parse import unquote |
10 | 15 | | from bs4 import BeautifulSoup |
11 | | - | from tqdm import tqdm |
| 16 | + | |
| 17 | + | import engines |
12 | 18 | | |
13 | 19 | | desktop_agents = [ |
14 | 20 | | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', |
| skipped 12 lines |
27 | 33 | | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' |
28 | 34 | | ] |
29 | 35 | | |
30 | | - | supported_engines = [ |
31 | | - | "ahmia", |
32 | | - | "torch", |
33 | | - | "darksearchio", |
34 | | - | "onionland", |
35 | | - | "notevil", |
36 | | - | "visitor", |
37 | | - | "darksearchenginer", |
38 | | - | "phobos", |
39 | | - | "onionsearchserver", |
40 | | - | "grams", |
41 | | - | "candle", |
42 | | - | "torsearchengine", |
43 | | - | "torgle", |
44 | | - | "onionsearchengine", |
45 | | - | "tordex", |
46 | | - | "tor66", |
47 | | - | "tormax", |
48 | | - | "haystack", |
49 | | - | "multivac", |
50 | | - | "evosearch", |
51 | | - | "oneirun", |
52 | | - | "deeplink", |
53 | | - | ] |
| 36 | + | supported_engines = engines.ENGINES |
54 | 37 | | |
55 | 38 | | available_csv_fields = [ |
56 | 39 | | "engine", |
57 | 40 | | "name", |
58 | 41 | | "link", |
59 | 42 | | "domain" |
60 | | - | # Todo: add description, but needs modify scraping (link_finder func) for all the engines |
61 | 43 | | ] |
62 | 44 | | |
63 | 45 | | |
| skipped 3 lines |
67 | 49 | | epilog += " {}".format(f) |
68 | 50 | | epilog += "\n" |
69 | 51 | | epilog += "Supported engines: \n\t" |
70 | | - | for e in supported_engines: |
| 52 | + | for e in supported_engines.keys(): |
71 | 53 | | epilog += " {}".format(e) |
72 | 54 | | return epilog |
73 | 55 | | |
| skipped 15 lines |
89 | 71 | | nargs="*") |
90 | 72 | | parser.add_argument("--field_delimiter", type=str, default=",", help='Delimiter for the CSV fields') |
91 | 73 | | |
| 74 | + | |
92 | 75 | | args = parser.parse_args() |
93 | 76 | | proxies = {'http': 'socks5h://{}'.format(args.proxy), 'https': 'socks5h://{}'.format(args.proxy)} |
94 | 77 | | tqdm_bar_format = "{desc}: {percentage:3.0f}% |{bar}| {n_fmt:3s} / {total_fmt:3s} [{elapsed:5s} < {remaining:5s}]" |
| skipped 14 lines |
109 | 92 | | return str |
110 | 93 | | |
111 | 94 | | |
| 95 | + | def get_parameter(url, parameter_name): |
| 96 | + | parsed = urlparse.urlparse(url) |
| 97 | + | return parse_qs(parsed.query)[parameter_name][0] |
| 98 | + | |
| 99 | + | |
112 | 100 | | def ahmia(searchstr): |
113 | | - | ahmia_url = "http://msydqstlz2kzerdg.onion/search/?q={}" |
| 101 | + | ahmia_url = supported_engines['ahmia'] + "/search/?q={}" |
114 | 102 | | |
115 | 103 | | with tqdm(total=1, initial=0, desc="%20s" % "Ahmia", unit="req", ascii=False, ncols=120, |
116 | 104 | | bar_format=tqdm_bar_format) as progress_bar: |
117 | | - | response = requests.get(ahmia_url.format(searchstr), proxies=proxies, headers=random_headers()) |
118 | | - | soup = BeautifulSoup(response.text, 'html.parser') |
| 105 | + | response = requests.get(ahmia_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
| 106 | + | soup = BeautifulSoup(response.text, 'html5lib') |
119 | 107 | | link_finder("ahmia", soup) |
120 | 108 | | progress_bar.update() |
121 | 109 | | progress_bar.close() |
122 | 110 | | |
123 | 111 | | |
124 | 112 | | def torch(searchstr): |
125 | | - | torch_url = "http://xmh57jrzrnw6insl.onion/4a1f6b371c/search.cgi?cmd=Search!&np={}&q={}" |
| 113 | + | torch_url = supported_engines['torch'] + "/4a1f6b371c/search.cgi?cmd=Search!&np={}&q={}" |
126 | 114 | | results_per_page = 10 |
127 | 115 | | max_nb_page = 100 |
128 | 116 | | if args.limit != 0: |
| skipped 3 lines |
132 | 120 | | s.proxies = proxies |
133 | 121 | | s.headers = random_headers() |
134 | 122 | | |
135 | | - | req = s.get(torch_url.format(0, searchstr)) |
136 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 123 | + | req = s.get(torch_url.format(0, quote(searchstr))) |
| 124 | + | soup = BeautifulSoup(req.text, 'html5lib') |
137 | 125 | | |
138 | 126 | | page_number = 1 |
139 | 127 | | for i in soup.find("table", attrs={"width": "100%"}).find_all("small"): |
| skipped 10 lines |
150 | 138 | | |
151 | 139 | | # Usually range is 2 to n+1, but TORCH behaves differently |
152 | 140 | | for n in range(1, page_number): |
153 | | - | req = s.get(torch_url.format(n, searchstr)) |
154 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 141 | + | req = s.get(torch_url.format(n, quote(searchstr))) |
| 142 | + | soup = BeautifulSoup(req.text, 'html5lib') |
155 | 143 | | link_finder("torch", soup) |
156 | 144 | | progress_bar.update() |
157 | 145 | | |
| skipped 3 lines |
161 | 149 | | def darksearchio(searchstr): |
162 | 150 | | global result |
163 | 151 | | result['darksearchio'] = [] |
164 | | - | darksearchio_url = "http://darksearch.io/api/search?query={}&page={}" |
| 152 | + | darksearchio_url = supported_engines['darksearchio'] + "/api/search?query={}&page={}" |
165 | 153 | | max_nb_page = 30 |
166 | 154 | | if args.limit != 0: |
167 | 155 | | max_nb_page = args.limit |
| skipped 1 lines |
169 | 157 | | with requests.Session() as s: |
170 | 158 | | s.proxies = proxies |
171 | 159 | | s.headers = random_headers() |
172 | | - | resp = s.get(darksearchio_url.format(searchstr, 1)) |
| 160 | + | resp = s.get(darksearchio_url.format(quote(searchstr), 1)) |
173 | 161 | | |
174 | 162 | | page_number = 1 |
175 | 163 | | if resp.status_code == 200: |
| skipped 12 lines |
188 | 176 | | progress_bar.update() |
189 | 177 | | |
190 | 178 | | for n in range(2, page_number + 1): |
191 | | - | resp = s.get(darksearchio_url.format(searchstr, n)) |
| 179 | + | resp = s.get(darksearchio_url.format(quote(searchstr), n)) |
192 | 180 | | if resp.status_code == 200: |
193 | 181 | | resp = resp.json() |
194 | 182 | | link_finder("darksearchio", resp['data']) |
| skipped 6 lines |
201 | 189 | | |
202 | 190 | | |
203 | 191 | | def onionland(searchstr): |
204 | | - | onionlandv3_url = "http://3bbad7fauom4d6sgppalyqddsqbf5u5p56b5k5uk2zxsy3d6ey2jobad.onion/search?q={}&page={}" |
| 192 | + | onionlandv3_url = supported_engines['onionland'] + "/search?q={}&page={}" |
205 | 193 | | max_nb_page = 100 |
206 | 194 | | if args.limit != 0: |
207 | 195 | | max_nb_page = args.limit |
| skipped 2 lines |
210 | 198 | | s.proxies = proxies |
211 | 199 | | s.headers = random_headers() |
212 | 200 | | |
213 | | - | resp = s.get(onionlandv3_url.format(searchstr, 1)) |
214 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 201 | + | resp = s.get(onionlandv3_url.format(quote(searchstr), 1)) |
| 202 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
215 | 203 | | |
216 | 204 | | page_number = 1 |
217 | 205 | | for i in soup.find_all('div', attrs={"class": "search-status"}): |
| skipped 17 lines |
235 | 223 | | progress_bar.update() |
236 | 224 | | |
237 | 225 | | for n in range(2, page_number + 1): |
238 | | - | resp = s.get(onionlandv3_url.format(searchstr, n)) |
239 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 226 | + | resp = s.get(onionlandv3_url.format(quote(searchstr), n)) |
| 227 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
240 | 228 | | ret = link_finder("onionland", soup) |
241 | 229 | | if ret < 0: |
242 | 230 | | break |
| skipped 3 lines |
246 | 234 | | |
247 | 235 | | |
248 | 236 | | def notevil(searchstr): |
249 | | - | notevil_url1 = "http://hss3uro2hsxfogfq.onion/index.php?q={}" |
250 | | - | notevil_url2 = "http://hss3uro2hsxfogfq.onion/index.php?q={}&hostLimit=20&start={}&numRows={}&template=0" |
| 237 | + | notevil_url1 = supported_engines['notevil'] + "/index.php?q={}" |
| 238 | + | notevil_url2 = supported_engines['notevil'] + "/index.php?q={}&hostLimit=20&start={}&numRows={}&template=0" |
251 | 239 | | max_nb_page = 20 |
252 | 240 | | if args.limit != 0: |
253 | 241 | | max_nb_page = args.limit |
254 | 242 | | |
255 | 243 | | # Do not use requests.Session() here (by experience less results would be got) |
256 | | - | req = requests.get(notevil_url1.format(searchstr), proxies=proxies, headers=random_headers()) |
257 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 244 | + | req = requests.get(notevil_url1.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
| 245 | + | soup = BeautifulSoup(req.text, 'html5lib') |
258 | 246 | | |
259 | 247 | | page_number = 1 |
260 | 248 | | last_div = soup.find("div", attrs={"style": "text-align:center"}).find("div", attrs={"style": "text-align:center"}) |
| skipped 12 lines |
273 | 261 | | |
274 | 262 | | for n in range(2, page_number + 1): |
275 | 263 | | start = (int(n - 1) * num_rows) |
276 | | - | req = requests.get(notevil_url2.format(searchstr, start, num_rows), |
| 264 | + | req = requests.get(notevil_url2.format(quote(searchstr), start, num_rows), |
277 | 265 | | proxies=proxies, |
278 | 266 | | headers=random_headers()) |
279 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 267 | + | soup = BeautifulSoup(req.text, 'html5lib') |
280 | 268 | | link_finder("notevil", soup) |
281 | 269 | | progress_bar.update() |
282 | 270 | | time.sleep(1) |
| skipped 2 lines |
285 | 273 | | |
286 | 274 | | |
287 | 275 | | def visitor(searchstr): |
288 | | - | visitor_url = "http://visitorfi5kl7q7i.onion/search/?q={}&page={}" |
| 276 | + | visitor_url = supported_engines['visitor'] + "/search/?q={}&page={}" |
289 | 277 | | max_nb_page = 30 |
290 | 278 | | if args.limit != 0: |
291 | 279 | | max_nb_page = args.limit |
| skipped 12 lines |
304 | 292 | | s.headers = random_headers() |
305 | 293 | | |
306 | 294 | | while continue_processing: |
307 | | - | resp = s.get(visitor_url.format(searchstr, page_to_request)) |
308 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 295 | + | resp = s.get(visitor_url.format(quote(searchstr), page_to_request)) |
| 296 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
309 | 297 | | link_finder("visitor", soup) |
310 | 298 | | progress_bar.update() |
311 | 299 | | |
| skipped 7 lines |
319 | 307 | | |
320 | 308 | | |
321 | 309 | | def darksearchenginer(searchstr): |
322 | | - | darksearchenginer_url = "http://7pwy57iklvt6lyhe.onion/" |
| 310 | + | darksearchenginer_url = supported_engines['darksearchenginer'] |
323 | 311 | | max_nb_page = 20 |
324 | 312 | | if args.limit != 0: |
325 | 313 | | max_nb_page = args.limit |
| skipped 5 lines |
331 | 319 | | |
332 | 320 | | # Note that this search engine is very likely to timeout |
333 | 321 | | resp = s.post(darksearchenginer_url, data={"search[keyword]": searchstr, "page": page_number}) |
334 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 322 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
335 | 323 | | |
336 | 324 | | pages_input = soup.find_all("input", attrs={"name": "page"}) |
337 | 325 | | for i in pages_input: |
| skipped 9 lines |
347 | 335 | | |
348 | 336 | | for n in range(2, page_number + 1): |
349 | 337 | | resp = s.post(darksearchenginer_url, data={"search[keyword]": searchstr, "page": str(n)}) |
350 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 338 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
351 | 339 | | link_finder("darksearchenginer", soup) |
352 | 340 | | progress_bar.update() |
353 | 341 | | |
| skipped 1 lines |
355 | 343 | | |
356 | 344 | | |
357 | 345 | | def phobos(searchstr): |
358 | | - | phobos_url = "http://phobosxilamwcg75xt22id7aywkzol6q6rfl2flipcqoc4e4ahima5id.onion/search?query={}&p={}" |
| 346 | + | phobos_url = supported_engines['phobos'] + "/search?query={}&p={}" |
359 | 347 | | max_nb_page = 100 |
360 | 348 | | if args.limit != 0: |
361 | 349 | | max_nb_page = args.limit |
| skipped 2 lines |
364 | 352 | | s.proxies = proxies |
365 | 353 | | s.headers = random_headers() |
366 | 354 | | |
367 | | - | resp = s.get(phobos_url.format(searchstr, 1), proxies=proxies, headers=random_headers()) |
368 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 355 | + | resp = s.get(phobos_url.format(quote(searchstr), 1), proxies=proxies, headers=random_headers()) |
| 356 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
369 | 357 | | |
370 | 358 | | page_number = 1 |
371 | 359 | | pages = soup.find("div", attrs={"class": "pages"}).find_all('a') |
| skipped 10 lines |
382 | 370 | | progress_bar.update() |
383 | 371 | | |
384 | 372 | | for n in range(2, page_number + 1): |
385 | | - | resp = s.get(phobos_url.format(searchstr, n), proxies=proxies, headers=random_headers()) |
386 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 373 | + | resp = s.get(phobos_url.format(quote(searchstr), n), proxies=proxies, headers=random_headers()) |
| 374 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
387 | 375 | | link_finder("phobos", soup) |
388 | 376 | | progress_bar.update() |
389 | 377 | | |
| skipped 1 lines |
391 | 379 | | |
392 | 380 | | |
393 | 381 | | def onionsearchserver(searchstr): |
394 | | - | onionsearchserver_url1 = "http://oss7wrm7xvoub77o.onion/oss/" |
| 382 | + | onionsearchserver_url1 = supported_engines['onionsearchserver'] + "/oss/" |
395 | 383 | | onionsearchserver_url2 = None |
396 | 384 | | results_per_page = 10 |
397 | 385 | | max_nb_page = 100 |
| skipped 5 lines |
403 | 391 | | s.headers = random_headers() |
404 | 392 | | |
405 | 393 | | resp = s.get(onionsearchserver_url1) |
406 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 394 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
407 | 395 | | for i in soup.find_all('iframe', attrs={"style": "display:none;"}): |
408 | 396 | | onionsearchserver_url2 = i['src'] + "{}&page={}" |
409 | 397 | | |
410 | 398 | | if onionsearchserver_url2 is None: |
411 | 399 | | return -1 |
412 | 400 | | |
413 | | - | resp = s.get(onionsearchserver_url2.format(searchstr, 1)) |
414 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 401 | + | resp = s.get(onionsearchserver_url2.format(quote(searchstr), 1)) |
| 402 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
415 | 403 | | |
416 | 404 | | page_number = 1 |
417 | 405 | | pages = soup.find_all("div", attrs={"class": "osscmnrdr ossnumfound"}) |
| skipped 10 lines |
428 | 416 | | progress_bar.update() |
429 | 417 | | |
430 | 418 | | for n in range(2, page_number + 1): |
431 | | - | resp = s.get(onionsearchserver_url2.format(searchstr, n)) |
432 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 419 | + | resp = s.get(onionsearchserver_url2.format(quote(searchstr), n)) |
| 420 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
433 | 421 | | link_finder("onionsearchserver", soup) |
434 | 422 | | progress_bar.update() |
435 | 423 | | |
| skipped 2 lines |
438 | 426 | | |
439 | 427 | | def grams(searchstr): |
440 | 428 | | # No multi pages handling as it is very hard to get many results on this engine |
441 | | - | grams_url1 = "http://grams7enqfy4nieo.onion/" |
442 | | - | grams_url2 = "http://grams7enqfy4nieo.onion/results" |
| 429 | + | grams_url1 = supported_engines['grams'] |
| 430 | + | grams_url2 = supported_engines['grams'] + "/results" |
443 | 431 | | |
444 | 432 | | with requests.Session() as s: |
445 | 433 | | s.proxies = proxies |
446 | 434 | | s.headers = random_headers() |
447 | 435 | | |
448 | 436 | | resp = s.get(grams_url1) |
449 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 437 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
450 | 438 | | token = soup.find('input', attrs={'name': '_token'})['value'] |
451 | 439 | | |
452 | 440 | | with tqdm(total=1, initial=0, desc="%20s" % "Grams", unit="req", ascii=False, ncols=120, |
453 | 441 | | bar_format=tqdm_bar_format) as progress_bar: |
454 | 442 | | resp = s.post(grams_url2, data={"req": searchstr, "_token": token}) |
455 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 443 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
456 | 444 | | link_finder("grams", soup) |
457 | 445 | | progress_bar.update() |
458 | 446 | | progress_bar.close() |
459 | 447 | | |
460 | 448 | | |
461 | 449 | | def candle(searchstr): |
462 | | - | candle_url = "http://gjobjn7ievumcq6z.onion/?q={}" |
| 450 | + | candle_url = supported_engines['candle'] + "/?q={}" |
463 | 451 | | |
464 | 452 | | with tqdm(total=1, initial=0, desc="%20s" % "Candle", unit="req", ascii=False, ncols=120, |
465 | 453 | | bar_format=tqdm_bar_format) as progress_bar: |
466 | | - | response = requests.get(candle_url.format(searchstr), proxies=proxies, headers=random_headers()) |
467 | | - | soup = BeautifulSoup(response.text, 'html.parser') |
| 454 | + | response = requests.get(candle_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
| 455 | + | soup = BeautifulSoup(response.text, 'html5lib') |
468 | 456 | | link_finder("candle", soup) |
469 | 457 | | progress_bar.update() |
470 | 458 | | progress_bar.close() |
471 | 459 | | |
472 | 460 | | |
473 | 461 | | def torsearchengine(searchstr): |
474 | | - | torsearchengine_url = "http://searchcoaupi3csb.onion/search/move/?q={}&pn={}&num=10&sdh=&" |
| 462 | + | torsearchengine_url = supported_engines['torsearchengine'] + "/search/move/?q={}&pn={}&num=10&sdh=&" |
475 | 463 | | max_nb_page = 100 |
476 | 464 | | if args.limit != 0: |
477 | 465 | | max_nb_page = args.limit |
| skipped 2 lines |
480 | 468 | | s.proxies = proxies |
481 | 469 | | s.headers = random_headers() |
482 | 470 | | |
483 | | - | resp = s.get(torsearchengine_url.format(searchstr, 1)) |
484 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 471 | + | resp = s.get(torsearchengine_url.format(quote(searchstr), 1)) |
| 472 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
485 | 473 | | |
486 | 474 | | page_number = 1 |
487 | 475 | | for i in soup.find_all('div', attrs={"id": "subheader"}): |
| skipped 11 lines |
499 | 487 | | progress_bar.update() |
500 | 488 | | |
501 | 489 | | for n in range(2, page_number + 1): |
502 | | - | resp = s.get(torsearchengine_url.format(searchstr, n)) |
503 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 490 | + | resp = s.get(torsearchengine_url.format(quote(searchstr), n)) |
| 491 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
504 | 492 | | ret = link_finder("torsearchengine", soup) |
505 | 493 | | progress_bar.update() |
506 | 494 | | |
| skipped 1 lines |
508 | 496 | | |
509 | 497 | | |
510 | 498 | | def torgle(searchstr): |
511 | | - | torgle_url = "http://torglejzid2cyoqt.onion/search.php?term={}" |
| 499 | + | torgle_url = supported_engines['torgle'] + "/search.php?term={}" |
512 | 500 | | |
513 | 501 | | with tqdm(total=1, initial=0, desc="%20s" % "Torgle", unit="req", ascii=False, ncols=120, |
514 | 502 | | bar_format=tqdm_bar_format) as progress_bar: |
515 | | - | response = requests.get(torgle_url.format(searchstr), proxies=proxies, headers=random_headers()) |
516 | | - | soup = BeautifulSoup(response.text, 'html.parser') |
| 503 | + | response = requests.get(torgle_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
| 504 | + | soup = BeautifulSoup(response.text, 'html5lib') |
517 | 505 | | link_finder("torgle", soup) |
518 | 506 | | progress_bar.update() |
519 | 507 | | progress_bar.close() |
520 | 508 | | |
521 | 509 | | |
522 | 510 | | def onionsearchengine(searchstr): |
523 | | - | onionsearchengine_url = "http://onionf4j3fwqpeo5.onion/search.php?search={}&submit=Search&page={}" |
| 511 | + | onionsearchengine_url = supported_engines['onionsearchengine'] + "/search.php?search={}&submit=Search&page={}" |
| 512 | + | # same as onionsearchengine_url = "http://5u56fjmxu63xcmbk.onion/search.php?search={}&submit=Search&page={}" |
524 | 513 | | max_nb_page = 100 |
525 | 514 | | if args.limit != 0: |
526 | 515 | | max_nb_page = args.limit |
| skipped 2 lines |
529 | 518 | | s.proxies = proxies |
530 | 519 | | s.headers = random_headers() |
531 | 520 | | |
532 | | - | resp = s.get(onionsearchengine_url.format(searchstr, 1)) |
533 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 521 | + | resp = s.get(onionsearchengine_url.format(quote(searchstr), 1)) |
| 522 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
534 | 523 | | |
535 | 524 | | page_number = 1 |
536 | 525 | | approx_re = re.search(r"\s([0-9]+)\sresult[s]?\sfound\s!.*", clear(soup.find('body').get_text())) |
| skipped 11 lines |
548 | 537 | | progress_bar.update() |
549 | 538 | | |
550 | 539 | | for n in range(2, page_number + 1): |
551 | | - | resp = s.get(onionsearchengine_url.format(searchstr, n)) |
552 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 540 | + | resp = s.get(onionsearchengine_url.format(quote(searchstr), n)) |
| 541 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
553 | 542 | | link_finder("onionsearchengine", soup) |
554 | 543 | | progress_bar.update() |
555 | 544 | | |
| skipped 1 lines |
557 | 546 | | |
558 | 547 | | |
559 | 548 | | def tordex(searchstr): |
560 | | - | tordex_url = "http://tordex7iie7z2wcg.onion/search?query={}&page={}" |
| 549 | + | tordex_url = supported_engines['tordex'] + "/search?query={}&page={}" |
561 | 550 | | max_nb_page = 100 |
562 | 551 | | if args.limit != 0: |
563 | 552 | | max_nb_page = args.limit |
| skipped 2 lines |
566 | 555 | | s.proxies = proxies |
567 | 556 | | s.headers = random_headers() |
568 | 557 | | |
569 | | - | resp = s.get(tordex_url.format(searchstr, 1)) |
570 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 558 | + | resp = s.get(tordex_url.format(quote(searchstr), 1)) |
| 559 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
571 | 560 | | |
572 | 561 | | page_number = 1 |
573 | 562 | | pages = soup.find_all("li", attrs={"class": "page-item"}) |
| skipped 11 lines |
585 | 574 | | progress_bar.update() |
586 | 575 | | |
587 | 576 | | for n in range(2, page_number + 1): |
588 | | - | resp = s.get(tordex_url.format(searchstr, n)) |
589 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 577 | + | resp = s.get(tordex_url.format(quote(searchstr), n)) |
| 578 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
590 | 579 | | link_finder("tordex", soup) |
591 | 580 | | progress_bar.update() |
592 | 581 | | |
| skipped 1 lines |
594 | 583 | | |
595 | 584 | | |
596 | 585 | | def tor66(searchstr): |
597 | | - | tor66_url = "http://tor66sezptuu2nta.onion/search?q={}&sorttype=rel&page={}" |
| 586 | + | tor66_url = supported_engines['tor66'] + "/search?q={}&sorttype=rel&page={}" |
598 | 587 | | max_nb_page = 30 |
599 | 588 | | if args.limit != 0: |
600 | 589 | | max_nb_page = args.limit |
| skipped 2 lines |
603 | 592 | | s.proxies = proxies |
604 | 593 | | s.headers = random_headers() |
605 | 594 | | |
606 | | - | resp = s.get(tor66_url.format(searchstr, 1)) |
607 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 595 | + | resp = s.get(tor66_url.format(quote(searchstr), 1)) |
| 596 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
608 | 597 | | |
609 | 598 | | page_number = 1 |
610 | 599 | | approx_re = re.search(r"\.Onion\ssites\sfound\s:\s([0-9]+)", |
| skipped 12 lines |
623 | 612 | | progress_bar.update() |
624 | 613 | | |
625 | 614 | | for n in range(2, page_number + 1): |
626 | | - | resp = s.get(tor66_url.format(searchstr, n)) |
627 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 615 | + | resp = s.get(tor66_url.format(quote(searchstr), n)) |
| 616 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
628 | 617 | | link_finder("tor66", soup) |
629 | 618 | | progress_bar.update() |
630 | 619 | | |
| skipped 1 lines |
632 | 621 | | |
633 | 622 | | |
634 | 623 | | def tormax(searchstr): |
635 | | - | tormax_url = "http://tormaxunodsbvtgo.onion/tormax/search?q={}" |
| 624 | + | tormax_url = supported_engines['tormax'] + "/tormax/search?q={}" |
636 | 625 | | |
637 | 626 | | with tqdm(total=1, initial=0, desc="%20s" % "Tormax", unit="req", ascii=False, ncols=120, |
638 | 627 | | bar_format=tqdm_bar_format) as progress_bar: |
639 | | - | response = requests.get(tormax_url.format(searchstr), proxies=proxies, headers=random_headers()) |
640 | | - | soup = BeautifulSoup(response.text, 'html.parser') |
| 628 | + | response = requests.get(tormax_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
| 629 | + | soup = BeautifulSoup(response.text, 'html5lib') |
641 | 630 | | link_finder("tormax", soup) |
642 | 631 | | progress_bar.update() |
643 | 632 | | progress_bar.close() |
644 | 633 | | |
645 | 634 | | |
646 | 635 | | def haystack(searchstr): |
647 | | - | haystack_url = "http://haystakvxad7wbk5.onion/?q={}&offset={}" |
| 636 | + | haystack_url = supported_engines['haystack'] + "/?q={}&offset={}" |
648 | 637 | | # At the 52nd page, it timeouts 100% of the time |
649 | 638 | | max_nb_page = 50 |
650 | 639 | | if args.limit != 0: |
| skipped 4 lines |
655 | 644 | | s.proxies = proxies |
656 | 645 | | s.headers = random_headers() |
657 | 646 | | |
658 | | - | req = s.get(haystack_url.format(searchstr, 0)) |
659 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 647 | + | req = s.get(haystack_url.format(quote(searchstr), 0)) |
| 648 | + | soup = BeautifulSoup(req.text, 'html5lib') |
660 | 649 | | |
661 | 650 | | bar_max = None |
662 | 651 | | if args.barmode == "fixed": |
| skipped 11 lines |
674 | 663 | | it = 1 |
675 | 664 | | while continue_processing: |
676 | 665 | | offset = int(it * offset_coeff) |
677 | | - | req = s.get(haystack_url.format(searchstr, offset)) |
678 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 666 | + | req = s.get(haystack_url.format(quote(searchstr), offset)) |
| 667 | + | soup = BeautifulSoup(req.text, 'html5lib') |
679 | 668 | | ret = link_finder("haystack", soup) |
680 | 669 | | progress_bar.update() |
681 | 670 | | it += 1 |
| skipped 2 lines |
684 | 673 | | |
685 | 674 | | |
686 | 675 | | def multivac(searchstr): |
687 | | - | multivac_url = "http://multivacigqzqqon.onion/?q={}&page={}" |
| 676 | + | multivac_url = supported_engines['multivac'] + "/?q={}&page={}" |
688 | 677 | | max_nb_page = 10 |
689 | 678 | | if args.limit != 0: |
690 | 679 | | max_nb_page = args.limit |
| skipped 3 lines |
694 | 683 | | s.headers = random_headers() |
695 | 684 | | |
696 | 685 | | page_to_request = 1 |
697 | | - | req = s.get(multivac_url.format(searchstr, page_to_request)) |
698 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 686 | + | req = s.get(multivac_url.format(quote(searchstr), page_to_request)) |
| 687 | + | soup = BeautifulSoup(req.text, 'html5lib') |
699 | 688 | | |
700 | 689 | | bar_max = None |
701 | 690 | | if args.barmode == "fixed": |
| skipped 10 lines |
712 | 701 | | |
713 | 702 | | while continue_processing: |
714 | 703 | | page_to_request += 1 |
715 | | - | req = s.get(multivac_url.format(searchstr, page_to_request)) |
716 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 704 | + | req = s.get(multivac_url.format(quote(searchstr), page_to_request)) |
| 705 | + | soup = BeautifulSoup(req.text, 'html5lib') |
717 | 706 | | ret = link_finder("multivac", soup) |
718 | 707 | | progress_bar.update() |
719 | 708 | | |
| skipped 2 lines |
722 | 711 | | |
723 | 712 | | |
724 | 713 | | def evosearch(searchstr): |
725 | | - | evosearch_url = "http://evo7no6twwwrm63c.onion/evo/search.php?" \ |
| 714 | + | evosearch_url = supported_engines['evosearch'] + "/evo/search.php?" \ |
726 | 715 | | "query={}&" \ |
727 | 716 | | "start={}&" \ |
728 | 717 | | "search=1&type=and&mark=bold+text&" \ |
| skipped 7 lines |
736 | 725 | | s.proxies = proxies |
737 | 726 | | s.headers = random_headers() |
738 | 727 | | |
739 | | - | req = s.get(evosearch_url.format(searchstr, 1, results_per_page)) |
740 | | - | soup = BeautifulSoup(req.text, 'html.parser') |
| 728 | + | req = s.get(evosearch_url.format(quote(searchstr), 1, results_per_page)) |
| 729 | + | soup = BeautifulSoup(req.text, 'html5lib') |
741 | 730 | | |
742 | 731 | | page_number = 1 |
743 | 732 | | i = soup.find("p", attrs={"class": "cntr"}) |
744 | 733 | | if i is not None: |
745 | 734 | | if i.get_text() is not None and "of" in i.get_text(): |
746 | 735 | | nb_res = float(clear(str.split(i.get_text().split("-")[1].split("of")[1])[0])) |
747 | | - | # The results page loads in two times, it is hard not to lose the second part |
748 | | - | page_number = math.ceil(nb_res / (results_per_page / 2)) |
| 736 | + | page_number = math.ceil(nb_res / results_per_page) |
749 | 737 | | if page_number > max_nb_page: |
750 | 738 | | page_number = max_nb_page |
751 | 739 | | |
| skipped 4 lines |
756 | 744 | | progress_bar.update() |
757 | 745 | | |
758 | 746 | | for n in range(2, page_number + 1): |
759 | | - | resp = s.get(evosearch_url.format(searchstr, n, results_per_page)) |
760 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 747 | + | resp = s.get(evosearch_url.format(quote(searchstr), n, results_per_page)) |
| 748 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
761 | 749 | | link_finder("evosearch", soup) |
762 | 750 | | progress_bar.update() |
763 | 751 | | |
| skipped 1 lines |
765 | 753 | | |
766 | 754 | | |
767 | 755 | | def oneirun(searchstr): |
768 | | - | oneirun_url = "http://oneirunda366dmfm.onion/Home/IndexEn" |
| 756 | + | oneirun_url = supported_engines['oneirun'] + "/Home/IndexEn" |
769 | 757 | | |
770 | 758 | | with requests.Session() as s: |
771 | 759 | | s.proxies = proxies |
772 | 760 | | s.headers = random_headers() |
773 | 761 | | |
774 | 762 | | resp = s.get(oneirun_url) |
775 | | - | soup = BeautifulSoup(resp.text, 'html.parser') |
| 763 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
776 | 764 | | token = soup.find('input', attrs={"name": "__RequestVerificationToken"})['value'] |
777 | 765 | | |
778 | 766 | | with tqdm(total=1, initial=0, desc="%20s" % "Oneirun", unit="req", ascii=False, ncols=120, |
779 | 767 | | bar_format=tqdm_bar_format) as progress_bar: |
780 | | - | response = s.post(oneirun_url.format(searchstr), |
| 768 | + | response = s.post(oneirun_url.format(quote(searchstr)), |
781 | 769 | | data={"searchString": searchstr, "__RequestVerificationToken": token}) |
782 | | - | soup = BeautifulSoup(response.text, 'html.parser') |
| 770 | + | soup = BeautifulSoup(response.text, 'html5lib') |
783 | 771 | | link_finder("oneirun", soup) |
784 | 772 | | progress_bar.update() |
785 | 773 | | progress_bar.close() |
786 | 774 | | |
787 | 775 | | |
788 | 776 | | def deeplink(searchstr): |
789 | | - | deeplink_url1 = "http://deeplinkdeatbml7.onion/index.php" |
790 | | - | deeplink_url2 = "http://deeplinkdeatbml7.onion/?search={}&type=verified" |
| 777 | + | deeplink_url1 = supported_engines['deeplink'] + "/index.php" |
| 778 | + | deeplink_url2 = supported_engines['deeplink'] + "/?search={}&type=verified" |
791 | 779 | | |
792 | 780 | | with requests.Session() as s: |
793 | 781 | | s.proxies = proxies |
| skipped 2 lines |
796 | 784 | | |
797 | 785 | | with tqdm(total=1, initial=0, desc="%20s" % "DeepLink", unit="req", ascii=False, ncols=120, |
798 | 786 | | bar_format=tqdm_bar_format) as progress_bar: |
799 | | - | response = s.get(deeplink_url2.format(searchstr)) |
800 | | - | soup = BeautifulSoup(response.text, 'html.parser') |
| 787 | + | response = s.get(deeplink_url2.format(quote(searchstr))) |
| 788 | + | soup = BeautifulSoup(response.text, 'html5lib') |
801 | 789 | | link_finder("deeplink", soup) |
802 | 790 | | progress_bar.update() |
803 | 791 | | progress_bar.close() |
804 | 792 | | |
805 | 793 | | |
| 794 | + | def torsearchengine1(searchstr): |
| 795 | + | torsearchengine1_url1 = supported_engines['torsearchengine1'] |
| 796 | + | torsearchengine1_url2 = supported_engines['torsearchengine1'] + "/index.php" |
| 797 | + | |
| 798 | + | with requests.Session() as s: |
| 799 | + | s.proxies = proxies |
| 800 | + | s.headers = random_headers() |
| 801 | + | s.get(torsearchengine1_url1) |
| 802 | + | |
| 803 | + | with tqdm(total=1, initial=0, desc="%20s" % "TOR Search Engine", unit="req", ascii=False, ncols=120, |
| 804 | + | bar_format=tqdm_bar_format) as progress_bar: |
| 805 | + | response = s.post(torsearchengine1_url2, {'search': searchstr, 'search2': ''}) |
| 806 | + | soup = BeautifulSoup(response.text, 'html5lib') |
| 807 | + | link_finder("torsearchengine1", soup) |
| 808 | + | progress_bar.update() |
| 809 | + | progress_bar.close() |
| 810 | + | |
| 811 | + | |
| 812 | + | def torgle1(searchstr): |
| 813 | + | torgle1_url = supported_engines['torgle1'] + "/torgle/index-frame.php?query={}&search=1&engine-ver=2&isframe=0{}" |
| 814 | + | results_per_page = 10 |
| 815 | + | max_nb_page = 30 |
| 816 | + | if args.limit != 0: |
| 817 | + | max_nb_page = args.limit |
| 818 | + | |
| 819 | + | with requests.Session() as s: |
| 820 | + | s.proxies = proxies |
| 821 | + | s.headers = random_headers() |
| 822 | + | |
| 823 | + | resp = s.get(torgle1_url.format(quote(searchstr), "")) |
| 824 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
| 825 | + | |
| 826 | + | page_number = 1 |
| 827 | + | i = soup.find('div', attrs={"id": "result_report"}) |
| 828 | + | if i is not None: |
| 829 | + | if i.get_text() is not None and "of" in i.get_text(): |
| 830 | + | res_re = re.match(r".*of\s([0-9]+)\s.*", clear(i.get_text())) |
| 831 | + | total_results = int(res_re.group(1)) |
| 832 | + | page_number = math.ceil(total_results / results_per_page) |
| 833 | + | if page_number > max_nb_page: |
| 834 | + | page_number = max_nb_page |
| 835 | + | |
| 836 | + | with tqdm(total=page_number, initial=0, desc="%20s" % "Torgle", unit="req", ascii=False, ncols=120, |
| 837 | + | bar_format=tqdm_bar_format) as progress_bar: |
| 838 | + | |
| 839 | + | link_finder("torgle1", soup) |
| 840 | + | progress_bar.update() |
| 841 | + | |
| 842 | + | for n in range(2, page_number + 1): |
| 843 | + | start_page_param = "&start={}".format(n) |
| 844 | + | resp = s.get(torgle1_url.format(quote(searchstr), start_page_param)) |
| 845 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
| 846 | + | link_finder("torgle1", soup) |
| 847 | + | progress_bar.update() |
| 848 | + | |
| 849 | + | progress_bar.close() |
| 850 | + | |
| 851 | + | |
| 852 | + | def grams1(searchstr): |
| 853 | + | grams1_url = supported_engines['grams1'] + "/results/index.php?page={}&searchstr={}" |
| 854 | + | results_per_page = 25 |
| 855 | + | max_nb_page = 30 |
| 856 | + | if args.limit != 0: |
| 857 | + | max_nb_page = args.limit |
| 858 | + | |
| 859 | + | with requests.Session() as s: |
| 860 | + | s.proxies = proxies |
| 861 | + | s.headers = random_headers() |
| 862 | + | |
| 863 | + | resp = s.get(grams1_url.format(1, quote(searchstr))) |
| 864 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
| 865 | + | |
| 866 | + | page_number = 1 |
| 867 | + | pages = soup.find_all('div', attrs={"class": "result-text"}) |
| 868 | + | if pages is not None: |
| 869 | + | res_re = re.match(r"About ([0-9]+) result(.*)", clear(pages[0].get_text())) |
| 870 | + | total_results = int(res_re.group(1)) |
| 871 | + | page_number = math.ceil(total_results / results_per_page) |
| 872 | + | if page_number > max_nb_page: |
| 873 | + | page_number = max_nb_page |
| 874 | + | |
| 875 | + | with tqdm(total=page_number, initial=0, desc="%20s" % "Grams", unit="req", ascii=False, ncols=120, |
| 876 | + | bar_format=tqdm_bar_format) as progress_bar: |
| 877 | + | |
| 878 | + | link_finder("grams1", soup) |
| 879 | + | progress_bar.update() |
| 880 | + | |
| 881 | + | for n in range(2, page_number + 1): |
| 882 | + | resp = s.get(grams1_url.format(n, quote(searchstr))) |
| 883 | + | soup = BeautifulSoup(resp.text, 'html5lib') |
| 884 | + | link_finder("grams1", soup) |
| 885 | + | progress_bar.update() |
| 886 | + | |
| 887 | + | progress_bar.close() |
| 888 | + | |
| 889 | + | |
806 | 890 | | def get_domain_from_url(link): |
807 | 891 | | fqdn_re = r"^[a-z][a-z0-9+\-.]*://([a-z0-9\-._~%!$&'()*+,;=]+@)?([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])" |
808 | 892 | | domain_re = re.match(fqdn_re, link) |
| skipped 35 lines |
844 | 928 | | def append_link(): |
845 | 929 | | nonlocal has_result |
846 | 930 | | has_result = True |
847 | | - | |
848 | 931 | | result[engine_str].append({"name": name, "link": link}) |
849 | 932 | | |
850 | 933 | | if args.continuous_write and csv_file.writable(): |
851 | | - | csv_writer = csv.writer(csv_file, delimiter=field_delim, quoting=csv.QUOTE_NONNUMERIC) |
| 934 | + | csv_writer = csv.writer(csv_file, delimiter=field_delim, quoting=csv.QUOTE_ALL) |
852 | 935 | | fields = {"engine": engine_str, "name": name, "link": link} |
853 | 936 | | write_to_csv(csv_writer, fields) |
854 | 937 | | |
| skipped 1 lines |
856 | 939 | | result[engine_str] = [] |
857 | 940 | | |
858 | 941 | | if engine_str == "ahmia": |
859 | | - | for i in data_obj.find_all('li', attrs={'class': 'result'}): |
860 | | - | i = i.find('h4') |
861 | | - | name = clear(i.get_text()) |
862 | | - | link = i.find('a')['href'].replace("/search/search/redirect?search_term={}&redirect_url=" |
863 | | - | .format(args.search), "") |
| 942 | + | for r in data_obj.select('li.result h4'): |
| 943 | + | name = clear(r.get_text()) |
| 944 | + | link = r.find('a')['href'].split('redirect_url=')[1] |
864 | 945 | | append_link() |
865 | 946 | | |
866 | 947 | | if engine_str == "candle": |
867 | | - | html_page = data_obj.find('html') |
868 | | - | if html_page: |
869 | | - | for i in data_obj.find('html').find_all('a'): |
870 | | - | if str(i['href']).startswith("http"): |
871 | | - | name = clear(i.get_text()) |
872 | | - | link = clear(i['href']) |
873 | | - | append_link() |
| 948 | + | for r in data_obj.select("body h2 a"): |
| 949 | + | if str(r['href']).startswith("http"): |
| 950 | + | name = clear(r.get_text()) |
| 951 | + | link = clear(r['href']) |
| 952 | + | append_link() |
874 | 953 | | |
875 | 954 | | if engine_str == "darksearchenginer": |
876 | | - | for i in data_obj.find('div', attrs={"class": "table-responsive"}).find_all('a'): |
877 | | - | name = clear(i.get_text()) |
878 | | - | link = clear(i['href']) |
| 955 | + | for r in data_obj.select('.table-responsive a'): |
| 956 | + | name = clear(r.get_text()) |
| 957 | + | link = clear(r['href']) |
879 | 958 | | append_link() |
880 | 959 | | |
881 | 960 | | if engine_str == "darksearchio": |
| skipped 11 lines |
893 | 972 | | append_link() |
894 | 973 | | |
895 | 974 | | if engine_str == "evosearch": |
896 | | - | if data_obj.find('div', attrs={"id": "results"}) is not None: |
897 | | - | for div in data_obj.find('div', attrs={"id": "results"}).find_all('div', attrs={"class": "odrow"}): |
898 | | - | name = clear(div.find('div', attrs={"class": "title"}).find('a').get_text()) |
899 | | - | link = clear(div.find('div', attrs={"class": "title"}).find('a')['href'] |
900 | | - | .replace("./include/click_counter.php?url=", "") |
901 | | - | .replace("&query={}".format(args.search), "")) |
902 | | - | append_link() |
| 975 | + | for r in data_obj.select("#results .title a"): |
| 976 | + | name = clear(r.get_text()) |
| 977 | + | link = get_parameter(r['href'], 'url') |
| 978 | + | append_link() |
903 | 979 | | |
904 | 980 | | if engine_str == "grams": |
905 | 981 | | for i in data_obj.find_all("div", attrs={"class": "media-body"}): |
906 | 982 | | if not i.find('span'): |
907 | | - | for j in i.find_all('a'): |
908 | | - | if str(j.get_text()).startswith("http"): |
909 | | - | link = j.get_text() |
910 | | - | else: |
911 | | - | name = j.get_text() |
912 | | - | append_link() |
| 983 | + | for r in i.select(".searchlinks a"): |
| 984 | + | name = clear(r.get_text()) |
| 985 | + | link = clear(r['href']) |
| 986 | + | append_link() |
| 987 | + | |
| 988 | + | if engine_str == "grams1": |
| 989 | + | for r in data_obj.select(".searchlinks a"): |
| 990 | + | name = clear(r.get_text()) |
| 991 | + | link = clear(r['href']) |
| 992 | + | append_link() |
913 | 993 | | |
914 | 994 | | if engine_str == "haystack": |
915 | | - | if data_obj.find('div', attrs={"class": "result"}) is not None: |
916 | | - | for div in data_obj.find_all('div', attrs={"class": "result"}): |
917 | | - | if div.find('a') is not None and div.find('i') is not None: |
918 | | - | name = clear(div.find('a').get_text()) |
919 | | - | link = clear(div.find('i').get_text()) |
920 | | - | append_link() |
| 995 | + | for r in data_obj.select(".result b a"): |
| 996 | + | name = clear(r.get_text()) |
| 997 | + | link = get_parameter(r['href'], 'url') |
| 998 | + | append_link() |
921 | 999 | | |
922 | 1000 | | if engine_str == "multivac": |
923 | | - | for i in data_obj.find_all('dl'): |
924 | | - | link_tag = i.find('a') |
925 | | - | if link_tag: |
926 | | - | if link_tag['href'] != "": |
927 | | - | name = clear(link_tag.get_text()) |
928 | | - | link = clear(link_tag['href']) |
929 | | - | append_link() |
930 | | - | else: |
931 | | - | break |
| 1001 | + | for r in data_obj.select("dl dt a"): |
| 1002 | + | if r['href'] != "": |
| 1003 | + | name = clear(r.get_text()) |
| 1004 | + | link = clear(r['href']) |
| 1005 | + | append_link() |
| 1006 | + | else: |
| 1007 | + | break |
932 | 1008 | | |
933 | 1009 | | if engine_str == "notevil": |
934 | | - | ''' As for OnionLand, we could use the span instead of the href to get a beautiful link |
935 | | - | However some useful links are shown under the "li" tag, |
936 | | - | and there we would not be able to have a sanitized version |
937 | | - | Thus, the best is to implement a generic sanitize function. ''' |
938 | | - | for i in data_obj.find_all('p'): |
939 | | - | name = clear(i.find('a').get_text()) |
940 | | - | link = i.find('a')['href'].replace("./r2d.php?url=", "") |
941 | | - | append_link() |
942 | | - | for i in data_obj.find_all('li'): |
943 | | - | name = clear(i.find('a').get_text()) |
944 | | - | link = i.find('a')['href'].replace("./r2d.php?url=", "") |
| 1010 | + | for r in data_obj.select('#content > div > p > a:not([target])'): |
| 1011 | + | name = clear(r.get_text()) |
| 1012 | + | link = get_parameter(r['href'], 'url') |
945 | 1013 | | append_link() |
946 | 1014 | | |
947 | 1015 | | if engine_str == "oneirun": |
| skipped 3 lines |
951 | 1019 | | append_link() |
952 | 1020 | | |
953 | 1021 | | if engine_str == "onionland": |
954 | | - | if not data_obj.find('div', attrs={"class": "row no-result-row"}): |
955 | | - | for i in data_obj.find_all('div', attrs={"class": "result-block"}): |
956 | | - | if not str(clear(i.find('div', attrs={'class': "title"}).find('a')['href'])).startswith("/ads"): |
957 | | - | name = clear(i.find('div', attrs={'class': "title"}).get_text()) |
958 | | - | link = clear(i.find('div', attrs={'class': "link"}).get_text()) |
959 | | - | append_link() |
| 1022 | + | for r in data_obj.select('.result-block .title a'): |
| 1023 | + | if not r['href'].startswith('/ads/'): |
| 1024 | + | name = clear(r.get_text()) |
| 1025 | + | link = unquote(unquote(get_parameter(r['href'], 'l'))) |
| 1026 | + | append_link() |
960 | 1027 | | |
961 | 1028 | | if engine_str == "onionsearchengine": |
962 | | - | for i in data_obj.find_all('table'): |
963 | | - | for j in i.find_all('a'): |
964 | | - | if str(j['href']).startswith("url.php?u=") and not str(j.get_text()).startswith("http://"): |
965 | | - | name = clear(j.get_text()) |
966 | | - | link = clear(str(j['href']).replace("url.php?u=", "")) |
967 | | - | append_link() |
| 1029 | + | for r in data_obj.select("table a b"): |
| 1030 | + | name = clear(r.get_text()) |
| 1031 | + | link = get_parameter(r.parent['href'], 'u') |
| 1032 | + | append_link() |
968 | 1033 | | |
969 | 1034 | | if engine_str == "onionsearchserver": |
970 | | - | for i in data_obj.find_all('div', attrs={"class": "osscmnrdr ossfieldrdr1"}): |
971 | | - | name = clear(i.find('a').get_text()) |
972 | | - | link = clear(i.find('a')['href']) |
| 1035 | + | for r in data_obj.select('.osscmnrdr.ossfieldrdr1 a'): |
| 1036 | + | name = clear(r.get_text()) |
| 1037 | + | link = clear(r['href']) |
973 | 1038 | | append_link() |
974 | 1039 | | |
975 | 1040 | | if engine_str == "phobos": |
976 | | - | links = data_obj.find('div', attrs={"class": "serp"}).find_all('a', attrs={"class": "titles"}) |
977 | | - | for i in links: |
978 | | - | name = clear(i.get_text()) |
979 | | - | link = clear(i['href']) |
| 1041 | + | for r in data_obj.select('.serp .titles'): |
| 1042 | + | name = clear(r.get_text()) |
| 1043 | + | link = clear(r['href']) |
980 | 1044 | | append_link() |
981 | 1045 | | |
982 | 1046 | | if engine_str == "tor66": |
| skipped 4 lines |
987 | 1051 | | append_link() |
988 | 1052 | | |
989 | 1053 | | if engine_str == "torch": |
990 | | - | for i in data_obj.find_all('dl'): |
991 | | - | name = clear(i.find('a').get_text()) |
992 | | - | link = i.find('a')['href'] |
| 1054 | + | for r in data_obj.select("dl > dt > a"): |
| 1055 | + | name = clear(r.get_text()) |
| 1056 | + | link = clear(r['href']) |
993 | 1057 | | append_link() |
994 | 1058 | | |
995 | 1059 | | if engine_str == "tordex": |
996 | | - | for i in data_obj.find_all('div', attrs={"class": "result mb-3"}): |
997 | | - | a_link = i.find('h5').find('a') |
998 | | - | name = clear(a_link.get_text()) |
999 | | - | link = clear(a_link['href']) |
| 1060 | + | for r in data_obj.select('.container h5 a'): |
| 1061 | + | name = clear(r.get_text()) |
| 1062 | + | link = clear(r['href']) |
1000 | 1063 | | append_link() |
1001 | 1064 | | |
1002 | 1065 | | if engine_str == "torgle": |
| skipped 5 lines |
1008 | 1071 | | name = clear(j.get_text()) |
1009 | 1072 | | append_link() |
1010 | 1073 | | |
| 1074 | + | if engine_str == "torgle1": |
| 1075 | + | for r in data_obj.select("#results a.title"): |
| 1076 | + | name = clear(r.get_text()) |
| 1077 | + | link = clear(r['href']) |
| 1078 | + | append_link() |
| 1079 | + | |
1011 | 1080 | | if engine_str == "tormax": |
1012 | | - | for i in data_obj.find_all('article'): |
1013 | | - | if i.find('a') is not None and i.find('div') is not None: |
1014 | | - | link = clear(i.find('div', attrs={"class": "url"}).get_text()) |
1015 | | - | name = clear(i.find('a', attrs={"class": "title"}).get_text()) |
1016 | | - | append_link() |
| 1081 | + | for r in data_obj.select("#search-results article a.title"): |
| 1082 | + | name = clear(r.get_text()) |
| 1083 | + | link = clear(r.find_next_sibling('div', {'class': 'url'}).get_text()) |
| 1084 | + | append_link() |
1017 | 1085 | | |
1018 | 1086 | | if engine_str == "torsearchengine": |
1019 | 1087 | | for i in data_obj.find_all('h3', attrs={'class': 'title text-truncate'}): |
| skipped 1 lines |
1021 | 1089 | | link = i.find('a')['data-uri'] |
1022 | 1090 | | append_link() |
1023 | 1091 | | |
| 1092 | + | if engine_str == "torsearchengine1": |
| 1093 | + | for r in data_obj.find_all('span', {'style': 'font-size:1.2em;font-weight:bold;color:#1a0dab'}): |
| 1094 | + | name = clear(r.get_text()) |
| 1095 | + | link = r.find_next_sibling('a')['href'] |
| 1096 | + | append_link() |
| 1097 | + | |
1024 | 1098 | | if engine_str == "visitor": |
1025 | | - | li_tags = data_obj.find_all('li', attrs={'class': 'hs_site'}) |
1026 | | - | for i in li_tags: |
1027 | | - | h3tags = i.find_all('h3') |
1028 | | - | for n in h3tags: |
1029 | | - | name = clear(n.find('a').get_text()) |
1030 | | - | link = n.find('a')['href'] |
1031 | | - | append_link() |
| 1099 | + | for r in data_obj.select(".hs_site h3 a"): |
| 1100 | + | name = clear(r.get_text()) |
| 1101 | + | link = clear(r['href']) |
| 1102 | + | append_link() |
1032 | 1103 | | |
1033 | 1104 | | if args.continuous_write and not csv_file.closed: |
1034 | 1105 | | csv_file.close() |
| skipped 35 lines |
1070 | 1141 | | except KeyError: |
1071 | 1142 | | print("Error: search engine {} not in the list of supported engines".format(e)) |
1072 | 1143 | | else: |
1073 | | - | for e in supported_engines: |
| 1144 | + | for e in supported_engines.keys(): |
1074 | 1145 | | if not (args.exclude and len(args.exclude) > 0 and e in args.exclude[0]): |
1075 | 1146 | | call_func_as_str(e, args.search) |
1076 | 1147 | | |
| skipped 1 lines |
1078 | 1149 | | |
1079 | 1150 | | if not args.continuous_write: |
1080 | 1151 | | with open(filename, 'w', newline='') as csv_file: |
1081 | | - | csv_writer = csv.writer(csv_file, delimiter=field_delim, quoting=csv.QUOTE_NONNUMERIC) |
| 1152 | + | csv_writer = csv.writer(csv_file, delimiter=field_delim, quoting=csv.QUOTE_ALL) |
1082 | 1153 | | for engine in result.keys(): |
1083 | 1154 | | for i in result[engine]: |
1084 | 1155 | | i['engine'] = engine |
| skipped 14 lines |