| skipped 3 lines |
4 | 4 | | import re |
5 | 5 | | import time |
6 | 6 | | from datetime import datetime |
| 7 | + | from functools import reduce |
7 | 8 | | from random import choice |
| 9 | + | |
| 10 | + | from multiprocessing import Pool, cpu_count, current_process, freeze_support |
8 | 11 | | from tqdm import tqdm |
9 | 12 | | |
10 | 13 | | import requests |
| skipped 2 lines |
13 | 16 | | from urllib.parse import quote |
14 | 17 | | from urllib.parse import unquote |
15 | 18 | | from bs4 import BeautifulSoup |
| 19 | + | from urllib3.exceptions import ProtocolError |
16 | 20 | | |
17 | 21 | | import engines |
18 | 22 | | |
| skipped 44 lines |
63 | 67 | | help="Write progressively to output file (default: False)") |
64 | 68 | | parser.add_argument("search", type=str, help="The search string or phrase") |
65 | 69 | | parser.add_argument("--limit", type=int, default=0, help="Set a max number of pages per engine to load") |
66 | | - | parser.add_argument("--barmode", type=str, default="fixed", help="Can be 'fixed' (default) or 'unknown'") |
67 | 70 | | parser.add_argument("--engines", type=str, action='append', help='Engines to request (default: full list)', nargs="*") |
68 | 71 | | parser.add_argument("--exclude", type=str, action='append', help='Engines to exclude (default: none)', nargs="*") |
69 | 72 | | parser.add_argument("--fields", type=str, action='append', |
70 | 73 | | help='Fields to output to csv file (default: engine name link), available fields are shown below', |
71 | 74 | | nargs="*") |
72 | 75 | | parser.add_argument("--field_delimiter", type=str, default=",", help='Delimiter for the CSV fields') |
73 | | - | |
| 76 | + | parser.add_argument("--mp_units", type=int, default=(cpu_count() - 1), help="Number of processing units (default: " |
| 77 | + | "core number minus 1)") |
74 | 78 | | |
75 | 79 | | args = parser.parse_args() |
76 | 80 | | proxies = {'http': 'socks5h://{}'.format(args.proxy), 'https': 'socks5h://{}'.format(args.proxy)} |
77 | | - | tqdm_bar_format = "{desc}: {percentage:3.0f}% |{bar}| {n_fmt:3s} / {total_fmt:3s} [{elapsed:5s} < {remaining:5s}]" |
78 | | - | result = {} |
79 | 81 | | filename = args.output |
80 | 82 | | field_delim = "," |
81 | 83 | | if args.field_delimiter and len(args.field_delimiter) == 1: |
82 | 84 | | field_delim = args.field_delimiter |
83 | 85 | | |
| 86 | + | |
84 | 87 | | def random_headers(): |
85 | 88 | | return {'User-Agent': choice(desktop_agents), |
86 | 89 | | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} |
| skipped 10 lines |
97 | 100 | | return parse_qs(parsed.query)[parameter_name][0] |
98 | 101 | | |
99 | 102 | | |
| 103 | + | def get_proc_pos(): |
| 104 | + | return (current_process()._identity[0]) - 1 |
| 105 | + | |
| 106 | + | |
| 107 | + | def get_tqdm_desc(e_name, pos): |
| 108 | + | return "%20s (#%d)" % (e_name, pos) |
| 109 | + | |
| 110 | + | |
100 | 111 | | def ahmia(searchstr): |
| 112 | + | results = [] |
101 | 113 | | ahmia_url = supported_engines['ahmia'] + "/search/?q={}" |
102 | 114 | | |
103 | | - | with tqdm(total=1, initial=0, desc="%20s" % "Ahmia", unit="req", ascii=False, ncols=120, |
104 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 115 | + | pos = get_proc_pos() |
| 116 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Ahmia", pos), position=pos) as progress_bar: |
105 | 117 | | response = requests.get(ahmia_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
106 | 118 | | soup = BeautifulSoup(response.text, 'html5lib') |
107 | | - | link_finder("ahmia", soup) |
| 119 | + | results = link_finder("ahmia", soup) |
108 | 120 | | progress_bar.update() |
109 | | - | progress_bar.close() |
| 121 | + | |
| 122 | + | return results |
110 | 123 | | |
111 | 124 | | |
112 | 125 | | def torch(searchstr): |
| 126 | + | results = [] |
113 | 127 | | torch_url = supported_engines['torch'] + "/4a1f6b371c/search.cgi?cmd=Search!&np={}&q={}" |
114 | 128 | | results_per_page = 10 |
115 | 129 | | max_nb_page = 100 |
| skipped 14 lines |
130 | 144 | | if page_number > max_nb_page: |
131 | 145 | | page_number = max_nb_page |
132 | 146 | | |
133 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "TORCH", unit="req", ascii=False, ncols=120, |
134 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 147 | + | pos = get_proc_pos() |
| 148 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("TORCH", pos), position=pos) as progress_bar: |
135 | 149 | | |
136 | | - | link_finder("torch", soup) |
| 150 | + | results = link_finder("torch", soup) |
137 | 151 | | progress_bar.update() |
138 | 152 | | |
139 | 153 | | # Usually range is 2 to n+1, but TORCH behaves differently |
140 | 154 | | for n in range(1, page_number): |
141 | 155 | | req = s.get(torch_url.format(n, quote(searchstr))) |
142 | 156 | | soup = BeautifulSoup(req.text, 'html5lib') |
143 | | - | link_finder("torch", soup) |
| 157 | + | results = results + link_finder("torch", soup) |
144 | 158 | | progress_bar.update() |
145 | 159 | | |
146 | | - | progress_bar.close() |
| 160 | + | return results |
147 | 161 | | |
148 | 162 | | |
149 | 163 | | def torch1(searchstr): |
| 164 | + | results = [] |
150 | 165 | | torch1_url = supported_engines['torch1'] + "/search?q={}&cmd=Search!" |
151 | 166 | | |
152 | | - | with tqdm(total=1, initial=0, desc="%20s" % "TORCH", unit="req", ascii=False, ncols=120, |
153 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 167 | + | pos = get_proc_pos() |
| 168 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("TORCH 1", pos), position=pos) as progress_bar: |
154 | 169 | | response = requests.get(torch1_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
155 | 170 | | soup = BeautifulSoup(response.text, 'html5lib') |
156 | | - | link_finder("torch1", soup) |
| 171 | + | results = link_finder("torch1", soup) |
157 | 172 | | progress_bar.update() |
158 | | - | progress_bar.close() |
| 173 | + | |
| 174 | + | return results |
159 | 175 | | |
160 | 176 | | |
161 | 177 | | def darksearchio(searchstr): |
162 | | - | global result |
163 | | - | result['darksearchio'] = [] |
| 178 | + | results = [] |
164 | 179 | | darksearchio_url = supported_engines['darksearchio'] + "/api/search?query={}&page={}" |
165 | 180 | | max_nb_page = 30 |
166 | 181 | | if args.limit != 0: |
| skipped 14 lines |
181 | 196 | | else: |
182 | 197 | | return |
183 | 198 | | |
184 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "DarkSearch (.io)", unit="req", ascii=False, ncols=120, |
185 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 199 | + | pos = get_proc_pos() |
| 200 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("DarkSearch (.io)", pos), position=pos) \ |
| 201 | + | as progress_bar: |
186 | 202 | | |
187 | | - | link_finder("darksearchio", resp['data']) |
| 203 | + | results = link_finder("darksearchio", resp['data']) |
188 | 204 | | progress_bar.update() |
189 | 205 | | |
190 | 206 | | for n in range(2, page_number + 1): |
191 | 207 | | resp = s.get(darksearchio_url.format(quote(searchstr), n)) |
192 | 208 | | if resp.status_code == 200: |
193 | 209 | | resp = resp.json() |
194 | | - | link_finder("darksearchio", resp['data']) |
| 210 | + | results = results + link_finder("darksearchio", resp['data']) |
195 | 211 | | progress_bar.update() |
196 | 212 | | else: |
197 | 213 | | # Current page results will be lost but we will try to continue after a short sleep |
198 | 214 | | time.sleep(1) |
199 | 215 | | |
200 | | - | progress_bar.close() |
| 216 | + | return results |
201 | 217 | | |
202 | 218 | | |
203 | 219 | | def onionland(searchstr): |
| 220 | + | results = [] |
204 | 221 | | onionlandv3_url = supported_engines['onionland'] + "/search?q={}&page={}" |
205 | 222 | | max_nb_page = 100 |
206 | 223 | | if args.limit != 0: |
| skipped 17 lines |
224 | 241 | | if page_number > max_nb_page: |
225 | 242 | | page_number = max_nb_page |
226 | 243 | | |
227 | | - | bar_max = None |
228 | | - | if args.barmode == "fixed": |
229 | | - | bar_max = max_nb_page |
230 | | - | |
231 | | - | with tqdm(total=bar_max, initial=0, desc="%20s" % "OnionLand", unit="req", ascii=False, ncols=120, |
232 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 244 | + | pos = get_proc_pos() |
| 245 | + | with tqdm(total=max_nb_page, initial=0, desc=get_tqdm_desc("OnionLand", pos), position=pos) as progress_bar: |
233 | 246 | | |
234 | | - | link_finder("onionland", soup) |
| 247 | + | results = link_finder("onionland", soup) |
235 | 248 | | progress_bar.update() |
236 | 249 | | |
237 | 250 | | for n in range(2, page_number + 1): |
238 | 251 | | resp = s.get(onionlandv3_url.format(quote(searchstr), n)) |
239 | 252 | | soup = BeautifulSoup(resp.text, 'html5lib') |
240 | 253 | | ret = link_finder("onionland", soup) |
241 | | - | if ret < 0: |
| 254 | + | if len(ret) == 0: |
242 | 255 | | break |
| 256 | + | results = results + ret |
243 | 257 | | progress_bar.update() |
244 | 258 | | |
245 | | - | progress_bar.close() |
| 259 | + | return results |
246 | 260 | | |
247 | 261 | | |
248 | 262 | | def notevil(searchstr): |
| 263 | + | results = [] |
249 | 264 | | notevil_url1 = supported_engines['notevil'] + "/index.php?q={}" |
250 | 265 | | notevil_url2 = supported_engines['notevil'] + "/index.php?q={}&hostLimit=20&start={}&numRows={}&template=0" |
251 | 266 | | max_nb_page = 20 |
| skipped 12 lines |
264 | 279 | | if page_number > max_nb_page: |
265 | 280 | | page_number = max_nb_page |
266 | 281 | | |
267 | | - | num_rows = 20 |
268 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "not Evil", unit="req", ascii=False, ncols=120, |
269 | | - | bar_format=tqdm_bar_format) as progress_bar: |
270 | | - | |
271 | | - | link_finder("notevil", soup) |
| 282 | + | pos = get_proc_pos() |
| 283 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("not Evil", pos), position=pos) as progress_bar: |
| 284 | + | num_rows = 20 |
| 285 | + | results = link_finder("notevil", soup) |
272 | 286 | | progress_bar.update() |
273 | 287 | | |
274 | 288 | | for n in range(2, page_number + 1): |
| skipped 2 lines |
277 | 291 | | proxies=proxies, |
278 | 292 | | headers=random_headers()) |
279 | 293 | | soup = BeautifulSoup(req.text, 'html5lib') |
280 | | - | link_finder("notevil", soup) |
| 294 | + | results = results + link_finder("notevil", soup) |
281 | 295 | | progress_bar.update() |
282 | 296 | | time.sleep(1) |
283 | 297 | | |
284 | | - | progress_bar.close() |
| 298 | + | return results |
285 | 299 | | |
286 | 300 | | |
287 | 301 | | def visitor(searchstr): |
| 302 | + | results = [] |
288 | 303 | | visitor_url = supported_engines['visitor'] + "/search/?q={}&page={}" |
289 | 304 | | max_nb_page = 30 |
290 | 305 | | if args.limit != 0: |
291 | 306 | | max_nb_page = args.limit |
292 | 307 | | |
293 | | - | bar_max = None |
294 | | - | if args.barmode == "fixed": |
295 | | - | bar_max = max_nb_page |
296 | | - | with tqdm(total=bar_max, initial=0, desc="%20s" % "VisiTOR", unit="req", ascii=False, ncols=120, |
297 | | - | bar_format=tqdm_bar_format) as progress_bar: |
298 | | - | |
| 308 | + | pos = get_proc_pos() |
| 309 | + | with tqdm(total=max_nb_page, initial=0, desc=get_tqdm_desc("VisiTOR", pos), position=pos) as progress_bar: |
299 | 310 | | continue_processing = True |
300 | 311 | | page_to_request = 1 |
301 | 312 | | |
| skipped 4 lines |
306 | 317 | | while continue_processing: |
307 | 318 | | resp = s.get(visitor_url.format(quote(searchstr), page_to_request)) |
308 | 319 | | soup = BeautifulSoup(resp.text, 'html5lib') |
309 | | - | link_finder("visitor", soup) |
| 320 | + | results = results + link_finder("visitor", soup) |
310 | 321 | | progress_bar.update() |
311 | 322 | | |
312 | 323 | | next_page = soup.find('a', text="Next »") |
| skipped 2 lines |
315 | 326 | | |
316 | 327 | | page_to_request += 1 |
317 | 328 | | |
318 | | - | progress_bar.close() |
| 329 | + | return results |
319 | 330 | | |
320 | 331 | | |
321 | 332 | | def darksearchenginer(searchstr): |
| 333 | + | results = [] |
322 | 334 | | darksearchenginer_url = supported_engines['darksearchenginer'] |
323 | 335 | | max_nb_page = 20 |
324 | 336 | | if args.limit != 0: |
| skipped 14 lines |
339 | 351 | | if page_number > max_nb_page: |
340 | 352 | | page_number = max_nb_page |
341 | 353 | | |
342 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Dark Search Enginer", unit="req", ascii=False, ncols=120, |
343 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 354 | + | pos = get_proc_pos() |
| 355 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Dark Search Enginer", pos), position=pos) \ |
| 356 | + | as progress_bar: |
344 | 357 | | |
345 | | - | link_finder("darksearchenginer", soup) |
| 358 | + | results = link_finder("darksearchenginer", soup) |
346 | 359 | | progress_bar.update() |
347 | 360 | | |
348 | 361 | | for n in range(2, page_number + 1): |
349 | 362 | | resp = s.post(darksearchenginer_url, data={"search[keyword]": searchstr, "page": str(n)}) |
350 | 363 | | soup = BeautifulSoup(resp.text, 'html5lib') |
351 | | - | link_finder("darksearchenginer", soup) |
| 364 | + | results = results + link_finder("darksearchenginer", soup) |
352 | 365 | | progress_bar.update() |
353 | 366 | | |
354 | | - | progress_bar.close() |
| 367 | + | return results |
355 | 368 | | |
356 | 369 | | |
357 | 370 | | def phobos(searchstr): |
| 371 | + | results = [] |
358 | 372 | | phobos_url = supported_engines['phobos'] + "/search?query={}&p={}" |
359 | 373 | | max_nb_page = 100 |
360 | 374 | | if args.limit != 0: |
| skipped 14 lines |
375 | 389 | | if page_number > max_nb_page: |
376 | 390 | | page_number = max_nb_page |
377 | 391 | | |
378 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Phobos", unit="req", ascii=False, ncols=120, |
379 | | - | bar_format=tqdm_bar_format) as progress_bar: |
380 | | - | |
381 | | - | link_finder("phobos", soup) |
| 392 | + | pos = get_proc_pos() |
| 393 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Phobos", pos), position=pos) as progress_bar: |
| 394 | + | results = link_finder("phobos", soup) |
382 | 395 | | progress_bar.update() |
383 | 396 | | |
384 | 397 | | for n in range(2, page_number + 1): |
385 | 398 | | resp = s.get(phobos_url.format(quote(searchstr), n), proxies=proxies, headers=random_headers()) |
386 | 399 | | soup = BeautifulSoup(resp.text, 'html5lib') |
387 | | - | link_finder("phobos", soup) |
| 400 | + | results = results + link_finder("phobos", soup) |
388 | 401 | | progress_bar.update() |
389 | 402 | | |
390 | | - | progress_bar.close() |
| 403 | + | return results |
391 | 404 | | |
392 | 405 | | |
393 | 406 | | def onionsearchserver(searchstr): |
| 407 | + | results = [] |
394 | 408 | | onionsearchserver_url1 = supported_engines['onionsearchserver'] + "/oss/" |
395 | 409 | | onionsearchserver_url2 = None |
396 | 410 | | results_per_page = 10 |
| skipped 11 lines |
408 | 422 | | onionsearchserver_url2 = i['src'] + "{}&page={}" |
409 | 423 | | |
410 | 424 | | if onionsearchserver_url2 is None: |
411 | | - | return -1 |
| 425 | + | return results |
412 | 426 | | |
413 | 427 | | resp = s.get(onionsearchserver_url2.format(quote(searchstr), 1)) |
414 | 428 | | soup = BeautifulSoup(resp.text, 'html5lib') |
| skipped 6 lines |
421 | 435 | | if page_number > max_nb_page: |
422 | 436 | | page_number = max_nb_page |
423 | 437 | | |
424 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Onion Search Server", unit="req", ascii=False, ncols=120, |
425 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 438 | + | pos = get_proc_pos() |
| 439 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Onion Search Server", pos), position=pos) \ |
| 440 | + | as progress_bar: |
426 | 441 | | |
427 | | - | link_finder("onionsearchserver", soup) |
| 442 | + | results = link_finder("onionsearchserver", soup) |
428 | 443 | | progress_bar.update() |
429 | 444 | | |
430 | 445 | | for n in range(2, page_number + 1): |
431 | 446 | | resp = s.get(onionsearchserver_url2.format(quote(searchstr), n)) |
432 | 447 | | soup = BeautifulSoup(resp.text, 'html5lib') |
433 | | - | link_finder("onionsearchserver", soup) |
| 448 | + | results = results + link_finder("onionsearchserver", soup) |
434 | 449 | | progress_bar.update() |
435 | 450 | | |
436 | | - | progress_bar.close() |
| 451 | + | return results |
437 | 452 | | |
438 | 453 | | |
439 | 454 | | def grams(searchstr): |
| 455 | + | results = [] |
440 | 456 | | # No multi pages handling as it is very hard to get many results on this engine |
441 | 457 | | grams_url1 = supported_engines['grams'] |
442 | 458 | | grams_url2 = supported_engines['grams'] + "/results" |
| skipped 6 lines |
449 | 465 | | soup = BeautifulSoup(resp.text, 'html5lib') |
450 | 466 | | token = soup.find('input', attrs={'name': '_token'})['value'] |
451 | 467 | | |
452 | | - | with tqdm(total=1, initial=0, desc="%20s" % "Grams", unit="req", ascii=False, ncols=120, |
453 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 468 | + | pos = get_proc_pos() |
| 469 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Grams", pos), position=pos) as progress_bar: |
454 | 470 | | resp = s.post(grams_url2, data={"req": searchstr, "_token": token}) |
455 | 471 | | soup = BeautifulSoup(resp.text, 'html5lib') |
456 | | - | link_finder("grams", soup) |
| 472 | + | results = link_finder("grams", soup) |
457 | 473 | | progress_bar.update() |
458 | | - | progress_bar.close() |
| 474 | + | |
| 475 | + | return results |
459 | 476 | | |
460 | 477 | | |
461 | 478 | | def candle(searchstr): |
| 479 | + | results = [] |
462 | 480 | | candle_url = supported_engines['candle'] + "/?q={}" |
463 | 481 | | |
464 | | - | with tqdm(total=1, initial=0, desc="%20s" % "Candle", unit="req", ascii=False, ncols=120, |
465 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 482 | + | pos = get_proc_pos() |
| 483 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Candle", pos), position=pos) as progress_bar: |
466 | 484 | | response = requests.get(candle_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
467 | 485 | | soup = BeautifulSoup(response.text, 'html5lib') |
468 | | - | link_finder("candle", soup) |
| 486 | + | results = link_finder("candle", soup) |
469 | 487 | | progress_bar.update() |
470 | | - | progress_bar.close() |
| 488 | + | |
| 489 | + | return results |
471 | 490 | | |
472 | 491 | | |
473 | 492 | | def torsearchengine(searchstr): |
| 493 | + | results = [] |
474 | 494 | | torsearchengine_url = supported_engines['torsearchengine'] + "/search/move/?q={}&pn={}&num=10&sdh=&" |
475 | 495 | | max_nb_page = 100 |
476 | 496 | | if args.limit != 0: |
| skipped 15 lines |
492 | 512 | | if page_number > max_nb_page: |
493 | 513 | | page_number = max_nb_page |
494 | 514 | | |
495 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Tor Search Engine", unit="req", ascii=False, ncols=120, |
496 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 515 | + | pos = get_proc_pos() |
| 516 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Tor Search Engine", pos), position=pos) \ |
| 517 | + | as progress_bar: |
497 | 518 | | |
498 | | - | link_finder("torsearchengine", soup) |
| 519 | + | results = link_finder("torsearchengine", soup) |
499 | 520 | | progress_bar.update() |
500 | 521 | | |
501 | 522 | | for n in range(2, page_number + 1): |
502 | 523 | | resp = s.get(torsearchengine_url.format(quote(searchstr), n)) |
503 | 524 | | soup = BeautifulSoup(resp.text, 'html5lib') |
504 | | - | ret = link_finder("torsearchengine", soup) |
| 525 | + | results = results + link_finder("torsearchengine", soup) |
505 | 526 | | progress_bar.update() |
506 | 527 | | |
507 | | - | progress_bar.close() |
| 528 | + | return results |
508 | 529 | | |
509 | 530 | | |
510 | 531 | | def torgle(searchstr): |
| 532 | + | results = [] |
511 | 533 | | torgle_url = supported_engines['torgle'] + "/search.php?term={}" |
512 | 534 | | |
513 | | - | with tqdm(total=1, initial=0, desc="%20s" % "Torgle", unit="req", ascii=False, ncols=120, |
514 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 535 | + | pos = get_proc_pos() |
| 536 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Torgle", pos), position=pos) as progress_bar: |
515 | 537 | | response = requests.get(torgle_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
516 | 538 | | soup = BeautifulSoup(response.text, 'html5lib') |
517 | | - | link_finder("torgle", soup) |
| 539 | + | results = link_finder("torgle", soup) |
518 | 540 | | progress_bar.update() |
519 | | - | progress_bar.close() |
| 541 | + | |
| 542 | + | return results |
520 | 543 | | |
521 | 544 | | |
522 | 545 | | def onionsearchengine(searchstr): |
| 546 | + | results = [] |
523 | 547 | | onionsearchengine_url = supported_engines['onionsearchengine'] + "/search.php?search={}&submit=Search&page={}" |
524 | 548 | | # same as onionsearchengine_url = "http://5u56fjmxu63xcmbk.onion/search.php?search={}&submit=Search&page={}" |
525 | 549 | | max_nb_page = 100 |
| skipped 16 lines |
542 | 566 | | if page_number > max_nb_page: |
543 | 567 | | page_number = max_nb_page |
544 | 568 | | |
545 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Onion Search Engine", unit="req", ascii=False, ncols=120, |
546 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 569 | + | pos = get_proc_pos() |
| 570 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Onion Search Engine", pos), position=pos) \ |
| 571 | + | as progress_bar: |
547 | 572 | | |
548 | | - | link_finder("onionsearchengine", soup) |
| 573 | + | results = link_finder("onionsearchengine", soup) |
549 | 574 | | progress_bar.update() |
550 | 575 | | |
551 | 576 | | for n in range(2, page_number + 1): |
552 | 577 | | resp = s.get(onionsearchengine_url.format(quote(searchstr), n)) |
553 | 578 | | soup = BeautifulSoup(resp.text, 'html5lib') |
554 | | - | link_finder("onionsearchengine", soup) |
| 579 | + | results = results + link_finder("onionsearchengine", soup) |
555 | 580 | | progress_bar.update() |
556 | 581 | | |
557 | | - | progress_bar.close() |
| 582 | + | return results |
558 | 583 | | |
559 | 584 | | |
560 | 585 | | def tordex(searchstr): |
| 586 | + | results = [] |
561 | 587 | | tordex_url = supported_engines['tordex'] + "/search?query={}&page={}" |
562 | 588 | | max_nb_page = 100 |
563 | 589 | | if args.limit != 0: |
| skipped 15 lines |
579 | 605 | | if page_number > max_nb_page: |
580 | 606 | | page_number = max_nb_page |
581 | 607 | | |
582 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Tordex", unit="req", ascii=False, ncols=120, |
583 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 608 | + | pos = get_proc_pos() |
| 609 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Tordex", pos), position=pos) as progress_bar: |
584 | 610 | | |
585 | | - | link_finder("tordex", soup) |
| 611 | + | results = link_finder("tordex", soup) |
586 | 612 | | progress_bar.update() |
587 | 613 | | |
588 | 614 | | for n in range(2, page_number + 1): |
589 | 615 | | resp = s.get(tordex_url.format(quote(searchstr), n)) |
590 | 616 | | soup = BeautifulSoup(resp.text, 'html5lib') |
591 | | - | link_finder("tordex", soup) |
| 617 | + | results = results + link_finder("tordex", soup) |
592 | 618 | | progress_bar.update() |
593 | 619 | | |
594 | | - | progress_bar.close() |
| 620 | + | return results |
595 | 621 | | |
596 | 622 | | |
597 | 623 | | def tor66(searchstr): |
| 624 | + | results = [] |
598 | 625 | | tor66_url = supported_engines['tor66'] + "/search?q={}&sorttype=rel&page={}" |
599 | 626 | | max_nb_page = 30 |
600 | 627 | | if args.limit != 0: |
| skipped 7 lines |
608 | 635 | | soup = BeautifulSoup(resp.text, 'html5lib') |
609 | 636 | | |
610 | 637 | | page_number = 1 |
611 | | - | approx_re = re.search(r"\.Onion\ssites\sfound\s:\s([0-9]+)", |
612 | | - | resp.text) |
| 638 | + | approx_re = re.search(r"\.Onion\ssites\sfound\s:\s([0-9]+)", resp.text) |
613 | 639 | | if approx_re is not None: |
614 | 640 | | nb_res = int(approx_re.group(1)) |
615 | 641 | | results_per_page = 20 |
| skipped 1 lines |
617 | 643 | | if page_number > max_nb_page: |
618 | 644 | | page_number = max_nb_page |
619 | 645 | | |
620 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Tor66", unit="req", ascii=False, ncols=120, |
621 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 646 | + | pos = get_proc_pos() |
| 647 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Tor66", pos), position=pos) as progress_bar: |
622 | 648 | | |
623 | | - | link_finder("tor66", soup) |
| 649 | + | results = link_finder("tor66", soup) |
624 | 650 | | progress_bar.update() |
625 | 651 | | |
626 | 652 | | for n in range(2, page_number + 1): |
627 | 653 | | resp = s.get(tor66_url.format(quote(searchstr), n)) |
628 | 654 | | soup = BeautifulSoup(resp.text, 'html5lib') |
629 | | - | link_finder("tor66", soup) |
| 655 | + | results = results + link_finder("tor66", soup) |
630 | 656 | | progress_bar.update() |
631 | 657 | | |
632 | | - | progress_bar.close() |
| 658 | + | return results |
633 | 659 | | |
634 | 660 | | |
635 | 661 | | def tormax(searchstr): |
| 662 | + | results = [] |
636 | 663 | | tormax_url = supported_engines['tormax'] + "/tormax/search?q={}" |
637 | 664 | | |
638 | | - | with tqdm(total=1, initial=0, desc="%20s" % "Tormax", unit="req", ascii=False, ncols=120, |
639 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 665 | + | pos = get_proc_pos() |
| 666 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Tormax", pos), position=pos) as progress_bar: |
640 | 667 | | response = requests.get(tormax_url.format(quote(searchstr)), proxies=proxies, headers=random_headers()) |
641 | 668 | | soup = BeautifulSoup(response.text, 'html5lib') |
642 | | - | link_finder("tormax", soup) |
| 669 | + | results = link_finder("tormax", soup) |
643 | 670 | | progress_bar.update() |
644 | | - | progress_bar.close() |
| 671 | + | |
| 672 | + | return results |
645 | 673 | | |
646 | 674 | | |
647 | 675 | | def haystack(searchstr): |
| 676 | + | results = [] |
648 | 677 | | haystack_url = supported_engines['haystack'] + "/?q={}&offset={}" |
649 | 678 | | # At the 52nd page, it timeouts 100% of the time |
650 | 679 | | max_nb_page = 50 |
| skipped 8 lines |
659 | 688 | | req = s.get(haystack_url.format(quote(searchstr), 0)) |
660 | 689 | | soup = BeautifulSoup(req.text, 'html5lib') |
661 | 690 | | |
662 | | - | bar_max = None |
663 | | - | if args.barmode == "fixed": |
664 | | - | bar_max = max_nb_page |
665 | | - | with tqdm(total=bar_max, initial=0, desc="%20s" % "Haystack", unit="req", ascii=False, ncols=120, |
666 | | - | bar_format=tqdm_bar_format) as progress_bar: |
667 | | - | |
| 691 | + | pos = get_proc_pos() |
| 692 | + | with tqdm(total=max_nb_page, initial=0, desc=get_tqdm_desc("Haystack", pos), position=pos) as progress_bar: |
668 | 693 | | continue_processing = True |
669 | | - | |
670 | 694 | | ret = link_finder("haystack", soup) |
671 | | - | if ret < 0: |
| 695 | + | results = results + ret |
| 696 | + | progress_bar.update() |
| 697 | + | if len(ret) == 0: |
672 | 698 | | continue_processing = False |
673 | | - | progress_bar.update() |
674 | 699 | | |
675 | 700 | | it = 1 |
676 | 701 | | while continue_processing: |
| skipped 1 lines |
678 | 703 | | req = s.get(haystack_url.format(quote(searchstr), offset)) |
679 | 704 | | soup = BeautifulSoup(req.text, 'html5lib') |
680 | 705 | | ret = link_finder("haystack", soup) |
| 706 | + | results = results + ret |
681 | 707 | | progress_bar.update() |
682 | 708 | | it += 1 |
683 | | - | if it >= max_nb_page or ret < 0: |
| 709 | + | if it >= max_nb_page or len(ret) == 0: |
684 | 710 | | continue_processing = False |
| 711 | + | |
| 712 | + | return results |
685 | 713 | | |
686 | 714 | | |
687 | 715 | | def multivac(searchstr): |
| 716 | + | results = [] |
688 | 717 | | multivac_url = supported_engines['multivac'] + "/?q={}&page={}" |
689 | 718 | | max_nb_page = 10 |
690 | 719 | | if args.limit != 0: |
| skipped 7 lines |
698 | 727 | | req = s.get(multivac_url.format(quote(searchstr), page_to_request)) |
699 | 728 | | soup = BeautifulSoup(req.text, 'html5lib') |
700 | 729 | | |
701 | | - | bar_max = None |
702 | | - | if args.barmode == "fixed": |
703 | | - | bar_max = max_nb_page |
704 | | - | with tqdm(total=bar_max, initial=0, desc="%20s" % "Multivac", unit="req", ascii=False, ncols=120, |
705 | | - | bar_format=tqdm_bar_format) as progress_bar: |
706 | | - | |
| 730 | + | pos = get_proc_pos() |
| 731 | + | with tqdm(total=max_nb_page, initial=0, desc=get_tqdm_desc("Multivac", pos), position=pos) as progress_bar: |
707 | 732 | | continue_processing = True |
708 | | - | |
709 | 733 | | ret = link_finder("multivac", soup) |
710 | | - | if ret < 0 or page_to_request >= max_nb_page: |
| 734 | + | results = results + ret |
| 735 | + | progress_bar.update() |
| 736 | + | if len(ret) == 0 or page_to_request >= max_nb_page: |
711 | 737 | | continue_processing = False |
712 | | - | progress_bar.update() |
713 | 738 | | |
714 | 739 | | while continue_processing: |
715 | 740 | | page_to_request += 1 |
716 | 741 | | req = s.get(multivac_url.format(quote(searchstr), page_to_request)) |
717 | 742 | | soup = BeautifulSoup(req.text, 'html5lib') |
718 | 743 | | ret = link_finder("multivac", soup) |
| 744 | + | results = results + ret |
719 | 745 | | progress_bar.update() |
720 | | - | |
721 | | - | if page_to_request >= max_nb_page or ret < 0: |
| 746 | + | if len(ret) == 0 or page_to_request >= max_nb_page: |
722 | 747 | | continue_processing = False |
723 | 748 | | |
| 749 | + | return results |
| 750 | + | |
724 | 751 | | |
725 | 752 | | def evosearch(searchstr): |
| 753 | + | results = [] |
726 | 754 | | evosearch_url = supported_engines['evosearch'] + "/evo/search.php?" \ |
727 | | - | "query={}&" \ |
728 | | - | "start={}&" \ |
729 | | - | "search=1&type=and&mark=bold+text&" \ |
730 | | - | "results={}" |
| 755 | + | "query={}&" \ |
| 756 | + | "start={}&" \ |
| 757 | + | "search=1&type=and&mark=bold+text&" \ |
| 758 | + | "results={}" |
731 | 759 | | results_per_page = 50 |
732 | 760 | | max_nb_page = 30 |
733 | 761 | | if args.limit != 0: |
| skipped 15 lines |
749 | 777 | | if page_number > max_nb_page: |
750 | 778 | | page_number = max_nb_page |
751 | 779 | | |
752 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Evo Search", unit="req", ascii=False, ncols=120, |
753 | | - | bar_format=tqdm_bar_format) as progress_bar: |
754 | | - | |
755 | | - | link_finder("evosearch", soup) |
| 780 | + | pos = get_proc_pos() |
| 781 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Evo Search", pos), position=pos) as progress_bar: |
| 782 | + | results = link_finder("evosearch", soup) |
756 | 783 | | progress_bar.update() |
757 | 784 | | |
758 | 785 | | for n in range(2, page_number + 1): |
759 | 786 | | resp = s.get(evosearch_url.format(quote(searchstr), n, results_per_page)) |
760 | 787 | | soup = BeautifulSoup(resp.text, 'html5lib') |
761 | | - | link_finder("evosearch", soup) |
| 788 | + | results = results + link_finder("evosearch", soup) |
762 | 789 | | progress_bar.update() |
763 | 790 | | |
764 | | - | progress_bar.close() |
| 791 | + | return results |
765 | 792 | | |
766 | 793 | | |
767 | 794 | | def oneirun(searchstr): |
| 795 | + | results = [] |
768 | 796 | | oneirun_url = supported_engines['oneirun'] + "/Home/IndexEn" |
769 | 797 | | |
770 | 798 | | with requests.Session() as s: |
| skipped 4 lines |
775 | 803 | | soup = BeautifulSoup(resp.text, 'html5lib') |
776 | 804 | | token = soup.find('input', attrs={"name": "__RequestVerificationToken"})['value'] |
777 | 805 | | |
778 | | - | with tqdm(total=1, initial=0, desc="%20s" % "Oneirun", unit="req", ascii=False, ncols=120, |
779 | | - | bar_format=tqdm_bar_format) as progress_bar: |
780 | | - | response = s.post(oneirun_url.format(quote(searchstr)), |
781 | | - | data={"searchString": searchstr, "__RequestVerificationToken": token}) |
| 806 | + | pos = get_proc_pos() |
| 807 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("Oneirun", pos), position=pos) as progress_bar: |
| 808 | + | response = s.post(oneirun_url.format(quote(searchstr)), data={ |
| 809 | + | "searchString": searchstr, |
| 810 | + | "__RequestVerificationToken": token |
| 811 | + | }) |
782 | 812 | | soup = BeautifulSoup(response.text, 'html5lib') |
783 | | - | link_finder("oneirun", soup) |
| 813 | + | results = link_finder("oneirun", soup) |
784 | 814 | | progress_bar.update() |
785 | | - | progress_bar.close() |
| 815 | + | |
| 816 | + | return results |
786 | 817 | | |
787 | 818 | | |
788 | 819 | | def deeplink(searchstr): |
| 820 | + | results = [] |
789 | 821 | | deeplink_url1 = supported_engines['deeplink'] + "/index.php" |
790 | 822 | | deeplink_url2 = supported_engines['deeplink'] + "/?search={}&type=verified" |
791 | 823 | | |
792 | 824 | | with requests.Session() as s: |
793 | 825 | | s.proxies = proxies |
794 | 826 | | s.headers = random_headers() |
795 | | - | resp = s.get(deeplink_url1) |
| 827 | + | s.get(deeplink_url1) |
796 | 828 | | |
797 | | - | with tqdm(total=1, initial=0, desc="%20s" % "DeepLink", unit="req", ascii=False, ncols=120, |
798 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 829 | + | pos = get_proc_pos() |
| 830 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("DeepLink", pos), position=pos) as progress_bar: |
799 | 831 | | response = s.get(deeplink_url2.format(quote(searchstr))) |
800 | 832 | | soup = BeautifulSoup(response.text, 'html5lib') |
801 | | - | link_finder("deeplink", soup) |
| 833 | + | results = link_finder("deeplink", soup) |
802 | 834 | | progress_bar.update() |
803 | | - | progress_bar.close() |
| 835 | + | |
| 836 | + | return results |
804 | 837 | | |
805 | 838 | | |
806 | 839 | | def torsearchengine1(searchstr): |
| 840 | + | results = [] |
807 | 841 | | torsearchengine1_url1 = supported_engines['torsearchengine1'] |
808 | 842 | | torsearchengine1_url2 = supported_engines['torsearchengine1'] + "/index.php" |
809 | 843 | | |
| skipped 2 lines |
812 | 846 | | s.headers = random_headers() |
813 | 847 | | s.get(torsearchengine1_url1) |
814 | 848 | | |
815 | | - | with tqdm(total=1, initial=0, desc="%20s" % "TOR Search Engine", unit="req", ascii=False, ncols=120, |
816 | | - | bar_format=tqdm_bar_format) as progress_bar: |
| 849 | + | pos = get_proc_pos() |
| 850 | + | with tqdm(total=1, initial=0, desc=get_tqdm_desc("TOR Search Engine 1", pos), position=pos) as progress_bar: |
817 | 851 | | response = s.post(torsearchengine1_url2, {'search': searchstr, 'search2': ''}) |
818 | 852 | | soup = BeautifulSoup(response.text, 'html5lib') |
819 | | - | link_finder("torsearchengine1", soup) |
| 853 | + | results = link_finder("torsearchengine1", soup) |
820 | 854 | | progress_bar.update() |
821 | | - | progress_bar.close() |
| 855 | + | |
| 856 | + | return results |
822 | 857 | | |
823 | 858 | | |
824 | 859 | | def torgle1(searchstr): |
| 860 | + | results = [] |
825 | 861 | | torgle1_url = supported_engines['torgle1'] + "/torgle/index-frame.php?query={}&search=1&engine-ver=2&isframe=0{}" |
826 | 862 | | results_per_page = 10 |
827 | 863 | | max_nb_page = 30 |
| skipped 17 lines |
845 | 881 | | if page_number > max_nb_page: |
846 | 882 | | page_number = max_nb_page |
847 | 883 | | |
848 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Torgle", unit="req", ascii=False, ncols=120, |
849 | | - | bar_format=tqdm_bar_format) as progress_bar: |
850 | | - | |
851 | | - | link_finder("torgle1", soup) |
| 884 | + | pos = get_proc_pos() |
| 885 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Torgle 1", pos), position=pos) as progress_bar: |
| 886 | + | results = link_finder("torgle1", soup) |
852 | 887 | | progress_bar.update() |
853 | 888 | | |
854 | 889 | | for n in range(2, page_number + 1): |
855 | 890 | | start_page_param = "&start={}".format(n) |
856 | 891 | | resp = s.get(torgle1_url.format(quote(searchstr), start_page_param)) |
857 | 892 | | soup = BeautifulSoup(resp.text, 'html5lib') |
858 | | - | link_finder("torgle1", soup) |
| 893 | + | results = results + link_finder("torgle1", soup) |
859 | 894 | | progress_bar.update() |
860 | 895 | | |
861 | | - | progress_bar.close() |
| 896 | + | return results |
862 | 897 | | |
863 | 898 | | |
864 | 899 | | def grams1(searchstr): |
| 900 | + | results = [] |
865 | 901 | | grams1_url = supported_engines['grams1'] + "/results/index.php?page={}&searchstr={}" |
866 | 902 | | results_per_page = 25 |
867 | 903 | | max_nb_page = 30 |
| skipped 16 lines |
884 | 920 | | if page_number > max_nb_page: |
885 | 921 | | page_number = max_nb_page |
886 | 922 | | |
887 | | - | with tqdm(total=page_number, initial=0, desc="%20s" % "Grams", unit="req", ascii=False, ncols=120, |
888 | | - | bar_format=tqdm_bar_format) as progress_bar: |
889 | | - | |
890 | | - | link_finder("grams1", soup) |
| 923 | + | pos = get_proc_pos() |
| 924 | + | with tqdm(total=page_number, initial=0, desc=get_tqdm_desc("Grams 1", pos), position=pos) as progress_bar: |
| 925 | + | results = link_finder("grams1", soup) |
891 | 926 | | progress_bar.update() |
892 | 927 | | |
893 | 928 | | for n in range(2, page_number + 1): |
894 | 929 | | resp = s.get(grams1_url.format(n, quote(searchstr))) |
895 | 930 | | soup = BeautifulSoup(resp.text, 'html5lib') |
896 | | - | link_finder("grams1", soup) |
| 931 | + | results = results + link_finder("grams1", soup) |
897 | 932 | | progress_bar.update() |
898 | 933 | | |
899 | | - | progress_bar.close() |
| 934 | + | return results |
900 | 935 | | |
901 | 936 | | |
902 | 937 | | def get_domain_from_url(link): |
| skipped 24 lines |
927 | 962 | | |
928 | 963 | | |
929 | 964 | | def link_finder(engine_str, data_obj): |
930 | | - | global result |
931 | 965 | | global filename |
932 | 966 | | name = "" |
933 | 967 | | link = "" |
934 | 968 | | csv_file = None |
935 | | - | has_result = False |
| 969 | + | found_links = [] |
936 | 970 | | |
937 | 971 | | if args.continuous_write: |
938 | 972 | | csv_file = open(filename, 'a', newline='') |
939 | 973 | | |
940 | | - | def append_link(): |
941 | | - | nonlocal has_result |
942 | | - | has_result = True |
943 | | - | result[engine_str].append({"name": name, "link": link}) |
| 974 | + | def add_link(): |
| 975 | + | found_links.append({"engine": engine_str, "name": name, "link": link}) |
944 | 976 | | |
945 | 977 | | if args.continuous_write and csv_file.writable(): |
946 | 978 | | csv_writer = csv.writer(csv_file, delimiter=field_delim, quoting=csv.QUOTE_ALL) |
947 | 979 | | fields = {"engine": engine_str, "name": name, "link": link} |
948 | 980 | | write_to_csv(csv_writer, fields) |
949 | 981 | | |
950 | | - | if engine_str not in result: |
951 | | - | result[engine_str] = [] |
952 | | - | |
953 | 982 | | if engine_str == "ahmia": |
954 | 983 | | for r in data_obj.select('li.result h4'): |
955 | 984 | | name = clear(r.get_text()) |
956 | 985 | | link = r.find('a')['href'].split('redirect_url=')[1] |
957 | | - | append_link() |
| 986 | + | add_link() |
958 | 987 | | |
959 | 988 | | if engine_str == "candle": |
960 | 989 | | for r in data_obj.select("body h2 a"): |
961 | 990 | | if str(r['href']).startswith("http"): |
962 | 991 | | name = clear(r.get_text()) |
963 | 992 | | link = clear(r['href']) |
964 | | - | append_link() |
| 993 | + | add_link() |
965 | 994 | | |
966 | 995 | | if engine_str == "darksearchenginer": |
967 | 996 | | for r in data_obj.select('.table-responsive a'): |
968 | 997 | | name = clear(r.get_text()) |
969 | 998 | | link = clear(r['href']) |
970 | | - | append_link() |
| 999 | + | add_link() |
971 | 1000 | | |
972 | 1001 | | if engine_str == "darksearchio": |
973 | 1002 | | for r in data_obj: |
974 | 1003 | | name = clear(r["title"]) |
975 | 1004 | | link = clear(r["link"]) |
976 | | - | append_link() |
| 1005 | + | add_link() |
977 | 1006 | | |
978 | 1007 | | if engine_str == "deeplink": |
979 | 1008 | | for tr in data_obj.find_all('tr'): |
| skipped 1 lines |
981 | 1010 | | if cels is not None and len(cels) == 4: |
982 | 1011 | | name = clear(cels[1].get_text()) |
983 | 1012 | | link = clear(cels[0].find('a')['href']) |
984 | | - | append_link() |
| 1013 | + | add_link() |
985 | 1014 | | |
986 | 1015 | | if engine_str == "evosearch": |
987 | 1016 | | for r in data_obj.select("#results .title a"): |
988 | 1017 | | name = clear(r.get_text()) |
989 | 1018 | | link = get_parameter(r['href'], 'url') |
990 | | - | append_link() |
| 1019 | + | add_link() |
991 | 1020 | | |
992 | 1021 | | if engine_str == "grams": |
993 | 1022 | | for i in data_obj.find_all("div", attrs={"class": "media-body"}): |
| skipped 1 lines |
995 | 1024 | | for r in i.select(".searchlinks a"): |
996 | 1025 | | name = clear(r.get_text()) |
997 | 1026 | | link = clear(r['href']) |
998 | | - | append_link() |
| 1027 | + | add_link() |
999 | 1028 | | |
1000 | 1029 | | if engine_str == "grams1": |
1001 | 1030 | | for r in data_obj.select(".searchlinks a"): |
1002 | 1031 | | name = clear(r.get_text()) |
1003 | 1032 | | link = clear(r['href']) |
1004 | | - | append_link() |
| 1033 | + | add_link() |
1005 | 1034 | | |
1006 | 1035 | | if engine_str == "haystack": |
1007 | 1036 | | for r in data_obj.select(".result b a"): |
1008 | 1037 | | name = clear(r.get_text()) |
1009 | 1038 | | link = get_parameter(r['href'], 'url') |
1010 | | - | append_link() |
| 1039 | + | add_link() |
1011 | 1040 | | |
1012 | 1041 | | if engine_str == "multivac": |
1013 | 1042 | | for r in data_obj.select("dl dt a"): |
1014 | 1043 | | if r['href'] != "": |
1015 | 1044 | | name = clear(r.get_text()) |
1016 | 1045 | | link = clear(r['href']) |
1017 | | - | append_link() |
| 1046 | + | add_link() |
1018 | 1047 | | else: |
1019 | 1048 | | break |
1020 | 1049 | | |
| skipped 1 lines |
1022 | 1051 | | for r in data_obj.select('#content > div > p > a:not([target])'): |
1023 | 1052 | | name = clear(r.get_text()) |
1024 | 1053 | | link = get_parameter(r['href'], 'url') |
1025 | | - | append_link() |
| 1054 | + | add_link() |
1026 | 1055 | | |
1027 | 1056 | | if engine_str == "oneirun": |
1028 | 1057 | | for td in data_obj.find_all('td', attrs={"style": "vertical-align: top;"}): |
1029 | 1058 | | name = clear(td.find('h5').get_text()) |
1030 | 1059 | | link = clear(td.find('a')['href']) |
1031 | | - | append_link() |
| 1060 | + | add_link() |
1032 | 1061 | | |
1033 | 1062 | | if engine_str == "onionland": |
1034 | 1063 | | for r in data_obj.select('.result-block .title a'): |
1035 | 1064 | | if not r['href'].startswith('/ads/'): |
1036 | 1065 | | name = clear(r.get_text()) |
1037 | 1066 | | link = unquote(unquote(get_parameter(r['href'], 'l'))) |
1038 | | - | append_link() |
| 1067 | + | add_link() |
1039 | 1068 | | |
1040 | 1069 | | if engine_str == "onionsearchengine": |
1041 | 1070 | | for r in data_obj.select("table a b"): |
1042 | 1071 | | name = clear(r.get_text()) |
1043 | 1072 | | link = get_parameter(r.parent['href'], 'u') |
1044 | | - | append_link() |
| 1073 | + | add_link() |
1045 | 1074 | | |
1046 | 1075 | | if engine_str == "onionsearchserver": |
1047 | 1076 | | for r in data_obj.select('.osscmnrdr.ossfieldrdr1 a'): |
1048 | 1077 | | name = clear(r.get_text()) |
1049 | 1078 | | link = clear(r['href']) |
1050 | | - | append_link() |
| 1079 | + | add_link() |
1051 | 1080 | | |
1052 | 1081 | | if engine_str == "phobos": |
1053 | 1082 | | for r in data_obj.select('.serp .titles'): |
1054 | 1083 | | name = clear(r.get_text()) |
1055 | 1084 | | link = clear(r['href']) |
1056 | | - | append_link() |
| 1085 | + | add_link() |
1057 | 1086 | | |
1058 | 1087 | | if engine_str == "tor66": |
1059 | 1088 | | for i in data_obj.find('hr').find_all_next('b'): |
1060 | 1089 | | if i.find('a'): |
1061 | 1090 | | name = clear(i.find('a').get_text()) |
1062 | 1091 | | link = clear(i.find('a')['href']) |
1063 | | - | append_link() |
| 1092 | + | add_link() |
1064 | 1093 | | |
1065 | 1094 | | if engine_str == "torch": |
1066 | 1095 | | for r in data_obj.select("dl > dt > a"): |
1067 | 1096 | | name = clear(r.get_text()) |
1068 | 1097 | | link = clear(r['href']) |
1069 | | - | append_link() |
| 1098 | + | add_link() |
1070 | 1099 | | |
1071 | 1100 | | if engine_str == "torch1": |
1072 | 1101 | | for r in data_obj.select("dl > dt > a"): |
1073 | 1102 | | name = clear(r.get_text()) |
1074 | 1103 | | link = clear(r['href']) |
1075 | | - | append_link() |
| 1104 | + | add_link() |
1076 | 1105 | | |
1077 | 1106 | | if engine_str == "tordex": |
1078 | 1107 | | for r in data_obj.select('.container h5 a'): |
1079 | 1108 | | name = clear(r.get_text()) |
1080 | 1109 | | link = clear(r['href']) |
1081 | | - | append_link() |
| 1110 | + | add_link() |
1082 | 1111 | | |
1083 | 1112 | | if engine_str == "torgle": |
1084 | 1113 | | for i in data_obj.find_all('ul', attrs={"id": "page"}): |
| skipped 2 lines |
1087 | 1116 | | link = clear(j.get_text()) |
1088 | 1117 | | else: |
1089 | 1118 | | name = clear(j.get_text()) |
1090 | | - | append_link() |
| 1119 | + | add_link() |
1091 | 1120 | | |
1092 | 1121 | | if engine_str == "torgle1": |
1093 | 1122 | | for r in data_obj.select("#results a.title"): |
1094 | 1123 | | name = clear(r.get_text()) |
1095 | 1124 | | link = clear(r['href']) |
1096 | | - | append_link() |
| 1125 | + | add_link() |
1097 | 1126 | | |
1098 | 1127 | | if engine_str == "tormax": |
1099 | 1128 | | for r in data_obj.select("#search-results article a.title"): |
1100 | 1129 | | name = clear(r.get_text()) |
1101 | 1130 | | link = clear(r.find_next_sibling('div', {'class': 'url'}).get_text()) |
1102 | | - | append_link() |
| 1131 | + | add_link() |
1103 | 1132 | | |
1104 | 1133 | | if engine_str == "torsearchengine": |
1105 | 1134 | | for i in data_obj.find_all('h3', attrs={'class': 'title text-truncate'}): |
1106 | 1135 | | name = clear(i.find('a').get_text()) |
1107 | 1136 | | link = i.find('a')['data-uri'] |
1108 | | - | append_link() |
| 1137 | + | add_link() |
1109 | 1138 | | |
1110 | 1139 | | if engine_str == "torsearchengine1": |
1111 | 1140 | | for r in data_obj.find_all('span', {'style': 'font-size:1.2em;font-weight:bold;color:#1a0dab'}): |
1112 | 1141 | | name = clear(r.get_text()) |
1113 | 1142 | | link = r.find_next_sibling('a')['href'] |
1114 | | - | append_link() |
| 1143 | + | add_link() |
1115 | 1144 | | |
1116 | 1145 | | if engine_str == "visitor": |
1117 | 1146 | | for r in data_obj.select(".hs_site h3 a"): |
1118 | 1147 | | name = clear(r.get_text()) |
1119 | 1148 | | link = clear(r['href']) |
1120 | | - | append_link() |
| 1149 | + | add_link() |
1121 | 1150 | | |
1122 | 1151 | | if args.continuous_write and not csv_file.closed: |
1123 | 1152 | | csv_file.close() |
1124 | 1153 | | |
1125 | | - | if not has_result: |
1126 | | - | return -1 |
| 1154 | + | return found_links |
1127 | 1155 | | |
1128 | | - | return 1 |
1129 | 1156 | | |
1130 | | - | |
1131 | | - | def call_func_as_str(function_name, function_arg): |
| 1157 | + | def run_method(method_name_and_argument): |
| 1158 | + | method_name = method_name_and_argument.split(':')[0] |
| 1159 | + | argument = method_name_and_argument.split(':')[1] |
| 1160 | + | ret = [] |
1132 | 1161 | | try: |
1133 | | - | globals()[function_name](function_arg) |
| 1162 | + | ret = globals()[method_name](argument) |
1134 | 1163 | | except ConnectionError: |
1135 | 1164 | | print("Error: unable to connect") |
1136 | 1165 | | except OSError: |
1137 | 1166 | | print("Error: unable to connect") |
| 1167 | + | except ProtocolError: |
| 1168 | + | print("Error: unable to connect") |
| 1169 | + | return ret |
1138 | 1170 | | |
1139 | 1171 | | |
1140 | 1172 | | def scrape(): |
1141 | | - | global result |
1142 | 1173 | | global filename |
1143 | 1174 | | |
1144 | 1175 | | start_time = datetime.now() |
| skipped 5 lines |
1150 | 1181 | | search = search[0:9] |
1151 | 1182 | | filename = str(filename).replace("$SEARCH", search) |
1152 | 1183 | | |
| 1184 | + | func_args = [] |
| 1185 | + | stats_dict = {} |
1153 | 1186 | | if args.engines and len(args.engines) > 0: |
1154 | | - | engines = args.engines[0] |
1155 | | - | for e in engines: |
| 1187 | + | eng = args.engines[0] |
| 1188 | + | for e in eng: |
1156 | 1189 | | try: |
1157 | 1190 | | if not (args.exclude and len(args.exclude) > 0 and e in args.exclude[0]): |
1158 | | - | call_func_as_str(e, args.search) |
| 1191 | + | func_args.append("{}:{}".format(e, args.search)) |
| 1192 | + | stats_dict[e] = 0 |
1159 | 1193 | | except KeyError: |
1160 | 1194 | | print("Error: search engine {} not in the list of supported engines".format(e)) |
1161 | 1195 | | else: |
1162 | 1196 | | for e in supported_engines.keys(): |
1163 | 1197 | | if not (args.exclude and len(args.exclude) > 0 and e in args.exclude[0]): |
1164 | | - | call_func_as_str(e, args.search) |
| 1198 | + | func_args.append("{}:{}".format(e, args.search)) |
| 1199 | + | stats_dict[e] = 0 |
| 1200 | + | |
| 1201 | + | # Doing multiprocessing |
| 1202 | + | units = min((cpu_count() - 1), len(func_args)) |
| 1203 | + | if args.mp_units and args.mp_units > 0: |
| 1204 | + | units = min(args.mp_units, len(func_args)) |
| 1205 | + | print("search.py started with {} processing units...".format(units)) |
| 1206 | + | freeze_support() |
| 1207 | + | |
| 1208 | + | results = {} |
| 1209 | + | with Pool(units, initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),)) as p: |
| 1210 | + | results_map = p.map(run_method, func_args) |
| 1211 | + | results = reduce(lambda a, b: a + b if b is not None else a, results_map) |
1165 | 1212 | | |
1166 | 1213 | | stop_time = datetime.now() |
1167 | 1214 | | |
1168 | 1215 | | if not args.continuous_write: |
1169 | 1216 | | with open(filename, 'w', newline='') as csv_file: |
1170 | 1217 | | csv_writer = csv.writer(csv_file, delimiter=field_delim, quoting=csv.QUOTE_ALL) |
1171 | | - | for engine in result.keys(): |
1172 | | - | for i in result[engine]: |
1173 | | - | i['engine'] = engine |
1174 | | - | write_to_csv(csv_writer, i) |
| 1218 | + | for r in results: |
| 1219 | + | write_to_csv(csv_writer, r) |
1175 | 1220 | | |
1176 | 1221 | | total = 0 |
1177 | 1222 | | print("\nReport:") |
1178 | 1223 | | print(" Execution time: %s seconds" % (stop_time - start_time)) |
1179 | | - | for engine in result.keys(): |
1180 | | - | print(" {}: {}".format(engine, str(len(result[engine])))) |
1181 | | - | total += len(result[engine]) |
| 1224 | + | print(" Results per engine:") |
| 1225 | + | for r in results: |
| 1226 | + | stats_dict[r['engine']] += 1 |
| 1227 | + | for s in stats_dict: |
| 1228 | + | n = stats_dict[s] |
| 1229 | + | print(" {}: {}".format(s, str(n))) |
| 1230 | + | total += n |
1182 | 1231 | | print(" Total: {} links written to {}".format(str(total), filename)) |
1183 | 1232 | | |
1184 | 1233 | | |
| skipped 3 lines |