Projects STRLCPY OnionSearch Commits 2ba47d76
🤬
  • Now possible to write progressively to the output file (instead of everything at the end, which remains default behavior)

  • Loading...
  • Gobarigo committed 4 years ago
    2ba47d76
    1 parent 4951cda0
  • ■ ■ ■ ■ ■
    README.md
    skipped 45 lines
    46 46  ## 📈 Usage
    47 47   
    48 48  ```
    49  -usage: search.py [-h] [--proxy PROXY] [--output OUTPUT] [--limit LIMIT]
     49 +usage: search.py [-h] [--proxy PROXY] [--output OUTPUT]
     50 + [--continuous_write CONTINUOUS_WRITE] [--limit LIMIT]
    50 51   [--barmode BARMODE] [--engines [ENGINES [ENGINES ...]]]
    51 52   [--exclude [EXCLUDE [EXCLUDE ...]]]
    52 53   search
    skipped 7 lines
    60 61   --output OUTPUT Output File (default: output_$SEARCH_$DATE.txt), where
    61 62   $SEARCH is replaced by the first chars of the search
    62 63   string and $DATE is replaced by the datetime
     64 + --continuous_write CONTINUOUS_WRITE
     65 + Write progressively to output file (default: False)
    63 66   --limit LIMIT Set a max number of pages per engine to load
    64 67   --barmode BARMODE Can be 'fixed' (default) or 'unknown'
    65 68   --engines [ENGINES [ENGINES ...]]
    66 69   Engines to request (default: full list)
    67 70   --exclude [EXCLUDE [EXCLUDE ...]]
    68 71   Engines to exclude (default: none)
    69  - 
    70 72  [...]
    71 73  ```
    72 74   
    skipped 43 lines
    116 118   
    117 119  In the csv file produced, the name and url strings are sanitized as much as possible, but there might still be some problems.
    118 120   
     121 +Note that you can choose to progressively write to the output (instead of everything at the end, which would prevent
     122 +losing the results if something goes wrong). To do so you have to use `--continuous-write True`, just as is:
     123 +```
     124 +python3 search.py "computer" --continuous-write True
     125 +```
     126 +You can then use the `tail -f` (tail follow) Unix command to actively watch or monitor the results of the scraping.
    119 127   
    120 128  ## 📝 License
    121 129  [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.fr.html)
    skipped 3 lines
  • ■ ■ ■ ■ ■ ■
    search.py
    skipped 63 lines
    64 64  parser.add_argument("--output", default='output_$SEARCH_$DATE.txt', type=str,
    65 65   help="Output File (default: output_$SEARCH_$DATE.txt), where $SEARCH is replaced by the first "
    66 66   "chars of the search string and $DATE is replaced by the datetime")
     67 +parser.add_argument("--continuous_write", type=bool, default=False,
     68 + help="Write progressively to output file (default: False)")
    67 69  parser.add_argument("search", type=str, help="The search string or phrase")
    68 70  parser.add_argument("--limit", type=int, default=0, help="Set a max number of pages per engine to load")
    69 71  parser.add_argument("--barmode", type=str, default="fixed", help="Can be 'fixed' (default) or 'unknown'")
    skipped 4 lines
    74 76  proxies = {'http': 'socks5h://{}'.format(args.proxy), 'https': 'socks5h://{}'.format(args.proxy)}
    75 77  tqdm_bar_format = "{desc}: {percentage:3.0f}% |{bar}| {n_fmt:3s} / {total_fmt:3s} [{elapsed:5s} < {remaining:5s}]"
    76 78  result = {}
     79 +filename = args.output
    77 80   
    78 81   
    79 82  def random_headers():
    skipped 703 lines
    783 786   
    784 787  def link_finder(engine_str, data_obj):
    785 788   global result
     789 + global filename
    786 790   name = ""
    787 791   link = ""
     792 + f = None
     793 + 
     794 + if args.continuous_write:
     795 + f = open(filename, "a")
    788 796   
    789 797   def append_link():
    790 798   result[engine_str].append({"name": name, "link": link})
     799 + if args.continuous_write and f.writable():
     800 + f.write("\"{}\",\"{}\",\"{}\"\n".format(engine_str, name, link))
    791 801   
    792 802   if engine_str not in result:
    793 803   result[engine_str] = []
    skipped 35 lines
    829 839   
    830 840   if engine_str == "evosearch":
    831 841   if data_obj.find('div', attrs={"id": "results"}) is not None:
    832  - count = 0
    833 842   for div in data_obj.find('div', attrs={"id": "results"}).find_all('div', attrs={"class": "odrow"}):
    834 843   name = clear(div.find('div', attrs={"class": "title"}).find('a').get_text())
    835 844   link = clear(div.find('div', attrs={"class": "title"}).find('a')['href']
    836 845   .replace("./include/click_counter.php?url=", "")
    837 846   .replace("&query={}".format(args.search), ""))
    838  - count += 1
    839 847   append_link()
    840 848   
    841 849   if engine_str == "grams":
    skipped 7 lines
    849 857   append_link()
    850 858   
    851 859   if engine_str == "haystack":
    852  - if data_obj.find('div', attrs={"class": "result"}) is None:
    853  - return -1
    854  - for div in data_obj.find_all('div', attrs={"class": "result"}):
    855  - if div.find('a') is not None and div.find('i') is not None:
    856  - name = clear(div.find('a').get_text())
    857  - link = clear(div.find('i').get_text())
    858  - append_link()
     860 + if data_obj.find('div', attrs={"class": "result"}) is not None:
     861 + for div in data_obj.find_all('div', attrs={"class": "result"}):
     862 + if div.find('a') is not None and div.find('i') is not None:
     863 + name = clear(div.find('a').get_text())
     864 + link = clear(div.find('i').get_text())
     865 + append_link()
    859 866   
    860 867   if engine_str == "multivac":
    861 868   for i in data_obj.find_all('dl'):
    skipped 4 lines
    866 873   link = clear(link_tag['href'])
    867 874   append_link()
    868 875   else:
    869  - return -1
     876 + break
    870 877   
    871 878   if engine_str == "notevil":
    872 879   ''' As for OnionLand, we could use the span instead of the href to get a beautiful link
    skipped 16 lines
    889 896   append_link()
    890 897   
    891 898   if engine_str == "onionland":
    892  - if data_obj.find('div', attrs={"class": "row no-result-row"}):
    893  - return -1
    894  - for i in data_obj.find_all('div', attrs={"class": "result-block"}):
    895  - if not str(clear(i.find('div', attrs={'class': "title"}).find('a')['href'])).startswith("/ads"):
    896  - name = clear(i.find('div', attrs={'class': "title"}).get_text())
    897  - link = clear(i.find('div', attrs={'class': "link"}).get_text())
    898  - append_link()
     899 + if not data_obj.find('div', attrs={"class": "row no-result-row"}):
     900 + for i in data_obj.find_all('div', attrs={"class": "result-block"}):
     901 + if not str(clear(i.find('div', attrs={'class': "title"}).find('a')['href'])).startswith("/ads"):
     902 + name = clear(i.find('div', attrs={'class': "title"}).get_text())
     903 + link = clear(i.find('div', attrs={'class': "link"}).get_text())
     904 + append_link()
    899 905   
    900 906   if engine_str == "onionsearchengine":
    901 907   for i in data_obj.find_all('table'):
    skipped 67 lines
    969 975   link = n.find('a')['href']
    970 976   append_link()
    971 977   
     978 + if args.continuous_write and not f.closed:
     979 + f.close()
     980 + 
     981 + if len(result[engine_str]) <= 0:
     982 + return -1
     983 + 
    972 984   return 1
    973 985   
    974 986   
    skipped 6 lines
    981 993   print("Error: unable to connect")
    982 994   
    983 995   
     996 +def write_to_file(filename, results, engine):
     997 + f = open(filename, "w+")
     998 + for i in results[engine]:
     999 + f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"]))
     1000 + f.close()
     1001 + 
     1002 + 
    984 1003  def scrape():
    985 1004   global result
     1005 + global filename
    986 1006   
    987 1007   start_time = datetime.now()
    988 1008   
    989 1009   # Building the filename
    990  - filename = args.output
    991 1010   filename = str(filename).replace("$DATE", start_time.strftime("%Y%m%d%H%M%S"))
    992 1011   search = str(args.search).replace(" ", "")
    993 1012   if len(search) > 10:
    skipped 15 lines
    1009 1028   
    1010 1029   stop_time = datetime.now()
    1011 1030   
     1031 + if not args.continuous_write:
     1032 + f = open(filename, "w+")
     1033 + for engine in result.keys():
     1034 + for i in result[engine]:
     1035 + f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"]))
     1036 + f.close()
     1037 + 
    1012 1038   total = 0
    1013 1039   print("\nReport:")
    1014 1040   print(" Execution time: %s seconds" % (stop_time - start_time))
    1015  - 
    1016  - f = open(filename, "w+")
    1017 1041   for engine in result.keys():
    1018 1042   print(" {}: {}".format(engine, str(len(result[engine]))))
    1019 1043   total += len(result[engine])
    1020  - for i in result[engine]:
    1021  - f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"]))
    1022  - 
    1023  - f.close()
    1024 1044   print(" Total: {} links written to {}".format(str(total), filename))
    1025 1045   
    1026 1046   
    skipped 3 lines
Please wait...
Page is in error, reload to recover