1 | 1 | | import argparse |
| 2 | + | import csv |
2 | 3 | | import math |
3 | 4 | | import re |
4 | 5 | | import time |
| skipped 46 lines |
51 | 52 | | "deeplink", |
52 | 53 | | ] |
53 | 54 | | |
| 55 | + | available_csv_fields = [ |
| 56 | + | "engine", |
| 57 | + | "name", |
| 58 | + | "link", |
| 59 | + | "domain" |
| 60 | + | # Todo: add description, but needs modify scraping (link_finder func) for all the engines |
| 61 | + | ] |
| 62 | + | |
54 | 63 | | |
55 | 64 | | def print_epilog(): |
56 | | - | epilog = "Supported engines: ".format(len(supported_engines)) |
| 65 | + | epilog = "Available CSV fields: \n\t" |
| 66 | + | for f in available_csv_fields: |
| 67 | + | epilog += " {}".format(f) |
| 68 | + | epilog += "\n" |
| 69 | + | epilog += "Supported engines: \n\t" |
57 | 70 | | for e in supported_engines: |
58 | 71 | | epilog += " {}".format(e) |
59 | 72 | | return epilog |
60 | 73 | | |
61 | 74 | | |
62 | | - | parser = argparse.ArgumentParser(epilog=print_epilog()) |
| 75 | + | parser = argparse.ArgumentParser(epilog=print_epilog(), formatter_class=argparse.RawTextHelpFormatter) |
63 | 76 | | parser.add_argument("--proxy", default='localhost:9050', type=str, help="Set Tor proxy (default: 127.0.0.1:9050)") |
64 | 77 | | parser.add_argument("--output", default='output_$SEARCH_$DATE.txt', type=str, |
65 | 78 | | help="Output File (default: output_$SEARCH_$DATE.txt), where $SEARCH is replaced by the first " |
| skipped 5 lines |
71 | 84 | | parser.add_argument("--barmode", type=str, default="fixed", help="Can be 'fixed' (default) or 'unknown'") |
72 | 85 | | parser.add_argument("--engines", type=str, action='append', help='Engines to request (default: full list)', nargs="*") |
73 | 86 | | parser.add_argument("--exclude", type=str, action='append', help='Engines to exclude (default: none)', nargs="*") |
| 87 | + | parser.add_argument("--fields", type=str, action='append', |
| 88 | + | help='Fields to output to csv file (default: engine name link), available fields are shown below', |
| 89 | + | nargs="*") |
| 90 | + | parser.add_argument("--field_delimiter", type=str, default=",", help='Delimiter for the CSV fields') |
74 | 91 | | |
75 | 92 | | args = parser.parse_args() |
76 | 93 | | proxies = {'http': 'socks5h://{}'.format(args.proxy), 'https': 'socks5h://{}'.format(args.proxy)} |
77 | 94 | | tqdm_bar_format = "{desc}: {percentage:3.0f}% |{bar}| {n_fmt:3s} / {total_fmt:3s} [{elapsed:5s} < {remaining:5s}]" |
78 | 95 | | result = {} |
79 | 96 | | filename = args.output |
80 | | - | |
| 97 | + | field_delim = "," |
| 98 | + | if args.field_delimiter and len(args.field_delimiter) == 1: |
| 99 | + | field_delim = args.field_delimiter |
81 | 100 | | |
82 | 101 | | def random_headers(): |
83 | 102 | | return {'User-Agent': choice(desktop_agents), |
| skipped 700 lines |
784 | 803 | | progress_bar.close() |
785 | 804 | | |
786 | 805 | | |
| 806 | + | def get_domain_from_url(link): |
| 807 | + | fqdn_re = r"^[a-z][a-z0-9+\-.]*://([a-z0-9\-._~%!$&'()*+,;=]+@)?([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])" |
| 808 | + | domain_re = re.match(fqdn_re, link) |
| 809 | + | if domain_re is not None: |
| 810 | + | if domain_re.lastindex == 2: |
| 811 | + | return domain_re.group(2) |
| 812 | + | return None |
| 813 | + | |
| 814 | + | |
| 815 | + | def write_to_csv(csv_writer, fields): |
| 816 | + | line_to_write = [] |
| 817 | + | if args.fields and len(args.fields) > 0: |
| 818 | + | for f in args.fields[0]: |
| 819 | + | if f in fields: |
| 820 | + | line_to_write.append(fields[f]) |
| 821 | + | if f == "domain": |
| 822 | + | domain = get_domain_from_url(fields['link']) |
| 823 | + | line_to_write.append(domain) |
| 824 | + | csv_writer.writerow(line_to_write) |
| 825 | + | else: |
| 826 | + | # Default output mode |
| 827 | + | line_to_write.append(fields['engine']) |
| 828 | + | line_to_write.append(fields['name']) |
| 829 | + | line_to_write.append(fields['link']) |
| 830 | + | csv_writer.writerow(line_to_write) |
| 831 | + | |
| 832 | + | |
787 | 833 | | def link_finder(engine_str, data_obj): |
788 | 834 | | global result |
789 | 835 | | global filename |
790 | 836 | | name = "" |
791 | 837 | | link = "" |
792 | | - | f = None |
| 838 | + | csv_file = None |
| 839 | + | has_result = False |
793 | 840 | | |
794 | 841 | | if args.continuous_write: |
795 | | - | f = open(filename, "a") |
| 842 | + | csv_file = open(filename, 'a', newline='') |
796 | 843 | | |
797 | 844 | | def append_link(): |
| 845 | + | nonlocal has_result |
| 846 | + | has_result = True |
| 847 | + | |
798 | 848 | | result[engine_str].append({"name": name, "link": link}) |
799 | | - | if args.continuous_write and f.writable(): |
800 | | - | f.write("\"{}\",\"{}\",\"{}\"\n".format(engine_str, name, link)) |
| 849 | + | |
| 850 | + | if args.continuous_write and csv_file.writable(): |
| 851 | + | csv_writer = csv.writer(csv_file, delimiter=field_delim, quoting=csv.QUOTE_NONNUMERIC) |
| 852 | + | fields = {"engine": engine_str, "name": name, "link": link} |
| 853 | + | write_to_csv(csv_writer, fields) |
801 | 854 | | |
802 | 855 | | if engine_str not in result: |
803 | 856 | | result[engine_str] = [] |
| skipped 7 lines |
811 | 864 | | append_link() |
812 | 865 | | |
813 | 866 | | if engine_str == "candle": |
814 | | - | for i in data_obj.find('html').find_all('a'): |
815 | | - | if str(i['href']).startswith("http"): |
816 | | - | name = clear(i.get_text()) |
817 | | - | link = clear(i['href']) |
818 | | - | append_link() |
| 867 | + | html_page = data_obj.find('html') |
| 868 | + | if html_page: |
| 869 | + | for i in data_obj.find('html').find_all('a'): |
| 870 | + | if str(i['href']).startswith("http"): |
| 871 | + | name = clear(i.get_text()) |
| 872 | + | link = clear(i['href']) |
| 873 | + | append_link() |
819 | 874 | | |
820 | 875 | | if engine_str == "darksearchenginer": |
821 | 876 | | for i in data_obj.find('div', attrs={"class": "table-responsive"}).find_all('a'): |
| skipped 153 lines |
975 | 1030 | | link = n.find('a')['href'] |
976 | 1031 | | append_link() |
977 | 1032 | | |
978 | | - | if args.continuous_write and not f.closed: |
979 | | - | f.close() |
| 1033 | + | if args.continuous_write and not csv_file.closed: |
| 1034 | + | csv_file.close() |
980 | 1035 | | |
981 | | - | if len(result[engine_str]) <= 0: |
| 1036 | + | if not has_result: |
982 | 1037 | | return -1 |
983 | 1038 | | |
984 | 1039 | | return 1 |
| skipped 6 lines |
991 | 1046 | | print("Error: unable to connect") |
992 | 1047 | | except OSError: |
993 | 1048 | | print("Error: unable to connect") |
994 | | - | |
995 | | - | |
996 | | - | def write_to_file(filename, results, engine): |
997 | | - | f = open(filename, "w+") |
998 | | - | for i in results[engine]: |
999 | | - | f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"])) |
1000 | | - | f.close() |
1001 | 1049 | | |
1002 | 1050 | | |
1003 | 1051 | | def scrape(): |
| skipped 25 lines |
1029 | 1077 | | stop_time = datetime.now() |
1030 | 1078 | | |
1031 | 1079 | | if not args.continuous_write: |
1032 | | - | f = open(filename, "w+") |
1033 | | - | for engine in result.keys(): |
1034 | | - | for i in result[engine]: |
1035 | | - | f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"])) |
1036 | | - | f.close() |
| 1080 | + | with open(filename, 'w', newline='') as csv_file: |
| 1081 | + | csv_writer = csv.writer(csv_file, delimiter=field_delim, quoting=csv.QUOTE_NONNUMERIC) |
| 1082 | + | for engine in result.keys(): |
| 1083 | + | for i in result[engine]: |
| 1084 | + | i['engine'] = engine |
| 1085 | + | write_to_csv(csv_writer, i) |
1037 | 1086 | | |
1038 | 1087 | | total = 0 |
1039 | 1088 | | print("\nReport:") |
| skipped 10 lines |