STRLCPY/OnionSearch

Now possible to write progressively to the output file (instead of everything at the end, which remains default behavior)
Gobarigo committed 4 years ago

2ba47d76

1 parent 4951cda0

■ ■ ■ ■ ■ ■

README.md

		skipped 45 lines
46	46		## 📈 Usage
47	47
48	48		```
49		-	usage: search.py [-h] [--proxy PROXY] [--output OUTPUT] [--limit LIMIT]
	49	+	usage: search.py [-h] [--proxy PROXY] [--output OUTPUT]
	50	+	[--continuous_write CONTINUOUS_WRITE] [--limit LIMIT]
50	51		[--barmode BARMODE] [--engines [ENGINES [ENGINES ...]]]
51	52		[--exclude [EXCLUDE [EXCLUDE ...]]]
52	53		search
		skipped 7 lines
60	61		--output OUTPUT Output File (default: output_$SEARCH_$DATE.txt), where
61	62		$SEARCH is replaced by the first chars of the search
62	63		string and $DATE is replaced by the datetime
	64	+	--continuous_write CONTINUOUS_WRITE
	65	+	Write progressively to output file (default: False)
63	66		--limit LIMIT Set a max number of pages per engine to load
64	67		--barmode BARMODE Can be 'fixed' (default) or 'unknown'
65	68		--engines [ENGINES [ENGINES ...]]
66	69		Engines to request (default: full list)
67	70		--exclude [EXCLUDE [EXCLUDE ...]]
68	71		Engines to exclude (default: none)
69		-
70	72		[...]
71	73		```
72	74
		skipped 43 lines
116	118
117	119		In the csv file produced, the name and url strings are sanitized as much as possible, but there might still be some problems.
118	120
	121	+	Note that you can choose to progressively write to the output (instead of everything at the end, which would prevent
	122	+	losing the results if something goes wrong). To do so you have to use `--continuous-write True`, just as is:
	123	+	```
	124	+	python3 search.py "computer" --continuous-write True
	125	+	```
	126	+	You can then use the `tail -f` (tail follow) Unix command to actively watch or monitor the results of the scraping.
119	127
120	128		## 📝 License
121	129		[GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.fr.html)
		skipped 3 lines

■ ■ ■ ■ ■ ■

search.py

		skipped 63 lines
64	64		parser.add_argument("--output", default='output_$SEARCH_$DATE.txt', type=str,
65	65		help="Output File (default: output_$SEARCH_$DATE.txt), where $SEARCH is replaced by the first "
66	66		"chars of the search string and $DATE is replaced by the datetime")
	67	+	parser.add_argument("--continuous_write", type=bool, default=False,
	68	+	help="Write progressively to output file (default: False)")
67	69		parser.add_argument("search", type=str, help="The search string or phrase")
68	70		parser.add_argument("--limit", type=int, default=0, help="Set a max number of pages per engine to load")
69	71		parser.add_argument("--barmode", type=str, default="fixed", help="Can be 'fixed' (default) or 'unknown'")
		skipped 4 lines
74	76		proxies = {'http': 'socks5h://{}'.format(args.proxy), 'https': 'socks5h://{}'.format(args.proxy)}
75	77		tqdm_bar_format = "{desc}: {percentage:3.0f}% \|{bar}\| {n_fmt:3s} / {total_fmt:3s} [{elapsed:5s} < {remaining:5s}]"
76	78		result = {}
	79	+	filename = args.output
77	80
78	81
79	82		def random_headers():
		skipped 703 lines
783	786
784	787		def link_finder(engine_str, data_obj):
785	788		global result
	789	+	global filename
786	790		name = ""
787	791		link = ""
	792	+	f = None
	793	+
	794	+	if args.continuous_write:
	795	+	f = open(filename, "a")
788	796
789	797		def append_link():
790	798		result[engine_str].append({"name": name, "link": link})
	799	+	if args.continuous_write and f.writable():
	800	+	f.write("\"{}\",\"{}\",\"{}\"\n".format(engine_str, name, link))
791	801
792	802		if engine_str not in result:
793	803		result[engine_str] = []
		skipped 35 lines
829	839
830	840		if engine_str == "evosearch":
831	841		if data_obj.find('div', attrs={"id": "results"}) is not None:
832		-	count = 0
833	842		for div in data_obj.find('div', attrs={"id": "results"}).find_all('div', attrs={"class": "odrow"}):
834	843		name = clear(div.find('div', attrs={"class": "title"}).find('a').get_text())
835	844		link = clear(div.find('div', attrs={"class": "title"}).find('a')['href']
836	845		.replace("./include/click_counter.php?url=", "")
837	846		.replace("&query={}".format(args.search), ""))
838		-	count += 1
839	847		append_link()
840	848
841	849		if engine_str == "grams":
		skipped 7 lines
849	857		append_link()
850	858
851	859		if engine_str == "haystack":
852		-	if data_obj.find('div', attrs={"class": "result"}) is None:
853		-	return -1
854		-	for div in data_obj.find_all('div', attrs={"class": "result"}):
855		-	if div.find('a') is not None and div.find('i') is not None:
856		-	name = clear(div.find('a').get_text())
857		-	link = clear(div.find('i').get_text())
858		-	append_link()
	860	+	if data_obj.find('div', attrs={"class": "result"}) is not None:
	861	+	for div in data_obj.find_all('div', attrs={"class": "result"}):
	862	+	if div.find('a') is not None and div.find('i') is not None:
	863	+	name = clear(div.find('a').get_text())
	864	+	link = clear(div.find('i').get_text())
	865	+	append_link()
859	866
860	867		if engine_str == "multivac":
861	868		for i in data_obj.find_all('dl'):
		skipped 4 lines
866	873		link = clear(link_tag['href'])
867	874		append_link()
868	875		else:
869		-	return -1
	876	+	break
870	877
871	878		if engine_str == "notevil":
872	879		''' As for OnionLand, we could use the span instead of the href to get a beautiful link
		skipped 16 lines
889	896		append_link()
890	897
891	898		if engine_str == "onionland":
892		-	if data_obj.find('div', attrs={"class": "row no-result-row"}):
893		-	return -1
894		-	for i in data_obj.find_all('div', attrs={"class": "result-block"}):
895		-	if not str(clear(i.find('div', attrs={'class': "title"}).find('a')['href'])).startswith("/ads"):
896		-	name = clear(i.find('div', attrs={'class': "title"}).get_text())
897		-	link = clear(i.find('div', attrs={'class': "link"}).get_text())
898		-	append_link()
	899	+	if not data_obj.find('div', attrs={"class": "row no-result-row"}):
	900	+	for i in data_obj.find_all('div', attrs={"class": "result-block"}):
	901	+	if not str(clear(i.find('div', attrs={'class': "title"}).find('a')['href'])).startswith("/ads"):
	902	+	name = clear(i.find('div', attrs={'class': "title"}).get_text())
	903	+	link = clear(i.find('div', attrs={'class': "link"}).get_text())
	904	+	append_link()
899	905
900	906		if engine_str == "onionsearchengine":
901	907		for i in data_obj.find_all('table'):
		skipped 67 lines
969	975		link = n.find('a')['href']
970	976		append_link()
971	977
	978	+	if args.continuous_write and not f.closed:
	979	+	f.close()
	980	+
	981	+	if len(result[engine_str]) <= 0:
	982	+	return -1
	983	+
972	984		return 1
973	985
974	986
		skipped 6 lines
981	993		print("Error: unable to connect")
982	994
983	995
	996	+	def write_to_file(filename, results, engine):
	997	+	f = open(filename, "w+")
	998	+	for i in results[engine]:
	999	+	f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"]))
	1000	+	f.close()
	1001	+
	1002	+
984	1003		def scrape():
985	1004		global result
	1005	+	global filename
986	1006
987	1007		start_time = datetime.now()
988	1008
989	1009		# Building the filename
990		-	filename = args.output
991	1010		filename = str(filename).replace("$DATE", start_time.strftime("%Y%m%d%H%M%S"))
992	1011		search = str(args.search).replace(" ", "")
993	1012		if len(search) > 10:
		skipped 15 lines
1009	1028
1010	1029		stop_time = datetime.now()
1011	1030
	1031	+	if not args.continuous_write:
	1032	+	f = open(filename, "w+")
	1033	+	for engine in result.keys():
	1034	+	for i in result[engine]:
	1035	+	f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"]))
	1036	+	f.close()
	1037	+
1012	1038		total = 0
1013	1039		print("\nReport:")
1014	1040		print(" Execution time: %s seconds" % (stop_time - start_time))
1015		-
1016		-	f = open(filename, "w+")
1017	1041		for engine in result.keys():
1018	1042		print(" {}: {}".format(engine, str(len(result[engine]))))
1019	1043		total += len(result[engine])
1020		-	for i in result[engine]:
1021		-	f.write("\"{}\",\"{}\",\"{}\"\n".format(engine, i["name"], i["link"]))
1022		-
1023		-	f.close()
1024	1044		print(" Total: {} links written to {}".format(str(total), filename))
1025	1045
1026	1046
		skipped 3 lines

Now possible to write progressively to the output file (instead of everything at the end, which remains default behavior)