STRLCPY/isp-data-pollution

v1.2: Add random search engine choice
Steven Thomas Smith committed 7 years ago

7cc9b6e1

1 parent 0ac5eea4

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

Total 1 files

■ ■ ■ ■ ■ ■

isp_data_pollution.py

		skipped 18 lines
19	19		# You should have received a copy of the GNU General Public License
20	20		# along with this program. If not, see <http://www.gnu.org/licenses/>.
21	21
22		-	__version__ = '1.1'
	22	+	__version__ = '1.2'
23	23
24		-	import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time, warnings as warn
	24	+	import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn
25	25		import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
26	26		from selenium import webdriver
27	27		from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
		skipped 23 lines
51	51		max_links_cached = 100000 # Maximum number of links to cache for download
52	52		max_links_per_page = 200 # Maximum number of links to add per page
53	53		max_links_per_domain = 400 # Maximum number of links to add per domain
54		-	search_url = 'http://www.google.com/search' # keep unencrypted for ISP DPI
55	54		wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain'
56	55		timeout = 20
57	56		short_timeout = 3
		skipped 63 lines
121	120		# 1. The bandwidth usage is undoubtedly (much) smaller because gzip encoding is used
122	121		# 2. A lightweight proxy could be used for accurate bandwidth, and header editing
123	122
	123	+	# Safe search options
	124	+	class SafeWebSearch():
	125	+	""" Safe web search class with default Google parameters.
	126	+	Use unencrypted HTTP for ISP DPI.
	127	+	"""
	128	+	def __init__(self,
	129	+	search_url='http://www.google.com/search', # search engine
	130	+	query_parameter='q', # query parameter
	131	+	safe_parameter='safe=active', # query parameter for safe searches
	132	+	css_selector='div.g', # css selector to harvest search results
	133	+	additional_parameters='', # additional parameters required to get results
	134	+	result_extraction=lambda x: x): # function to extract the link
	135	+	self.search_url = search_url
	136	+	self.query_parameter = query_parameter
	137	+	self.safe_parameter = safe_parameter
	138	+	self.css_selector = css_selector
	139	+	self.additional_parameters = additional_parameters
	140	+	self.result_extraction = result_extraction
	141	+
	142	+	SafeGoogle = SafeWebSearch()
	143	+	SafeBing = SafeWebSearch(search_url='http://www.bing.com/search',
	144	+	safe_parameter='adlt=strict',css_selector='li.b_algo')
	145	+	yahoo_search_reprog = re.compile(r'/RU=(.+?)/R[A-Z]=')
	146	+	SafeYahoo = SafeWebSearch(search_url='http://search.yahoo.com/search', query_parameter='p',
	147	+	safe_parameter='adlt=strict',css_selector='div.compTitle',
	148	+	result_extraction=lambda x: yahoo_search_reprog.findall(uprs.parse_qs(x)['_ylu'][0])[0])
	149	+	SafeDuckDuckGo = SafeWebSearch(search_url='http://www.duckduckgo.com/',
	150	+	safe_parameter='kp=1',css_selector='div.result__body')
124	151
125	152		class ISPDataPollution:
126	153		"""
		skipped 23 lines
150	177		max_links_per_page=max_links_per_page,
151	178		max_links_per_domain=max_links_per_domain,
152	179		user_agent=user_agent,
153		-	search_url=search_url,
154	180		blacklist_url=blacklist_url,
155	181		wordsite_url=wordsite_url,
156	182		seed_bias_links=seed_bias_links,
		skipped 5 lines
162	188		self.max_links_per_page = max_links_per_page
163	189		self.max_links_per_domain = max_links_per_domain
164	190		self.user_agent = user_agent
165		-	self.search_url = search_url
166	191		self.blacklist_url = blacklist_url
167	192		self.wordsite_url = wordsite_url
168	193		self.seed_bias_links = seed_bias_links
		skipped 480 lines
649	674		:param query:
650	675		:return:
651	676		"""
652		-	url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
	677	+	self.select_random_search_engine()
	678	+	url = uprs.urlunparse(uprs.urlparse(self.SafeSearch.search_url)._replace(query='{}={}{}&{}'.format(
	679	+	self.SafeSearch.query_parameter,uprs.quote_plus(query),
	680	+	self.SafeSearch.additional_parameters,self.SafeSearch.safe_parameter)))
653	681		if self.verbose: self.print_url(url)
654	682		@self.phantomjs_timeout
655	683		def phantomjs_get(): self.driver.get(url) # selenium driver
		skipped 4 lines
660	688		new_links = self.websearch_links()
661	689		if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
662	690
	691	+	def select_random_search_engine(self):
	692	+	self.SafeSearch = random.choice([SafeGoogle, SafeBing, SafeYahoo, SafeDuckDuckGo])
	693	+	return self.SafeSearch
	694	+
663	695		def websearch_links(self):
664	696		"""
665	697		Webpage format for a popular search engine, <div class="g">.
		skipped 2 lines
668	700		# https://github.com/detro/ghostdriver/issues/169
669	701		@self.phantomjs_short_timeout
670	702		def phantomjs_find_elements_by_css_selector():
671		-	return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g'))
	703	+	return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector(self.SafeSearch.css_selector))
672	704		elements = phantomjs_find_elements_by_css_selector()
673	705		# get links in random order until max. per page
674	706		k = 0
675	707		links = []
676	708		try:
677		-	for div in sorted(elements,key=lambda k: random.random()):
	709	+	for elt in sorted(elements,key=lambda k: random.random()):
678	710		@self.phantomjs_short_timeout
679		-	def phantomjs_find_element_by_tag_name(): return div.find_element_by_tag_name('a')
	711	+	def phantomjs_find_element_by_tag_name(): return elt.find_element_by_tag_name('a')
680	712		a_tag = phantomjs_find_element_by_tag_name()
681	713		@self.phantomjs_short_timeout
682	714		def phantomjs_get_attribute(): return a_tag.get_attribute('href')
683	715		href = phantomjs_get_attribute()
684		-	if href is not None: links.append(href)
	716	+	if href is not None:
	717	+	href = self.SafeSearch.result_extraction(href)
	718	+	links.append(href)
685	719		k += 1
686	720		if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break
687	721		except Exception as e:
		skipped 215 lines

v1.2: Add random search engine choice