STRLCPY/isp-data-pollution

Add phantomjs max memory, before/after progress lines
Steven Thomas Smith committed 7 years ago

45e2b037

1 parent e8b18815

Total 1 files

■ ■ ■ ■ ■ ■

isp_data_pollution.py

		skipped 46 lines
47	47		wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain'
48	48		timeout = 20
49	49		short_timeout = 3
	50	+	phantomjs_rss_limit_mb = 1024 # Default maximum meory limit of phantomjs processs (MB)
	51	+	terminal_width = 80 # tty width, standard is 80 chars; add code to adapt later
50	52
51	53		blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz'
52	54		# Usage of the Shalla Blacklists:
		skipped 60 lines
113	115
114	116
115	117		class ISPDataPollution:
116		-	'''Re: https://www.eff.org/deeplinks/2017/03/senate-puts-isp-profits-over-your-privacy
	118	+	"""
	119	+	Re: https://www.eff.org/deeplinks/2017/03/senate-puts-isp-profits-over-your-privacy
117	120
118	121		I pay my ISP a lot for data usage every month. I typically don't use
119	122		all the bandwidth that I pay for. If my ISP is going to sell my
		skipped 12 lines
132	135		The crawler uses the Python requests and lxml.html libraries, is hardcoded
133	136		to download html without javascript processing, will not download
134	137		images, and respects robots.txt, which all provide good security.
135		-	'''
	138	+	"""
136	139
137	140		def __init__(self,gb_per_month=gb_per_month,
138	141		max_links_cached=max_links_cached,
		skipped 4 lines
143	146		blacklist_url=blacklist_url,
144	147		wordsite_url=wordsite_url,
145	148		seed_bias_links=seed_bias_links,
146		-	timeout=timeout,
	149	+	timeout=timeout, diurnal_flag=True,
147	150		quit_driver_every_call=False,
148	151		blacklist=True,verbose=True):
149	152		self.max_links_cached = max_links_cached
		skipped 6 lines
156	159		self.seed_bias_links = seed_bias_links
157	160		self.blacklist = blacklist; self.verbose = verbose
158	161		self.timeout = timeout
	162	+	self.diurnal_flag = diurnal_flag
159	163		self.quit_driver_every_call = quit_driver_every_call
160	164		# self.gb_per_month = gb_per_month # set in parseArgs
161	165		# self.debug = debug # set in parseArgs
		skipped 18 lines
180	184		def parseArgs(self):
181	185		parser = ap.ArgumentParser()
182	186		parser.add_argument('-bw', '--gb_per_month', help="GB per month", type=int, default=gb_per_month)
	187	+	parser.add_argument('-mm', '--maxmemory', help="Maximum memory of phantomjs (MB); 0=>restart every link", type=int, default=phantomjs_rss_limit_mb)
183	188		parser.add_argument('-g', '--debug', help="Debug flag", action='store_true')
184	189		args = parser.parse_args()
185	190		for k in args.__dict__: setattr(self,k,getattr(args,k))
		skipped 2 lines
188	193
189	194		def sanity_check_arguments(self):
190	195		self.gb_per_month = min(2048,max(1,self.gb_per_month)) # min-max bandwidth limits
	196	+	if self.maxmemory == 0: self.quit_driver_every_call = True
	197	+	self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits
191	198
192	199		def open_session(self):
193	200		self.quit_session()
		skipped 20 lines
214	221		self.session = driver
215	222
216	223		def quit_session(self,hard_quit=False,pid=None):
217		-	''' close, kill -9, quit, del '''
	224	+	"""
	225	+	close, kill -9, quit, del
	226	+	:param hard_quit:
	227	+	:param pid:
	228	+	:return:
	229	+	"""
218	230		# http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution
219	231		if hasattr(self,'session'):
220	232		if not hard_quit:
		skipped 102 lines
323	335		if self.quit_driver_every_call: self.quit_session()
324	336		while True: # pollute forever, pausing only to meet the bandwidth requirement
325	337		try:
326		-	if self.diurnal_cycle_test():
	338	+	if (not self.diurnal_flag) or self.diurnal_cycle_test():
327	339		self.pollute()
328	340		else:
329	341		time.sleep(self.chi2_mean_std(3.,1.))
		skipped 13 lines
343	355		self.clear_session()
344	356		if self.quit_driver_every_call: self.quit_session()
345	357		url = self.pop_link()
	358	+	if self.verbose: self.print_url(url)
346	359		if self.quit_driver_every_call: self.open_session()
347	360		self.get_url(url)
348	361		self.clear_session()
		skipped 55 lines
404	417		return npr.uniform() < val
405	418
406	419		def chi2_mean_std(self,mean=1.,std=0.1):
407		-	'''
	420	+	"""
408	421		Chi-squared random variable with given mean and standard deviation.
409		-	'''
	422	+	:param mean:
	423	+	:param std:
	424	+	:return:
	425	+	"""
410	426		scale = 2.*mean/std
411	427		nu = mean*scale
412	428		return npr.chisquare(nu)/scale
		skipped 124 lines
537	553		return '.'.join(uprs.urlparse(url).netloc.split('.')[-2:])
538	554
539	555		def get_websearch(self,query):
540		-	'''HTTP GET of a websearch, then add any embedded links.'''
	556	+	"""
	557	+	HTTP GET of a websearch, then add any embedded links.
	558	+	:param query:
	559	+	:return:
	560	+	"""
541	561		url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
	562	+	if self.verbose: self.print_url(url)
542	563		@self.phantomjs_timeout
543	564		def phantomjs_get(): self.session.get(url) # selenium driver
544	565		phantomjs_get()
		skipped 4 lines
549	570		if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
550	571
551	572		def websearch_links(self):
552		-	'''Webpage format for a popular search engine, <div class="g">'''
	573	+	"""
	574	+	Webpage format for a popular search engine, <div class="g">.
	575	+	:return:
	576	+	"""
553	577		# https://github.com/detro/ghostdriver/issues/169
554	578		@self.phantomjs_short_timeout
555	579		def phantomjs_find_elements_by_css_selector():
		skipped 18 lines
574	598		return links
575	599
576	600		def get_url(self,url):
577		-	'''HTTP GET of the url, and add any embedded links.'''
	601	+	"""
	602	+	HTTP GET of the url, and add any embedded links.
	603	+	:param url:
	604	+	:return:
	605	+	"""
578	606		if not self.check_robots(url): return # bail out if robots.txt says to
579	607		@self.phantomjs_timeout
580	608		def phantomjs_get(): self.session.get(url) # selenium driver
		skipped 60 lines
641	669		except Exception as e:
642	670		if self.debug: print('.current_url exception:\n{}'.format(e))
643	671		if self.debug:
644		-	print("'{}': {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy()))
	672	+	print("{}: {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy()))
645	673		elif self.verbose:
646		-	self.print_progress(k,current_url)
	674	+	self.print_progress(current_url,num_links=k)
647	675
648		-	def print_progress(self,num_links,url,terminal_width=80):
649		-	# truncate or fill with white space
650		-	text_suffix = ': +{:d}/{:d} links, H(domain)={:.1f} b'.format(num_links,self.link_count(),self.domain_entropy())
651		-	chars_used = 2 + len(text_suffix)
652		-	if len(url) + chars_used > terminal_width:
653		-	url = url[:terminal_width-chars_used-1] + '…'
654		-	text = "'{}'{}".format(url,text_suffix)
	676	+	def print_url(self,url):
	677	+	if self.debug: print(url + ' …')
	678	+	else: self.print_progress(url)
	679	+
	680	+	def print_progress(self,url,num_links=None):
	681	+	if num_links is not None:
	682	+	text_suffix = ': +{:d}/{:d} links, H(domain)={:.1f} b'.format(num_links,self.link_count(),self.domain_entropy())
	683	+	else:
	684	+	text_suffix = ': {:d} links, H(domain)={:.1f} b …'.format(self.link_count(),self.domain_entropy())
	685	+	self.print_truncated_line(url,text_suffix)
	686	+
	687	+	def print_truncated_line(self,url,text_suffix='',terminal_width=terminal_width):
	688	+	"""
	689	+	Print truncated `url` + `text_suffix` to fill `terminal_width`
	690	+	:param url:
	691	+	:param text_suffix:
	692	+	:param terminal_width:
	693	+	:return:
	694	+	"""
	695	+	chars_used = len(text_suffix)
	696	+	if text_suffix == '…':
	697	+	if len(url) >= terminal_width:
	698	+	url = url[:terminal_width-1] # add '…' below
	699	+	elif len(url) < terminal_width-1:
	700	+	url += ' ' # add an extra space before the ellipsis
	701	+	else:
	702	+	if len(url) + chars_used > terminal_width:
	703	+	url = url[:terminal_width-chars_used-1] + '…'
	704	+	text = "{}{}".format(url,text_suffix) # added white space necessary
655	705		text = text[:min(terminal_width,len(text))] + ' ' * max(0,terminal_width-len(text))
656	706		print(text,end='',flush=True)
657	707		time.sleep(0.01)
		skipped 55 lines
713	763		raise self.TimeoutError('robotparser is taking too long')
714	764
715	765		def check_phantomjs_process(self):
716		-	'''Check if phantomjs is running.'''
	766	+	"""
	767	+	Check if phantomjs is running.
	768	+	:return:
	769	+	"""
717	770		# Check rss and restart if too large, then check existence
718	771		# http://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python
719	772		try:
720	773		if not hasattr(self,'session'): self.open_session()
721	774		pid, rss_mb = self.phantomjs_pid_and_memory()
722		-	if rss_mb > 1024: # 1 GB rss limit
	775	+	if rss_mb > self.phantomjs_rss_limit_mb: # memory limit
723	776		self.quit_session(pid=pid)
724	777		self.open_session()
725	778		pid, _ = self.phantomjs_pid_and_memory()
		skipped 32 lines

Add phantomjs max memory, before/after progress lines