STRLCPY/isp-data-pollution

Split up phantomjs get_attribute and fins_tag calls
Steven Thomas Smith committed 7 years ago

4b24bfa0

1 parent e8ddf1ee

Total 1 files

■ ■ ■ ■ ■ ■

isp_data_pollution.py

		skipped 45 lines
46	46		search_url = 'http://www.google.com/search' # keep unencrypted for ISP DPI
47	47		wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain'
48	48		timeout = 20
	49	+	short_timeout = 3
49	50
50	51		blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz'
51	52		# Usage of the Shalla Blacklists:
		skipped 34 lines
86	87
87	88		# monkeypatch the read class method in RobotFileParser
88	89		# many sites will block access to robots.txt without a standard User-Agent header
89		-	short_timeout = 3
90	90		class RobotFileParserUserAgent(robotparser.RobotFileParser):
91	91
92	92		timeout = short_timeout # short-term timeout
		skipped 321 lines
414	414
415	415		def exceeded_bandwidth_tasks(self):
416	416		if self.bandwidth_test():
417		-	# decimate the stack and clear the cookies
418		-	if self.link_count() > int(np.ceil(0.81*self.max_links_cached)):
419		-	for url in self.draw_links(n=int(np.ceil(self.link_count()/10.))):
420		-	self.pop_link()
	417	+	self.decimate_links(total_frac=0.81,decimate_frac=0.1)
421	418		time.sleep(120)
422	419
423	420		def every_hour_tasks(self):
		skipped 16 lines
440	437
441	438		def every_day_tasks(self):
442	439		if int(self.elapsed_time/3600. % 24.) == 23:
443		-	# clear out cookies every day, and seed more links
	440	+	# clear out cookies every day, decimate, and seed more links
444	441		if self.twentyfour_hour_trigger:
445	442		if hasattr(self,'session'):
446	443		self.seed_links()
		skipped 2 lines
449	446		self.open_session()
450	447		else:
451	448		self.open_session()
	449	+	self.decimate_links(total_frac=0.667, decimate_frac=0.1)
452	450		self.seed_links()
453	451		if self.quit_driver_every_call: self.quit_session()
454	452		self.twentyfour_hour_trigger = False
		skipped 5 lines
460	458		# reset bw stats and (really) decimate the stack every couple of weeks
461	459		self.start_time = time.time()
462	460		self.data_usage = 0
463		-	if self.link_count() > int(np.ceil(0.49*self.max_links_cached)):
464		-	for url in self.draw_links(n=int(np.ceil(self.link_count()/3.))):
465		-	self.pop_link(url)
	461	+	self.decimate_links(total_frac=0.49, decimate_frac=0.333)
	462	+
	463	+	def decimate_links(self, total_frac=0.81, decimate_frac=0.1): # decimate the stack
	464	+	if self.link_count() > int(np.ceil(total_frac * self.max_links_cached)):
	465	+	for url in self.draw_links(n=int(np.ceil(self.link_count() * decimate_frac))):
	466	+	self.remove_link(url)
466	467
467	468		def set_user_agent(self):
468	469		global user_agent
		skipped 11 lines
480	481
481	482		def draw_links(self,n=1,log_sampling=False):
482	483		urls = []
483		-	domain_count = np.array([(dmn,len(self.domain_links[dmn])) for dmn in self.domain_links])
484		-	p = np.array([np.float(c) for d,c in domain_count])
	484	+	domain_array = np.array([dmn for dmn in self.domain_links])
	485	+	domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])])
	486	+	p = np.array([np.float(c) for c in domain_count])
485	487		count_total = p.sum()
486	488		if log_sampling: # log-sampling [log(x+1)] to bias lower count domains
487	489		p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype)
		skipped 1 lines
489	491		p = p/p.sum()
490	492		cnts = npr.multinomial(n, pvals=p)
491	493		if n > 1:
492		-	for k in range(len(cnts)):
493		-	domain = domain_count[k][0]
494		-	cnt = min(cnts[k],domain_count[k][1])
	494	+	for k in range(cnts.shape[0]):
	495	+	domain = domain_array[k]
	496	+	cnt = min(cnts[k],domain_count[k])
495	497		for url in random.sample(self.domain_links[domain],cnt):
496	498		urls.append(url)
497	499		else:
		skipped 51 lines
549	551		# https://github.com/detro/ghostdriver/issues/169
550	552		@self.phantomjs_short_timeout
551	553		def phantomjs_find_elements_by_css_selector():
552		-	return WebDriverWait(self.session, 3).until(lambda x: x.find_elements_by_css_selector('div.g'))
	554	+	return WebDriverWait(self.session,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g'))
553	555		elements = phantomjs_find_elements_by_css_selector()
554	556		# get links in random order until max. per page
555	557		k = 0
		skipped 1 lines
557	559		try:
558	560		for div in sorted(elements,key=lambda k: random.random()):
559	561		@self.phantomjs_short_timeout
560		-	def phantomjs_find_element_by_tag_name():
561		-	if div.find_element_by_tag_name('a').get_attribute('href') is not None:
562		-	links.append(div.find_element_by_tag_name('a').get_attribute('href'))
563		-	phantomjs_find_element_by_tag_name()
	562	+	def phantomjs_find_element_by_tag_name(): return div.find_element_by_tag_name('a')
	563	+	a_tag = phantomjs_find_element_by_tag_name()
	564	+	@self.phantomjs_short_timeout
	565	+	def phantomjs_get_attribute(): return a_tag.get_attribute('href')
	566	+	href = phantomjs_get_attribute()
	567	+	if href is not None: links.append(href)
564	568		k += 1
565	569		if k > self.max_links_per_page: break
566	570		except Exception as e:
567		-	if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e))
	571	+	if self.debug: print('.find_element_by_tag_name.get_attribute() exception:\n{}'.format(e))
568	572		return links
569	573
570	574		def get_url(self,url):
		skipped 22 lines
593	597		try:
594	598		for a in sorted(elements,key=lambda k: random.random()):
595	599		@self.phantomjs_short_timeout
596		-	def phantomjs_get_attribute():
597		-	if a.get_attribute('href') is not None:
598		-	links.append(a.get_attribute('href'))
599		-	phantomjs_get_attribute()
	600	+	def phantomjs_get_attribute(): return a.get_attribute('href')
	601	+	href = phantomjs_get_attribute()
	602	+	if href is not None: links.append(href)
600	603		k += 1
601	604		if k > self.max_links_per_page: break
602		-	except Exception as a:
	605	+	except Exception as e:
603	606		if self.debug: print('.get_attribute() exception:\n{}'.format(e))
604	607		return links
605	608
		skipped 8 lines
614	617		rp.read()
615	618		result = rp.can_fetch(self.user_agent,url)
616	619		del rp # ensure self.close() in urllib
617		-	robots_read()
	620	+	return result
	621	+	result = robots_read()
618	622		return result
619	623
620	624		def add_url_links(self,links,url=''):
		skipped 131 lines

Split up phantomjs get_attribute and fins_tag calls