STRLCPY/isp-data-pollution

Improve domain handling with dict of sets
Steven Thomas Smith committed 7 years ago

452c94a2

1 parent 8205f091

Total 1 files

■ ■ ■ ■ ■ ■

isp_data_pollution.py

		skipped 158 lines
159	159		self.fake = Factory.create()
160	160		self.hour_trigger = True
161	161		self.twentyfour_hour_trigger = True
162		-	self.links = set()
163		-	self.link_count = dict()
	162	+	self.domain_links = dict()
164	163		self.start_time = time.time()
165	164		self.data_usage = 0
166	165		self.get_blacklist()
		skipped 156 lines
323	322
324	323		def pollute(self):
325	324		if not self.quit_driver_every_call: self.check_phantomjs_process()
326		-	if len(self.links) < 2000:
	325	+	if self.link_count() < 2000:
327	326		if self.quit_driver_every_call: self.open_session()
328	327		self.seed_links()
329	328		self.clear_session()
330	329		if self.quit_driver_every_call: self.quit_session()
331		-	url = self.remove_link()
	330	+	url = self.pop_link()
332	331		if self.quit_driver_every_call: self.open_session()
333	332		self.get_url(url)
334	333		self.clear_session()
335	334		if self.quit_driver_every_call: self.quit_session()
	335	+
	336	+	def link_count(self):
	337	+	return int(np.array([len(self.domain_links[dmn]) for dmn in self.domain_links]).sum())
336	338
337	339		def seed_links(self):
338	340		# bias with non-random seed links
339		-	self.links \|= set(self.seed_bias_links)
340		-	if len(self.links) < self.max_links_cached:
	341	+	self.bias_links()
	342	+	if self.link_count() < self.max_links_cached:
341	343		num_words = max(1,npr.poisson(1.33)+1) # mean of 1.33 words per search
342		-	word = ' '.join(random.sample(self.words,num_words))
	344	+	if num_words == 1:
	345	+	word = ' '.join(random.sample(self.words,num_words))
	346	+	else:
	347	+	if npr.uniform() < 0.5:
	348	+	word = ' '.join(random.sample(self.words,num_words))
	349	+	else: # quote the first two words together
	350	+	word = ' '.join(['"{}"'.format(' '.join(random.sample(self.words, 2))),
	351	+	' '.join(random.sample(self.words, num_words-2))])
343	352		if self.debug: print('Seeding with search for \'{}\'…'.format(word))
344	353		# self.add_url_links(self.websearch(word).content.decode('utf-8'))
345	354		self.get_websearch(word)
	355	+
	356	+	def bias_links(self):
	357	+	for url in self.seed_bias_links: self.add_link(url)
346	358
347	359		def diurnal_cycle_test(self):
348	360		now = dt.datetime.now()
		skipped 19 lines
368	380		def exceeded_bandwidth_tasks(self):
369	381		if self.bandwidth_test():
370	382		# decimate the stack and clear the cookies
371		-	if len(self.links) > int(np.ceil(0.81*self.max_links_cached)):
372		-	for url in random.sample(self.links,int(np.ceil(len(self.links)/10.))):
373		-	self.remove_link(url)
	383	+	if self.link_count() > int(np.ceil(0.81*self.max_links_cached)):
	384	+	for url in self.draw_links(n=int(np.ceil(self.link_count()/10.))):
	385	+	self.pop_link()
374	386		time.sleep(120)
375	387
376	388		def every_hour_tasks(self):
		skipped 34 lines
411	423		# reset bw stats and (really) decimate the stack every couple of weeks
412	424		self.start_time = time.time()
413	425		self.data_usage = 0
414		-	if len(self.links) > int(np.ceil(0.49*self.max_links_cached)):
415		-	for url in random.sample(self.links,int(np.ceil(len(self.links)/3.))):
416		-	self.remove_link(url)
	426	+	if self.link_count() > int(np.ceil(0.49*self.max_links_cached)):
	427	+	for url in self.draw_links(n=int(np.ceil(self.link_count()/3.))):
	428	+	self.pop_link(url)
417	429
418	430		def set_user_agent(self):
419	431		global user_agent
		skipped 3 lines
423	435		except Exception as e:
424	436		if self.debug: print('.update() exception:\n{}'.format(e))
425	437
426		-	def remove_link(self):
427		-	url = random.sample(self.links,1)[0]
	438	+	def draw_link(self):
	439	+	return self.draw_links(n=1)[0]
	440	+
	441	+	def draw_links(self,n=1):
	442	+	urls = []
	443	+	domain_count = np.array([(dmn,len(self.domain_links[dmn])) for dmn in self.domain_links])
	444	+	p = np.array([np.float(c) for d,c in domain_count])
	445	+	count_total = p.sum()
	446	+	if count_total > 0:
	447	+	p = p/p.sum()
	448	+	cnts = npr.multinomial(n, pvals=p)
	449	+	if n > 1:
	450	+	for k in range(len(cnts)):
	451	+	domain = domain_count[k][0]
	452	+	cnt = min(cnts[k],domain_count[k][1])
	453	+	for url in random.sample(self.domain_links[domain],cnt):
	454	+	urls.append(url)
	455	+	else:
	456	+	k = int(np.nonzero(cnts)[0])
	457	+	domain = domain_count[k][0]
	458	+	url = random.sample(self.domain_links[domain],1)[0]
	459	+	urls.append(url)
	460	+	return urls
	461	+
	462	+	def pop_link(self):
	463	+	url = self.draw_link()
428	464		if npr.uniform() < 0.95: # 95% 1 GET, ~5% 2 GETs, .2% three GETs
429		-	self.links.remove(url) # pop a random item from the stack
430		-	self.decrement_link_count(url)
	465	+	self.remove_link(url) # pop a random item from the stack
431	466		return url
432	467
433	468		def add_link(self,url):
434	469		result = False
435	470		domain = self.domain_name(url)
436		-	self.link_count.setdefault(domain,0)
437		-	if len(self.links) < self.max_links_cached \
438		-	and self.link_count[domain] < self.max_links_per_domain \
439		-	and url not in self.links:
440		-	self.links.add(url)
441		-	self.increment_link_count(url,domain)
	471	+	if self.link_count() < self.max_links_cached \
	472	+	and len(getattr(self.domain_links,domain,[])) < self.max_links_per_domain \
	473	+	and url not in getattr(self.domain_links,domain,set()):
	474	+	self.domain_links.setdefault(domain, set())
	475	+	self.domain_links[domain].add(url)
442	476		result = True
443	477		# if self.debug: print('\tAdded link \'{}\'…'.format(url))
444	478		return result
445	479
446		-	def decrement_link_count(self,url,domain=None):
447		-	if domain is None: domain = self.domain_name(url)
448		-	self.link_count.setdefault(domain,0)
449		-	if self.link_count[domain] > 0: self.link_count[domain] -= 1
450		-
451		-	def increment_link_count(self,url,domain=None):
452		-	if domain is None: domain = self.domain_name(url)
453		-	self.link_count.setdefault(domain,0)
454		-	self.link_count[domain] += 1
	480	+	def remove_link(self,url):
	481	+	result = False
	482	+	domain = self.domain_name(url)
	483	+	if url in getattr(self.domain_links,domain,set()):
	484	+	self.domain_links[domain].remove(url)
	485	+	if len(self.domain_links[domain]) == 0:
	486	+	self.domain_links.remove(domain)
	487	+	result = True
	488	+	return result
455	489
456	490		def domain_name(self,url):
457	491		return '.'.join(uprs.urlparse(url).netloc.split('.')[-2:])
		skipped 13 lines
471	505		except Exception as e:
472	506		if self.debug: print('.page_source exception:\n{}'.format(e))
473	507		new_links = self.websearch_links()
474		-	if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url)
	508	+	if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
475	509
476	510		def websearch_links(self):
477	511		'''Webpage format for a popular search engine, <div class="g">'''
		skipped 22 lines
500	534		except Exception as e:
501	535		if self.debug: print('.page_source exception:\n{}'.format(e))
502	536		new_links = self.url_links()
503		-	if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url)
	537	+	if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
504	538
505	539		def url_links(self):
506	540		'''Generic webpage link finder format.'''
		skipped 35 lines
542	576		except Exception as e:
543	577		if self.debug: print('.current_url exception:\n{}'.format(e))
544	578		if self.debug:
545		-	print("'{}': {:d} links added, {:d} total".format(current_url,k,len(self.links)))
	579	+	print("'{}': {:d} links added, {:d} total".format(current_url,k,self.link_count()))
546	580		elif self.verbose:
547	581		self.print_progress(k,current_url)
548	582
549	583		def print_progress(self,num_links,url,terminal_width=80):
550	584		# truncate or fill with white space
551		-	text_suffix = ': {:d} links added, {:d} total'.format(num_links,len(self.links))
	585	+	text_suffix = ': {:d} links added, {:d} total'.format(num_links,self.link_count())
552	586		chars_used = 2 + len(text_suffix)
553	587		if len(url) + chars_used > terminal_width:
554	588		url = url[:terminal_width-chars_used-1] + '…'
		skipped 72 lines

Improve domain handling with dict of sets