🤬
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 158 lines
    159 159   self.fake = Factory.create()
    160 160   self.hour_trigger = True
    161 161   self.twentyfour_hour_trigger = True
    162  - self.links = set()
    163  - self.link_count = dict()
     162 + self.domain_links = dict()
    164 163   self.start_time = time.time()
    165 164   self.data_usage = 0
    166 165   self.get_blacklist()
    skipped 156 lines
    323 322   
    324 323   def pollute(self):
    325 324   if not self.quit_driver_every_call: self.check_phantomjs_process()
    326  - if len(self.links) < 2000:
     325 + if self.link_count() < 2000:
    327 326   if self.quit_driver_every_call: self.open_session()
    328 327   self.seed_links()
    329 328   self.clear_session()
    330 329   if self.quit_driver_every_call: self.quit_session()
    331  - url = self.remove_link()
     330 + url = self.pop_link()
    332 331   if self.quit_driver_every_call: self.open_session()
    333 332   self.get_url(url)
    334 333   self.clear_session()
    335 334   if self.quit_driver_every_call: self.quit_session()
     335 + 
     336 + def link_count(self):
     337 + return int(np.array([len(self.domain_links[dmn]) for dmn in self.domain_links]).sum())
    336 338   
    337 339   def seed_links(self):
    338 340   # bias with non-random seed links
    339  - self.links |= set(self.seed_bias_links)
    340  - if len(self.links) < self.max_links_cached:
     341 + self.bias_links()
     342 + if self.link_count() < self.max_links_cached:
    341 343   num_words = max(1,npr.poisson(1.33)+1) # mean of 1.33 words per search
    342  - word = ' '.join(random.sample(self.words,num_words))
     344 + if num_words == 1:
     345 + word = ' '.join(random.sample(self.words,num_words))
     346 + else:
     347 + if npr.uniform() < 0.5:
     348 + word = ' '.join(random.sample(self.words,num_words))
     349 + else: # quote the first two words together
     350 + word = ' '.join(['"{}"'.format(' '.join(random.sample(self.words, 2))),
     351 + ' '.join(random.sample(self.words, num_words-2))])
    343 352   if self.debug: print('Seeding with search for \'{}\'…'.format(word))
    344 353   # self.add_url_links(self.websearch(word).content.decode('utf-8'))
    345 354   self.get_websearch(word)
     355 + 
     356 + def bias_links(self):
     357 + for url in self.seed_bias_links: self.add_link(url)
    346 358   
    347 359   def diurnal_cycle_test(self):
    348 360   now = dt.datetime.now()
    skipped 19 lines
    368 380   def exceeded_bandwidth_tasks(self):
    369 381   if self.bandwidth_test():
    370 382   # decimate the stack and clear the cookies
    371  - if len(self.links) > int(np.ceil(0.81*self.max_links_cached)):
    372  - for url in random.sample(self.links,int(np.ceil(len(self.links)/10.))):
    373  - self.remove_link(url)
     383 + if self.link_count() > int(np.ceil(0.81*self.max_links_cached)):
     384 + for url in self.draw_links(n=int(np.ceil(self.link_count()/10.))):
     385 + self.pop_link()
    374 386   time.sleep(120)
    375 387   
    376 388   def every_hour_tasks(self):
    skipped 34 lines
    411 423   # reset bw stats and (really) decimate the stack every couple of weeks
    412 424   self.start_time = time.time()
    413 425   self.data_usage = 0
    414  - if len(self.links) > int(np.ceil(0.49*self.max_links_cached)):
    415  - for url in random.sample(self.links,int(np.ceil(len(self.links)/3.))):
    416  - self.remove_link(url)
     426 + if self.link_count() > int(np.ceil(0.49*self.max_links_cached)):
     427 + for url in self.draw_links(n=int(np.ceil(self.link_count()/3.))):
     428 + self.pop_link(url)
    417 429   
    418 430   def set_user_agent(self):
    419 431   global user_agent
    skipped 3 lines
    423 435   except Exception as e:
    424 436   if self.debug: print('.update() exception:\n{}'.format(e))
    425 437   
    426  - def remove_link(self):
    427  - url = random.sample(self.links,1)[0]
     438 + def draw_link(self):
     439 + return self.draw_links(n=1)[0]
     440 + 
     441 + def draw_links(self,n=1):
     442 + urls = []
     443 + domain_count = np.array([(dmn,len(self.domain_links[dmn])) for dmn in self.domain_links])
     444 + p = np.array([np.float(c) for d,c in domain_count])
     445 + count_total = p.sum()
     446 + if count_total > 0:
     447 + p = p/p.sum()
     448 + cnts = npr.multinomial(n, pvals=p)
     449 + if n > 1:
     450 + for k in range(len(cnts)):
     451 + domain = domain_count[k][0]
     452 + cnt = min(cnts[k],domain_count[k][1])
     453 + for url in random.sample(self.domain_links[domain],cnt):
     454 + urls.append(url)
     455 + else:
     456 + k = int(np.nonzero(cnts)[0])
     457 + domain = domain_count[k][0]
     458 + url = random.sample(self.domain_links[domain],1)[0]
     459 + urls.append(url)
     460 + return urls
     461 + 
     462 + def pop_link(self):
     463 + url = self.draw_link()
    428 464   if npr.uniform() < 0.95: # 95% 1 GET, ~5% 2 GETs, .2% three GETs
    429  - self.links.remove(url) # pop a random item from the stack
    430  - self.decrement_link_count(url)
     465 + self.remove_link(url) # pop a random item from the stack
    431 466   return url
    432 467   
    433 468   def add_link(self,url):
    434 469   result = False
    435 470   domain = self.domain_name(url)
    436  - self.link_count.setdefault(domain,0)
    437  - if len(self.links) < self.max_links_cached \
    438  - and self.link_count[domain] < self.max_links_per_domain \
    439  - and url not in self.links:
    440  - self.links.add(url)
    441  - self.increment_link_count(url,domain)
     471 + if self.link_count() < self.max_links_cached \
     472 + and len(getattr(self.domain_links,domain,[])) < self.max_links_per_domain \
     473 + and url not in getattr(self.domain_links,domain,set()):
     474 + self.domain_links.setdefault(domain, set())
     475 + self.domain_links[domain].add(url)
    442 476   result = True
    443 477   # if self.debug: print('\tAdded link \'{}\'…'.format(url))
    444 478   return result
    445 479   
    446  - def decrement_link_count(self,url,domain=None):
    447  - if domain is None: domain = self.domain_name(url)
    448  - self.link_count.setdefault(domain,0)
    449  - if self.link_count[domain] > 0: self.link_count[domain] -= 1
    450  - 
    451  - def increment_link_count(self,url,domain=None):
    452  - if domain is None: domain = self.domain_name(url)
    453  - self.link_count.setdefault(domain,0)
    454  - self.link_count[domain] += 1
     480 + def remove_link(self,url):
     481 + result = False
     482 + domain = self.domain_name(url)
     483 + if url in getattr(self.domain_links,domain,set()):
     484 + self.domain_links[domain].remove(url)
     485 + if len(self.domain_links[domain]) == 0:
     486 + self.domain_links.remove(domain)
     487 + result = True
     488 + return result
    455 489   
    456 490   def domain_name(self,url):
    457 491   return '.'.join(uprs.urlparse(url).netloc.split('.')[-2:])
    skipped 13 lines
    471 505   except Exception as e:
    472 506   if self.debug: print('.page_source exception:\n{}'.format(e))
    473 507   new_links = self.websearch_links()
    474  - if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url)
     508 + if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    475 509   
    476 510   def websearch_links(self):
    477 511   '''Webpage format for a popular search engine, <div class="g">'''
    skipped 22 lines
    500 534   except Exception as e:
    501 535   if self.debug: print('.page_source exception:\n{}'.format(e))
    502 536   new_links = self.url_links()
    503  - if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url)
     537 + if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    504 538   
    505 539   def url_links(self):
    506 540   '''Generic webpage link finder format.'''
    skipped 35 lines
    542 576   except Exception as e:
    543 577   if self.debug: print('.current_url exception:\n{}'.format(e))
    544 578   if self.debug:
    545  - print("'{}': {:d} links added, {:d} total".format(current_url,k,len(self.links)))
     579 + print("'{}': {:d} links added, {:d} total".format(current_url,k,self.link_count()))
    546 580   elif self.verbose:
    547 581   self.print_progress(k,current_url)
    548 582   
    549 583   def print_progress(self,num_links,url,terminal_width=80):
    550 584   # truncate or fill with white space
    551  - text_suffix = ': {:d} links added, {:d} total'.format(num_links,len(self.links))
     585 + text_suffix = ': {:d} links added, {:d} total'.format(num_links,self.link_count())
    552 586   chars_used = 2 + len(text_suffix)
    553 587   if len(url) + chars_used > terminal_width:
    554 588   url = url[:terminal_width-chars_used-1] + '…'
    skipped 72 lines
Please wait...
Page is in error, reload to recover