🤬
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 45 lines
    46 46  search_url = 'http://www.google.com/search' # keep unencrypted for ISP DPI
    47 47  wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain'
    48 48  timeout = 20
     49 +short_timeout = 3
    49 50   
    50 51  blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz'
    51 52  # Usage of the Shalla Blacklists:
    skipped 34 lines
    86 87   
    87 88  # monkeypatch the read class method in RobotFileParser
    88 89  # many sites will block access to robots.txt without a standard User-Agent header
    89  -short_timeout = 3
    90 90  class RobotFileParserUserAgent(robotparser.RobotFileParser):
    91 91   
    92 92   timeout = short_timeout # short-term timeout
    skipped 321 lines
    414 414   
    415 415   def exceeded_bandwidth_tasks(self):
    416 416   if self.bandwidth_test():
    417  - # decimate the stack and clear the cookies
    418  - if self.link_count() > int(np.ceil(0.81*self.max_links_cached)):
    419  - for url in self.draw_links(n=int(np.ceil(self.link_count()/10.))):
    420  - self.pop_link()
     417 + self.decimate_links(total_frac=0.81,decimate_frac=0.1)
    421 418   time.sleep(120)
    422 419   
    423 420   def every_hour_tasks(self):
    skipped 16 lines
    440 437   
    441 438   def every_day_tasks(self):
    442 439   if int(self.elapsed_time/3600. % 24.) == 23:
    443  - # clear out cookies every day, and seed more links
     440 + # clear out cookies every day, decimate, and seed more links
    444 441   if self.twentyfour_hour_trigger:
    445 442   if hasattr(self,'session'):
    446 443   self.seed_links()
    skipped 2 lines
    449 446   self.open_session()
    450 447   else:
    451 448   self.open_session()
     449 + self.decimate_links(total_frac=0.667, decimate_frac=0.1)
    452 450   self.seed_links()
    453 451   if self.quit_driver_every_call: self.quit_session()
    454 452   self.twentyfour_hour_trigger = False
    skipped 5 lines
    460 458   # reset bw stats and (really) decimate the stack every couple of weeks
    461 459   self.start_time = time.time()
    462 460   self.data_usage = 0
    463  - if self.link_count() > int(np.ceil(0.49*self.max_links_cached)):
    464  - for url in self.draw_links(n=int(np.ceil(self.link_count()/3.))):
    465  - self.pop_link(url)
     461 + self.decimate_links(total_frac=0.49, decimate_frac=0.333)
     462 + 
     463 + def decimate_links(self, total_frac=0.81, decimate_frac=0.1): # decimate the stack
     464 + if self.link_count() > int(np.ceil(total_frac * self.max_links_cached)):
     465 + for url in self.draw_links(n=int(np.ceil(self.link_count() * decimate_frac))):
     466 + self.remove_link(url)
    466 467   
    467 468   def set_user_agent(self):
    468 469   global user_agent
    skipped 11 lines
    480 481   
    481 482   def draw_links(self,n=1,log_sampling=False):
    482 483   urls = []
    483  - domain_count = np.array([(dmn,len(self.domain_links[dmn])) for dmn in self.domain_links])
    484  - p = np.array([np.float(c) for d,c in domain_count])
     484 + domain_array = np.array([dmn for dmn in self.domain_links])
     485 + domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])])
     486 + p = np.array([np.float(c) for c in domain_count])
    485 487   count_total = p.sum()
    486 488   if log_sampling: # log-sampling [log(x+1)] to bias lower count domains
    487 489   p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype)
    skipped 1 lines
    489 491   p = p/p.sum()
    490 492   cnts = npr.multinomial(n, pvals=p)
    491 493   if n > 1:
    492  - for k in range(len(cnts)):
    493  - domain = domain_count[k][0]
    494  - cnt = min(cnts[k],domain_count[k][1])
     494 + for k in range(cnts.shape[0]):
     495 + domain = domain_array[k]
     496 + cnt = min(cnts[k],domain_count[k])
    495 497   for url in random.sample(self.domain_links[domain],cnt):
    496 498   urls.append(url)
    497 499   else:
    skipped 51 lines
    549 551   # https://github.com/detro/ghostdriver/issues/169
    550 552   @self.phantomjs_short_timeout
    551 553   def phantomjs_find_elements_by_css_selector():
    552  - return WebDriverWait(self.session, 3).until(lambda x: x.find_elements_by_css_selector('div.g'))
     554 + return WebDriverWait(self.session,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g'))
    553 555   elements = phantomjs_find_elements_by_css_selector()
    554 556   # get links in random order until max. per page
    555 557   k = 0
    skipped 1 lines
    557 559   try:
    558 560   for div in sorted(elements,key=lambda k: random.random()):
    559 561   @self.phantomjs_short_timeout
    560  - def phantomjs_find_element_by_tag_name():
    561  - if div.find_element_by_tag_name('a').get_attribute('href') is not None:
    562  - links.append(div.find_element_by_tag_name('a').get_attribute('href'))
    563  - phantomjs_find_element_by_tag_name()
     562 + def phantomjs_find_element_by_tag_name(): return div.find_element_by_tag_name('a')
     563 + a_tag = phantomjs_find_element_by_tag_name()
     564 + @self.phantomjs_short_timeout
     565 + def phantomjs_get_attribute(): return a_tag.get_attribute('href')
     566 + href = phantomjs_get_attribute()
     567 + if href is not None: links.append(href)
    564 568   k += 1
    565 569   if k > self.max_links_per_page: break
    566 570   except Exception as e:
    567  - if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e))
     571 + if self.debug: print('.find_element_by_tag_name.get_attribute() exception:\n{}'.format(e))
    568 572   return links
    569 573   
    570 574   def get_url(self,url):
    skipped 22 lines
    593 597   try:
    594 598   for a in sorted(elements,key=lambda k: random.random()):
    595 599   @self.phantomjs_short_timeout
    596  - def phantomjs_get_attribute():
    597  - if a.get_attribute('href') is not None:
    598  - links.append(a.get_attribute('href'))
    599  - phantomjs_get_attribute()
     600 + def phantomjs_get_attribute(): return a.get_attribute('href')
     601 + href = phantomjs_get_attribute()
     602 + if href is not None: links.append(href)
    600 603   k += 1
    601 604   if k > self.max_links_per_page: break
    602  - except Exception as a:
     605 + except Exception as e:
    603 606   if self.debug: print('.get_attribute() exception:\n{}'.format(e))
    604 607   return links
    605 608   
    skipped 8 lines
    614 617   rp.read()
    615 618   result = rp.can_fetch(self.user_agent,url)
    616 619   del rp # ensure self.close() in urllib
    617  - robots_read()
     620 + return result
     621 + result = robots_read()
    618 622   return result
    619 623   
    620 624   def add_url_links(self,links,url=''):
    skipped 131 lines
Please wait...
Page is in error, reload to recover