🤬
  • ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 46 lines
    47 47  wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain'
    48 48  timeout = 20
    49 49  short_timeout = 3
     50 +phantomjs_rss_limit_mb = 1024 # Default maximum meory limit of phantomjs processs (MB)
     51 +terminal_width = 80 # tty width, standard is 80 chars; add code to adapt later
    50 52   
    51 53  blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz'
    52 54  # Usage of the Shalla Blacklists:
    skipped 60 lines
    113 115   
    114 116   
    115 117  class ISPDataPollution:
    116  - '''Re: https://www.eff.org/deeplinks/2017/03/senate-puts-isp-profits-over-your-privacy
     118 + """
     119 + Re: https://www.eff.org/deeplinks/2017/03/senate-puts-isp-profits-over-your-privacy
    117 120  
    118 121  I pay my ISP a lot for data usage every month. I typically don't use
    119 122  all the bandwidth that I pay for. If my ISP is going to sell my
    skipped 12 lines
    132 135  The crawler uses the Python requests and lxml.html libraries, is hardcoded
    133 136  to download html without javascript processing, will not download
    134 137  images, and respects robots.txt, which all provide good security.
    135  -'''
     138 + """
    136 139   
    137 140   def __init__(self,gb_per_month=gb_per_month,
    138 141   max_links_cached=max_links_cached,
    skipped 4 lines
    143 146   blacklist_url=blacklist_url,
    144 147   wordsite_url=wordsite_url,
    145 148   seed_bias_links=seed_bias_links,
    146  - timeout=timeout,
     149 + timeout=timeout, diurnal_flag=True,
    147 150   quit_driver_every_call=False,
    148 151   blacklist=True,verbose=True):
    149 152   self.max_links_cached = max_links_cached
    skipped 6 lines
    156 159   self.seed_bias_links = seed_bias_links
    157 160   self.blacklist = blacklist; self.verbose = verbose
    158 161   self.timeout = timeout
     162 + self.diurnal_flag = diurnal_flag
    159 163   self.quit_driver_every_call = quit_driver_every_call
    160 164   # self.gb_per_month = gb_per_month # set in parseArgs
    161 165   # self.debug = debug # set in parseArgs
    skipped 18 lines
    180 184   def parseArgs(self):
    181 185   parser = ap.ArgumentParser()
    182 186   parser.add_argument('-bw', '--gb_per_month', help="GB per month", type=int, default=gb_per_month)
     187 + parser.add_argument('-mm', '--maxmemory', help="Maximum memory of phantomjs (MB); 0=>restart every link", type=int, default=phantomjs_rss_limit_mb)
    183 188   parser.add_argument('-g', '--debug', help="Debug flag", action='store_true')
    184 189   args = parser.parse_args()
    185 190   for k in args.__dict__: setattr(self,k,getattr(args,k))
    skipped 2 lines
    188 193   
    189 194   def sanity_check_arguments(self):
    190 195   self.gb_per_month = min(2048,max(1,self.gb_per_month)) # min-max bandwidth limits
     196 + if self.maxmemory == 0: self.quit_driver_every_call = True
     197 + self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits
    191 198   
    192 199   def open_session(self):
    193 200   self.quit_session()
    skipped 20 lines
    214 221   self.session = driver
    215 222   
    216 223   def quit_session(self,hard_quit=False,pid=None):
    217  - ''' close, kill -9, quit, del '''
     224 + """
     225 + close, kill -9, quit, del
     226 + :param hard_quit:
     227 + :param pid:
     228 + :return:
     229 + """
    218 230   # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution
    219 231   if hasattr(self,'session'):
    220 232   if not hard_quit:
    skipped 102 lines
    323 335   if self.quit_driver_every_call: self.quit_session()
    324 336   while True: # pollute forever, pausing only to meet the bandwidth requirement
    325 337   try:
    326  - if self.diurnal_cycle_test():
     338 + if (not self.diurnal_flag) or self.diurnal_cycle_test():
    327 339   self.pollute()
    328 340   else:
    329 341   time.sleep(self.chi2_mean_std(3.,1.))
    skipped 13 lines
    343 355   self.clear_session()
    344 356   if self.quit_driver_every_call: self.quit_session()
    345 357   url = self.pop_link()
     358 + if self.verbose: self.print_url(url)
    346 359   if self.quit_driver_every_call: self.open_session()
    347 360   self.get_url(url)
    348 361   self.clear_session()
    skipped 55 lines
    404 417   return npr.uniform() < val
    405 418   
    406 419   def chi2_mean_std(self,mean=1.,std=0.1):
    407  - '''
     420 + """
    408 421   Chi-squared random variable with given mean and standard deviation.
    409  - '''
     422 + :param mean:
     423 + :param std:
     424 + :return:
     425 + """
    410 426   scale = 2.*mean/std
    411 427   nu = mean*scale
    412 428   return npr.chisquare(nu)/scale
    skipped 124 lines
    537 553   return '.'.join(uprs.urlparse(url).netloc.split('.')[-2:])
    538 554   
    539 555   def get_websearch(self,query):
    540  - '''HTTP GET of a websearch, then add any embedded links.'''
     556 + """
     557 + HTTP GET of a websearch, then add any embedded links.
     558 + :param query:
     559 + :return:
     560 + """
    541 561   url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
     562 + if self.verbose: self.print_url(url)
    542 563   @self.phantomjs_timeout
    543 564   def phantomjs_get(): self.session.get(url) # selenium driver
    544 565   phantomjs_get()
    skipped 4 lines
    549 570   if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    550 571   
    551 572   def websearch_links(self):
    552  - '''Webpage format for a popular search engine, <div class="g">'''
     573 + """
     574 + Webpage format for a popular search engine, <div class="g">.
     575 + :return:
     576 + """
    553 577   # https://github.com/detro/ghostdriver/issues/169
    554 578   @self.phantomjs_short_timeout
    555 579   def phantomjs_find_elements_by_css_selector():
    skipped 18 lines
    574 598   return links
    575 599   
    576 600   def get_url(self,url):
    577  - '''HTTP GET of the url, and add any embedded links.'''
     601 + """
     602 + HTTP GET of the url, and add any embedded links.
     603 + :param url:
     604 + :return:
     605 + """
    578 606   if not self.check_robots(url): return # bail out if robots.txt says to
    579 607   @self.phantomjs_timeout
    580 608   def phantomjs_get(): self.session.get(url) # selenium driver
    skipped 60 lines
    641 669   except Exception as e:
    642 670   if self.debug: print('.current_url exception:\n{}'.format(e))
    643 671   if self.debug:
    644  - print("'{}': {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy()))
     672 + print("{}: {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy()))
    645 673   elif self.verbose:
    646  - self.print_progress(k,current_url)
     674 + self.print_progress(current_url,num_links=k)
    647 675   
    648  - def print_progress(self,num_links,url,terminal_width=80):
    649  - # truncate or fill with white space
    650  - text_suffix = ': +{:d}/{:d} links, H(domain)={:.1f} b'.format(num_links,self.link_count(),self.domain_entropy())
    651  - chars_used = 2 + len(text_suffix)
    652  - if len(url) + chars_used > terminal_width:
    653  - url = url[:terminal_width-chars_used-1] + '…'
    654  - text = "'{}'{}".format(url,text_suffix)
     676 + def print_url(self,url):
     677 + if self.debug: print(url + ' …')
     678 + else: self.print_progress(url)
     679 + 
     680 + def print_progress(self,url,num_links=None):
     681 + if num_links is not None:
     682 + text_suffix = ': +{:d}/{:d} links, H(domain)={:.1f} b'.format(num_links,self.link_count(),self.domain_entropy())
     683 + else:
     684 + text_suffix = ': {:d} links, H(domain)={:.1f} b …'.format(self.link_count(),self.domain_entropy())
     685 + self.print_truncated_line(url,text_suffix)
     686 + 
     687 + def print_truncated_line(self,url,text_suffix='',terminal_width=terminal_width):
     688 + """
     689 + Print truncated `url` + `text_suffix` to fill `terminal_width`
     690 + :param url:
     691 + :param text_suffix:
     692 + :param terminal_width:
     693 + :return:
     694 + """
     695 + chars_used = len(text_suffix)
     696 + if text_suffix == '…':
     697 + if len(url) >= terminal_width:
     698 + url = url[:terminal_width-1] # add '…' below
     699 + elif len(url) < terminal_width-1:
     700 + url += ' ' # add an extra space before the ellipsis
     701 + else:
     702 + if len(url) + chars_used > terminal_width:
     703 + url = url[:terminal_width-chars_used-1] + '…'
     704 + text = "{}{}".format(url,text_suffix) # added white space necessary
    655 705   text = text[:min(terminal_width,len(text))] + ' ' * max(0,terminal_width-len(text))
    656 706   print(text,end='',flush=True)
    657 707   time.sleep(0.01)
    skipped 55 lines
    713 763   raise self.TimeoutError('robotparser is taking too long')
    714 764   
    715 765   def check_phantomjs_process(self):
    716  - '''Check if phantomjs is running.'''
     766 + """
     767 + Check if phantomjs is running.
     768 + :return:
     769 + """
    717 770   # Check rss and restart if too large, then check existence
    718 771   # http://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python
    719 772   try:
    720 773   if not hasattr(self,'session'): self.open_session()
    721 774   pid, rss_mb = self.phantomjs_pid_and_memory()
    722  - if rss_mb > 1024: # 1 GB rss limit
     775 + if rss_mb > self.phantomjs_rss_limit_mb: # memory limit
    723 776   self.quit_session(pid=pid)
    724 777   self.open_session()
    725 778   pid, _ = self.phantomjs_pid_and_memory()
    skipped 32 lines
Please wait...
Page is in error, reload to recover