🤬
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 84 lines
    85 85   {r'Mac\s*OS': 3, r'iOS': 6, r'Linux': 1, r'Windows': 1, 'noneoftheabove': 1},
    86 86   'is_pc':
    87 87   {True: 4, False: 6},
     88 + 'is_pc':
     89 + {True: 4, False: 6},
     90 + 'is_touch_capable':
     91 + {True: 6, False: 4},
    88 92   }
     93 +# project to simplex
     94 +for tlf in property_pvals:
     95 + tot = 0.
     96 + for f in property_pvals[tlf]: tot += abs(property_pvals[tlf][f])
     97 + for f in property_pvals[tlf]: property_pvals[tlf][f] = abs(property_pvals[tlf][f])/tot
    89 98   
    90 99  # tell ISP that an iPad is being used
    91 100  user_agent = 'Mozilla/5.0 (iPad; CPU OS 6_1 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B141 Safari/8536.25'
    skipped 270 lines
    362 371   except Exception as e:
    363 372   if self.debug: print('.execute_script() exception:\n{}'.format(e))
    364 373   
    365  - def get_blacklist(self):
     374 + def get_blacklist(self,update_flag=False):
     375 + blacklist_domains = getattr(self,'blacklist_domains',set())
     376 + blacklist_urls = getattr(self,'blacklist_urls',set())
    366 377   self.blacklist_domains = set()
    367 378   self.blacklist_urls = set()
    368 379   try:
    skipped 9 lines
    378 389   if self.verbose: print(e)
    379 390   # Make sure blacklists are not empty
    380 391   if self.blacklist:
    381  - try:
    382  - assert self.blacklist_domains != set() or self.blacklist_urls != set()
     392 + try: # no fully empty collection of blacklists
     393 + assert (self.blacklist_domains != set() or self.blacklist_urls != set()) \
     394 + and (not update_flag or (blacklist_domains != set() or blacklist_urls != set()))
    383 395   except AssertionError as e:
    384 396   print(e)
    385  - print('Empty blacklists! Exiting.')
    386  - sys.exit(1)
     397 + if update_flag:
     398 + self.blacklist_domains = blacklist_domains
     399 + self.blacklist_urls = blacklist_urls
     400 + warn.warn('Blacklists not updated; falling back on previous blacklist download.')
     401 + else:
     402 + print('Empty blacklists! Exiting.')
     403 + sys.exit(1)
    387 404   # ignore problem urls
    388 405   self.blacklist_urls |= { 'about:blank' }
    389 406   
    skipped 221 lines
    611 628   self.start_time = time.time()
    612 629   self.data_usage = 0
    613 630   self.decimate_links(total_frac=0.49, decimate_frac=0.333)
    614  - self.get_blacklist() # reload the latest blacklists
     631 + self.get_blacklist(update_flag=True) # reload the latest blacklists
    615 632   
    616 633   def decimate_links(self, total_frac=0.81, decimate_frac=0.1, log_sampling=False):
    617 634   """ Delete `decimate_frac` of links if the total exceeds `total_frac` of the maximum allowed. """
    skipped 2 lines
    620 637   self.remove_link(url)
    621 638   
    622 639   def set_user_agent(self):
    623  - self.user_agent_draw()
     640 + self.draw_user_agent()
    624 641   try:
    625 642   @self.phantomjs_short_timeout
    626 643   def phantomjs_capabilities_update():
    skipped 2 lines
    629 646   except Exception as e:
    630 647   if self.debug: print('.update() exception:\n{}'.format(e))
    631 648   
    632  - def user_agent_draw(self):
    633  - """Draw a random User-Agent either uniformly (mildly susceptible to ML), or from a distribution. """
     649 + def draw_user_agent(self,max_draws=10000):
     650 + """Draw a random User-Agent either uniformly (mildly susceptible to ML), or from a matched distribution."""
    634 651   global ua_parse_flag, user_agent
    635  - if not ua_parse_flag: #
     652 + if not ua_parse_flag:
    636 653   self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent
    637 654   return
    638 655   # Draw User-Agent from pre-defined property distribution
    639 656   property_pvals = self.property_pvals
    640  - while True:
     657 + k = 0
     658 + while k < max_draws:
    641 659   uap = ua.parse(self.fake.user_agent())
    642 660   # print(uap.ua_string)
    643 661   p_browser = property_pvals['browser']['noneoftheabove']
    644  - for k in property_pvals['browser']:
    645  - if bool(re.findall(k, uap.browser.family, flags=re.IGNORECASE)):
    646  - p_browser = property_pvals['browser'][k]
     662 + for ky in property_pvals['browser']:
     663 + if bool(re.findall(ky, uap.browser.family, flags=re.IGNORECASE)):
     664 + p_browser = property_pvals['browser'][ky]
    647 665   break
    648 666   p_os = property_pvals['os']['noneoftheabove']
    649  - for k in property_pvals['os']:
    650  - if bool(re.findall(k, uap.os.family, flags=re.IGNORECASE)):
    651  - p_os = property_pvals['os'][k]
     667 + for ky in property_pvals['os']:
     668 + if bool(re.findall(ky, uap.os.family, flags=re.IGNORECASE)):
     669 + p_os = property_pvals['os'][ky]
    652 670   break
    653 671   p_pc = property_pvals['is_pc'][uap.is_pc]
     672 + p_touch_capable = property_pvals['is_touch_capable'][uap.is_touch_capable]
    654 673   if npr.uniform() <= p_browser \
    655 674   and npr.uniform() <= p_os \
    656  - and npr.uniform() <= p_pc: break
     675 + and npr.uniform() <= p_pc \
     676 + and npr.uniform() <= p_touch_capable: break
     677 + k += 1
    657 678   self.user_agent = uap.ua_string
     679 + print(self.user_agent,flush=True)
    658 680   
    659 681   def draw_link(self,log_sampling=True):
    660 682   """ Draw a single, random link. """
    skipped 441 lines
Please wait...
Page is in error, reload to recover