| skipped 84 lines |
85 | 85 | | {r'Mac\s*OS': 3, r'iOS': 6, r'Linux': 1, r'Windows': 1, 'noneoftheabove': 1}, |
86 | 86 | | 'is_pc': |
87 | 87 | | {True: 4, False: 6}, |
| 88 | + | 'is_pc': |
| 89 | + | {True: 4, False: 6}, |
| 90 | + | 'is_touch_capable': |
| 91 | + | {True: 6, False: 4}, |
88 | 92 | | } |
| 93 | + | # project to simplex |
| 94 | + | for tlf in property_pvals: |
| 95 | + | tot = 0. |
| 96 | + | for f in property_pvals[tlf]: tot += abs(property_pvals[tlf][f]) |
| 97 | + | for f in property_pvals[tlf]: property_pvals[tlf][f] = abs(property_pvals[tlf][f])/tot |
89 | 98 | | |
90 | 99 | | # tell ISP that an iPad is being used |
91 | 100 | | user_agent = 'Mozilla/5.0 (iPad; CPU OS 6_1 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B141 Safari/8536.25' |
| skipped 270 lines |
362 | 371 | | except Exception as e: |
363 | 372 | | if self.debug: print('.execute_script() exception:\n{}'.format(e)) |
364 | 373 | | |
365 | | - | def get_blacklist(self): |
| 374 | + | def get_blacklist(self,update_flag=False): |
| 375 | + | blacklist_domains = getattr(self,'blacklist_domains',set()) |
| 376 | + | blacklist_urls = getattr(self,'blacklist_urls',set()) |
366 | 377 | | self.blacklist_domains = set() |
367 | 378 | | self.blacklist_urls = set() |
368 | 379 | | try: |
| skipped 9 lines |
378 | 389 | | if self.verbose: print(e) |
379 | 390 | | # Make sure blacklists are not empty |
380 | 391 | | if self.blacklist: |
381 | | - | try: |
382 | | - | assert self.blacklist_domains != set() or self.blacklist_urls != set() |
| 392 | + | try: # no fully empty collection of blacklists |
| 393 | + | assert (self.blacklist_domains != set() or self.blacklist_urls != set()) \ |
| 394 | + | and (not update_flag or (blacklist_domains != set() or blacklist_urls != set())) |
383 | 395 | | except AssertionError as e: |
384 | 396 | | print(e) |
385 | | - | print('Empty blacklists! Exiting.') |
386 | | - | sys.exit(1) |
| 397 | + | if update_flag: |
| 398 | + | self.blacklist_domains = blacklist_domains |
| 399 | + | self.blacklist_urls = blacklist_urls |
| 400 | + | warn.warn('Blacklists not updated; falling back on previous blacklist download.') |
| 401 | + | else: |
| 402 | + | print('Empty blacklists! Exiting.') |
| 403 | + | sys.exit(1) |
387 | 404 | | # ignore problem urls |
388 | 405 | | self.blacklist_urls |= { 'about:blank' } |
389 | 406 | | |
| skipped 221 lines |
611 | 628 | | self.start_time = time.time() |
612 | 629 | | self.data_usage = 0 |
613 | 630 | | self.decimate_links(total_frac=0.49, decimate_frac=0.333) |
614 | | - | self.get_blacklist() # reload the latest blacklists |
| 631 | + | self.get_blacklist(update_flag=True) # reload the latest blacklists |
615 | 632 | | |
616 | 633 | | def decimate_links(self, total_frac=0.81, decimate_frac=0.1, log_sampling=False): |
617 | 634 | | """ Delete `decimate_frac` of links if the total exceeds `total_frac` of the maximum allowed. """ |
| skipped 2 lines |
620 | 637 | | self.remove_link(url) |
621 | 638 | | |
622 | 639 | | def set_user_agent(self): |
623 | | - | self.user_agent_draw() |
| 640 | + | self.draw_user_agent() |
624 | 641 | | try: |
625 | 642 | | @self.phantomjs_short_timeout |
626 | 643 | | def phantomjs_capabilities_update(): |
| skipped 2 lines |
629 | 646 | | except Exception as e: |
630 | 647 | | if self.debug: print('.update() exception:\n{}'.format(e)) |
631 | 648 | | |
632 | | - | def user_agent_draw(self): |
633 | | - | """Draw a random User-Agent either uniformly (mildly susceptible to ML), or from a distribution. """ |
| 649 | + | def draw_user_agent(self,max_draws=10000): |
| 650 | + | """Draw a random User-Agent either uniformly (mildly susceptible to ML), or from a matched distribution.""" |
634 | 651 | | global ua_parse_flag, user_agent |
635 | | - | if not ua_parse_flag: # |
| 652 | + | if not ua_parse_flag: |
636 | 653 | | self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent |
637 | 654 | | return |
638 | 655 | | # Draw User-Agent from pre-defined property distribution |
639 | 656 | | property_pvals = self.property_pvals |
640 | | - | while True: |
| 657 | + | k = 0 |
| 658 | + | while k < max_draws: |
641 | 659 | | uap = ua.parse(self.fake.user_agent()) |
642 | 660 | | # print(uap.ua_string) |
643 | 661 | | p_browser = property_pvals['browser']['noneoftheabove'] |
644 | | - | for k in property_pvals['browser']: |
645 | | - | if bool(re.findall(k, uap.browser.family, flags=re.IGNORECASE)): |
646 | | - | p_browser = property_pvals['browser'][k] |
| 662 | + | for ky in property_pvals['browser']: |
| 663 | + | if bool(re.findall(ky, uap.browser.family, flags=re.IGNORECASE)): |
| 664 | + | p_browser = property_pvals['browser'][ky] |
647 | 665 | | break |
648 | 666 | | p_os = property_pvals['os']['noneoftheabove'] |
649 | | - | for k in property_pvals['os']: |
650 | | - | if bool(re.findall(k, uap.os.family, flags=re.IGNORECASE)): |
651 | | - | p_os = property_pvals['os'][k] |
| 667 | + | for ky in property_pvals['os']: |
| 668 | + | if bool(re.findall(ky, uap.os.family, flags=re.IGNORECASE)): |
| 669 | + | p_os = property_pvals['os'][ky] |
652 | 670 | | break |
653 | 671 | | p_pc = property_pvals['is_pc'][uap.is_pc] |
| 672 | + | p_touch_capable = property_pvals['is_touch_capable'][uap.is_touch_capable] |
654 | 673 | | if npr.uniform() <= p_browser \ |
655 | 674 | | and npr.uniform() <= p_os \ |
656 | | - | and npr.uniform() <= p_pc: break |
| 675 | + | and npr.uniform() <= p_pc \ |
| 676 | + | and npr.uniform() <= p_touch_capable: break |
| 677 | + | k += 1 |
657 | 678 | | self.user_agent = uap.ua_string |
| 679 | + | print(self.user_agent,flush=True) |
658 | 680 | | |
659 | 681 | | def draw_link(self,log_sampling=True): |
660 | 682 | | """ Draw a single, random link. """ |
| skipped 441 lines |