skipped 18 lines 19 19 # You should have received a copy of the GNU General Public License 20 20 # along with this program. If not, see <http://www.gnu.org/licenses/>. 21 21 22 - __version__ = '1.3 ' 22 + __version__ = '1.4 ' 23 23 24 24 import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn 25 25 import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs skipped 2 lines 28 28 from selenium.webdriver.support.ui import WebDriverWait 29 29 from io import BytesIO 30 30 from faker import Factory 31 + 32 + # parse User-Agent for matching distribution 33 + ua_parse_flag = True 34 + try: 35 + # pip install user-agents 36 + import user_agents as ua 37 + except ImportError: 38 + ua_parse_flag = False 31 39 32 40 # ensure pyopenssl exists to address SNI support 33 41 # https://stackoverflow.com/questions/18578439/using-requests-with-tls-doesnt-give-sni-support/18579484#18579484 skipped 33 lines 67 75 # commercial usage. This includes all kinds of private usage. 68 76 # The lists must not be given to any third party. 69 77 70 - # tell my ISP that I use a really awful browser, along with random user agents (below) 71 - user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' 78 + # property value distribution to match household 79 + property_pvals = \ 80 + {'DNT': # Do Not Track HTTP header 81 + {True: 0.8, False: 0.2}, 82 + 'browser': 83 + {'Safari': 6, 'Firefox': 3, 'Chrome': 2, 'noneoftheabove': 1}, 84 + 'os': 85 + {r'Mac\s*OS': 3, r'iOS': 6, r'Linux': 1, r'Windows': 1, 'noneoftheabove': 1}, 86 + 'is_pc': 87 + {True: 4, False: 6}, 88 + } 89 + 90 + # tell ISP that an iPad is being used 91 + user_agent = 'Mozilla/5.0 (iPad; CPU OS 6_1 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B141 Safari/8536.25' 72 92 73 93 # bias the content with non-random, diverse, link-heavy, popular content 74 94 seed_bias_links = ['http://my.xfinity.com/news', skipped 101 lines 176 196 max_links_cached=max_links_cached, 177 197 max_links_per_page=max_links_per_page, 178 198 max_links_per_domain=max_links_per_domain, 179 - user_agent=user_agent, 199 + property_pvals=property_pvals, 200 + user_egent=user_agent, 180 201 blacklist_url=blacklist_url, 181 202 wordsite_url=wordsite_url, 182 203 seed_bias_links=seed_bias_links, skipped 4 lines 187 208 self.max_links_cached = max_links_cached 188 209 self.max_links_per_page = max_links_per_page 189 210 self.max_links_per_domain = max_links_per_domain 211 + self.property_pvals = property_pvals 190 212 self.user_agent = user_agent 191 213 self.blacklist_url = blacklist_url 192 214 self.wordsite_url = wordsite_url skipped 23 lines 216 238 self.data_usage = 0 217 239 self.get_blacklist() 218 240 self.get_random_words() 241 + self.set_user_agent() 219 242 self.pollute_forever() 220 243 221 244 def parseArgs(self): skipped 47 lines 269 292 dcap['acceptSslCerts'] = ( True ) 270 293 dcap['applicationCacheEnabled'] = ( True ) 271 294 dcap['handlesAlerts'] = ( False ) 272 - dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch' } ) 295 + dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch', ' DNT ' : ' 1 ' } ) 273 296 phantomjs_service_args = ['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2'] 274 297 if self.proxy is not None: 275 298 phantomjs_service_args = ['--proxy={}'.format(self.proxy)] + phantomjs_service_args skipped 321 lines 597 620 self.remove_link(url) 598 621 599 622 def set_user_agent(self): 600 - global user_agent 601 - self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent 623 + self.user_agent_draw() 602 624 try: 603 625 @self.phantomjs_short_timeout 604 626 def phantomjs_capabilities_update(): skipped 1 lines 606 628 phantomjs_capabilities_update() 607 629 except Exception as e: 608 630 if self.debug: print('.update() exception:\n{}'.format(e)) 631 + 632 + def user_agent_draw(self): 633 + """Draw a random User-Agent either uniformly (mildly susceptible to ML), or from a distribution. """ 634 + global ua_parse_flag, user_agent 635 + if not ua_parse_flag: # 636 + self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent 637 + return 638 + # Draw User-Agent from pre-defined property distribution 639 + property_pvals = self.property_pvals 640 + while True: 641 + uap = ua.parse(self.fake.user_agent()) 642 + # print(uap.ua_string) 643 + p_browser = property_pvals['browser']['noneoftheabove'] 644 + for k in property_pvals['browser']: 645 + if bool(re.findall(k, uap.browser.family, flags=re.IGNORECASE)): 646 + p_browser = property_pvals['browser'][k] 647 + break 648 + p_os = property_pvals['os']['noneoftheabove'] 649 + for k in property_pvals['os']: 650 + if bool(re.findall(k, uap.os.family, flags=re.IGNORECASE)): 651 + p_os = property_pvals['os'][k] 652 + break 653 + p_pc = property_pvals['is_pc'][uap.is_pc] 654 + if npr.uniform() <= p_browser \ 655 + and npr.uniform() <= p_os \ 656 + and npr.uniform() <= p_pc: break 657 + self.user_agent = uap.ua_string 609 658 610 659 def draw_link(self,log_sampling=True): 611 660 """ Draw a single, random link. """ skipped 441 lines