🤬
  • ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 18 lines
    19 19  # You should have received a copy of the GNU General Public License
    20 20  # along with this program. If not, see <http://www.gnu.org/licenses/>.
    21 21   
    22  -__version__ = '1.3'
     22 +__version__ = '1.4'
    23 23   
    24 24  import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn
    25 25  import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
    skipped 2 lines
    28 28  from selenium.webdriver.support.ui import WebDriverWait
    29 29  from io import BytesIO
    30 30  from faker import Factory
     31 + 
     32 +# parse User-Agent for matching distribution
     33 +ua_parse_flag = True
     34 +try:
     35 + # pip install user-agents
     36 + import user_agents as ua
     37 +except ImportError:
     38 + ua_parse_flag = False
    31 39   
    32 40  # ensure pyopenssl exists to address SNI support
    33 41  # https://stackoverflow.com/questions/18578439/using-requests-with-tls-doesnt-give-sni-support/18579484#18579484
    skipped 33 lines
    67 75  # commercial usage. This includes all kinds of private usage.
    68 76  # The lists must not be given to any third party.
    69 77   
    70  -# tell my ISP that I use a really awful browser, along with random user agents (below)
    71  -user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
     78 +# property value distribution to match household
     79 +property_pvals = \
     80 + {'DNT': # Do Not Track HTTP header
     81 + {True: 0.8, False: 0.2},
     82 + 'browser':
     83 + {'Safari': 6, 'Firefox': 3, 'Chrome': 2, 'noneoftheabove': 1},
     84 + 'os':
     85 + {r'Mac\s*OS': 3, r'iOS': 6, r'Linux': 1, r'Windows': 1, 'noneoftheabove': 1},
     86 + 'is_pc':
     87 + {True: 4, False: 6},
     88 + }
     89 + 
     90 +# tell ISP that an iPad is being used
     91 +user_agent = 'Mozilla/5.0 (iPad; CPU OS 6_1 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B141 Safari/8536.25'
    72 92   
    73 93  # bias the content with non-random, diverse, link-heavy, popular content
    74 94  seed_bias_links = ['http://my.xfinity.com/news',
    skipped 101 lines
    176 196   max_links_cached=max_links_cached,
    177 197   max_links_per_page=max_links_per_page,
    178 198   max_links_per_domain=max_links_per_domain,
    179  - user_agent=user_agent,
     199 + property_pvals=property_pvals,
     200 + user_egent=user_agent,
    180 201   blacklist_url=blacklist_url,
    181 202   wordsite_url=wordsite_url,
    182 203   seed_bias_links=seed_bias_links,
    skipped 4 lines
    187 208   self.max_links_cached = max_links_cached
    188 209   self.max_links_per_page = max_links_per_page
    189 210   self.max_links_per_domain = max_links_per_domain
     211 + self.property_pvals = property_pvals
    190 212   self.user_agent = user_agent
    191 213   self.blacklist_url = blacklist_url
    192 214   self.wordsite_url = wordsite_url
    skipped 23 lines
    216 238   self.data_usage = 0
    217 239   self.get_blacklist()
    218 240   self.get_random_words()
     241 + self.set_user_agent()
    219 242   self.pollute_forever()
    220 243   
    221 244   def parseArgs(self):
    skipped 47 lines
    269 292   dcap['acceptSslCerts'] = ( True )
    270 293   dcap['applicationCacheEnabled'] = ( True )
    271 294   dcap['handlesAlerts'] = ( False )
    272  - dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch' } )
     295 + dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch', 'DNT': '1' } )
    273 296   phantomjs_service_args = ['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2']
    274 297   if self.proxy is not None:
    275 298   phantomjs_service_args = ['--proxy={}'.format(self.proxy)] + phantomjs_service_args
    skipped 321 lines
    597 620   self.remove_link(url)
    598 621   
    599 622   def set_user_agent(self):
    600  - global user_agent
    601  - self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent
     623 + self.user_agent_draw()
    602 624   try:
    603 625   @self.phantomjs_short_timeout
    604 626   def phantomjs_capabilities_update():
    skipped 1 lines
    606 628   phantomjs_capabilities_update()
    607 629   except Exception as e:
    608 630   if self.debug: print('.update() exception:\n{}'.format(e))
     631 + 
     632 + def user_agent_draw(self):
     633 + """Draw a random User-Agent either uniformly (mildly susceptible to ML), or from a distribution. """
     634 + global ua_parse_flag, user_agent
     635 + if not ua_parse_flag: #
     636 + self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent
     637 + return
     638 + # Draw User-Agent from pre-defined property distribution
     639 + property_pvals = self.property_pvals
     640 + while True:
     641 + uap = ua.parse(self.fake.user_agent())
     642 + # print(uap.ua_string)
     643 + p_browser = property_pvals['browser']['noneoftheabove']
     644 + for k in property_pvals['browser']:
     645 + if bool(re.findall(k, uap.browser.family, flags=re.IGNORECASE)):
     646 + p_browser = property_pvals['browser'][k]
     647 + break
     648 + p_os = property_pvals['os']['noneoftheabove']
     649 + for k in property_pvals['os']:
     650 + if bool(re.findall(k, uap.os.family, flags=re.IGNORECASE)):
     651 + p_os = property_pvals['os'][k]
     652 + break
     653 + p_pc = property_pvals['is_pc'][uap.is_pc]
     654 + if npr.uniform() <= p_browser \
     655 + and npr.uniform() <= p_os \
     656 + and npr.uniform() <= p_pc: break
     657 + self.user_agent = uap.ua_string
    609 658   
    610 659   def draw_link(self,log_sampling=True):
    611 660   """ Draw a single, random link. """
    skipped 441 lines
Please wait...
Page is in error, reload to recover