🤬
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 3 lines
    4 4   
    5 5  # isp_data_pollution: bandwidth-limited ISP data pollution
    6 6   
    7  -# Copyright 2017 Steven T. Smith <steve dot t dot smith at gmail dot com>, GPL
     7 +# Copyright 20172018 Steven T. Smith <steve dot t dot smith at gmail dot com>, GPL
    8 8   
    9 9  # This program is free software: you can redistribute it and/or modify
    10 10  # it under the terms of the GNU General Public License as published by
    skipped 8 lines
    19 19  # You should have received a copy of the GNU General Public License
    20 20  # along with this program. If not, see <http://www.gnu.org/licenses/>.
    21 21   
    22  -__version__ = '1.4'
     22 +__version__ = '2.0'
    23 23   
    24 24  import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn
    25 25  import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
    26 26  from selenium import webdriver
    27  -from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    28 27  from selenium.webdriver.support.ui import WebDriverWait
    29 28  from io import BytesIO
    30  -from faker import Factory
     29 +import fake_useragent as fake_ua
    31 30   
    32 31  # parse User-Agent for matching distribution
    33 32  ua_parse_flag = True
    skipped 21 lines
    55 54  # nice this process on UNIX
    56 55  if hasattr(os,'nice'): os.nice(15)
    57 56   
    58  -gb_per_month = 50 # How many gigabytes to pollute per month
     57 +gb_per_month = 100 # How many gigabytes to pollute per month
    59 58  max_links_cached = 100000 # Maximum number of links to cache for download
    60 59  max_links_per_page = 200 # Maximum number of links to add per page
    61 60  max_links_per_domain = 400 # Maximum number of links to add per domain
    62 61  wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain'
    63  -timeout = 20
    64  -short_timeout = 3
    65  -phantomjs_rss_limit_mb = 1024 # Default maximum meory limit of phantomjs processs (MB)
     62 +timeout = 45
     63 +short_timeout = 10
     64 +browserdriver_rss_limit_mb = 1024 # Default maximum memory limit of browserdriver (chromedriver) processs (MB)
    66 65  terminal_width = 80 # tty width, standard is 80 chars; add code to adapt later
    67 66   
    68 67  blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz'
    skipped 29 lines
    98 97   
    99 98  # tell ISP that an iPad is being used
    100 99  user_agent = 'Mozilla/5.0 (iPad; CPU OS 6_1 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B141 Safari/8536.25'
     100 + 
     101 +# Tor browser size on Linux
     102 +window_size = (1296,1018)
    101 103   
    102 104  # bias the content with non-random, diverse, link-heavy, popular content
    103 105  seed_bias_links = ['http://my.xfinity.com/news',
    skipped 102 lines
    206 208   max_links_per_page=max_links_per_page,
    207 209   max_links_per_domain=max_links_per_domain,
    208 210   property_pvals=property_pvals,
    209  - user_egent=user_agent,
     211 + user_agent=user_agent,
    210 212   blacklist_url=blacklist_url,
    211 213   wordsite_url=wordsite_url,
    212 214   seed_bias_links=seed_bias_links,
    213 215   timeout=timeout, diurnal_flag=True,
    214 216   quit_driver_every_call=False,
    215 217   blacklist=True,verbose=True):
    216  - print('This is ISP Data Pollution ����, Version {}'.format(__version__))
     218 + print(f'This is ISP Data Pollution ����, Version {__version__}')
    217 219   self.max_links_cached = max_links_cached
    218 220   self.max_links_per_page = max_links_per_page
    219 221   self.max_links_per_domain = max_links_per_domain
    skipped 10 lines
    230 232   # self.debug = debug # set in parseArgs
    231 233   self.args = self.args = self.parseArgs()
    232 234   # timeout configurable decorators
    233  - self.phantomjs_timeout = self.block_timeout(self.phantomjs_hang_handler, \
     235 + self.chromedriver_timeout = self.block_timeout(self.chromedriver_hang_handler, \
    234 236   alarm_time=self.timeout+2,errors=(self.TimeoutError,), debug=self.debug)
    235  - self.phantomjs_short_timeout = self.block_timeout(self.phantomjs_hang_handler, \
    236  - alarm_time=short_timeout+1,errors=(self.TimeoutError,Exception), debug=self.debug)
    237  - self.phantomjs_quit_timeout = self.block_timeout(self.phantomjs_quit_hang_handler, \
    238  - alarm_time=short_timeout+1,errors=(self.TimeoutError,Exception), debug=self.debug)
     237 + self.chromedriver_short_timeout = self.block_timeout(self.chromedriver_hang_handler, \
     238 + alarm_time=short_timeout+2,errors=(self.TimeoutError,Exception), debug=self.debug)
     239 + self.chromedriver_quit_timeout = self.block_timeout(self.chromedriver_quit_hang_handler, \
     240 + alarm_time=short_timeout+2,errors=(self.TimeoutError,Exception), debug=self.debug)
    239 241   self.robots_timeout = self.block_timeout(self.robots_hang_handler, \
    240  - alarm_time=short_timeout+1,errors=(self.TimeoutError,), debug=self.debug)
    241  - self.check_phantomjs_version()
    242  - self.fake = Factory.create()
     242 + alarm_time=short_timeout+2,errors=(self.TimeoutError,), debug=self.debug)
     243 + self.check_chromedriver_version()
     244 + self.fake_ua = fake_ua.UserAgent()
    243 245   self.hour_trigger = True
    244 246   self.twentyfour_hour_trigger = True
    245 247   self.domain_links = dict()
    skipped 1 lines
    247 249   self.data_usage = 0
    248 250   self.get_blacklist()
    249 251   self.get_random_words()
    250  - self.set_user_agent()
    251 252   self.pollute_forever()
    252 253   
    253 254   def parseArgs(self):
    254 255   parser = ap.ArgumentParser()
    255 256   parser.add_argument('-bw', '--gb_per_month', help="GB per month", type=int, default=gb_per_month)
    256 257   parser.add_argument('-mm', '--maxmemory',
    257  - help="Maximum memory of phantomjs (MB); 0=>restart every link",
     258 + help="Maximum memory of chromedriver (MB); 0=>restart every link",
    258 259   type=int, default=1024)
    259  - parser.add_argument('-P', '--phantomjs-binary-path', help="Path to phantomjs binary", type=str, default=None)
    260  - parser.add_argument('-p', '--proxy', help="Proxy for phantomjs", type=str, default=None)
     260 + parser.add_argument('-P', '--chromedriver-binary-path', help="Path to chromedriver binary", type=str, default=None)
     261 + parser.add_argument('-p', '--proxy', help="Proxy for chromedriver", type=str, default=None)
    261 262   parser.add_argument('-g', '--debug', help="Debug flag", action='store_true')
    262 263   args = parser.parse_args()
    263 264   for k in args.__dict__: setattr(self,k,getattr(args,k))
    skipped 3 lines
    267 268   def sanity_check_arguments(self):
    268 269   self.gb_per_month = min(2048,max(1,self.gb_per_month)) # min-max bandwidth limits
    269 270   if self.maxmemory == 0: self.quit_driver_every_call = True
    270  - self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits
     271 + self.chromedriver_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits
    271 272   
    272  - def check_phantomjs_version(self,recommended_version=(2,1)):
     273 + def check_chromedriver_version(self,recommended_version=(2,41)):
    273 274   self.open_driver()
    274 275   if self.debug:
    275  - print("{} version is {}, {} version is {}".format(self.driver.capabilities["browserName"],
     276 + print("{} version is {}, chromedriver version is {}".format(self.driver.capabilities["browserName"],
    276 277   self.driver.capabilities["version"],
    277  - self.driver.capabilities["driverName"],
    278  - self.driver.capabilities["driverVersion"]))
    279  - phantomjs_version = tuple(int(i) for i in self.driver.capabilities["version"].split('.'))
    280  - if phantomjs_version < recommended_version:
     278 + self.driver.capabilities["chrome"]["chromedriverVersion"]))
     279 + chromedriver_version = tuple(int(i) for i in
     280 + re.sub(r'([\d.]+?) .*','\\1',self.driver.capabilities["chrome"]["chromedriverVersion"]).split('.'))
     281 + if chromedriver_version < recommended_version:
    281 282   warn.warn("""{} version is {};
    282  -please upgrade to at least version {} from http://phantomjs.org.
     283 +please upgrade to at least version {} from http://chromedriver.chromium.org/downloads.
    283 284  """.format(self.driver.capabilities["browserName"],self.driver.capabilities["version"],
    284 285   '.'.join(str(i) for i in recommended_version)))
    285 286   self.quit_driver()
    286 287   
    287 288   def open_driver(self):
    288 289   self.quit_driver()
    289  - if not hasattr(self, 'driver') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver):
    290  - # phantomjs driver
    291  - # http://engineering.shapesecurity.com/2015/01/detecting-phantomjs-based-visitors.html
    292  - # https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string
    293  - # http://phantomjs.org/api/webpage/property/settings.html
    294  - # http://stackoverflow.com/questions/23390974/phantomjs-keeping-cache
    295  - dcap = dict(DesiredCapabilities.PHANTOMJS)
    296  - # dcap['browserName'] = 'Chrome'
    297  - dcap['phantomjs.page.settings.userAgent'] = ( self.user_agent )
    298  - dcap['phantomjs.page.settings.loadImages'] = ( 'false' )
    299  - dcap['phantomjs.page.settings.clearMemoryCaches'] = ( 'true' )
    300  - dcap['phantomjs.page.settings.resourceTimeout'] = ( max(2000,int(self.timeout * 1000)) )
    301  - dcap['acceptSslCerts'] = ( True )
    302  - dcap['applicationCacheEnabled'] = ( True )
    303  - dcap['handlesAlerts'] = ( False )
    304  - dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch', 'DNT': '1' } )
    305  - phantomjs_service_args = ['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2']
     290 + if not hasattr(self, 'driver') or not isinstance(self.driver,webdriver.chrome.webdriver.WebDriver):
     291 + # chromedriver
     292 + chrome_options = webdriver.ChromeOptions()
     293 + chrome_options.add_argument('headless')
     294 + chrome_options.add_argument(f'user-agent={self.user_agent}')
     295 + chrome_options.add_argument('window-size={:d},{:d}'.format(window_size[0],window_size[1]))
     296 + # Disable image downloads; see https://stackoverflow.com/questions/18657976/disable-images-in-selenium-google-chromedriver
     297 + chrome_options.add_argument('blink-settings=imagesEnabled=false')
     298 + chrome_options.add_argument('mute-audio')
    306 299   if self.proxy is not None:
    307  - phantomjs_service_args = ['--proxy={}'.format(self.proxy)] + phantomjs_service_args
    308  - if self.phantomjs_binary_path is None:
    309  - driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=phantomjs_service_args)
     300 + chrome_options.add_argument(f'proxy-server={self.proxy}')
     301 + if self.chromedriver_binary_path is None:
     302 + driver = webdriver.Chrome(chrome_options=chrome_options)
    310 303   else:
    311  - driver = webdriver.PhantomJS(self.phantomjs_binary_path,desired_capabilities=dcap,service_args=phantomjs_service_args)
    312  - driver.set_window_size(1296,1018) # Tor browser size on Linux
     304 + chrome_options.binary_location = self.chromedriver_binary_path
     305 + driver = webdriver.Chrome(self.chromedriver_binary_path,chrome_options=chrome_options)
     306 + driver.set_window_size(window_size[0],window_size[1])
    313 307   driver.implicitly_wait(self.timeout)
    314 308   driver.set_page_load_timeout(self.timeout)
    315 309   driver.set_script_timeout(self.timeout)
    316 310   self.driver = driver
    317 311   
    318  - def quit_driver(self,hard_quit=False,pid=None,phantomjs_short_timeout_decorator=None):
     312 + def quit_driver(self,hard_quit=False,pid=None,chromedriver_short_timeout_decorator=None):
    319 313   """
    320 314   close, kill -9, quit, del
    321 315   :param hard_quit:
    322 316   :param pid:
    323 317   :return:
    324  - """
     318 + """
     319 + # Use original phantomjs code for chromedriver, even though chromedriver is likely far more robust
    325 320   # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution
    326  - if phantomjs_short_timeout_decorator is None:
    327  - phantomjs_short_timeout_decorator = self.phantomjs_short_timeout
     321 + if chromedriver_short_timeout_decorator is None:
     322 + chromedriver_short_timeout_decorator = self.chromedriver_short_timeout
    328 323   if hasattr(self,'driver'):
    329 324   if not hard_quit:
    330  - @phantomjs_short_timeout_decorator
    331  - def phantomjs_close(): self.driver.close()
    332  - phantomjs_close()
     325 + @chromedriver_short_timeout_decorator
     326 + def chromedriver_close(): self.driver.close()
     327 + chromedriver_close()
    333 328   try:
    334  - @phantomjs_short_timeout_decorator
    335  - def phantomjs_send_signal(): self.driver.service.process.send_signal(signal.SIGTERM)
    336  - phantomjs_send_signal()
     329 + if pid is None:
     330 + @chromedriver_short_timeout_decorator
     331 + def chromedriver_process_pid(): return self.driver.service.process.pid
     332 + pid = chromedriver_process_pid()
     333 + @chromedriver_short_timeout_decorator
     334 + def chromedriver_send_signal():
     335 + # Google Chrome is a child process of chromedriver
     336 + for c in psutil.Process(pid).children(): c.send_signal(signal.SIGTERM)
     337 + self.driver.service.process.send_signal(signal.SIGTERM)
     338 + chromedriver_send_signal()
    337 339   except Exception as e:
    338  - if self.debug: print('.send_signal() exception:\n{}'.format(e))
    339  - if pid is None:
    340  - @phantomjs_short_timeout_decorator
    341  - def phantomjs_process_pid(): return self.driver.service.process.pid
    342  - pid = phantomjs_process_pid()
     340 + if self.debug: print(f'.send_signal() exception:\n{e}')
    343 341   if isinstance(pid,int):
    344 342   try:
     343 + # Google Chrome is a child process of chromedriver
     344 + for c in psutil.Process(pid).children(): os.kill(c.pid, signal.SIGTERM)
    345 345   os.kill(pid, signal.SIGTERM) # overkill (pun intended)
    346 346   except Exception as e:
    347  - if self.debug: print('.kill() exception:\n{}'.format(e))
     347 + if self.debug: print(f'.kill() exception:\n{e}')
    348 348   try:
    349  - @phantomjs_short_timeout_decorator
    350  - def phantomjs_quit(): self.driver.quit()
    351  - phantomjs_quit()
     349 + @chromedriver_short_timeout_decorator
     350 + def chromedriver_quit(): self.driver.quit()
     351 + chromedriver_quit()
    352 352   except Exception as e:
    353  - if self.debug: print('.quit() exception:\n{}'.format(e))
     353 + if self.debug: print(f'.quit() exception:\n{e}')
    354 354   del self.driver
    355 355   
    356 356   def clear_driver(self):
    357 357   # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver
    358 358   if hasattr(self, 'driver'):
    359 359   try:
    360  - @self.phantomjs_short_timeout
    361  - def phantomjs_delete_all_cookies(): self.driver.delete_all_cookies()
    362  - phantomjs_delete_all_cookies()
     360 + @self.chromedriver_short_timeout
     361 + def chromedriver_delete_all_cookies(): self.driver.delete_all_cookies()
     362 + chromedriver_delete_all_cookies()
    363 363   except Exception as e:
    364  - if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
     364 + if self.debug: print(f'.delete_all_cookies() exception:\n{e}')
    365 365   try:
    366  - @self.phantomjs_short_timeout
    367  - def phantomjs_clear():
    368  - self.driver.execute_script('window.localStorage.clear();')
    369  - self.driver.execute_script('window.sessionStorage.clear();')
    370  - phantomjs_clear()
     366 + @self.chromedriver_short_timeout
     367 + def chromedriver_clear():
     368 + pass
     369 + # Neither of these methods appear to work for chromedriver
     370 + # self.driver.execute_script('window.localStorage.clear();')
     371 + # self.driver.execute_script('window.sessionStorage.clear();')
     372 + chromedriver_clear()
    371 373   except Exception as e:
    372  - if self.debug: print('.execute_script() exception:\n{}'.format(e))
     374 + if self.debug: print(f'.execute_script() exception:\n{e}')
    373 375   
    374 376   def get_blacklist(self,update_flag=False):
    375 377   blacklist_domains = getattr(self,'blacklist_domains',set())
    skipped 53 lines
    429 431   # drugs hospitals porn spyware
    430 432   # dynamic imagehosting radiotv tracker
    431 433   for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]:
    432  - self.blacklist_domains |= set(tgz.extractfile('BL/{}/domains'.format(member)).read().decode('utf-8').splitlines())
    433  - self.blacklist_urls |= set(tgz.extractfile('BL/{}/urls'.format(member)).read().decode('utf-8').splitlines())
     434 + self.blacklist_domains |= set(tgz.extractfile(f'BL/{member}/domains').read().decode('utf-8').splitlines())
     435 + self.blacklist_urls |= set(tgz.extractfile(f'BL/{member}/urls').read().decode('utf-8').splitlines())
    434 436   tgz.close()
    435 437   tmpfile.close()
    436 438   
    skipped 20 lines
    457 459   self.words = response.content.decode('utf-8').splitlines()
    458 460   reqsession.close()
    459 461   except Exception as e:
    460  - if self.debug: print('requests exception:\n{}'.format(e))
     462 + if self.debug: print(f'requests exception:\n{e}')
    461 463   self.words = [ 'FUBAR' ]
    462 464   # if self.debug: print('There are {:d} words.'.format(len(self.words)))
    463 465   
    skipped 19 lines
    483 485   self.every_hour_tasks()
    484 486   time.sleep(self.chi2_mean_std(0.5,0.2))
    485 487   except Exception as e:
    486  - if self.debug: print('.pollute() exception:\n{}'.format(e))
     488 + if self.debug: print(f'.pollute() exception:\n{e}')
    487 489   
    488 490   def pollute(self):
    489  - if not self.quit_driver_every_call: self.check_phantomjs_process()
     491 + if not self.quit_driver_every_call: self.check_chromedriver_process()
    490 492   if self.link_count() < 2000:
    491 493   if self.quit_driver_every_call: self.open_driver()
    492 494   self.seed_links()
    skipped 42 lines
    535 537   else: # quote the first two words together
    536 538   word = ' '.join(['"{}"'.format(' '.join(random.sample(self.words, 2))),
    537 539   ' '.join(random.sample(self.words, num_words-2))])
    538  - if self.debug: print('Seeding with search for \'{}\'…'.format(word))
     540 + if self.debug: print(f'Seeding with search for \'{word}\'…')
    539 541   self.get_websearch(word)
    540 542   
    541 543   def bias_links(self):
    skipped 44 lines
    586 588   if self.hour_trigger:
    587 589   if hasattr(self,'driver'):
    588 590   self.set_user_agent()
    589  - if True:
     591 + if True: pass
     592 + elif False:
     593 + # `set_user_agent` reopens chromedriver now
    590 594   self.quit_driver()
    591 595   self.open_driver()
    592 596   else:
    593 597   try:
    594  - @self.phantomjs_short_timeout
    595  - def phantomjs_delete_all_cookies(): self.driver.delete_all_cookies()
    596  - phantomjs_delete_all_cookies()
     598 + @self.chromedriver_short_timeout
     599 + def chromedriver_delete_all_cookies(): self.driver.delete_all_cookies()
     600 + chromedriver_delete_all_cookies()
    597 601   except Exception as e:
    598  - if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
     602 + if self.debug: print(f'.delete_all_cookies() exception:\n{e}')
    599 603   self.seed_links()
    600 604   else: self.open_driver()
    601 605   self.hour_trigger = False
    skipped 36 lines
    638 642   
    639 643   def set_user_agent(self):
    640 644   self.draw_user_agent()
    641  - try:
    642  - @self.phantomjs_short_timeout
    643  - def phantomjs_capabilities_update():
    644  - self.driver.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent})
    645  - phantomjs_capabilities_update()
    646  - except Exception as e:
    647  - if self.debug: print('.update() exception:\n{}'.format(e))
     645 + # chromedriver cannot reset the User-Agent in runtime, so it must be restarted with a new UA
     646 + # https://stackoverflow.com/questions/50375628/how-to-change-useragent-string-in-runtime-chromedriver-selenium/50375914#50375914
     647 + self.open_driver()
    648 648   
    649 649   def draw_user_agent(self,max_draws=10000):
    650 650   """Draw a random User-Agent either uniformly (mildly susceptible to ML), or from a matched distribution."""
    651 651   global ua_parse_flag, user_agent
    652 652   if not ua_parse_flag:
    653  - self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent
     653 + self.user_agent = self.fake_ua.random if npr.random() < 0.95 else user_agent
    654 654   return
    655 655   # Draw User-Agent from pre-defined property distribution
    656 656   property_pvals = self.property_pvals
    657 657   k = 0
    658 658   while k < max_draws:
    659  - uap = ua.parse(self.fake.user_agent())
     659 + uap = ua.parse(self.fake_ua.random)
    660 660   # print(uap.ua_string)
    661 661   p_browser = property_pvals['browser']['noneoftheabove']
    662 662   for ky in property_pvals['browser']:
    skipped 88 lines
    751 751   self.domain_links.setdefault(domain, set())
    752 752   self.domain_links[domain].add(url)
    753 753   result = True
    754  - # if self.debug: print('\tAdded link \'{}\'…'.format(url))
     754 + # if self.debug: print(f'\tAdded link \'{url}\'…')
    755 755   return result
    756 756   
    757 757   def remove_link(self,url):
    skipped 20 lines
    778 778   self.SafeSearch.query_parameter,uprs.quote_plus(query),
    779 779   self.SafeSearch.additional_parameters,self.SafeSearch.safe_parameter)))
    780 780   if self.verbose: self.print_url(url)
    781  - @self.phantomjs_timeout
    782  - def phantomjs_get(): self.driver.get(url) # selenium driver
    783  - phantomjs_get()
    784  - @self.phantomjs_short_timeout
    785  - def phantomjs_page_source(): self.data_usage += len(self.driver.page_source)
    786  - phantomjs_page_source()
     781 + @self.chromedriver_timeout
     782 + def chromedriver_get(): self.driver.get(url) # selenium driver
     783 + chromedriver_get()
     784 + @self.chromedriver_short_timeout
     785 + def chromedriver_page_source(): self.data_usage += len(self.driver.page_source)
     786 + chromedriver_page_source()
    787 787   new_links = self.websearch_links()
    788 788   if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    789 789   
    skipped 7 lines
    797 797   :return:
    798 798   """
    799 799   # https://github.com/detro/ghostdriver/issues/169
    800  - @self.phantomjs_short_timeout
    801  - def phantomjs_find_elements_by_css_selector():
     800 + @self.chromedriver_short_timeout
     801 + def chromedriver_find_elements_by_css_selector():
    802 802   return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector(self.SafeSearch.css_selector))
    803  - elements = phantomjs_find_elements_by_css_selector()
     803 + elements = chromedriver_find_elements_by_css_selector()
    804 804   # get links in random order until max. per page
    805 805   k = 0
    806 806   links = []
    807 807   try:
    808 808   for elt in sorted(elements,key=lambda k: random.random()):
    809  - @self.phantomjs_short_timeout
    810  - def phantomjs_find_element_by_tag_name(): return elt.find_element_by_tag_name('a')
    811  - a_tag = phantomjs_find_element_by_tag_name()
    812  - @self.phantomjs_short_timeout
    813  - def phantomjs_get_attribute(): return a_tag.get_attribute('href')
    814  - href = phantomjs_get_attribute()
     809 + @self.chromedriver_short_timeout
     810 + def chromedriver_find_element_by_tag_name(): return elt.find_element_by_tag_name('a')
     811 + a_tag = chromedriver_find_element_by_tag_name()
     812 + @self.chromedriver_short_timeout
     813 + def chromedriver_get_attribute(): return a_tag.get_attribute('href')
     814 + href = chromedriver_get_attribute()
    815 815   if href is not None:
    816 816   href = self.SafeSearch.result_extraction(href)
    817 817   links.append(href)
    818 818   k += 1
    819 819   if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break
    820 820   except Exception as e:
    821  - if self.debug: print('.find_element_by_tag_name.get_attribute() exception:\n{}'.format(e))
     821 + if self.debug: print(f'.find_element_by_tag_name.get_attribute() exception:\n{e}')
    822 822   return links
    823 823   
    824 824   def get_url(self,url):
    skipped 3 lines
    828 828   :return:
    829 829   """
    830 830   if not self.check_robots(url): return # bail out if robots.txt says to
    831  - @self.phantomjs_timeout
    832  - def phantomjs_get(): self.driver.get(url) # selenium driver
    833  - phantomjs_get()
    834  - @self.phantomjs_short_timeout
    835  - def phantomjs_page_source(): self.data_usage += len(self.driver.page_source)
    836  - phantomjs_page_source()
     831 + @self.chromedriver_timeout
     832 + def chromedriver_get(): self.driver.get(url) # selenium driver
     833 + chromedriver_get()
     834 + @self.chromedriver_short_timeout
     835 + def chromedriver_page_source(): self.data_usage += len(self.driver.page_source)
     836 + chromedriver_page_source()
    837 837   new_links = self.url_links()
    838 838   if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    839 839   
    840 840   def url_links(self):
    841 841   """Generic webpage link finder format."""
    842 842   # https://github.com/detro/ghostdriver/issues/169
    843  - @self.phantomjs_short_timeout
    844  - def phantomjs_find_elements_by_tag_name():
    845  - return WebDriverWait(self.driver,3).until(lambda x: x.find_elements_by_tag_name('a'))
    846  - elements = phantomjs_find_elements_by_tag_name()
     843 + @self.chromedriver_short_timeout
     844 + def chromedriver_find_elements_by_tag_name():
     845 + return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_tag_name('a'))
     846 + elements = chromedriver_find_elements_by_tag_name()
    847 847   
    848 848   # get links in random order until max. per page
    849 849   k = 0
    850 850   links = []
    851 851   try:
    852 852   for a in sorted(elements,key=lambda k: random.random()):
    853  - @self.phantomjs_short_timeout
    854  - def phantomjs_get_attribute(): return a.get_attribute('href')
    855  - href = phantomjs_get_attribute()
     853 + @self.chromedriver_short_timeout
     854 + def chromedriver_get_attribute(): return a.get_attribute('href')
     855 + href = chromedriver_get_attribute()
    856 856   if href is not None: links.append(href)
    857 857   k += 1
    858 858   if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break
    859 859   except Exception as e:
    860  - if self.debug: print('.get_attribute() exception:\n{}'.format(e))
     860 + if self.debug: print(f'.get_attribute() exception:\n{e}')
    861 861   return links
    862 862   
    863 863   def check_robots(self,url):
    skipped 21 lines
    885 885   if self.verbose or self.debug:
    886 886   current_url = url # default
    887 887   try:
    888  - @self.phantomjs_short_timeout
    889  - def phantomjs_current_url(): return self.driver.current_url
    890  - current_url = phantomjs_current_url()
     888 + @self.chromedriver_short_timeout
     889 + def chromedriver_current_url(): return self.driver.current_url
     890 + current_url = chromedriver_current_url()
    891 891   # the current_url method breaks on a lot of sites, e.g.
    892 892   # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()'
    893 893   except Exception as e:
    894  - if self.debug: print('.current_url exception:\n{}'.format(e))
     894 + if self.debug: print(f'.current_url exception:\n{e}')
    895 895   if self.debug:
    896 896   print("{}: {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy()))
    897 897   elif self.verbose:
    skipped 27 lines
    925 925   else:
    926 926   if len(url) + chars_used > terminal_width:
    927 927   url = url[:terminal_width-chars_used-1] + '…'
    928  - text = "{}{}".format(url,text_suffix) # added white space necessary
     928 + text = f"{url}{text_suffix}" # added white space necessary
    929 929   text = text[:min(terminal_width,len(text))] + ' ' * max(0,terminal_width-len(text))
    930 930   print(text,end='',flush=True)
    931 931   time.sleep(0.01)
    skipped 5 lines
    937 937   def bandwidth_test(self):
    938 938   running_bandwidth = self.data_usage/(self.elapsed_time+900.)
    939 939   running_bandwidth = running_bandwidth/407. # Convert to GB/month, 2**30/(3600*24*30.5)
    940  - # if self.debug: print('Using {} GB/month'.format(running_bandwidth))
     940 + # if self.debug: print(f'Using {running_bandwidth} GB/month')
    941 941   return running_bandwidth > self.gb_per_month
    942 942   
    943  - # handle phantomjs timeouts
    944  - # configurable decorator to timeout phantomjs and robotparser calls
     943 + # handle chromedriver timeouts
     944 + # configurable decorator to timeout chromedriver and robotparser calls
    945 945   # http://stackoverflow.com/questions/15572288/general-decorator-to-wrap-try-except-in-python
    946 946   # Syntax:
    947  - # phantomjs_timeout = block_timeout(phantomjs_hang_handler)
    948  - # @phantomjs_timeout
    949  - # def phantomjs_block():
    950  - # # phantomjs stuff
     947 + # chromedriver_timeout = block_timeout(chromedriver_hang_handler)
     948 + # @chromedriver_timeout
     949 + # def chromedriver_block():
     950 + # # chromedriver stuff
    951 951   # pass
    952  - # phantomjs_block()
     952 + # chromedriver_block()
    953 953   
    954 954   def block_timeout(self,hang_handler, alarm_time=timeout, errors=(Exception,), debug=False):
    955 955   def decorator(func):
    skipped 4 lines
    960 960   try:
    961 961   result = func(*args, **kwargs)
    962 962   except errors as e:
    963  - if debug: print('{} exception:\n{}'.format(func.__name__, e))
     963 + if debug: print(f'{func.__name__} exception:\n{e}')
    964 964   finally:
    965 965   signal.alarm(0) # cancel the alarm
    966 966   return result
    skipped 3 lines
    970 970   class TimeoutError(Exception):
    971 971   pass
    972 972   
    973  - def phantomjs_hang_handler(self, signum, frame):
     973 + def chromedriver_hang_handler(self, signum, frame):
    974 974   # https://github.com/detro/ghostdriver/issues/334
    975 975   # http://stackoverflow.com/questions/492519/timeout-on-a-function-call
    976  - if self.debug: print('Looks like phantomjs has hung.')
     976 + if self.debug: print('Looks like chromedriver has hung.')
    977 977   try:
    978  - self.quit_driver(phantomjs_short_timeout_decorator=self.phantomjs_quit_timeout)
     978 + self.quit_driver(chromedriver_short_timeout_decorator=self.chromedriver_quit_timeout)
    979 979   except Exception as e:
    980 980   if self.debug: print(e)
    981 981   self.open_driver()
    982 982   
    983  - def phantomjs_quit_hang_handler(self, signum, frame):
    984  - raise self.TimeoutError('phantomjs .quit method is taking too long')
     983 + def chromedriver_quit_hang_handler(self, signum, frame):
     984 + raise self.TimeoutError('chromedriver .quit method is taking too long')
    985 985   
    986 986   def robots_hang_handler(self, signum, frame):
    987 987   if self.debug: print('Looks like robotparser has hung.')
    988 988   raise self.TimeoutError('robotparser is taking too long')
    989 989   
    990  - def check_phantomjs_process(self):
     990 + def check_chromedriver_process(self):
    991 991   """
    992  - Check if phantomjs is running.
     992 + Check if chromedriver is running.
    993 993   :return:
    994 994   """
    995 995   # Check rss and restart if too large, then check existence
    996 996   # http://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python
    997 997   try:
    998 998   if not hasattr(self,'driver'): self.open_driver()
    999  - pid, rss_mb = self.phantomjs_pid_and_memory()
    1000  - if rss_mb > self.phantomjs_rss_limit_mb: # memory limit
     999 + pid, rss_mb = self.chromedriver_pid_and_memory()
     1000 + if rss_mb > self.chromedriver_rss_limit_mb: # memory limit
    1001 1001   self.quit_driver(pid=pid)
    1002 1002   self.open_driver()
    1003  - pid, _ = self.phantomjs_pid_and_memory()
     1003 + pid, _ = self.chromedriver_pid_and_memory()
    1004 1004   # check existence
    1005 1005   os.kill(pid, 0)
    1006 1006   except (OSError,psutil.NoSuchProcess,Exception) as e:
    1007  - if self.debug: print('.phantomjs_pid_and_memory() exception:\n{}'.format(e))
     1007 + if self.debug: print(f'.chromedriver_pid_and_memory() exception:\n{e}')
    1008 1008   if issubclass(type(e),psutil.NoSuchProcess):
    1009  - raise Exception("There's a phantomjs zombie, and the thread shouldn't have reached this statement.")
     1009 + raise Exception("There's a chromedriver zombie, and the thread shouldn't have reached this statement.")
    1010 1010   return False
    1011 1011   else:
    1012 1012   return True
    1013 1013   
    1014  - def phantomjs_pid_and_memory(self):
    1015  - """ Return the pid and memory (MB) of the phantomjs process,
     1014 + def chromedriver_pid_and_memory(self):
     1015 + """ Return the pid and memory (MB) of the chromedriver process,
    1016 1016   restart if it's a zombie, and exit if a restart isn't working
    1017 1017   after three attempts. """
    1018 1018   for k in range(3): # three strikes
    1019 1019   try:
    1020  - @self.phantomjs_short_timeout
    1021  - def phantomjs_process_pid(): return self.driver.service.process.pid
    1022  - pid = phantomjs_process_pid()
     1020 + @self.chromedriver_short_timeout
     1021 + def chromedriver_process_pid(): return self.driver.service.process.pid
     1022 + pid = chromedriver_process_pid()
    1023 1023   rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20)
    1024 1024   break
    1025 1025   except (psutil.NoSuchProcess,Exception) as e:
    1026  - if self.debug: print('.service.process.pid exception:\n{}'.format(e))
     1026 + if self.debug: print(f'.service.process.pid exception:\n{e}')
    1027 1027   self.quit_driver(pid=pid)
    1028 1028   self.open_driver()
    1029  - else: # throw in the towel and exit if no viable phantomjs process after multiple attempts
    1030  - print('No viable phantomjs process after multiple attempts!')
     1029 + else: # throw in the towel and exit if no viable chromedriver process after multiple attempts
     1030 + print('No viable chromedriver process after multiple attempts!')
    1031 1031   sys.exit(1)
    1032 1032   return (pid, rss_mb)
    1033 1033   
    skipped 89 lines
Please wait...
Page is in error, reload to recover