skipped 3 lines 4 4 5 5 # isp_data_pollution: bandwidth-limited ISP data pollution 6 6 7 - # Copyright 2017 Steven T. Smith <steve dot t dot smith at gmail dot com>, GPL 7 + # Copyright 2017– 2018 Steven T. Smith <steve dot t dot smith at gmail dot com>, GPL 8 8 9 9 # This program is free software: you can redistribute it and/or modify 10 10 # it under the terms of the GNU General Public License as published by skipped 8 lines 19 19 # You should have received a copy of the GNU General Public License 20 20 # along with this program. If not, see <http://www.gnu.org/licenses/>. 21 21 22 - __version__ = '1 .4 ' 22 + __version__ = '2 .0 ' 23 23 24 24 import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn 25 25 import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs 26 26 from selenium import webdriver 27 - from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 28 27 from selenium.webdriver.support.ui import WebDriverWait 29 28 from io import BytesIO 30 - from faker import Factory 29 + import fake_useragent as fake_ua 31 30 32 31 # parse User-Agent for matching distribution 33 32 ua_parse_flag = True skipped 21 lines 55 54 # nice this process on UNIX 56 55 if hasattr(os,'nice'): os.nice(15) 57 56 58 - gb_per_month = 50 # How many gigabytes to pollute per month 57 + gb_per_month = 100 # How many gigabytes to pollute per month 59 58 max_links_cached = 100000 # Maximum number of links to cache for download 60 59 max_links_per_page = 200 # Maximum number of links to add per page 61 60 max_links_per_domain = 400 # Maximum number of links to add per domain 62 61 wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain' 63 - timeout = 20 64 - short_timeout = 3 65 - phantomjs_rss_limit_mb = 1024 # Default maximum meory limit of phantomjs processs (MB) 62 + timeout = 45 63 + short_timeout = 10 64 + browserdriver_rss_limit_mb = 1024 # Default maximum memory limit of browserdriver (chromedriver) processs (MB) 66 65 terminal_width = 80 # tty width, standard is 80 chars; add code to adapt later 67 66 68 67 blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz' skipped 29 lines 98 97 99 98 # tell ISP that an iPad is being used 100 99 user_agent = 'Mozilla/5.0 (iPad; CPU OS 6_1 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B141 Safari/8536.25' 100 + 101 + # Tor browser size on Linux 102 + window_size = (1296,1018) 101 103 102 104 # bias the content with non-random, diverse, link-heavy, popular content 103 105 seed_bias_links = ['http://my.xfinity.com/news', skipped 102 lines 206 208 max_links_per_page=max_links_per_page, 207 209 max_links_per_domain=max_links_per_domain, 208 210 property_pvals=property_pvals, 209 - user_egent =user_agent, 211 + user_agent =user_agent, 210 212 blacklist_url=blacklist_url, 211 213 wordsite_url=wordsite_url, 212 214 seed_bias_links=seed_bias_links, 213 215 timeout=timeout, diurnal_flag=True, 214 216 quit_driver_every_call=False, 215 217 blacklist=True,verbose=True): 216 - print('This is ISP Data Pollution , Version {}'. format ( __version__ ) ) 218 + print(f 'This is ISP Data Pollution , Version {__version__ }') 217 219 self.max_links_cached = max_links_cached 218 220 self.max_links_per_page = max_links_per_page 219 221 self.max_links_per_domain = max_links_per_domain skipped 10 lines 230 232 # self.debug = debug # set in parseArgs 231 233 self.args = self.args = self.parseArgs() 232 234 # timeout configurable decorators 233 - self.phantomjs_timeout = self.block_timeout(self.phantomjs_hang_handler, \ 235 + self.chromedriver_timeout = self.block_timeout(self.chromedriver_hang_handler, \ 234 236 alarm_time=self.timeout+2,errors=(self.TimeoutError,), debug=self.debug) 235 - self.phantomjs_short_timeout = self.block_timeout(self.phantomjs_hang_handler, \ 236 - alarm_time=short_timeout+1 ,errors=(self.TimeoutError,Exception), debug=self.debug) 237 - self.phantomjs_quit_timeout = self.block_timeout(self.phantomjs_quit_hang_handler, \ 238 - alarm_time=short_timeout+1 ,errors=(self.TimeoutError,Exception), debug=self.debug) 237 + self.chromedriver_short_timeout = self.block_timeout(self.chromedriver_hang_handler, \ 238 + alarm_time=short_timeout+2 ,errors=(self.TimeoutError,Exception), debug=self.debug) 239 + self.chromedriver_quit_timeout = self.block_timeout(self.chromedriver_quit_hang_handler, \ 240 + alarm_time=short_timeout+2 ,errors=(self.TimeoutError,Exception), debug=self.debug) 239 241 self.robots_timeout = self.block_timeout(self.robots_hang_handler, \ 240 - alarm_time=short_timeout+1 ,errors=(self.TimeoutError,), debug=self.debug) 241 - self.check_phantomjs_version() 242 - self.fake = Factory.create() 242 + alarm_time=short_timeout+2 ,errors=(self.TimeoutError,), debug=self.debug) 243 + self.check_chromedriver_version() 244 + self.fake_ua = fake_ua.UserAgent() 243 245 self.hour_trigger = True 244 246 self.twentyfour_hour_trigger = True 245 247 self.domain_links = dict() skipped 1 lines 247 249 self.data_usage = 0 248 250 self.get_blacklist() 249 251 self.get_random_words() 250 - self.set_user_agent() 251 252 self.pollute_forever() 252 253 253 254 def parseArgs(self): 254 255 parser = ap.ArgumentParser() 255 256 parser.add_argument('-bw', '--gb_per_month', help="GB per month", type=int, default=gb_per_month) 256 257 parser.add_argument('-mm', '--maxmemory', 257 - help="Maximum memory of phantomjs (MB); 0=>restart every link", 258 + help="Maximum memory of chromedriver (MB); 0=>restart every link", 258 259 type=int, default=1024) 259 - parser.add_argument('-P', '--phantomjs -binary-path', help="Path to phantomjs binary", type=str, default=None) 260 - parser.add_argument('-p', '--proxy', help="Proxy for phantomjs ", type=str, default=None) 260 + parser.add_argument('-P', '--chromedriver -binary-path', help="Path to chromedriver binary", type=str, default=None) 261 + parser.add_argument('-p', '--proxy', help="Proxy for chromedriver ", type=str, default=None) 261 262 parser.add_argument('-g', '--debug', help="Debug flag", action='store_true') 262 263 args = parser.parse_args() 263 264 for k in args.__dict__: setattr(self,k,getattr(args,k)) skipped 3 lines 267 268 def sanity_check_arguments(self): 268 269 self.gb_per_month = min(2048,max(1,self.gb_per_month)) # min-max bandwidth limits 269 270 if self.maxmemory == 0: self.quit_driver_every_call = True 270 - self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits 271 + self.chromedriver_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits 271 272 272 - def check_phantomjs_version (self,recommended_version=(2,1 )): 273 + def check_chromedriver_version (self,recommended_version=(2,41 )): 273 274 self.open_driver() 274 275 if self.debug: 275 - print("{} version is {}, { } version is {}".format(self.driver.capabilities["browserName"], 276 + print("{} version is {}, chromedriver version is {}".format(self.driver.capabilities["browserName"], 276 277 self.driver.capabilities["version"], 277 - self.driver.capabilities["driverName "], 278 - self.driver.capabilities["driverVersion"])) 279 - phantomjs_version = tuple(int(i) for i in self.driver.capabilities["version"].split('.')) 280 - if phantomjs_version < recommended_version: 278 + self.driver.capabilities["chrome "][ " chromedriverVersion " ] ) ) 279 + chromedriver_version = tuple(int(i) for i in 280 + re.sub(r'([\d.]+?) .*','\\1',self.driver.capabilities["chrome"]["chromedriverVersion"]).split('.')) 281 + if chromedriver_version < recommended_version: 281 282 warn.warn("""{} version is {}; 282 - please upgrade to at least version {} from http://phantomjs .org. 283 + please upgrade to at least version {} from http://chromedriver . chromium .org/ downloads . 283 284 """.format(self.driver.capabilities["browserName"],self.driver.capabilities["version"], 284 285 '.'.join(str(i) for i in recommended_version))) 285 286 self.quit_driver() 286 287 287 288 def open_driver(self): 288 289 self.quit_driver() 289 - if not hasattr(self, 'driver') or not isinstance(self.driver,webdriver.phantomjs .webdriver.WebDriver): 290 - # phantomjs driver 291 - # http://engineering.shapesecurity.com/2015/01/detecting-phantomjs-based-visitors.html 292 - # https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string 293 - # http://phantomjs.org/api/webpage/property/settings.html 294 - # http://stackoverflow.com/questions/23390974/phantomjs-keeping-cache 295 - dcap = dict(DesiredCapabilities.PHANTOMJS) 296 - # dcap['browserName'] = 'Chrome' 297 - dcap['phantomjs.page.settings.userAgent'] = ( self.user_agent ) 298 - dcap['phantomjs.page.settings.loadImages'] = ( 'false' ) 299 - dcap['phantomjs.page.settings.clearMemoryCaches'] = ( 'true' ) 300 - dcap['phantomjs.page.settings.resourceTimeout'] = ( max(2000,int(self.timeout * 1000)) ) 301 - dcap['acceptSslCerts'] = ( True ) 302 - dcap['applicationCacheEnabled'] = ( True ) 303 - dcap['handlesAlerts'] = ( False ) 304 - dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch', 'DNT': '1' } ) 305 - phantomjs_service_args = ['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2'] 290 + if not hasattr(self, 'driver') or not isinstance(self.driver,webdriver.chrome .webdriver.WebDriver): 291 + # chromedriver 292 + chrome_options = webdriver.ChromeOptions() 293 + chrome_options.add_argument('headless') 294 + chrome_options.add_argument(f'user-agent={self.user_agent}') 295 + chrome_options.add_argument('window-size={:d},{:d}'.format(window_size[0],window_size[1])) 296 + # Disable image downloads; see https://stackoverflow.com/questions/18657976/disable-images-in-selenium-google-chromedriver 297 + chrome_options.add_argument('blink-settings=imagesEnabled=false') 298 + chrome_options.add_argument('mute-audio') 306 299 if self.proxy is not None: 307 - phantomjs_service_args = ['--proxy={}'.format(self.proxy)] + phantomjs_service_args 308 - if self.phantomjs_binary_path is None: 309 - driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=phantomjs_service_args) 300 + chrome_options.add_argument(f'proxy-server={self.proxy}') 301 + if self.chromedriver_binary_path is None: 302 + driver = webdriver.Chrome(chrome_options=chrome_options) 310 303 else: 311 - driver = webdriver.PhantomJS(self.phantomjs_binary_path,desired_capabilities=dcap,service_args=phantomjs_service_args) 312 - driver.set_window_size(1296,1018) # Tor browser size on Linux 304 + chrome_options.binary_location = self.chromedriver_binary_path 305 + driver = webdriver.Chrome(self.chromedriver_binary_path,chrome_options=chrome_options) 306 + driver.set_window_size(window_size[0],window_size[1]) 313 307 driver.implicitly_wait(self.timeout) 314 308 driver.set_page_load_timeout(self.timeout) 315 309 driver.set_script_timeout(self.timeout) 316 310 self.driver = driver 317 311 318 - def quit_driver(self,hard_quit=False,pid=None,phantomjs_short_timeout_decorator =None): 312 + def quit_driver(self,hard_quit=False,pid=None,chromedriver_short_timeout_decorator =None): 319 313 """ 320 314 close, kill -9, quit, del 321 315 :param hard_quit: 322 316 :param pid: 323 317 :return: 324 - """ 318 + """ 319 + # Use original phantomjs code for chromedriver, even though chromedriver is likely far more robust 325 320 # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution 326 - if phantomjs_short_timeout_decorator is None: 327 - phantomjs_short_timeout_decorator = self.phantomjs_short_timeout 321 + if chromedriver_short_timeout_decorator is None: 322 + chromedriver_short_timeout_decorator = self.chromedriver_short_timeout 328 323 if hasattr(self,'driver'): 329 324 if not hard_quit: 330 - @phantomjs_short_timeout_decorator 331 - def phantomjs_close (): self.driver.close() 332 - phantomjs_close() 325 + @chromedriver_short_timeout_decorator 326 + def chromedriver_close (): self.driver.close() 327 + chromedriver_close() 333 328 try: 334 - @phantomjs_short_timeout_decorator 335 - def phantomjs_send_signal(): self.driver.service.process.send_signal(signal.SIGTERM) 336 - phantomjs_send_signal() 329 + if pid is None: 330 + @chromedriver_short_timeout_decorator 331 + def chromedriver_process_pid(): return self.driver.service.process.pid 332 + pid = chromedriver_process_pid() 333 + @chromedriver_short_timeout_decorator 334 + def chromedriver_send_signal(): 335 + # Google Chrome is a child process of chromedriver 336 + for c in psutil.Process(pid).children(): c.send_signal(signal.SIGTERM) 337 + self.driver.service.process.send_signal(signal.SIGTERM) 338 + chromedriver_send_signal() 337 339 except Exception as e: 338 - if self.debug: print('.send_signal() exception:\n{}'. format ( e ) ) 339 - if pid is None: 340 - @phantomjs_short_timeout_decorator 341 - def phantomjs_process_pid(): return self.driver.service.process.pid 342 - pid = phantomjs_process_pid() 340 + if self.debug: print(f '.send_signal() exception:\n{e }') 343 341 if isinstance(pid,int): 344 342 try: 343 + # Google Chrome is a child process of chromedriver 344 + for c in psutil.Process(pid).children(): os.kill(c.pid, signal.SIGTERM) 345 345 os.kill(pid, signal.SIGTERM) # overkill (pun intended) 346 346 except Exception as e: 347 - if self.debug: print('.kill() exception:\n{}'. format ( e ) ) 347 + if self.debug: print(f '.kill() exception:\n{e }') 348 348 try: 349 - @phantomjs_short_timeout_decorator 350 - def phantomjs_quit (): self.driver.quit() 351 - phantomjs_quit() 349 + @chromedriver_short_timeout_decorator 350 + def chromedriver_quit (): self.driver.quit() 351 + chromedriver_quit() 352 352 except Exception as e: 353 - if self.debug: print('.quit() exception:\n{}'. format ( e ) ) 353 + if self.debug: print(f '.quit() exception:\n{e }') 354 354 del self.driver 355 355 356 356 def clear_driver(self): 357 357 # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver 358 358 if hasattr(self, 'driver'): 359 359 try: 360 - @self.phantomjs_short_timeout 361 - def phantomjs_delete_all_cookies (): self.driver.delete_all_cookies() 362 - phantomjs_delete_all_cookies() 360 + @self.chromedriver_short_timeout 361 + def chromedriver_delete_all_cookies (): self.driver.delete_all_cookies() 362 + chromedriver_delete_all_cookies() 363 363 except Exception as e: 364 - if self.debug: print('.delete_all_cookies() exception:\n{}'. format ( e ) ) 364 + if self.debug: print(f '.delete_all_cookies() exception:\n{e }') 365 365 try: 366 - @self.phantomjs_short_timeout 367 - def phantomjs_clear(): 368 - self.driver.execute_script('window.localStorage.clear();') 369 - self.driver.execute_script('window.sessionStorage.clear();') 370 - phantomjs_clear() 366 + @self.chromedriver_short_timeout 367 + def chromedriver_clear(): 368 + pass 369 + # Neither of these methods appear to work for chromedriver 370 + # self.driver.execute_script('window.localStorage.clear();') 371 + # self.driver.execute_script('window.sessionStorage.clear();') 372 + chromedriver_clear() 371 373 except Exception as e: 372 - if self.debug: print('.execute_script() exception:\n{}'. format ( e ) ) 374 + if self.debug: print(f '.execute_script() exception:\n{e }') 373 375 374 376 def get_blacklist(self,update_flag=False): 375 377 blacklist_domains = getattr(self,'blacklist_domains',set()) skipped 53 lines 429 431 # drugs hospitals porn spyware 430 432 # dynamic imagehosting radiotv tracker 431 433 for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]: 432 - self.blacklist_domains |= set(tgz.extractfile('BL/{}/domains'. format ( member ) ).read().decode('utf-8').splitlines()) 433 - self.blacklist_urls |= set(tgz.extractfile('BL/{}/urls'. format ( member ) ).read().decode('utf-8').splitlines()) 434 + self.blacklist_domains |= set(tgz.extractfile(f 'BL/{member }/domains').read().decode('utf-8').splitlines()) 435 + self.blacklist_urls |= set(tgz.extractfile(f 'BL/{member }/urls').read().decode('utf-8').splitlines()) 434 436 tgz.close() 435 437 tmpfile.close() 436 438 skipped 20 lines 457 459 self.words = response.content.decode('utf-8').splitlines() 458 460 reqsession.close() 459 461 except Exception as e: 460 - if self.debug: print('requests exception:\n{}'. format ( e ) ) 462 + if self.debug: print(f 'requests exception:\n{e }') 461 463 self.words = [ 'FUBAR' ] 462 464 # if self.debug: print('There are {:d} words.'.format(len(self.words))) 463 465 skipped 19 lines 483 485 self.every_hour_tasks() 484 486 time.sleep(self.chi2_mean_std(0.5,0.2)) 485 487 except Exception as e: 486 - if self.debug: print('.pollute() exception:\n{}'. format ( e ) ) 488 + if self.debug: print(f '.pollute() exception:\n{e }') 487 489 488 490 def pollute(self): 489 - if not self.quit_driver_every_call: self.check_phantomjs_process () 491 + if not self.quit_driver_every_call: self.check_chromedriver_process () 490 492 if self.link_count() < 2000: 491 493 if self.quit_driver_every_call: self.open_driver() 492 494 self.seed_links() skipped 42 lines 535 537 else: # quote the first two words together 536 538 word = ' '.join(['"{}"'.format(' '.join(random.sample(self.words, 2))), 537 539 ' '.join(random.sample(self.words, num_words-2))]) 538 - if self.debug: print('Seeding with search for \'{}\'…'. format ( word ) ) 540 + if self.debug: print(f 'Seeding with search for \'{word }\'…') 539 541 self.get_websearch(word) 540 542 541 543 def bias_links(self): skipped 44 lines 586 588 if self.hour_trigger: 587 589 if hasattr(self,'driver'): 588 590 self.set_user_agent() 589 - if True: 591 + if True: pass 592 + elif False: 593 + # `set_user_agent` reopens chromedriver now 590 594 self.quit_driver() 591 595 self.open_driver() 592 596 else: 593 597 try: 594 - @self.phantomjs_short_timeout 595 - def phantomjs_delete_all_cookies (): self.driver.delete_all_cookies() 596 - phantomjs_delete_all_cookies() 598 + @self.chromedriver_short_timeout 599 + def chromedriver_delete_all_cookies (): self.driver.delete_all_cookies() 600 + chromedriver_delete_all_cookies() 597 601 except Exception as e: 598 - if self.debug: print('.delete_all_cookies() exception:\n{}'. format ( e ) ) 602 + if self.debug: print(f '.delete_all_cookies() exception:\n{e }') 599 603 self.seed_links() 600 604 else: self.open_driver() 601 605 self.hour_trigger = False skipped 36 lines 638 642 639 643 def set_user_agent(self): 640 644 self.draw_user_agent() 641 - try: 642 - @self.phantomjs_short_timeout 643 - def phantomjs_capabilities_update(): 644 - self.driver.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent}) 645 - phantomjs_capabilities_update() 646 - except Exception as e: 647 - if self.debug: print('.update() exception:\n{}'.format(e)) 645 + # chromedriver cannot reset the User-Agent in runtime, so it must be restarted with a new UA 646 + # https://stackoverflow.com/questions/50375628/how-to-change-useragent-string-in-runtime-chromedriver-selenium/50375914#50375914 647 + self.open_driver() 648 648 649 649 def draw_user_agent(self,max_draws=10000): 650 650 """Draw a random User-Agent either uniformly (mildly susceptible to ML), or from a matched distribution.""" 651 651 global ua_parse_flag, user_agent 652 652 if not ua_parse_flag: 653 - self.user_agent = self.fake .user_agent ( ) if npr.random() < 0.95 else user_agent 653 + self.user_agent = self.fake_ua .random if npr.random() < 0.95 else user_agent 654 654 return 655 655 # Draw User-Agent from pre-defined property distribution 656 656 property_pvals = self.property_pvals 657 657 k = 0 658 658 while k < max_draws: 659 - uap = ua.parse(self.fake .user_agent ( ) ) 659 + uap = ua.parse(self.fake_ua .random ) 660 660 # print(uap.ua_string) 661 661 p_browser = property_pvals['browser']['noneoftheabove'] 662 662 for ky in property_pvals['browser']: skipped 88 lines 751 751 self.domain_links.setdefault(domain, set()) 752 752 self.domain_links[domain].add(url) 753 753 result = True 754 - # if self.debug: print('\tAdded link \'{}\'…'. format ( url ) ) 754 + # if self.debug: print(f '\tAdded link \'{url }\'…') 755 755 return result 756 756 757 757 def remove_link(self,url): skipped 20 lines 778 778 self.SafeSearch.query_parameter,uprs.quote_plus(query), 779 779 self.SafeSearch.additional_parameters,self.SafeSearch.safe_parameter))) 780 780 if self.verbose: self.print_url(url) 781 - @self.phantomjs_timeout 782 - def phantomjs_get (): self.driver.get(url) # selenium driver 783 - phantomjs_get() 784 - @self.phantomjs_short_timeout 785 - def phantomjs_page_source (): self.data_usage += len(self.driver.page_source) 786 - phantomjs_page_source() 781 + @self.chromedriver_timeout 782 + def chromedriver_get (): self.driver.get(url) # selenium driver 783 + chromedriver_get() 784 + @self.chromedriver_short_timeout 785 + def chromedriver_page_source (): self.data_usage += len(self.driver.page_source) 786 + chromedriver_page_source() 787 787 new_links = self.websearch_links() 788 788 if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) 789 789 skipped 7 lines 797 797 :return: 798 798 """ 799 799 # https://github.com/detro/ghostdriver/issues/169 800 - @self.phantomjs_short_timeout 801 - def phantomjs_find_elements_by_css_selector(): 800 + @self.chromedriver_short_timeout 801 + def chromedriver_find_elements_by_css_selector(): 802 802 return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector(self.SafeSearch.css_selector)) 803 - elements = phantomjs_find_elements_by_css_selector() 803 + elements = chromedriver_find_elements_by_css_selector() 804 804 # get links in random order until max. per page 805 805 k = 0 806 806 links = [] 807 807 try: 808 808 for elt in sorted(elements,key=lambda k: random.random()): 809 - @self.phantomjs_short_timeout 810 - def phantomjs_find_element_by_tag_name (): return elt.find_element_by_tag_name('a') 811 - a_tag = phantomjs_find_element_by_tag_name() 812 - @self.phantomjs_short_timeout 813 - def phantomjs_get_attribute (): return a_tag.get_attribute('href') 814 - href = phantomjs_get_attribute() 809 + @self.chromedriver_short_timeout 810 + def chromedriver_find_element_by_tag_name (): return elt.find_element_by_tag_name('a') 811 + a_tag = chromedriver_find_element_by_tag_name() 812 + @self.chromedriver_short_timeout 813 + def chromedriver_get_attribute (): return a_tag.get_attribute('href') 814 + href = chromedriver_get_attribute() 815 815 if href is not None: 816 816 href = self.SafeSearch.result_extraction(href) 817 817 links.append(href) 818 818 k += 1 819 819 if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break 820 820 except Exception as e: 821 - if self.debug: print('.find_element_by_tag_name.get_attribute() exception:\n{}'. format ( e ) ) 821 + if self.debug: print(f '.find_element_by_tag_name.get_attribute() exception:\n{e }') 822 822 return links 823 823 824 824 def get_url(self,url): skipped 3 lines 828 828 :return: 829 829 """ 830 830 if not self.check_robots(url): return # bail out if robots.txt says to 831 - @self.phantomjs_timeout 832 - def phantomjs_get (): self.driver.get(url) # selenium driver 833 - phantomjs_get() 834 - @self.phantomjs_short_timeout 835 - def phantomjs_page_source (): self.data_usage += len(self.driver.page_source) 836 - phantomjs_page_source() 831 + @self.chromedriver_timeout 832 + def chromedriver_get (): self.driver.get(url) # selenium driver 833 + chromedriver_get() 834 + @self.chromedriver_short_timeout 835 + def chromedriver_page_source (): self.data_usage += len(self.driver.page_source) 836 + chromedriver_page_source() 837 837 new_links = self.url_links() 838 838 if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) 839 839 840 840 def url_links(self): 841 841 """Generic webpage link finder format.""" 842 842 # https://github.com/detro/ghostdriver/issues/169 843 - @self.phantomjs_short_timeout 844 - def phantomjs_find_elements_by_tag_name(): 845 - return WebDriverWait(self.driver,3 ).until(lambda x: x.find_elements_by_tag_name('a')) 846 - elements = phantomjs_find_elements_by_tag_name() 843 + @self.chromedriver_short_timeout 844 + def chromedriver_find_elements_by_tag_name(): 845 + return WebDriverWait(self.driver,short_timeout ).until(lambda x: x.find_elements_by_tag_name('a')) 846 + elements = chromedriver_find_elements_by_tag_name() 847 847 848 848 # get links in random order until max. per page 849 849 k = 0 850 850 links = [] 851 851 try: 852 852 for a in sorted(elements,key=lambda k: random.random()): 853 - @self.phantomjs_short_timeout 854 - def phantomjs_get_attribute (): return a.get_attribute('href') 855 - href = phantomjs_get_attribute() 853 + @self.chromedriver_short_timeout 854 + def chromedriver_get_attribute (): return a.get_attribute('href') 855 + href = chromedriver_get_attribute() 856 856 if href is not None: links.append(href) 857 857 k += 1 858 858 if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break 859 859 except Exception as e: 860 - if self.debug: print('.get_attribute() exception:\n{}'. format ( e ) ) 860 + if self.debug: print(f '.get_attribute() exception:\n{e }') 861 861 return links 862 862 863 863 def check_robots(self,url): skipped 21 lines 885 885 if self.verbose or self.debug: 886 886 current_url = url # default 887 887 try: 888 - @self.phantomjs_short_timeout 889 - def phantomjs_current_url (): return self.driver.current_url 890 - current_url = phantomjs_current_url() 888 + @self.chromedriver_short_timeout 889 + def chromedriver_current_url (): return self.driver.current_url 890 + current_url = chromedriver_current_url() 891 891 # the current_url method breaks on a lot of sites, e.g. 892 892 # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()' 893 893 except Exception as e: 894 - if self.debug: print('.current_url exception:\n{}'. format ( e ) ) 894 + if self.debug: print(f '.current_url exception:\n{e }') 895 895 if self.debug: 896 896 print("{}: {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy())) 897 897 elif self.verbose: skipped 27 lines 925 925 else: 926 926 if len(url) + chars_used > terminal_width: 927 927 url = url[:terminal_width-chars_used-1] + '…' 928 - text = "{}{}". format ( url , text_suffix ) # added white space necessary 928 + text = f "{url }{text_suffix }" # added white space necessary 929 929 text = text[:min(terminal_width,len(text))] + ' ' * max(0,terminal_width-len(text)) 930 930 print(text,end='',flush=True) 931 931 time.sleep(0.01) skipped 5 lines 937 937 def bandwidth_test(self): 938 938 running_bandwidth = self.data_usage/(self.elapsed_time+900.) 939 939 running_bandwidth = running_bandwidth/407. # Convert to GB/month, 2**30/(3600*24*30.5) 940 - # if self.debug: print('Using {} GB/month'. format ( running_bandwidth ) ) 940 + # if self.debug: print(f 'Using {running_bandwidth } GB/month') 941 941 return running_bandwidth > self.gb_per_month 942 942 943 - # handle phantomjs timeouts 944 - # configurable decorator to timeout phantomjs and robotparser calls 943 + # handle chromedriver timeouts 944 + # configurable decorator to timeout chromedriver and robotparser calls 945 945 # http://stackoverflow.com/questions/15572288/general-decorator-to-wrap-try-except-in-python 946 946 # Syntax: 947 - # phantomjs_timeout = block_timeout(phantomjs_hang_handler) 948 - # @phantomjs_timeout 949 - # def phantomjs_block(): 950 - # # phantomjs stuff 947 + # chromedriver_timeout = block_timeout(chromedriver_hang_handler) 948 + # @chromedriver_timeout 949 + # def chromedriver_block(): 950 + # # chromedriver stuff 951 951 # pass 952 - # phantomjs_block() 952 + # chromedriver_block() 953 953 954 954 def block_timeout(self,hang_handler, alarm_time=timeout, errors=(Exception,), debug=False): 955 955 def decorator(func): skipped 4 lines 960 960 try: 961 961 result = func(*args, **kwargs) 962 962 except errors as e: 963 - if debug: print('{} exception:\n{}'. format ( func . __name__ , e ) ) 963 + if debug: print(f '{func . __name__ } exception:\n{e }') 964 964 finally: 965 965 signal.alarm(0) # cancel the alarm 966 966 return result skipped 3 lines 970 970 class TimeoutError(Exception): 971 971 pass 972 972 973 - def phantomjs_hang_handler(self, signum, frame): 973 + def chromedriver_hang_handler(self, signum, frame): 974 974 # https://github.com/detro/ghostdriver/issues/334 975 975 # http://stackoverflow.com/questions/492519/timeout-on-a-function-call 976 - if self.debug: print('Looks like phantomjs has hung.') 976 + if self.debug: print('Looks like chromedriver has hung.') 977 977 try: 978 - self.quit_driver(phantomjs_short_timeout_decorator=self.phantomjs_quit_timeout) 978 + self.quit_driver(chromedriver_short_timeout_decorator=self.chromedriver_quit_timeout) 979 979 except Exception as e: 980 980 if self.debug: print(e) 981 981 self.open_driver() 982 982 983 - def phantomjs_quit_hang_handler(self, signum, frame): 984 - raise self.TimeoutError('phantomjs .quit method is taking too long') 983 + def chromedriver_quit_hang_handler(self, signum, frame): 984 + raise self.TimeoutError('chromedriver .quit method is taking too long') 985 985 986 986 def robots_hang_handler(self, signum, frame): 987 987 if self.debug: print('Looks like robotparser has hung.') 988 988 raise self.TimeoutError('robotparser is taking too long') 989 989 990 - def check_phantomjs_process(self): 990 + def check_chromedriver_process(self): 991 991 """ 992 - Check if phantomjs is running. 992 + Check if chromedriver is running. 993 993 :return: 994 994 """ 995 995 # Check rss and restart if too large, then check existence 996 996 # http://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python 997 997 try: 998 998 if not hasattr(self,'driver'): self.open_driver() 999 - pid, rss_mb = self.phantomjs_pid_and_memory() 1000 - if rss_mb > self.phantomjs_rss_limit_mb : # memory limit 999 + pid, rss_mb = self.chromedriver_pid_and_memory() 1000 + if rss_mb > self.chromedriver_rss_limit_mb : # memory limit 1001 1001 self.quit_driver(pid=pid) 1002 1002 self.open_driver() 1003 - pid, _ = self.phantomjs_pid_and_memory() 1003 + pid, _ = self.chromedriver_pid_and_memory() 1004 1004 # check existence 1005 1005 os.kill(pid, 0) 1006 1006 except (OSError,psutil.NoSuchProcess,Exception) as e: 1007 - if self.debug: print('.phantomjs_pid_and_memory () exception:\n{}'. format ( e ) ) 1007 + if self.debug: print(f '.chromedriver_pid_and_memory () exception:\n{e }') 1008 1008 if issubclass(type(e),psutil.NoSuchProcess): 1009 - raise Exception("There's a phantomjs zombie, and the thread shouldn't have reached this statement.") 1009 + raise Exception("There's a chromedriver zombie, and the thread shouldn't have reached this statement.") 1010 1010 return False 1011 1011 else: 1012 1012 return True 1013 1013 1014 - def phantomjs_pid_and_memory(self): 1015 - """ Return the pid and memory (MB) of the phantomjs process, 1014 + def chromedriver_pid_and_memory(self): 1015 + """ Return the pid and memory (MB) of the chromedriver process, 1016 1016 restart if it's a zombie, and exit if a restart isn't working 1017 1017 after three attempts. """ 1018 1018 for k in range(3): # three strikes 1019 1019 try: 1020 - @self.phantomjs_short_timeout 1021 - def phantomjs_process_pid (): return self.driver.service.process.pid 1022 - pid = phantomjs_process_pid() 1020 + @self.chromedriver_short_timeout 1021 + def chromedriver_process_pid (): return self.driver.service.process.pid 1022 + pid = chromedriver_process_pid() 1023 1023 rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20) 1024 1024 break 1025 1025 except (psutil.NoSuchProcess,Exception) as e: 1026 - if self.debug: print('.service.process.pid exception:\n{}'. format ( e ) ) 1026 + if self.debug: print(f '.service.process.pid exception:\n{e }') 1027 1027 self.quit_driver(pid=pid) 1028 1028 self.open_driver() 1029 - else: # throw in the towel and exit if no viable phantomjs process after multiple attempts 1030 - print('No viable phantomjs process after multiple attempts!') 1029 + else: # throw in the towel and exit if no viable chromedriver process after multiple attempts 1030 + print('No viable chromedriver process after multiple attempts!') 1031 1031 sys.exit(1) 1032 1032 return (pid, rss_mb) 1033 1033 skipped 89 lines