🤬
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 85 lines
    86 86   
    87 87  # monkeypatch the read class method in RobotFileParser
    88 88  # many sites will block access to robots.txt without a standard User-Agent header
    89  -robot_timeout = 3
     89 +short_timeout = 3
    90 90  class RobotFileParserUserAgent(robotparser.RobotFileParser):
    91 91   
    92  - timeout = robot_timeout # short-term timeout
     92 + timeout = short_timeout # short-term timeout
    93 93   
    94 94   def read(self):
    95 95   """Reads the robots.txt URL and feeds it to the parser."""
    skipped 14 lines
    110 110  # Notes for the future:
    111 111  # 1. The bandwidth usage is undoubtedly (much) smaller because gzip encoding is used
    112 112  # 2. A lightweight proxy could be used for accurate bandwidth, and header editing
     113 + 
    113 114   
    114 115  class ISPDataPollution:
    115 116   '''Re: https://www.eff.org/deeplinks/2017/03/senate-puts-isp-profits-over-your-privacy
    skipped 43 lines
    159 160   # self.gb_per_month = gb_per_month # set in parseArgs
    160 161   # self.debug = debug # set in parseArgs
    161 162   self.args = self.args = self.parseArgs()
    162  - signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler
     163 + # timeout configurable decorators
     164 + self.phantomjs_timeout = self.block_timeout(self.phantomjs_hang_handler, \
     165 + alarm_time=self.timeout+2,errors=(self.TimeoutError,), debug=self.debug)
     166 + self.phantomjs_short_timeout = self.block_timeout(self.phantomjs_hang_handler, \
     167 + alarm_time=short_timeout+1,errors=(self.TimeoutError,Exception), debug=self.debug)
     168 + self.robots_timeout = self.block_timeout(self.robots_hang_handler, \
     169 + alarm_time=short_timeout+1,errors=(self.TimeoutError,), debug=self.debug)
    163 170   self.fake = Factory.create()
    164 171   self.hour_trigger = True
    165 172   self.twentyfour_hour_trigger = True
    skipped 45 lines
    211 218   # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution
    212 219   if hasattr(self,'session'):
    213 220   if not hard_quit:
    214  - signal.alarm(3)
    215  - try:
    216  - self.session.close()
    217  - except self.TimeoutError as e:
    218  - if self.debug: print('.close() timeout exception:\n{}'.format(e))
    219  - except Exception as e:
    220  - if self.debug: print('.close() exception:\n{}'.format(e))
    221  - finally:
    222  - signal.alarm(0) # cancel the alarm
     221 + @self.phantomjs_short_timeout
     222 + def phantomjs_close(): self.session.close()
     223 + phantomjs_close()
    223 224   try:
    224  - self.session.service.process.send_signal(signal.SIGTERM)
     225 + @self.phantomjs_short_timeout
     226 + def phantomjs_send_signal(): self.session.service.process.send_signal(signal.SIGTERM)
     227 + phantomjs_send_signal()
    225 228   except Exception as e:
    226 229   if self.debug: print('.send_signal() exception:\n{}'.format(e))
    227 230   try:
    skipped 5 lines
    233 236   except Exception as e:
    234 237   if self.debug: print('.kill() exception:\n{}'.format(e))
    235 238   try:
    236  - self.session.quit()
    237  - del self.session # only delete session if quit is successful
     239 + @self.phantomjs_short_timeout
     240 + def phantomjs_quit():
     241 + self.session.quit()
     242 + del self.session # only delete session if quit is successful
     243 + phantomjs_quit()
    238 244   except Exception as e:
    239 245   if self.debug: print('.quit() exception:\n{}'.format(e))
    240 246   
    skipped 1 lines
    242 248   # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver
    243 249   if hasattr(self, 'session'):
    244 250   try:
    245  - self.session.delete_all_cookies()
     251 + @self.phantomjs_short_timeout
     252 + def phantomjs_delete_all_cookies(): self.session.delete_all_cookies()
     253 + phantomjs_delete_all_cookies()
    246 254   except Exception as e:
    247 255   if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
    248 256   try:
    249  - self.session.execute_script('window.localStorage.clear();')
    250  - self.session.execute_script('window.sessionStorage.clear();')
     257 + @self.phantomjs_short_timeout
     258 + def phantomjs_clear():
     259 + self.session.execute_script('window.localStorage.clear();')
     260 + self.session.execute_script('window.sessionStorage.clear();')
     261 + phantomjs_clear()
    251 262   except Exception as e:
    252 263   if self.debug: print('.execute_script() exception:\n{}'.format(e))
    253 264   
    skipped 162 lines
    416 427   self.set_user_agent()
    417 428   if hasattr(self,'session'):
    418 429   try:
    419  - self.session.delete_all_cookies()
     430 + @self.phantomjs_short_timeout
     431 + def phantomjs_delete_all_cookies(): self.session.delete_all_cookies()
     432 + phantomjs_delete_all_cookies()
    420 433   except Exception as e:
    421 434   if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
    422 435   self.hour_trigger = False
    skipped 32 lines
    455 468   global user_agent
    456 469   self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent
    457 470   try:
    458  - self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent})
     471 + @self.phantomjs_short_timeout
     472 + def phantomjs_capabilities_update():
     473 + self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent})
     474 + phantomjs_capabilities_update()
    459 475   except Exception as e:
    460 476   if self.debug: print('.update() exception:\n{}'.format(e))
    461 477   
    skipped 57 lines
    519 535   def get_websearch(self,query):
    520 536   '''HTTP GET of a websearch, then add any embedded links.'''
    521 537   url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
    522  - signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler
    523  - signal.alarm(self.timeout+2) # set an alarm
    524  - try:
    525  - self.session.get(url) # selenium driver
    526  - except self.TimeoutError as e:
    527  - if self.debug: print('.get() exception:\n{}'.format(e))
    528  - finally:
    529  - signal.alarm(0) # cancel the alarm
    530  - try:
    531  - self.data_usage += len(self.session.page_source)
    532  - except Exception as e:
    533  - if self.debug: print('.page_source exception:\n{}'.format(e))
     538 + @self.phantomjs_timeout
     539 + def phantomjs_get(): self.session.get(url) # selenium driver
     540 + phantomjs_get()
     541 + @self.phantomjs_short_timeout
     542 + def phantomjs_page_source(): self.data_usage += len(self.session.page_source)
     543 + phantomjs_page_source()
    534 544   new_links = self.websearch_links()
    535 545   if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    536 546   
    537 547   def websearch_links(self):
    538 548   '''Webpage format for a popular search engine, <div class="g">'''
     549 + # https://github.com/detro/ghostdriver/issues/169
     550 + @self.phantomjs_short_timeout
     551 + def phantomjs_find_elements_by_css_selector():
     552 + return WebDriverWait(self.session, 3).until(lambda x: x.find_elements_by_css_selector('div.g'))
     553 + elements = phantomjs_find_elements_by_css_selector()
     554 + # get links in random order until max. per page
     555 + k = 0
     556 + links = []
    539 557   try:
    540  - # https://github.com/detro/ghostdriver/issues/169
    541  - elements = WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_css_selector('div.g'))
    542  - return [ div.find_element_by_tag_name('a').get_attribute('href') \
    543  - for div in elements \
    544  - if div.find_element_by_tag_name('a').get_attribute('href') is not None ]
     558 + for div in sorted(elements,key=lambda k: random.random()):
     559 + @self.phantomjs_short_timeout
     560 + def phantomjs_find_element_by_tag_name():
     561 + if div.find_element_by_tag_name('a').get_attribute('href') is not None:
     562 + links.append(div.find_element_by_tag_name('a').get_attribute('href'))
     563 + phantomjs_find_element_by_tag_name()
     564 + k += 1
     565 + if k > self.max_links_per_page: break
    545 566   except Exception as e:
    546 567   if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e))
    547  - return []
     568 + return links
    548 569   
    549 570   def get_url(self,url):
    550 571   '''HTTP GET of the url, and add any embedded links.'''
    551 572   if not self.check_robots(url): return # bail out if robots.txt says to
    552  - signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler
    553  - signal.alarm(self.timeout+2) # set an alarm
    554  - try:
    555  - self.session.get(url) # selenium driver
    556  - except self.TimeoutError as e:
    557  - if self.debug: print('.get() exception:\n{}'.format(e))
    558  - finally:
    559  - signal.alarm(0) # cancel the alarm
    560  - try:
    561  - self.data_usage += len(self.session.page_source)
    562  - except Exception as e:
    563  - if self.debug: print('.page_source exception:\n{}'.format(e))
     573 + @self.phantomjs_timeout
     574 + def phantomjs_get(): self.session.get(url) # selenium driver
     575 + phantomjs_get()
     576 + @self.phantomjs_short_timeout
     577 + def phantomjs_page_source(): self.data_usage += len(self.session.page_source)
     578 + phantomjs_page_source()
    564 579   new_links = self.url_links()
    565 580   if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    566 581   
    567 582   def url_links(self):
    568  - '''Generic webpage link finder format.'''
     583 + """Generic webpage link finder format."""
     584 + # https://github.com/detro/ghostdriver/issues/169
     585 + @self.phantomjs_short_timeout
     586 + def phantomjs_find_elements_by_tag_name():
     587 + return WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_tag_name('a'))
     588 + elements = phantomjs_find_elements_by_tag_name()
     589 + 
     590 + # get links in random order until max. per page
     591 + k = 0
     592 + links = []
    569 593   try:
    570  - # https://github.com/detro/ghostdriver/issues/169
    571  - elements = WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_tag_name('a'))
    572  - return [ a.get_attribute('href') \
    573  - for a in elements if a.get_attribute('href') is not None ]
    574  - except Exception as e:
     594 + for a in sorted(elements,key=lambda k: random.random()):
     595 + @self.phantomjs_short_timeout
     596 + def phantomjs_get_attribute():
     597 + if a.get_attribute('href') is not None:
     598 + links.append(a.get_attribute('href'))
     599 + phantomjs_get_attribute()
     600 + k += 1
     601 + if k > self.max_links_per_page: break
     602 + except Exception as a:
    575 603   if self.debug: print('.get_attribute() exception:\n{}'.format(e))
    576  - return []
     604 + return links
    577 605   
    578 606   def check_robots(self,url):
    579 607   result = True
    580  - url_robots = uprs.urlunparse(
    581  - uprs.urlparse(url)._replace(scheme='https', path='/robots.txt', query='', params=''))
    582  - signal.signal(signal.SIGALRM, self.robot_hang_handler) # register hang handler
    583  -# signal.alarm(robot_timeout+1) # set a short-term alarm a little longer than robot_timeout
    584  - try:
     608 + url_robots = uprs.urlunparse(uprs.urlparse(url)._replace(scheme='https',
     609 + path='/robots.txt', query='', params=''))
     610 + @self.robots_timeout
     611 + def robots_read():
    585 612   rp = RobotFileParserUserAgent()
    586 613   rp.set_url(url_robots)
    587 614   rp.read()
    588 615   result = rp.can_fetch(self.user_agent,url)
    589  - except (self.TimeoutError,Exception) as e:
    590  - if self.debug: print('rp.read() exception:\n{}'.format(e))
    591  - finally:
    592  - signal.alarm(0) # cancel the alarm
    593  - del rp # ensure self.close() in urllib
     616 + del rp # ensure self.close() in urllib
     617 + robots_read()
    594 618   return result
    595 619   
    596 620   def add_url_links(self,links,url=''):
    skipped 6 lines
    603 627   if self.verbose or self.debug:
    604 628   current_url = url # default
    605 629   try:
    606  - current_url = self.session.current_url
     630 + @self.phantomjs_short_timeout
     631 + def phantomjs_current_url(): return self.session.current_url
     632 + current_url = phantomjs_current_url()
    607 633   # the current_url method breaks on a lot of sites, e.g.
    608 634   # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()'
    609 635   except Exception as e:
    skipped 25 lines
    635 661   return running_bandwidth > self.gb_per_month
    636 662   
    637 663   # handle phantomjs timeouts
     664 + # configurable decorator to timeout phantomjs and robotparser calls
     665 + # http://stackoverflow.com/questions/15572288/general-decorator-to-wrap-try-except-in-python
     666 + # Syntax:
     667 + # phantomjs_timeout = block_timeout(phantomjs_hang_handler)
     668 + # @phantomjs_timeout
     669 + # def phantomjs_block():
     670 + # # phantomjs stuff
     671 + # pass
     672 + # phantomjs_block()
     673 + 
     674 + def block_timeout(self,hang_handler, alarm_time=timeout, errors=(Exception,), debug=False):
     675 + def decorator(func):
     676 + def call_func(*args, **kwargs):
     677 + signal.signal(signal.SIGALRM, hang_handler) # register hang handler
     678 + signal.alarm(alarm_time) # set an alarm
     679 + result = None
     680 + try:
     681 + result = func(*args, **kwargs)
     682 + except errors as e:
     683 + if debug: print('{} exception:\n{}'.format(func.__name__, e))
     684 + finally:
     685 + signal.alarm(0) # cancel the alarm
     686 + return result
     687 + return call_func
     688 + return decorator
     689 + 
    638 690   class TimeoutError(Exception):
    639 691   pass
    640 692   
    skipped 9 lines
    650 702   raise self.TimeoutError('Unable to quit the session as well.')
    651 703   raise self.TimeoutError('phantomjs is taking too long')
    652 704   
    653  - def robot_hang_handler(self, signum, frame):
     705 + def robots_hang_handler(self, signum, frame):
    654 706   if self.debug: print('Looks like robotparser has hung.')
    655 707   raise self.TimeoutError('robotparser is taking too long')
    656 708   
    skipped 24 lines
    681 733   after three attempts. """
    682 734   for k in range(3): # three strikes
    683 735   try:
    684  - pid = self.session.service.process.pid
     736 + @self.phantomjs_short_timeout
     737 + def phantomjs_process_pid(): return self.session.service.process.pid
     738 + pid = phantomjs_process_pid()
    685 739   rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20)
    686 740   break
    687 741   except (psutil.NoSuchProcess,Exception) as e:
    skipped 10 lines
Please wait...
Page is in error, reload to recover