🤬
  • ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 85 lines
    86 86   
    87 87  # monkeypatch the read class method in RobotFileParser
    88 88  # many sites will block access to robots.txt without a standard User-Agent header
     89 +robot_timeout = 3
    89 90  class RobotFileParserUserAgent(robotparser.RobotFileParser):
     91 + 
     92 + timeout = robot_timeout # short-term timeout
     93 + 
    90 94   def read(self):
    91 95   """Reads the robots.txt URL and feeds it to the parser."""
    92 96   try:
    93 97   headers = {'User-Agent': user_agent, }
    94 98   request = urllib.request.Request(self.url, None, headers)
    95  - f = urllib.request.urlopen(request)
     99 + f = urllib.request.urlopen(request,timeout=self.timeout)
    96 100   # f = urllib.request.urlopen(self.url) #! original code
    97 101   except urllib.error.HTTPError as err:
    98 102   if err.code in (401, 403):
    skipped 416 lines
    515 519   def get_websearch(self,query):
    516 520   '''HTTP GET of a websearch, then add any embedded links.'''
    517 521   url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
     522 + signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler
    518 523   signal.alarm(self.timeout+2) # set an alarm
    519 524   try:
    520 525   self.session.get(url) # selenium driver
    skipped 23 lines
    544 549   def get_url(self,url):
    545 550   '''HTTP GET of the url, and add any embedded links.'''
    546 551   if not self.check_robots(url): return # bail out if robots.txt says to
     552 + signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler
    547 553   signal.alarm(self.timeout+2) # set an alarm
    548 554   try:
    549 555   self.session.get(url) # selenium driver
    skipped 20 lines
    570 576   return []
    571 577   
    572 578   def check_robots(self,url):
    573  - result = False
     579 + result = True
     580 + url_robots = uprs.urlunparse(
     581 + uprs.urlparse(url)._replace(scheme='https', path='/robots.txt', query='', params=''))
     582 + signal.signal(signal.SIGALRM, self.robot_hang_handler) # register hang handler
     583 +# signal.alarm(robot_timeout+1) # set a short-term alarm a little longer than robot_timeout
    574 584   try:
    575  - url_robots = uprs.urlunparse(uprs.urlparse(url)._replace(scheme='https',path='/robots.txt',query='',params=''))
    576 585   rp = RobotFileParserUserAgent()
    577 586   rp.set_url(url_robots)
    578 587   rp.read()
    579 588   result = rp.can_fetch(self.user_agent,url)
    580  - except Exception as e:
     589 + except (self.TimeoutError,Exception) as e:
    581 590   if self.debug: print('rp.read() exception:\n{}'.format(e))
     591 + finally:
     592 + signal.alarm(0) # cancel the alarm
    582 593   del rp # ensure self.close() in urllib
    583 594   return result
    584 595   
    skipped 53 lines
    638 649   if self.debug: print('.quit_session() exception:\n{}'.format(e))
    639 650   raise self.TimeoutError('Unable to quit the session as well.')
    640 651   raise self.TimeoutError('phantomjs is taking too long')
     652 + 
     653 + def robot_hang_handler(self, signum, frame):
     654 + if self.debug: print('Looks like robotparser has hung.')
     655 + raise self.TimeoutError('robotparser is taking too long')
    641 656   
    642 657   def check_phantomjs_process(self):
    643 658   '''Check if phantomjs is running.'''
    skipped 39 lines
Please wait...
Page is in error, reload to recover