| skipped 85 lines |
86 | 86 | | |
87 | 87 | | # monkeypatch the read class method in RobotFileParser |
88 | 88 | | # many sites will block access to robots.txt without a standard User-Agent header |
| 89 | + | robot_timeout = 3 |
89 | 90 | | class RobotFileParserUserAgent(robotparser.RobotFileParser): |
| 91 | + | |
| 92 | + | timeout = robot_timeout # short-term timeout |
| 93 | + | |
90 | 94 | | def read(self): |
91 | 95 | | """Reads the robots.txt URL and feeds it to the parser.""" |
92 | 96 | | try: |
93 | 97 | | headers = {'User-Agent': user_agent, } |
94 | 98 | | request = urllib.request.Request(self.url, None, headers) |
95 | | - | f = urllib.request.urlopen(request) |
| 99 | + | f = urllib.request.urlopen(request,timeout=self.timeout) |
96 | 100 | | # f = urllib.request.urlopen(self.url) #! original code |
97 | 101 | | except urllib.error.HTTPError as err: |
98 | 102 | | if err.code in (401, 403): |
| skipped 416 lines |
515 | 519 | | def get_websearch(self,query): |
516 | 520 | | '''HTTP GET of a websearch, then add any embedded links.''' |
517 | 521 | | url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query))) |
| 522 | + | signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler |
518 | 523 | | signal.alarm(self.timeout+2) # set an alarm |
519 | 524 | | try: |
520 | 525 | | self.session.get(url) # selenium driver |
| skipped 23 lines |
544 | 549 | | def get_url(self,url): |
545 | 550 | | '''HTTP GET of the url, and add any embedded links.''' |
546 | 551 | | if not self.check_robots(url): return # bail out if robots.txt says to |
| 552 | + | signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler |
547 | 553 | | signal.alarm(self.timeout+2) # set an alarm |
548 | 554 | | try: |
549 | 555 | | self.session.get(url) # selenium driver |
| skipped 20 lines |
570 | 576 | | return [] |
571 | 577 | | |
572 | 578 | | def check_robots(self,url): |
573 | | - | result = False |
| 579 | + | result = True |
| 580 | + | url_robots = uprs.urlunparse( |
| 581 | + | uprs.urlparse(url)._replace(scheme='https', path='/robots.txt', query='', params='')) |
| 582 | + | signal.signal(signal.SIGALRM, self.robot_hang_handler) # register hang handler |
| 583 | + | # signal.alarm(robot_timeout+1) # set a short-term alarm a little longer than robot_timeout |
574 | 584 | | try: |
575 | | - | url_robots = uprs.urlunparse(uprs.urlparse(url)._replace(scheme='https',path='/robots.txt',query='',params='')) |
576 | 585 | | rp = RobotFileParserUserAgent() |
577 | 586 | | rp.set_url(url_robots) |
578 | 587 | | rp.read() |
579 | 588 | | result = rp.can_fetch(self.user_agent,url) |
580 | | - | except Exception as e: |
| 589 | + | except (self.TimeoutError,Exception) as e: |
581 | 590 | | if self.debug: print('rp.read() exception:\n{}'.format(e)) |
| 591 | + | finally: |
| 592 | + | signal.alarm(0) # cancel the alarm |
582 | 593 | | del rp # ensure self.close() in urllib |
583 | 594 | | return result |
584 | 595 | | |
| skipped 53 lines |
638 | 649 | | if self.debug: print('.quit_session() exception:\n{}'.format(e)) |
639 | 650 | | raise self.TimeoutError('Unable to quit the session as well.') |
640 | 651 | | raise self.TimeoutError('phantomjs is taking too long') |
| 652 | + | |
| 653 | + | def robot_hang_handler(self, signum, frame): |
| 654 | + | if self.debug: print('Looks like robotparser has hung.') |
| 655 | + | raise self.TimeoutError('robotparser is taking too long') |
641 | 656 | | |
642 | 657 | | def check_phantomjs_process(self): |
643 | 658 | | '''Check if phantomjs is running.''' |
| skipped 39 lines |