| skipped 85 lines |
86 | 86 | | |
87 | 87 | | # monkeypatch the read class method in RobotFileParser |
88 | 88 | | # many sites will block access to robots.txt without a standard User-Agent header |
89 | | - | robot_timeout = 3 |
| 89 | + | short_timeout = 3 |
90 | 90 | | class RobotFileParserUserAgent(robotparser.RobotFileParser): |
91 | 91 | | |
92 | | - | timeout = robot_timeout # short-term timeout |
| 92 | + | timeout = short_timeout # short-term timeout |
93 | 93 | | |
94 | 94 | | def read(self): |
95 | 95 | | """Reads the robots.txt URL and feeds it to the parser.""" |
| skipped 14 lines |
110 | 110 | | # Notes for the future: |
111 | 111 | | # 1. The bandwidth usage is undoubtedly (much) smaller because gzip encoding is used |
112 | 112 | | # 2. A lightweight proxy could be used for accurate bandwidth, and header editing |
| 113 | + | |
113 | 114 | | |
114 | 115 | | class ISPDataPollution: |
115 | 116 | | '''Re: https://www.eff.org/deeplinks/2017/03/senate-puts-isp-profits-over-your-privacy |
| skipped 43 lines |
159 | 160 | | # self.gb_per_month = gb_per_month # set in parseArgs |
160 | 161 | | # self.debug = debug # set in parseArgs |
161 | 162 | | self.args = self.args = self.parseArgs() |
162 | | - | signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler |
| 163 | + | # timeout configurable decorators |
| 164 | + | self.phantomjs_timeout = self.block_timeout(self.phantomjs_hang_handler, \ |
| 165 | + | alarm_time=self.timeout+2,errors=(self.TimeoutError,), debug=self.debug) |
| 166 | + | self.phantomjs_short_timeout = self.block_timeout(self.phantomjs_hang_handler, \ |
| 167 | + | alarm_time=short_timeout+1,errors=(self.TimeoutError,Exception), debug=self.debug) |
| 168 | + | self.robots_timeout = self.block_timeout(self.robots_hang_handler, \ |
| 169 | + | alarm_time=short_timeout+1,errors=(self.TimeoutError,), debug=self.debug) |
163 | 170 | | self.fake = Factory.create() |
164 | 171 | | self.hour_trigger = True |
165 | 172 | | self.twentyfour_hour_trigger = True |
| skipped 45 lines |
211 | 218 | | # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution |
212 | 219 | | if hasattr(self,'session'): |
213 | 220 | | if not hard_quit: |
214 | | - | signal.alarm(3) |
215 | | - | try: |
216 | | - | self.session.close() |
217 | | - | except self.TimeoutError as e: |
218 | | - | if self.debug: print('.close() timeout exception:\n{}'.format(e)) |
219 | | - | except Exception as e: |
220 | | - | if self.debug: print('.close() exception:\n{}'.format(e)) |
221 | | - | finally: |
222 | | - | signal.alarm(0) # cancel the alarm |
| 221 | + | @self.phantomjs_short_timeout |
| 222 | + | def phantomjs_close(): self.session.close() |
| 223 | + | phantomjs_close() |
223 | 224 | | try: |
224 | | - | self.session.service.process.send_signal(signal.SIGTERM) |
| 225 | + | @self.phantomjs_short_timeout |
| 226 | + | def phantomjs_send_signal(): self.session.service.process.send_signal(signal.SIGTERM) |
| 227 | + | phantomjs_send_signal() |
225 | 228 | | except Exception as e: |
226 | 229 | | if self.debug: print('.send_signal() exception:\n{}'.format(e)) |
227 | 230 | | try: |
| skipped 5 lines |
233 | 236 | | except Exception as e: |
234 | 237 | | if self.debug: print('.kill() exception:\n{}'.format(e)) |
235 | 238 | | try: |
236 | | - | self.session.quit() |
237 | | - | del self.session # only delete session if quit is successful |
| 239 | + | @self.phantomjs_short_timeout |
| 240 | + | def phantomjs_quit(): |
| 241 | + | self.session.quit() |
| 242 | + | del self.session # only delete session if quit is successful |
| 243 | + | phantomjs_quit() |
238 | 244 | | except Exception as e: |
239 | 245 | | if self.debug: print('.quit() exception:\n{}'.format(e)) |
240 | 246 | | |
| skipped 1 lines |
242 | 248 | | # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver |
243 | 249 | | if hasattr(self, 'session'): |
244 | 250 | | try: |
245 | | - | self.session.delete_all_cookies() |
| 251 | + | @self.phantomjs_short_timeout |
| 252 | + | def phantomjs_delete_all_cookies(): self.session.delete_all_cookies() |
| 253 | + | phantomjs_delete_all_cookies() |
246 | 254 | | except Exception as e: |
247 | 255 | | if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e)) |
248 | 256 | | try: |
249 | | - | self.session.execute_script('window.localStorage.clear();') |
250 | | - | self.session.execute_script('window.sessionStorage.clear();') |
| 257 | + | @self.phantomjs_short_timeout |
| 258 | + | def phantomjs_clear(): |
| 259 | + | self.session.execute_script('window.localStorage.clear();') |
| 260 | + | self.session.execute_script('window.sessionStorage.clear();') |
| 261 | + | phantomjs_clear() |
251 | 262 | | except Exception as e: |
252 | 263 | | if self.debug: print('.execute_script() exception:\n{}'.format(e)) |
253 | 264 | | |
| skipped 162 lines |
416 | 427 | | self.set_user_agent() |
417 | 428 | | if hasattr(self,'session'): |
418 | 429 | | try: |
419 | | - | self.session.delete_all_cookies() |
| 430 | + | @self.phantomjs_short_timeout |
| 431 | + | def phantomjs_delete_all_cookies(): self.session.delete_all_cookies() |
| 432 | + | phantomjs_delete_all_cookies() |
420 | 433 | | except Exception as e: |
421 | 434 | | if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e)) |
422 | 435 | | self.hour_trigger = False |
| skipped 32 lines |
455 | 468 | | global user_agent |
456 | 469 | | self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent |
457 | 470 | | try: |
458 | | - | self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent}) |
| 471 | + | @self.phantomjs_short_timeout |
| 472 | + | def phantomjs_capabilities_update(): |
| 473 | + | self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent}) |
| 474 | + | phantomjs_capabilities_update() |
459 | 475 | | except Exception as e: |
460 | 476 | | if self.debug: print('.update() exception:\n{}'.format(e)) |
461 | 477 | | |
| skipped 57 lines |
519 | 535 | | def get_websearch(self,query): |
520 | 536 | | '''HTTP GET of a websearch, then add any embedded links.''' |
521 | 537 | | url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query))) |
522 | | - | signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler |
523 | | - | signal.alarm(self.timeout+2) # set an alarm |
524 | | - | try: |
525 | | - | self.session.get(url) # selenium driver |
526 | | - | except self.TimeoutError as e: |
527 | | - | if self.debug: print('.get() exception:\n{}'.format(e)) |
528 | | - | finally: |
529 | | - | signal.alarm(0) # cancel the alarm |
530 | | - | try: |
531 | | - | self.data_usage += len(self.session.page_source) |
532 | | - | except Exception as e: |
533 | | - | if self.debug: print('.page_source exception:\n{}'.format(e)) |
| 538 | + | @self.phantomjs_timeout |
| 539 | + | def phantomjs_get(): self.session.get(url) # selenium driver |
| 540 | + | phantomjs_get() |
| 541 | + | @self.phantomjs_short_timeout |
| 542 | + | def phantomjs_page_source(): self.data_usage += len(self.session.page_source) |
| 543 | + | phantomjs_page_source() |
534 | 544 | | new_links = self.websearch_links() |
535 | 545 | | if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) |
536 | 546 | | |
537 | 547 | | def websearch_links(self): |
538 | 548 | | '''Webpage format for a popular search engine, <div class="g">''' |
| 549 | + | # https://github.com/detro/ghostdriver/issues/169 |
| 550 | + | @self.phantomjs_short_timeout |
| 551 | + | def phantomjs_find_elements_by_css_selector(): |
| 552 | + | return WebDriverWait(self.session, 3).until(lambda x: x.find_elements_by_css_selector('div.g')) |
| 553 | + | elements = phantomjs_find_elements_by_css_selector() |
| 554 | + | # get links in random order until max. per page |
| 555 | + | k = 0 |
| 556 | + | links = [] |
539 | 557 | | try: |
540 | | - | # https://github.com/detro/ghostdriver/issues/169 |
541 | | - | elements = WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_css_selector('div.g')) |
542 | | - | return [ div.find_element_by_tag_name('a').get_attribute('href') \ |
543 | | - | for div in elements \ |
544 | | - | if div.find_element_by_tag_name('a').get_attribute('href') is not None ] |
| 558 | + | for div in sorted(elements,key=lambda k: random.random()): |
| 559 | + | @self.phantomjs_short_timeout |
| 560 | + | def phantomjs_find_element_by_tag_name(): |
| 561 | + | if div.find_element_by_tag_name('a').get_attribute('href') is not None: |
| 562 | + | links.append(div.find_element_by_tag_name('a').get_attribute('href')) |
| 563 | + | phantomjs_find_element_by_tag_name() |
| 564 | + | k += 1 |
| 565 | + | if k > self.max_links_per_page: break |
545 | 566 | | except Exception as e: |
546 | 567 | | if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e)) |
547 | | - | return [] |
| 568 | + | return links |
548 | 569 | | |
549 | 570 | | def get_url(self,url): |
550 | 571 | | '''HTTP GET of the url, and add any embedded links.''' |
551 | 572 | | if not self.check_robots(url): return # bail out if robots.txt says to |
552 | | - | signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler |
553 | | - | signal.alarm(self.timeout+2) # set an alarm |
554 | | - | try: |
555 | | - | self.session.get(url) # selenium driver |
556 | | - | except self.TimeoutError as e: |
557 | | - | if self.debug: print('.get() exception:\n{}'.format(e)) |
558 | | - | finally: |
559 | | - | signal.alarm(0) # cancel the alarm |
560 | | - | try: |
561 | | - | self.data_usage += len(self.session.page_source) |
562 | | - | except Exception as e: |
563 | | - | if self.debug: print('.page_source exception:\n{}'.format(e)) |
| 573 | + | @self.phantomjs_timeout |
| 574 | + | def phantomjs_get(): self.session.get(url) # selenium driver |
| 575 | + | phantomjs_get() |
| 576 | + | @self.phantomjs_short_timeout |
| 577 | + | def phantomjs_page_source(): self.data_usage += len(self.session.page_source) |
| 578 | + | phantomjs_page_source() |
564 | 579 | | new_links = self.url_links() |
565 | 580 | | if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) |
566 | 581 | | |
567 | 582 | | def url_links(self): |
568 | | - | '''Generic webpage link finder format.''' |
| 583 | + | """Generic webpage link finder format.""" |
| 584 | + | # https://github.com/detro/ghostdriver/issues/169 |
| 585 | + | @self.phantomjs_short_timeout |
| 586 | + | def phantomjs_find_elements_by_tag_name(): |
| 587 | + | return WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_tag_name('a')) |
| 588 | + | elements = phantomjs_find_elements_by_tag_name() |
| 589 | + | |
| 590 | + | # get links in random order until max. per page |
| 591 | + | k = 0 |
| 592 | + | links = [] |
569 | 593 | | try: |
570 | | - | # https://github.com/detro/ghostdriver/issues/169 |
571 | | - | elements = WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_tag_name('a')) |
572 | | - | return [ a.get_attribute('href') \ |
573 | | - | for a in elements if a.get_attribute('href') is not None ] |
574 | | - | except Exception as e: |
| 594 | + | for a in sorted(elements,key=lambda k: random.random()): |
| 595 | + | @self.phantomjs_short_timeout |
| 596 | + | def phantomjs_get_attribute(): |
| 597 | + | if a.get_attribute('href') is not None: |
| 598 | + | links.append(a.get_attribute('href')) |
| 599 | + | phantomjs_get_attribute() |
| 600 | + | k += 1 |
| 601 | + | if k > self.max_links_per_page: break |
| 602 | + | except Exception as a: |
575 | 603 | | if self.debug: print('.get_attribute() exception:\n{}'.format(e)) |
576 | | - | return [] |
| 604 | + | return links |
577 | 605 | | |
578 | 606 | | def check_robots(self,url): |
579 | 607 | | result = True |
580 | | - | url_robots = uprs.urlunparse( |
581 | | - | uprs.urlparse(url)._replace(scheme='https', path='/robots.txt', query='', params='')) |
582 | | - | signal.signal(signal.SIGALRM, self.robot_hang_handler) # register hang handler |
583 | | - | # signal.alarm(robot_timeout+1) # set a short-term alarm a little longer than robot_timeout |
584 | | - | try: |
| 608 | + | url_robots = uprs.urlunparse(uprs.urlparse(url)._replace(scheme='https', |
| 609 | + | path='/robots.txt', query='', params='')) |
| 610 | + | @self.robots_timeout |
| 611 | + | def robots_read(): |
585 | 612 | | rp = RobotFileParserUserAgent() |
586 | 613 | | rp.set_url(url_robots) |
587 | 614 | | rp.read() |
588 | 615 | | result = rp.can_fetch(self.user_agent,url) |
589 | | - | except (self.TimeoutError,Exception) as e: |
590 | | - | if self.debug: print('rp.read() exception:\n{}'.format(e)) |
591 | | - | finally: |
592 | | - | signal.alarm(0) # cancel the alarm |
593 | | - | del rp # ensure self.close() in urllib |
| 616 | + | del rp # ensure self.close() in urllib |
| 617 | + | robots_read() |
594 | 618 | | return result |
595 | 619 | | |
596 | 620 | | def add_url_links(self,links,url=''): |
| skipped 6 lines |
603 | 627 | | if self.verbose or self.debug: |
604 | 628 | | current_url = url # default |
605 | 629 | | try: |
606 | | - | current_url = self.session.current_url |
| 630 | + | @self.phantomjs_short_timeout |
| 631 | + | def phantomjs_current_url(): return self.session.current_url |
| 632 | + | current_url = phantomjs_current_url() |
607 | 633 | | # the current_url method breaks on a lot of sites, e.g. |
608 | 634 | | # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()' |
609 | 635 | | except Exception as e: |
| skipped 25 lines |
635 | 661 | | return running_bandwidth > self.gb_per_month |
636 | 662 | | |
637 | 663 | | # handle phantomjs timeouts |
| 664 | + | # configurable decorator to timeout phantomjs and robotparser calls |
| 665 | + | # http://stackoverflow.com/questions/15572288/general-decorator-to-wrap-try-except-in-python |
| 666 | + | # Syntax: |
| 667 | + | # phantomjs_timeout = block_timeout(phantomjs_hang_handler) |
| 668 | + | # @phantomjs_timeout |
| 669 | + | # def phantomjs_block(): |
| 670 | + | # # phantomjs stuff |
| 671 | + | # pass |
| 672 | + | # phantomjs_block() |
| 673 | + | |
| 674 | + | def block_timeout(self,hang_handler, alarm_time=timeout, errors=(Exception,), debug=False): |
| 675 | + | def decorator(func): |
| 676 | + | def call_func(*args, **kwargs): |
| 677 | + | signal.signal(signal.SIGALRM, hang_handler) # register hang handler |
| 678 | + | signal.alarm(alarm_time) # set an alarm |
| 679 | + | result = None |
| 680 | + | try: |
| 681 | + | result = func(*args, **kwargs) |
| 682 | + | except errors as e: |
| 683 | + | if debug: print('{} exception:\n{}'.format(func.__name__, e)) |
| 684 | + | finally: |
| 685 | + | signal.alarm(0) # cancel the alarm |
| 686 | + | return result |
| 687 | + | return call_func |
| 688 | + | return decorator |
| 689 | + | |
638 | 690 | | class TimeoutError(Exception): |
639 | 691 | | pass |
640 | 692 | | |
| skipped 9 lines |
650 | 702 | | raise self.TimeoutError('Unable to quit the session as well.') |
651 | 703 | | raise self.TimeoutError('phantomjs is taking too long') |
652 | 704 | | |
653 | | - | def robot_hang_handler(self, signum, frame): |
| 705 | + | def robots_hang_handler(self, signum, frame): |
654 | 706 | | if self.debug: print('Looks like robotparser has hung.') |
655 | 707 | | raise self.TimeoutError('robotparser is taking too long') |
656 | 708 | | |
| skipped 24 lines |
681 | 733 | | after three attempts. """ |
682 | 734 | | for k in range(3): # three strikes |
683 | 735 | | try: |
684 | | - | pid = self.session.service.process.pid |
| 736 | + | @self.phantomjs_short_timeout |
| 737 | + | def phantomjs_process_pid(): return self.session.service.process.pid |
| 738 | + | pid = phantomjs_process_pid() |
685 | 739 | | rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20) |
686 | 740 | | break |
687 | 741 | | except (psutil.NoSuchProcess,Exception) as e: |
| skipped 10 lines |