| skipped 23 lines |
24 | 24 | | import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs |
25 | 25 | | from selenium import webdriver |
26 | 26 | | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities |
| 27 | + | from selenium.webdriver.support.ui import WebDriverWait |
27 | 28 | | from io import BytesIO |
28 | 29 | | from faker import Factory |
29 | 30 | | |
| skipped 172 lines |
202 | 203 | | driver.set_page_load_timeout(self.timeout+10) |
203 | 204 | | self.session = driver |
204 | 205 | | |
205 | | - | def quit_session(self,pid=None): |
| 206 | + | def quit_session(self,hard_quit=False,pid=None): |
206 | 207 | | ''' close, kill -9, quit, del ''' |
207 | 208 | | # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution |
208 | 209 | | if hasattr(self,'session'): |
209 | | - | try: |
210 | | - | self.session.close() |
211 | | - | except Exception as e: |
212 | | - | if self.debug: print('.close() exception:\n{}'.format(e)) |
| 210 | + | if not hard_quit: |
| 211 | + | signal.alarm(3) |
| 212 | + | try: |
| 213 | + | self.session.close() |
| 214 | + | except self.TimeoutError as e: |
| 215 | + | if self.debug: print('.close() timeout exception:\n{}'.format(e)) |
| 216 | + | except Exception as e: |
| 217 | + | if self.debug: print('.close() exception:\n{}'.format(e)) |
| 218 | + | finally: |
| 219 | + | signal.alarm(0) # cancel the alarm |
213 | 220 | | try: |
214 | 221 | | self.session.service.process.send_signal(signal.SIGTERM) |
215 | 222 | | except Exception as e: |
| skipped 253 lines |
469 | 476 | | def websearch_links(self): |
470 | 477 | | '''Webpage format for a popular search engine, <div class="g">''' |
471 | 478 | | try: |
| 479 | + | # https://github.com/detro/ghostdriver/issues/169 |
| 480 | + | elements = WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_css_selector('div.g')) |
472 | 481 | | return [ div.find_element_by_tag_name('a').get_attribute('href') \ |
473 | | - | for div in self.session.find_elements_by_css_selector('div.g') \ |
| 482 | + | for div in elements \ |
474 | 483 | | if div.find_element_by_tag_name('a').get_attribute('href') is not None ] |
475 | 484 | | except Exception as e: |
476 | 485 | | if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e)) |
| skipped 19 lines |
496 | 505 | | def url_links(self): |
497 | 506 | | '''Generic webpage link finder format.''' |
498 | 507 | | try: |
| 508 | + | # https://github.com/detro/ghostdriver/issues/169 |
| 509 | + | elements = WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_tag_name('a')) |
499 | 510 | | return [ a.get_attribute('href') \ |
500 | | - | for a in self.session.find_elements_by_tag_name('a') \ |
501 | | - | if a.get_attribute('href') is not None ] |
| 511 | + | for a in elements if a.get_attribute('href') is not None ] |
502 | 512 | | except Exception as e: |
503 | 513 | | if self.debug: print('.get_attribute() exception:\n{}'.format(e)) |
504 | 514 | | return [] |
| skipped 61 lines |
566 | 576 | | # http://stackoverflow.com/questions/492519/timeout-on-a-function-call |
567 | 577 | | if self.debug: print('Looks like phantomjs has hung.') |
568 | 578 | | try: |
569 | | - | self.quit_session() |
| 579 | + | self.quit_session(hard_quit=True) |
570 | 580 | | self.open_session() |
571 | 581 | | except Exception as e: |
572 | 582 | | if self.debug: print('.quit_session() exception:\n{}'.format(e)) |
| skipped 44 lines |