🤬
  • WebDriverWait to mitigate "Element is no longer attached to the DOM"; phantomjs hard_quit

  • Loading...
  • Steven Thomas Smith committed 7 years ago
    8205f091
    1 parent 25645bbd
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 23 lines
    24 24  import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
    25 25  from selenium import webdriver
    26 26  from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
     27 +from selenium.webdriver.support.ui import WebDriverWait
    27 28  from io import BytesIO
    28 29  from faker import Factory
    29 30   
    skipped 172 lines
    202 203   driver.set_page_load_timeout(self.timeout+10)
    203 204   self.session = driver
    204 205   
    205  - def quit_session(self,pid=None):
     206 + def quit_session(self,hard_quit=False,pid=None):
    206 207   ''' close, kill -9, quit, del '''
    207 208   # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution
    208 209   if hasattr(self,'session'):
    209  - try:
    210  - self.session.close()
    211  - except Exception as e:
    212  - if self.debug: print('.close() exception:\n{}'.format(e))
     210 + if not hard_quit:
     211 + signal.alarm(3)
     212 + try:
     213 + self.session.close()
     214 + except self.TimeoutError as e:
     215 + if self.debug: print('.close() timeout exception:\n{}'.format(e))
     216 + except Exception as e:
     217 + if self.debug: print('.close() exception:\n{}'.format(e))
     218 + finally:
     219 + signal.alarm(0) # cancel the alarm
    213 220   try:
    214 221   self.session.service.process.send_signal(signal.SIGTERM)
    215 222   except Exception as e:
    skipped 253 lines
    469 476   def websearch_links(self):
    470 477   '''Webpage format for a popular search engine, <div class="g">'''
    471 478   try:
     479 + # https://github.com/detro/ghostdriver/issues/169
     480 + elements = WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_css_selector('div.g'))
    472 481   return [ div.find_element_by_tag_name('a').get_attribute('href') \
    473  - for div in self.session.find_elements_by_css_selector('div.g') \
     482 + for div in elements \
    474 483   if div.find_element_by_tag_name('a').get_attribute('href') is not None ]
    475 484   except Exception as e:
    476 485   if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e))
    skipped 19 lines
    496 505   def url_links(self):
    497 506   '''Generic webpage link finder format.'''
    498 507   try:
     508 + # https://github.com/detro/ghostdriver/issues/169
     509 + elements = WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_tag_name('a'))
    499 510   return [ a.get_attribute('href') \
    500  - for a in self.session.find_elements_by_tag_name('a') \
    501  - if a.get_attribute('href') is not None ]
     511 + for a in elements if a.get_attribute('href') is not None ]
    502 512   except Exception as e:
    503 513   if self.debug: print('.get_attribute() exception:\n{}'.format(e))
    504 514   return []
    skipped 61 lines
    566 576   # http://stackoverflow.com/questions/492519/timeout-on-a-function-call
    567 577   if self.debug: print('Looks like phantomjs has hung.')
    568 578   try:
    569  - self.quit_session()
     579 + self.quit_session(hard_quit=True)
    570 580   self.open_session()
    571 581   except Exception as e:
    572 582   if self.debug: print('.quit_session() exception:\n{}'.format(e))
    skipped 44 lines
Please wait...
Page is in error, reload to recover