🤬
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 198 lines
    199 199   parser.add_argument('-mm', '--maxmemory',
    200 200   help="Maximum memory of phantomjs (MB); 0=>restart every link",
    201 201   type=int, default=1024)
    202  - # parser.add_argument('-P', '--phantomjs-binary-path', help="Path to phantomjs binary", type=int, default=phantomjs_rss_limit_mb)
     202 + parser.add_argument('-P', '--phantomjs-binary-path', help="Path to phantomjs binary", type=str, default=None)
     203 + parser.add_argument('-p', '--proxy', help="Proxy for phantomjs", type=str, default=None)
    203 204   parser.add_argument('-g', '--debug', help="Debug flag", action='store_true')
    204 205   args = parser.parse_args()
    205 206   for k in args.__dict__: setattr(self,k,getattr(args,k))
    skipped 8 lines
    214 215   def check_phantomjs_version(self,recommended_version=(2,1)):
    215 216   self.open_session()
    216 217   if self.debug:
    217  - print("{} version is {}, {} version is {}".format(self.session.capabilities["browserName"],
    218  - self.session.capabilities["version"],
    219  - self.session.capabilities["driverName"],
    220  - self.session.capabilities["driverVersion"]))
    221  - phantomjs_version = tuple(int(i) for i in self.session.capabilities["version"].split('.'))
     218 + print("{} version is {}, {} version is {}".format(self.driver.capabilities["browserName"],
     219 + self.driver.capabilities["version"],
     220 + self.driver.capabilities["driverName"],
     221 + self.driver.capabilities["driverVersion"]))
     222 + phantomjs_version = tuple(int(i) for i in self.driver.capabilities["version"].split('.'))
    222 223   if phantomjs_version < recommended_version:
    223 224   warn.warn("""{} version is {};
    224 225  please upgrade to at least version {} from http://phantomjs.org.
    225  -""".format(self.session.capabilities["browserName"],self.session.capabilities["version"],
     226 +""".format(self.driver.capabilities["browserName"],self.driver.capabilities["version"],
    226 227   '.'.join(str(i) for i in recommended_version)))
    227 228   self.quit_session()
    228 229   
    229 230   def open_session(self):
    230 231   self.quit_session()
    231  - if not hasattr(self, 'session') or not isinstance(self.session,webdriver.phantomjs.webdriver.WebDriver):
     232 + if not hasattr(self, 'session') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver):
    232 233   # phantomjs session
    233 234   # http://engineering.shapesecurity.com/2015/01/detecting-phantomjs-based-visitors.html
    234 235   # https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string
    skipped 1 lines
    236 237   # http://stackoverflow.com/questions/23390974/phantomjs-keeping-cache
    237 238   dcap = dict(DesiredCapabilities.PHANTOMJS)
    238 239   # dcap['browserName'] = 'Chrome'
    239  - # if hasattr(self,'phantomjs_binary_path'): dcap['phantomjs.binary.path'] = ( self.phantomjs_binary_path )
    240 240   dcap['phantomjs.page.settings.userAgent'] = ( self.user_agent )
    241 241   dcap['phantomjs.page.settings.loadImages'] = ( 'false' )
    242 242   dcap['phantomjs.page.settings.clearMemoryCaches'] = ( 'true' )
    skipped 2 lines
    245 245   dcap['applicationCacheEnabled'] = ( True )
    246 246   dcap['handlesAlerts'] = ( False )
    247 247   dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch' } )
    248  - driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2'])
    249  - # if hasattr(self,'phantomjs_binary_path'): driver.capabilities.setdefault("phantomjs.binary.path", self.phantomjs_binary_path)
     248 + phantomjs_service_args = ['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2']
     249 + if self.proxy is not None:
     250 + phantomjs_service_args = ['--proxy={}'.format(self.proxy)] + phantomjs_service_args
     251 + if self.phantomjs_binary_path is None:
     252 + driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=phantomjs_service_args)
     253 + else:
     254 + driver = webdriver.PhantomJS(self.phantomjs_binary_path,desired_capabilities=dcap,service_args=phantomjs_service_args)
    250 255   driver.set_window_size(1296,1018) # Tor browser size on Linux
    251  - driver.implicitly_wait(self.timeout+10)
    252  - driver.set_page_load_timeout(self.timeout+10)
    253  - self.session = driver
     256 + driver.implicitly_wait(self.timeout)
     257 + driver.set_page_load_timeout(self.timeout)
     258 + driver.set_script_timeout(self.timeout)
     259 + self.driver = driver
    254 260   
    255 261   def quit_session(self,hard_quit=False,pid=None,phantomjs_short_timeout_decorator=None):
    256 262   """
    skipped 8 lines
    265 271   if hasattr(self,'session'):
    266 272   if not hard_quit:
    267 273   @phantomjs_short_timeout_decorator
    268  - def phantomjs_close(): self.session.close()
     274 + def phantomjs_close(): self.driver.close()
    269 275   phantomjs_close()
    270 276   try:
    271 277   @phantomjs_short_timeout_decorator
    272  - def phantomjs_send_signal(): self.session.service.process.send_signal(signal.SIGTERM)
     278 + def phantomjs_send_signal(): self.driver.service.process.send_signal(signal.SIGTERM)
    273 279   phantomjs_send_signal()
    274 280   except Exception as e:
    275 281   if self.debug: print('.send_signal() exception:\n{}'.format(e))
    276  - try:
    277  - if pid is None: pid, _ = self.phantomjs_pid_and_memory()
    278  - except Exception as e:
    279  - if self.debug: print('.phantomjs_pid_and_memory() exception:\n{}'.format(e))
    280  - try:
    281  - os.kill(pid, signal.SIGTERM) # overkill (pun intended)
    282  - except Exception as e:
    283  - if self.debug: print('.kill() exception:\n{}'.format(e))
     282 + if pid is None:
     283 + @phantomjs_short_timeout_decorator
     284 + def phantomjs_process_pid(): return self.driver.service.process.pid
     285 + pid = phantomjs_process_pid()
     286 + if isinstance(pid,int):
     287 + try:
     288 + os.kill(pid, signal.SIGTERM) # overkill (pun intended)
     289 + except Exception as e:
     290 + if self.debug: print('.kill() exception:\n{}'.format(e))
    284 291   try:
    285 292   @phantomjs_short_timeout_decorator
    286  - def phantomjs_quit(): self.session.quit()
     293 + def phantomjs_quit(): self.driver.quit()
    287 294   phantomjs_quit()
    288 295   except Exception as e:
    289 296   if self.debug: print('.quit() exception:\n{}'.format(e))
    290  - del self.session
     297 + del self.driver
    291 298   
    292 299   def clear_session(self):
    293 300   # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver
    294 301   if hasattr(self, 'session'):
    295 302   try:
    296 303   @self.phantomjs_short_timeout
    297  - def phantomjs_delete_all_cookies(): self.session.delete_all_cookies()
     304 + def phantomjs_delete_all_cookies(): self.driver.delete_all_cookies()
    298 305   phantomjs_delete_all_cookies()
    299 306   except Exception as e:
    300 307   if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
    301 308   try:
    302 309   @self.phantomjs_short_timeout
    303 310   def phantomjs_clear():
    304  - self.session.execute_script('window.localStorage.clear();')
    305  - self.session.execute_script('window.sessionStorage.clear();')
     311 + self.driver.execute_script('window.localStorage.clear();')
     312 + self.driver.execute_script('window.sessionStorage.clear();')
    306 313   phantomjs_clear()
    307 314   except Exception as e:
    308 315   if self.debug: print('.execute_script() exception:\n{}'.format(e))
    skipped 183 lines
    492 499   else:
    493 500   try:
    494 501   @self.phantomjs_short_timeout
    495  - def phantomjs_delete_all_cookies(): self.session.delete_all_cookies()
     502 + def phantomjs_delete_all_cookies(): self.driver.delete_all_cookies()
    496 503   phantomjs_delete_all_cookies()
    497 504   except Exception as e:
    498 505   if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
    skipped 42 lines
    541 548   try:
    542 549   @self.phantomjs_short_timeout
    543 550   def phantomjs_capabilities_update():
    544  - self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent})
     551 + self.driver.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent})
    545 552   phantomjs_capabilities_update()
    546 553   except Exception as e:
    547 554   if self.debug: print('.update() exception:\n{}'.format(e))
    skipped 97 lines
    645 652   url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
    646 653   if self.verbose: self.print_url(url)
    647 654   @self.phantomjs_timeout
    648  - def phantomjs_get(): self.session.get(url) # selenium driver
     655 + def phantomjs_get(): self.driver.get(url) # selenium driver
    649 656   phantomjs_get()
    650 657   @self.phantomjs_short_timeout
    651  - def phantomjs_page_source(): self.data_usage += len(self.session.page_source)
     658 + def phantomjs_page_source(): self.data_usage += len(self.driver.page_source)
    652 659   phantomjs_page_source()
    653 660   new_links = self.websearch_links()
    654 661   if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    skipped 6 lines
    661 668   # https://github.com/detro/ghostdriver/issues/169
    662 669   @self.phantomjs_short_timeout
    663 670   def phantomjs_find_elements_by_css_selector():
    664  - return WebDriverWait(self.session,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g'))
     671 + return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g'))
    665 672   elements = phantomjs_find_elements_by_css_selector()
    666 673   # get links in random order until max. per page
    667 674   k = 0
    skipped 8 lines
    676 683   href = phantomjs_get_attribute()
    677 684   if href is not None: links.append(href)
    678 685   k += 1
    679  - if k > self.max_links_per_page: break
     686 + if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break
    680 687   except Exception as e:
    681 688   if self.debug: print('.find_element_by_tag_name.get_attribute() exception:\n{}'.format(e))
    682 689   return links
    skipped 6 lines
    689 696   """
    690 697   if not self.check_robots(url): return # bail out if robots.txt says to
    691 698   @self.phantomjs_timeout
    692  - def phantomjs_get(): self.session.get(url) # selenium driver
     699 + def phantomjs_get(): self.driver.get(url) # selenium driver
    693 700   phantomjs_get()
    694 701   @self.phantomjs_short_timeout
    695  - def phantomjs_page_source(): self.data_usage += len(self.session.page_source)
     702 + def phantomjs_page_source(): self.data_usage += len(self.driver.page_source)
    696 703   phantomjs_page_source()
    697 704   new_links = self.url_links()
    698 705   if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    skipped 3 lines
    702 709   # https://github.com/detro/ghostdriver/issues/169
    703 710   @self.phantomjs_short_timeout
    704 711   def phantomjs_find_elements_by_tag_name():
    705  - return WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_tag_name('a'))
     712 + return WebDriverWait(self.driver,3).until(lambda x: x.find_elements_by_tag_name('a'))
    706 713   elements = phantomjs_find_elements_by_tag_name()
    707 714   
    708 715   # get links in random order until max. per page
    skipped 6 lines
    715 722   href = phantomjs_get_attribute()
    716 723   if href is not None: links.append(href)
    717 724   k += 1
    718  - if k > self.max_links_per_page: break
     725 + if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break
    719 726   except Exception as e:
    720 727   if self.debug: print('.get_attribute() exception:\n{}'.format(e))
    721 728   return links
    skipped 24 lines
    746 753   current_url = url # default
    747 754   try:
    748 755   @self.phantomjs_short_timeout
    749  - def phantomjs_current_url(): return self.session.current_url
     756 + def phantomjs_current_url(): return self.driver.current_url
    750 757   current_url = phantomjs_current_url()
    751 758   # the current_url method breaks on a lot of sites, e.g.
    752 759   # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()'
    skipped 125 lines
    878 885   for k in range(3): # three strikes
    879 886   try:
    880 887   @self.phantomjs_short_timeout
    881  - def phantomjs_process_pid(): return self.session.service.process.pid
     888 + def phantomjs_process_pid(): return self.driver.service.process.pid
    882 889   pid = phantomjs_process_pid()
    883 890   rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20)
    884 891   break
    skipped 11 lines
Please wait...
Page is in error, reload to recover