🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 212 lines
    213 213   self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits
    214 214   
    215 215   def check_phantomjs_version(self,recommended_version=(2,1)):
    216  - self.open_session()
     216 + self.open_driver()
    217 217   if self.debug:
    218 218   print("{} version is {}, {} version is {}".format(self.driver.capabilities["browserName"],
    219 219   self.driver.capabilities["version"],
    skipped 5 lines
    225 225  please upgrade to at least version {} from http://phantomjs.org.
    226 226  """.format(self.driver.capabilities["browserName"],self.driver.capabilities["version"],
    227 227   '.'.join(str(i) for i in recommended_version)))
    228  - self.quit_session()
     228 + self.quit_driver()
    229 229   
    230  - def open_session(self):
    231  - self.quit_session()
    232  - if not hasattr(self, 'session') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver):
    233  - # phantomjs session
     230 + def open_driver(self):
     231 + self.quit_driver()
     232 + if not hasattr(self, 'driver') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver):
     233 + # phantomjs driver
    234 234   # http://engineering.shapesecurity.com/2015/01/detecting-phantomjs-based-visitors.html
    235 235   # https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string
    236 236   # http://phantomjs.org/api/webpage/property/settings.html
    skipped 21 lines
    258 258   driver.set_script_timeout(self.timeout)
    259 259   self.driver = driver
    260 260   
    261  - def quit_session(self,hard_quit=False,pid=None,phantomjs_short_timeout_decorator=None):
     261 + def quit_driver(self,hard_quit=False,pid=None,phantomjs_short_timeout_decorator=None):
    262 262   """
    263 263   close, kill -9, quit, del
    264 264   :param hard_quit:
    skipped 3 lines
    268 268   # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution
    269 269   if phantomjs_short_timeout_decorator is None:
    270 270   phantomjs_short_timeout_decorator = self.phantomjs_short_timeout
    271  - if hasattr(self,'session'):
     271 + if hasattr(self,'driver'):
    272 272   if not hard_quit:
    273 273   @phantomjs_short_timeout_decorator
    274 274   def phantomjs_close(): self.driver.close()
    skipped 21 lines
    296 296   if self.debug: print('.quit() exception:\n{}'.format(e))
    297 297   del self.driver
    298 298   
    299  - def clear_session(self):
     299 + def clear_driver(self):
    300 300   # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver
    301  - if hasattr(self, 'session'):
     301 + if hasattr(self, 'driver'):
    302 302   try:
    303 303   @self.phantomjs_short_timeout
    304 304   def phantomjs_delete_all_cookies(): self.driver.delete_all_cookies()
    skipped 68 lines
    373 373  Downloading: website.com; NNNNN links [in library], H(domain)= B bits [entropy]
    374 374  Downloaded: website.com: +LLL/NNNNN links [added], H(domain)= B bits [entropy]
    375 375  """)
    376  - self.open_session()
     376 + self.open_driver()
    377 377   self.seed_links()
    378  - self.clear_session()
    379  - if self.quit_driver_every_call: self.quit_session()
     378 + self.clear_driver()
     379 + if self.quit_driver_every_call: self.quit_driver()
    380 380   while True: # pollute forever, pausing only to meet the bandwidth requirement
    381 381   try:
    382 382   if (not self.diurnal_flag) or self.diurnal_cycle_test():
    skipped 12 lines
    395 395   def pollute(self):
    396 396   if not self.quit_driver_every_call: self.check_phantomjs_process()
    397 397   if self.link_count() < 2000:
    398  - if self.quit_driver_every_call: self.open_session()
     398 + if self.quit_driver_every_call: self.open_driver()
    399 399   self.seed_links()
    400  - self.clear_session()
    401  - if self.quit_driver_every_call: self.quit_session()
     400 + self.clear_driver()
     401 + if self.quit_driver_every_call: self.quit_driver()
    402 402   url = self.pop_link()
    403 403   if self.verbose: self.print_url(url)
    404  - if self.quit_driver_every_call: self.open_session()
     404 + if self.quit_driver_every_call: self.open_driver()
    405 405   self.get_url(url)
    406  - self.clear_session()
    407  - if self.quit_driver_every_call: self.quit_session()
     406 + self.clear_driver()
     407 + if self.quit_driver_every_call: self.quit_driver()
    408 408   
    409 409   def link_count(self):
    410 410   return int(np.array([len(self.domain_links[dmn]) for dmn in self.domain_links]).sum())
    skipped 80 lines
    491 491   if int(self.elapsed_time/60. % 60.) == 59:
    492 492   # reset user agent, clear out cookies, seed more links
    493 493   if self.hour_trigger:
    494  - if hasattr(self,'session'):
     494 + if hasattr(self,'driver'):
    495 495   self.set_user_agent()
    496 496   if True:
    497  - self.quit_session()
    498  - self.open_session()
     497 + self.quit_driver()
     498 + self.open_driver()
    499 499   else:
    500 500   try:
    501 501   @self.phantomjs_short_timeout
    skipped 2 lines
    504 504   except Exception as e:
    505 505   if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
    506 506   self.seed_links()
    507  - else: self.open_session()
     507 + else: self.open_driver()
    508 508   self.hour_trigger = False
    509 509   else:
    510 510   self.hour_trigger = True
    skipped 4 lines
    515 515   if int(self.elapsed_time/3600. % 24.) == 23:
    516 516   # clear out cookies every day, decimate, and seed more links
    517 517   if self.twentyfour_hour_trigger:
    518  - if hasattr(self,'session'):
     518 + if hasattr(self,'driver'):
    519 519   self.seed_links()
    520  - # restart the session
    521  - self.quit_session()
    522  - self.open_session()
     520 + # restart the driver
     521 + self.quit_driver()
     522 + self.open_driver()
    523 523   else:
    524  - self.open_session()
     524 + self.open_driver()
    525 525   self.decimate_links(total_frac=0.667, decimate_frac=0.1)
    526 526   self.seed_links()
    527  - if self.quit_driver_every_call: self.quit_session()
     527 + if self.quit_driver_every_call: self.quit_driver()
    528 528   self.twentyfour_hour_trigger = False
    529 529   else:
    530 530   self.twentyfour_hour_trigger = True
    skipped 311 lines
    842 842   # http://stackoverflow.com/questions/492519/timeout-on-a-function-call
    843 843   if self.debug: print('Looks like phantomjs has hung.')
    844 844   try:
    845  - self.quit_session(phantomjs_short_timeout_decorator=self.phantomjs_quit_timeout)
     845 + self.quit_driver(phantomjs_short_timeout_decorator=self.phantomjs_quit_timeout)
    846 846   except Exception as e:
    847 847   if self.debug: print(e)
    848  - self.open_session()
     848 + self.open_driver()
    849 849   
    850 850   def phantomjs_quit_hang_handler(self, signum, frame):
    851 851   raise self.TimeoutError('phantomjs .quit method is taking too long')
    skipped 10 lines
    862 862   # Check rss and restart if too large, then check existence
    863 863   # http://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python
    864 864   try:
    865  - if not hasattr(self,'session'): self.open_session()
     865 + if not hasattr(self,'driver'): self.open_driver()
    866 866   pid, rss_mb = self.phantomjs_pid_and_memory()
    867 867   if rss_mb > self.phantomjs_rss_limit_mb: # memory limit
    868  - self.quit_session(pid=pid)
    869  - self.open_session()
     868 + self.quit_driver(pid=pid)
     869 + self.open_driver()
    870 870   pid, _ = self.phantomjs_pid_and_memory()
    871 871   # check existence
    872 872   os.kill(pid, 0)
    skipped 18 lines
    891 891   break
    892 892   except (psutil.NoSuchProcess,Exception) as e:
    893 893   if self.debug: print('.service.process.pid exception:\n{}'.format(e))
    894  - self.quit_session(pid=pid)
    895  - self.open_session()
     894 + self.quit_driver(pid=pid)
     895 + self.open_driver()
    896 896   else: # throw in the towel and exit if no viable phantomjs process after multiple attempts
    897 897   sys.exit()
    898 898   return (pid, rss_mb)
    skipped 4 lines
Please wait...
Page is in error, reload to recover