| skipped 212 lines |
213 | 213 | | self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits |
214 | 214 | | |
215 | 215 | | def check_phantomjs_version(self,recommended_version=(2,1)): |
216 | | - | self.open_session() |
| 216 | + | self.open_driver() |
217 | 217 | | if self.debug: |
218 | 218 | | print("{} version is {}, {} version is {}".format(self.driver.capabilities["browserName"], |
219 | 219 | | self.driver.capabilities["version"], |
| skipped 5 lines |
225 | 225 | | please upgrade to at least version {} from http://phantomjs.org. |
226 | 226 | | """.format(self.driver.capabilities["browserName"],self.driver.capabilities["version"], |
227 | 227 | | '.'.join(str(i) for i in recommended_version))) |
228 | | - | self.quit_session() |
| 228 | + | self.quit_driver() |
229 | 229 | | |
230 | | - | def open_session(self): |
231 | | - | self.quit_session() |
232 | | - | if not hasattr(self, 'session') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver): |
233 | | - | # phantomjs session |
| 230 | + | def open_driver(self): |
| 231 | + | self.quit_driver() |
| 232 | + | if not hasattr(self, 'driver') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver): |
| 233 | + | # phantomjs driver |
234 | 234 | | # http://engineering.shapesecurity.com/2015/01/detecting-phantomjs-based-visitors.html |
235 | 235 | | # https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string |
236 | 236 | | # http://phantomjs.org/api/webpage/property/settings.html |
| skipped 21 lines |
258 | 258 | | driver.set_script_timeout(self.timeout) |
259 | 259 | | self.driver = driver |
260 | 260 | | |
261 | | - | def quit_session(self,hard_quit=False,pid=None,phantomjs_short_timeout_decorator=None): |
| 261 | + | def quit_driver(self,hard_quit=False,pid=None,phantomjs_short_timeout_decorator=None): |
262 | 262 | | """ |
263 | 263 | | close, kill -9, quit, del |
264 | 264 | | :param hard_quit: |
| skipped 3 lines |
268 | 268 | | # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution |
269 | 269 | | if phantomjs_short_timeout_decorator is None: |
270 | 270 | | phantomjs_short_timeout_decorator = self.phantomjs_short_timeout |
271 | | - | if hasattr(self,'session'): |
| 271 | + | if hasattr(self,'driver'): |
272 | 272 | | if not hard_quit: |
273 | 273 | | @phantomjs_short_timeout_decorator |
274 | 274 | | def phantomjs_close(): self.driver.close() |
| skipped 21 lines |
296 | 296 | | if self.debug: print('.quit() exception:\n{}'.format(e)) |
297 | 297 | | del self.driver |
298 | 298 | | |
299 | | - | def clear_session(self): |
| 299 | + | def clear_driver(self): |
300 | 300 | | # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver |
301 | | - | if hasattr(self, 'session'): |
| 301 | + | if hasattr(self, 'driver'): |
302 | 302 | | try: |
303 | 303 | | @self.phantomjs_short_timeout |
304 | 304 | | def phantomjs_delete_all_cookies(): self.driver.delete_all_cookies() |
| skipped 68 lines |
373 | 373 | | Downloading: website.com; NNNNN links [in library], H(domain)= B bits [entropy] |
374 | 374 | | Downloaded: website.com: +LLL/NNNNN links [added], H(domain)= B bits [entropy] |
375 | 375 | | """) |
376 | | - | self.open_session() |
| 376 | + | self.open_driver() |
377 | 377 | | self.seed_links() |
378 | | - | self.clear_session() |
379 | | - | if self.quit_driver_every_call: self.quit_session() |
| 378 | + | self.clear_driver() |
| 379 | + | if self.quit_driver_every_call: self.quit_driver() |
380 | 380 | | while True: # pollute forever, pausing only to meet the bandwidth requirement |
381 | 381 | | try: |
382 | 382 | | if (not self.diurnal_flag) or self.diurnal_cycle_test(): |
| skipped 12 lines |
395 | 395 | | def pollute(self): |
396 | 396 | | if not self.quit_driver_every_call: self.check_phantomjs_process() |
397 | 397 | | if self.link_count() < 2000: |
398 | | - | if self.quit_driver_every_call: self.open_session() |
| 398 | + | if self.quit_driver_every_call: self.open_driver() |
399 | 399 | | self.seed_links() |
400 | | - | self.clear_session() |
401 | | - | if self.quit_driver_every_call: self.quit_session() |
| 400 | + | self.clear_driver() |
| 401 | + | if self.quit_driver_every_call: self.quit_driver() |
402 | 402 | | url = self.pop_link() |
403 | 403 | | if self.verbose: self.print_url(url) |
404 | | - | if self.quit_driver_every_call: self.open_session() |
| 404 | + | if self.quit_driver_every_call: self.open_driver() |
405 | 405 | | self.get_url(url) |
406 | | - | self.clear_session() |
407 | | - | if self.quit_driver_every_call: self.quit_session() |
| 406 | + | self.clear_driver() |
| 407 | + | if self.quit_driver_every_call: self.quit_driver() |
408 | 408 | | |
409 | 409 | | def link_count(self): |
410 | 410 | | return int(np.array([len(self.domain_links[dmn]) for dmn in self.domain_links]).sum()) |
| skipped 80 lines |
491 | 491 | | if int(self.elapsed_time/60. % 60.) == 59: |
492 | 492 | | # reset user agent, clear out cookies, seed more links |
493 | 493 | | if self.hour_trigger: |
494 | | - | if hasattr(self,'session'): |
| 494 | + | if hasattr(self,'driver'): |
495 | 495 | | self.set_user_agent() |
496 | 496 | | if True: |
497 | | - | self.quit_session() |
498 | | - | self.open_session() |
| 497 | + | self.quit_driver() |
| 498 | + | self.open_driver() |
499 | 499 | | else: |
500 | 500 | | try: |
501 | 501 | | @self.phantomjs_short_timeout |
| skipped 2 lines |
504 | 504 | | except Exception as e: |
505 | 505 | | if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e)) |
506 | 506 | | self.seed_links() |
507 | | - | else: self.open_session() |
| 507 | + | else: self.open_driver() |
508 | 508 | | self.hour_trigger = False |
509 | 509 | | else: |
510 | 510 | | self.hour_trigger = True |
| skipped 4 lines |
515 | 515 | | if int(self.elapsed_time/3600. % 24.) == 23: |
516 | 516 | | # clear out cookies every day, decimate, and seed more links |
517 | 517 | | if self.twentyfour_hour_trigger: |
518 | | - | if hasattr(self,'session'): |
| 518 | + | if hasattr(self,'driver'): |
519 | 519 | | self.seed_links() |
520 | | - | # restart the session |
521 | | - | self.quit_session() |
522 | | - | self.open_session() |
| 520 | + | # restart the driver |
| 521 | + | self.quit_driver() |
| 522 | + | self.open_driver() |
523 | 523 | | else: |
524 | | - | self.open_session() |
| 524 | + | self.open_driver() |
525 | 525 | | self.decimate_links(total_frac=0.667, decimate_frac=0.1) |
526 | 526 | | self.seed_links() |
527 | | - | if self.quit_driver_every_call: self.quit_session() |
| 527 | + | if self.quit_driver_every_call: self.quit_driver() |
528 | 528 | | self.twentyfour_hour_trigger = False |
529 | 529 | | else: |
530 | 530 | | self.twentyfour_hour_trigger = True |
| skipped 311 lines |
842 | 842 | | # http://stackoverflow.com/questions/492519/timeout-on-a-function-call |
843 | 843 | | if self.debug: print('Looks like phantomjs has hung.') |
844 | 844 | | try: |
845 | | - | self.quit_session(phantomjs_short_timeout_decorator=self.phantomjs_quit_timeout) |
| 845 | + | self.quit_driver(phantomjs_short_timeout_decorator=self.phantomjs_quit_timeout) |
846 | 846 | | except Exception as e: |
847 | 847 | | if self.debug: print(e) |
848 | | - | self.open_session() |
| 848 | + | self.open_driver() |
849 | 849 | | |
850 | 850 | | def phantomjs_quit_hang_handler(self, signum, frame): |
851 | 851 | | raise self.TimeoutError('phantomjs .quit method is taking too long') |
| skipped 10 lines |
862 | 862 | | # Check rss and restart if too large, then check existence |
863 | 863 | | # http://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python |
864 | 864 | | try: |
865 | | - | if not hasattr(self,'session'): self.open_session() |
| 865 | + | if not hasattr(self,'driver'): self.open_driver() |
866 | 866 | | pid, rss_mb = self.phantomjs_pid_and_memory() |
867 | 867 | | if rss_mb > self.phantomjs_rss_limit_mb: # memory limit |
868 | | - | self.quit_session(pid=pid) |
869 | | - | self.open_session() |
| 868 | + | self.quit_driver(pid=pid) |
| 869 | + | self.open_driver() |
870 | 870 | | pid, _ = self.phantomjs_pid_and_memory() |
871 | 871 | | # check existence |
872 | 872 | | os.kill(pid, 0) |
| skipped 18 lines |
891 | 891 | | break |
892 | 892 | | except (psutil.NoSuchProcess,Exception) as e: |
893 | 893 | | if self.debug: print('.service.process.pid exception:\n{}'.format(e)) |
894 | | - | self.quit_session(pid=pid) |
895 | | - | self.open_session() |
| 894 | + | self.quit_driver(pid=pid) |
| 895 | + | self.open_driver() |
896 | 896 | | else: # throw in the towel and exit if no viable phantomjs process after multiple attempts |
897 | 897 | | sys.exit() |
898 | 898 | | return (pid, rss_mb) |
| skipped 4 lines |