| skipped 198 lines |
199 | 199 | | parser.add_argument('-mm', '--maxmemory', |
200 | 200 | | help="Maximum memory of phantomjs (MB); 0=>restart every link", |
201 | 201 | | type=int, default=1024) |
202 | | - | # parser.add_argument('-P', '--phantomjs-binary-path', help="Path to phantomjs binary", type=int, default=phantomjs_rss_limit_mb) |
| 202 | + | parser.add_argument('-P', '--phantomjs-binary-path', help="Path to phantomjs binary", type=str, default=None) |
| 203 | + | parser.add_argument('-p', '--proxy', help="Proxy for phantomjs", type=str, default=None) |
203 | 204 | | parser.add_argument('-g', '--debug', help="Debug flag", action='store_true') |
204 | 205 | | args = parser.parse_args() |
205 | 206 | | for k in args.__dict__: setattr(self,k,getattr(args,k)) |
| skipped 8 lines |
214 | 215 | | def check_phantomjs_version(self,recommended_version=(2,1)): |
215 | 216 | | self.open_session() |
216 | 217 | | if self.debug: |
217 | | - | print("{} version is {}, {} version is {}".format(self.session.capabilities["browserName"], |
218 | | - | self.session.capabilities["version"], |
219 | | - | self.session.capabilities["driverName"], |
220 | | - | self.session.capabilities["driverVersion"])) |
221 | | - | phantomjs_version = tuple(int(i) for i in self.session.capabilities["version"].split('.')) |
| 218 | + | print("{} version is {}, {} version is {}".format(self.driver.capabilities["browserName"], |
| 219 | + | self.driver.capabilities["version"], |
| 220 | + | self.driver.capabilities["driverName"], |
| 221 | + | self.driver.capabilities["driverVersion"])) |
| 222 | + | phantomjs_version = tuple(int(i) for i in self.driver.capabilities["version"].split('.')) |
222 | 223 | | if phantomjs_version < recommended_version: |
223 | 224 | | warn.warn("""{} version is {}; |
224 | 225 | | please upgrade to at least version {} from http://phantomjs.org. |
225 | | - | """.format(self.session.capabilities["browserName"],self.session.capabilities["version"], |
| 226 | + | """.format(self.driver.capabilities["browserName"],self.driver.capabilities["version"], |
226 | 227 | | '.'.join(str(i) for i in recommended_version))) |
227 | 228 | | self.quit_session() |
228 | 229 | | |
229 | 230 | | def open_session(self): |
230 | 231 | | self.quit_session() |
231 | | - | if not hasattr(self, 'session') or not isinstance(self.session,webdriver.phantomjs.webdriver.WebDriver): |
| 232 | + | if not hasattr(self, 'session') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver): |
232 | 233 | | # phantomjs session |
233 | 234 | | # http://engineering.shapesecurity.com/2015/01/detecting-phantomjs-based-visitors.html |
234 | 235 | | # https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string |
| skipped 1 lines |
236 | 237 | | # http://stackoverflow.com/questions/23390974/phantomjs-keeping-cache |
237 | 238 | | dcap = dict(DesiredCapabilities.PHANTOMJS) |
238 | 239 | | # dcap['browserName'] = 'Chrome' |
239 | | - | # if hasattr(self,'phantomjs_binary_path'): dcap['phantomjs.binary.path'] = ( self.phantomjs_binary_path ) |
240 | 240 | | dcap['phantomjs.page.settings.userAgent'] = ( self.user_agent ) |
241 | 241 | | dcap['phantomjs.page.settings.loadImages'] = ( 'false' ) |
242 | 242 | | dcap['phantomjs.page.settings.clearMemoryCaches'] = ( 'true' ) |
| skipped 2 lines |
245 | 245 | | dcap['applicationCacheEnabled'] = ( True ) |
246 | 246 | | dcap['handlesAlerts'] = ( False ) |
247 | 247 | | dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch' } ) |
248 | | - | driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2']) |
249 | | - | # if hasattr(self,'phantomjs_binary_path'): driver.capabilities.setdefault("phantomjs.binary.path", self.phantomjs_binary_path) |
| 248 | + | phantomjs_service_args = ['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2'] |
| 249 | + | if self.proxy is not None: |
| 250 | + | phantomjs_service_args = ['--proxy={}'.format(self.proxy)] + phantomjs_service_args |
| 251 | + | if self.phantomjs_binary_path is None: |
| 252 | + | driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=phantomjs_service_args) |
| 253 | + | else: |
| 254 | + | driver = webdriver.PhantomJS(self.phantomjs_binary_path,desired_capabilities=dcap,service_args=phantomjs_service_args) |
250 | 255 | | driver.set_window_size(1296,1018) # Tor browser size on Linux |
251 | | - | driver.implicitly_wait(self.timeout+10) |
252 | | - | driver.set_page_load_timeout(self.timeout+10) |
253 | | - | self.session = driver |
| 256 | + | driver.implicitly_wait(self.timeout) |
| 257 | + | driver.set_page_load_timeout(self.timeout) |
| 258 | + | driver.set_script_timeout(self.timeout) |
| 259 | + | self.driver = driver |
254 | 260 | | |
255 | 261 | | def quit_session(self,hard_quit=False,pid=None,phantomjs_short_timeout_decorator=None): |
256 | 262 | | """ |
| skipped 8 lines |
265 | 271 | | if hasattr(self,'session'): |
266 | 272 | | if not hard_quit: |
267 | 273 | | @phantomjs_short_timeout_decorator |
268 | | - | def phantomjs_close(): self.session.close() |
| 274 | + | def phantomjs_close(): self.driver.close() |
269 | 275 | | phantomjs_close() |
270 | 276 | | try: |
271 | 277 | | @phantomjs_short_timeout_decorator |
272 | | - | def phantomjs_send_signal(): self.session.service.process.send_signal(signal.SIGTERM) |
| 278 | + | def phantomjs_send_signal(): self.driver.service.process.send_signal(signal.SIGTERM) |
273 | 279 | | phantomjs_send_signal() |
274 | 280 | | except Exception as e: |
275 | 281 | | if self.debug: print('.send_signal() exception:\n{}'.format(e)) |
276 | | - | try: |
277 | | - | if pid is None: pid, _ = self.phantomjs_pid_and_memory() |
278 | | - | except Exception as e: |
279 | | - | if self.debug: print('.phantomjs_pid_and_memory() exception:\n{}'.format(e)) |
280 | | - | try: |
281 | | - | os.kill(pid, signal.SIGTERM) # overkill (pun intended) |
282 | | - | except Exception as e: |
283 | | - | if self.debug: print('.kill() exception:\n{}'.format(e)) |
| 282 | + | if pid is None: |
| 283 | + | @phantomjs_short_timeout_decorator |
| 284 | + | def phantomjs_process_pid(): return self.driver.service.process.pid |
| 285 | + | pid = phantomjs_process_pid() |
| 286 | + | if isinstance(pid,int): |
| 287 | + | try: |
| 288 | + | os.kill(pid, signal.SIGTERM) # overkill (pun intended) |
| 289 | + | except Exception as e: |
| 290 | + | if self.debug: print('.kill() exception:\n{}'.format(e)) |
284 | 291 | | try: |
285 | 292 | | @phantomjs_short_timeout_decorator |
286 | | - | def phantomjs_quit(): self.session.quit() |
| 293 | + | def phantomjs_quit(): self.driver.quit() |
287 | 294 | | phantomjs_quit() |
288 | 295 | | except Exception as e: |
289 | 296 | | if self.debug: print('.quit() exception:\n{}'.format(e)) |
290 | | - | del self.session |
| 297 | + | del self.driver |
291 | 298 | | |
292 | 299 | | def clear_session(self): |
293 | 300 | | # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver |
294 | 301 | | if hasattr(self, 'session'): |
295 | 302 | | try: |
296 | 303 | | @self.phantomjs_short_timeout |
297 | | - | def phantomjs_delete_all_cookies(): self.session.delete_all_cookies() |
| 304 | + | def phantomjs_delete_all_cookies(): self.driver.delete_all_cookies() |
298 | 305 | | phantomjs_delete_all_cookies() |
299 | 306 | | except Exception as e: |
300 | 307 | | if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e)) |
301 | 308 | | try: |
302 | 309 | | @self.phantomjs_short_timeout |
303 | 310 | | def phantomjs_clear(): |
304 | | - | self.session.execute_script('window.localStorage.clear();') |
305 | | - | self.session.execute_script('window.sessionStorage.clear();') |
| 311 | + | self.driver.execute_script('window.localStorage.clear();') |
| 312 | + | self.driver.execute_script('window.sessionStorage.clear();') |
306 | 313 | | phantomjs_clear() |
307 | 314 | | except Exception as e: |
308 | 315 | | if self.debug: print('.execute_script() exception:\n{}'.format(e)) |
| skipped 183 lines |
492 | 499 | | else: |
493 | 500 | | try: |
494 | 501 | | @self.phantomjs_short_timeout |
495 | | - | def phantomjs_delete_all_cookies(): self.session.delete_all_cookies() |
| 502 | + | def phantomjs_delete_all_cookies(): self.driver.delete_all_cookies() |
496 | 503 | | phantomjs_delete_all_cookies() |
497 | 504 | | except Exception as e: |
498 | 505 | | if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e)) |
| skipped 42 lines |
541 | 548 | | try: |
542 | 549 | | @self.phantomjs_short_timeout |
543 | 550 | | def phantomjs_capabilities_update(): |
544 | | - | self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent}) |
| 551 | + | self.driver.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent}) |
545 | 552 | | phantomjs_capabilities_update() |
546 | 553 | | except Exception as e: |
547 | 554 | | if self.debug: print('.update() exception:\n{}'.format(e)) |
| skipped 97 lines |
645 | 652 | | url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query))) |
646 | 653 | | if self.verbose: self.print_url(url) |
647 | 654 | | @self.phantomjs_timeout |
648 | | - | def phantomjs_get(): self.session.get(url) # selenium driver |
| 655 | + | def phantomjs_get(): self.driver.get(url) # selenium driver |
649 | 656 | | phantomjs_get() |
650 | 657 | | @self.phantomjs_short_timeout |
651 | | - | def phantomjs_page_source(): self.data_usage += len(self.session.page_source) |
| 658 | + | def phantomjs_page_source(): self.data_usage += len(self.driver.page_source) |
652 | 659 | | phantomjs_page_source() |
653 | 660 | | new_links = self.websearch_links() |
654 | 661 | | if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) |
| skipped 6 lines |
661 | 668 | | # https://github.com/detro/ghostdriver/issues/169 |
662 | 669 | | @self.phantomjs_short_timeout |
663 | 670 | | def phantomjs_find_elements_by_css_selector(): |
664 | | - | return WebDriverWait(self.session,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g')) |
| 671 | + | return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g')) |
665 | 672 | | elements = phantomjs_find_elements_by_css_selector() |
666 | 673 | | # get links in random order until max. per page |
667 | 674 | | k = 0 |
| skipped 8 lines |
676 | 683 | | href = phantomjs_get_attribute() |
677 | 684 | | if href is not None: links.append(href) |
678 | 685 | | k += 1 |
679 | | - | if k > self.max_links_per_page: break |
| 686 | + | if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break |
680 | 687 | | except Exception as e: |
681 | 688 | | if self.debug: print('.find_element_by_tag_name.get_attribute() exception:\n{}'.format(e)) |
682 | 689 | | return links |
| skipped 6 lines |
689 | 696 | | """ |
690 | 697 | | if not self.check_robots(url): return # bail out if robots.txt says to |
691 | 698 | | @self.phantomjs_timeout |
692 | | - | def phantomjs_get(): self.session.get(url) # selenium driver |
| 699 | + | def phantomjs_get(): self.driver.get(url) # selenium driver |
693 | 700 | | phantomjs_get() |
694 | 701 | | @self.phantomjs_short_timeout |
695 | | - | def phantomjs_page_source(): self.data_usage += len(self.session.page_source) |
| 702 | + | def phantomjs_page_source(): self.data_usage += len(self.driver.page_source) |
696 | 703 | | phantomjs_page_source() |
697 | 704 | | new_links = self.url_links() |
698 | 705 | | if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) |
| skipped 3 lines |
702 | 709 | | # https://github.com/detro/ghostdriver/issues/169 |
703 | 710 | | @self.phantomjs_short_timeout |
704 | 711 | | def phantomjs_find_elements_by_tag_name(): |
705 | | - | return WebDriverWait(self.session,3).until(lambda x: x.find_elements_by_tag_name('a')) |
| 712 | + | return WebDriverWait(self.driver,3).until(lambda x: x.find_elements_by_tag_name('a')) |
706 | 713 | | elements = phantomjs_find_elements_by_tag_name() |
707 | 714 | | |
708 | 715 | | # get links in random order until max. per page |
| skipped 6 lines |
715 | 722 | | href = phantomjs_get_attribute() |
716 | 723 | | if href is not None: links.append(href) |
717 | 724 | | k += 1 |
718 | | - | if k > self.max_links_per_page: break |
| 725 | + | if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break |
719 | 726 | | except Exception as e: |
720 | 727 | | if self.debug: print('.get_attribute() exception:\n{}'.format(e)) |
721 | 728 | | return links |
| skipped 24 lines |
746 | 753 | | current_url = url # default |
747 | 754 | | try: |
748 | 755 | | @self.phantomjs_short_timeout |
749 | | - | def phantomjs_current_url(): return self.session.current_url |
| 756 | + | def phantomjs_current_url(): return self.driver.current_url |
750 | 757 | | current_url = phantomjs_current_url() |
751 | 758 | | # the current_url method breaks on a lot of sites, e.g. |
752 | 759 | | # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()' |
| skipped 125 lines |
878 | 885 | | for k in range(3): # three strikes |
879 | 886 | | try: |
880 | 887 | | @self.phantomjs_short_timeout |
881 | | - | def phantomjs_process_pid(): return self.session.service.process.pid |
| 888 | + | def phantomjs_process_pid(): return self.driver.service.process.pid |
882 | 889 | | pid = phantomjs_process_pid() |
883 | 890 | | rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20) |
884 | 891 | | break |
| skipped 11 lines |