| skipped 208 lines |
209 | 209 | | try: |
210 | 210 | | self.session.close() |
211 | 211 | | except Exception as e: |
212 | | - | if self.debug: print(e) |
| 212 | + | if self.debug: print('.close() exception:\n{}'.format(e)) |
213 | 213 | | try: |
214 | 214 | | self.session.service.process.send_signal(signal.SIGTERM) |
215 | 215 | | except Exception as e: |
216 | | - | if self.debug: print(e) |
| 216 | + | if self.debug: print('.send_signal() exception:\n{}'.format(e)) |
217 | 217 | | try: |
218 | 218 | | if pid is None: pid, _ = self.phantomjs_pid_and_memory() |
219 | 219 | | except Exception as e: |
220 | | - | if self.debug: print(e) |
| 220 | + | if self.debug: print('.phantomjs_pid_and_memory() exception:\n{}'.format(e)) |
221 | 221 | | try: |
222 | 222 | | os.kill(pid, signal.SIGTERM) # overkill (pun intended) |
223 | 223 | | except Exception as e: |
224 | | - | if self.debug: print(e) |
| 224 | + | if self.debug: print('.kill() exception:\n{}'.format(e)) |
225 | 225 | | try: |
226 | 226 | | self.session.quit() |
227 | 227 | | del self.session # only delete session if quit is successful |
228 | 228 | | except Exception as e: |
229 | | - | if self.debug: print(e) |
| 229 | + | if self.debug: print('.quit() exception:\n{}'.format(e)) |
230 | 230 | | |
231 | 231 | | def clear_session(self): |
232 | 232 | | # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver |
233 | 233 | | if hasattr(self, 'session'): |
234 | | - | self.session.delete_all_cookies() |
| 234 | + | try: |
| 235 | + | self.session.delete_all_cookies() |
| 236 | + | except Exception as e: |
| 237 | + | if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e)) |
235 | 238 | | try: |
236 | 239 | | self.session.execute_script('window.localStorage.clear();') |
237 | 240 | | self.session.execute_script('window.sessionStorage.clear();') |
238 | 241 | | except Exception as e: |
239 | | - | if self.debug: print(e) |
| 242 | + | if self.debug: print('.execute_script() exception:\n{}'.format(e)) |
240 | 243 | | |
241 | 244 | | def get_blacklist(self): |
242 | 245 | | self.blacklist_domains = set() |
| skipped 45 lines |
288 | 291 | | self.words = response.content.decode('utf-8').splitlines() |
289 | 292 | | reqsession.close() |
290 | 293 | | except Exception as e: |
291 | | - | if self.debug: print(e) |
| 294 | + | if self.debug: print('requests exception:\n{}'.format(e)) |
292 | 295 | | self.words = [ 'FUBAR' ] |
293 | 296 | | # if self.debug: print('There are {:d} words.'.format(len(self.words))) |
294 | 297 | | |
| skipped 14 lines |
309 | 312 | | self.every_hour_tasks() |
310 | 313 | | time.sleep(self.chi2_mean_std(0.5,0.2)) |
311 | 314 | | except Exception as e: |
312 | | - | if self.debug: print(e) |
| 315 | + | if self.debug: print('.pollute() exception:\n{}'.format(e)) |
313 | 316 | | |
314 | 317 | | def pollute(self): |
315 | 318 | | if not self.quit_driver_every_call: self.check_phantomjs_process() |
| skipped 53 lines |
369 | 372 | | if self.hour_trigger: |
370 | 373 | | self.set_user_agent() |
371 | 374 | | if hasattr(self,'session'): |
372 | | - | # self.session.cookies.clear() # requests session |
373 | | - | self.session.delete_all_cookies() |
| 375 | + | try: |
| 376 | + | self.session.delete_all_cookies() |
| 377 | + | except Exception as e: |
| 378 | + | if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e)) |
374 | 379 | | self.hour_trigger = False |
375 | 380 | | else: |
376 | 381 | | self.hour_trigger = True |
| skipped 29 lines |
406 | 411 | | def set_user_agent(self): |
407 | 412 | | global user_agent |
408 | 413 | | self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent |
409 | | - | self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent}) |
| 414 | + | try: |
| 415 | + | self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent}) |
| 416 | + | except Exception as e: |
| 417 | + | if self.debug: print('.update() exception:\n{}'.format(e)) |
410 | 418 | | |
411 | 419 | | def remove_link(self): |
412 | 420 | | url = random.sample(self.links,1)[0] |
| skipped 31 lines |
444 | 452 | | def get_websearch(self,query): |
445 | 453 | | '''HTTP GET of a websearch, then add any embedded links.''' |
446 | 454 | | url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query))) |
447 | | - | # return self.session.get(url) |
448 | 455 | | signal.alarm(self.timeout+2) # set an alarm |
449 | 456 | | try: |
450 | 457 | | self.session.get(url) # selenium driver |
451 | 458 | | except self.TimeoutError as e: |
452 | | - | if self.debug: print(e) |
| 459 | + | if self.debug: print('.get() exception:\n{}'.format(e)) |
453 | 460 | | finally: |
454 | 461 | | signal.alarm(0) # cancel the alarm |
455 | | - | self.data_usage += len(self.session.page_source) |
| 462 | + | try: |
| 463 | + | self.data_usage += len(self.session.page_source) |
| 464 | + | except Exception as e: |
| 465 | + | if self.debug: print('.page_source exception:\n{}'.format(e)) |
456 | 466 | | new_links = self.websearch_links() |
457 | 467 | | if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url) |
458 | 468 | | |
| skipped 4 lines |
463 | 473 | | for div in self.session.find_elements_by_css_selector('div.g') \ |
464 | 474 | | if div.find_element_by_tag_name('a').get_attribute('href') is not None ] |
465 | 475 | | except Exception as e: |
466 | | - | if self.debug: print(e) |
| 476 | + | if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e)) |
467 | 477 | | return [] |
468 | 478 | | |
469 | 479 | | def get_url(self,url): |
| skipped 3 lines |
473 | 483 | | try: |
474 | 484 | | self.session.get(url) # selenium driver |
475 | 485 | | except self.TimeoutError as e: |
476 | | - | if self.debug: print(e) |
| 486 | + | if self.debug: print('.get() exception:\n{}'.format(e)) |
477 | 487 | | finally: |
478 | 488 | | signal.alarm(0) # cancel the alarm |
479 | | - | self.data_usage += len(self.session.page_source) |
| 489 | + | try: |
| 490 | + | self.data_usage += len(self.session.page_source) |
| 491 | + | except Exception as e: |
| 492 | + | if self.debug: print('.page_source exception:\n{}'.format(e)) |
480 | 493 | | new_links = self.url_links() |
481 | 494 | | if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url) |
482 | 495 | | |
| skipped 4 lines |
487 | 500 | | for a in self.session.find_elements_by_tag_name('a') \ |
488 | 501 | | if a.get_attribute('href') is not None ] |
489 | 502 | | except Exception as e: |
490 | | - | if self.debug: print(e) |
| 503 | + | if self.debug: print('.get_attribute() exception:\n{}'.format(e)) |
491 | 504 | | return [] |
492 | 505 | | |
493 | 506 | | def check_robots(self,url): |
| skipped 5 lines |
499 | 512 | | rp.read() |
500 | 513 | | result = rp.can_fetch(self.user_agent,url) |
501 | 514 | | except Exception as e: |
502 | | - | if self.debug: print(e) |
| 515 | + | if self.debug: print('rp.read() exception:\n{}'.format(e)) |
503 | 516 | | del rp # ensure self.close() in urllib |
504 | 517 | | return result |
505 | 518 | | |
| skipped 11 lines |
517 | 530 | | # the current_url method breaks on a lot of sites, e.g. |
518 | 531 | | # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()' |
519 | 532 | | except Exception as e: |
520 | | - | if self.debug: print(e) |
| 533 | + | if self.debug: print('.current_url exception:\n{}'.format(e)) |
521 | 534 | | if self.debug: |
522 | 535 | | print("'{}': {:d} links added, {:d} total".format(current_url,k,len(self.links))) |
523 | 536 | | elif self.verbose: |
| skipped 32 lines |
556 | 569 | | self.quit_session() |
557 | 570 | | self.open_session() |
558 | 571 | | except Exception as e: |
559 | | - | if self.debug: print(e) |
| 572 | + | if self.debug: print('.quit_session() exception:\n{}'.format(e)) |
560 | 573 | | raise self.TimeoutError('Unable to quit the session as well.') |
561 | 574 | | raise self.TimeoutError('phantomjs is taking too long') |
562 | 575 | | |
| skipped 11 lines |
574 | 587 | | # check existence |
575 | 588 | | os.kill(pid, 0) |
576 | 589 | | except (OSError,psutil.NoSuchProcess,Exception) as e: |
577 | | - | if self.debug: print(e) |
| 590 | + | if self.debug: print('.phantomjs_pid_and_memory() exception:\n{}'.format(e)) |
578 | 591 | | if issubclass(type(e),psutil.NoSuchProcess): |
579 | 592 | | raise Exception("There's a phantomjs zombie, and the thread shouldn't have reached this statement.") |
580 | 593 | | return False |
| skipped 10 lines |
591 | 604 | | rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20) |
592 | 605 | | break |
593 | 606 | | except (psutil.NoSuchProcess,Exception) as e: |
594 | | - | if self.debug: print(e) |
| 607 | + | if self.debug: print('.service.process.pid exception:\n{}'.format(e)) |
595 | 608 | | self.quit_session(pid=pid) |
596 | 609 | | self.open_session() |
597 | 610 | | else: # throw in the towel and exit if no viable phantomjs process after multiple attempts |
| skipped 6 lines |