🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 208 lines
    209 209   try:
    210 210   self.session.close()
    211 211   except Exception as e:
    212  - if self.debug: print(e)
     212 + if self.debug: print('.close() exception:\n{}'.format(e))
    213 213   try:
    214 214   self.session.service.process.send_signal(signal.SIGTERM)
    215 215   except Exception as e:
    216  - if self.debug: print(e)
     216 + if self.debug: print('.send_signal() exception:\n{}'.format(e))
    217 217   try:
    218 218   if pid is None: pid, _ = self.phantomjs_pid_and_memory()
    219 219   except Exception as e:
    220  - if self.debug: print(e)
     220 + if self.debug: print('.phantomjs_pid_and_memory() exception:\n{}'.format(e))
    221 221   try:
    222 222   os.kill(pid, signal.SIGTERM) # overkill (pun intended)
    223 223   except Exception as e:
    224  - if self.debug: print(e)
     224 + if self.debug: print('.kill() exception:\n{}'.format(e))
    225 225   try:
    226 226   self.session.quit()
    227 227   del self.session # only delete session if quit is successful
    228 228   except Exception as e:
    229  - if self.debug: print(e)
     229 + if self.debug: print('.quit() exception:\n{}'.format(e))
    230 230   
    231 231   def clear_session(self):
    232 232   # https://sqa.stackexchange.com/questions/10466/how-to-clear-localstorage-using-selenium-and-webdriver
    233 233   if hasattr(self, 'session'):
    234  - self.session.delete_all_cookies()
     234 + try:
     235 + self.session.delete_all_cookies()
     236 + except Exception as e:
     237 + if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
    235 238   try:
    236 239   self.session.execute_script('window.localStorage.clear();')
    237 240   self.session.execute_script('window.sessionStorage.clear();')
    238 241   except Exception as e:
    239  - if self.debug: print(e)
     242 + if self.debug: print('.execute_script() exception:\n{}'.format(e))
    240 243   
    241 244   def get_blacklist(self):
    242 245   self.blacklist_domains = set()
    skipped 45 lines
    288 291   self.words = response.content.decode('utf-8').splitlines()
    289 292   reqsession.close()
    290 293   except Exception as e:
    291  - if self.debug: print(e)
     294 + if self.debug: print('requests exception:\n{}'.format(e))
    292 295   self.words = [ 'FUBAR' ]
    293 296   # if self.debug: print('There are {:d} words.'.format(len(self.words)))
    294 297   
    skipped 14 lines
    309 312   self.every_hour_tasks()
    310 313   time.sleep(self.chi2_mean_std(0.5,0.2))
    311 314   except Exception as e:
    312  - if self.debug: print(e)
     315 + if self.debug: print('.pollute() exception:\n{}'.format(e))
    313 316   
    314 317   def pollute(self):
    315 318   if not self.quit_driver_every_call: self.check_phantomjs_process()
    skipped 53 lines
    369 372   if self.hour_trigger:
    370 373   self.set_user_agent()
    371 374   if hasattr(self,'session'):
    372  - # self.session.cookies.clear() # requests session
    373  - self.session.delete_all_cookies()
     375 + try:
     376 + self.session.delete_all_cookies()
     377 + except Exception as e:
     378 + if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
    374 379   self.hour_trigger = False
    375 380   else:
    376 381   self.hour_trigger = True
    skipped 29 lines
    406 411   def set_user_agent(self):
    407 412   global user_agent
    408 413   self.user_agent = self.fake.user_agent() if npr.random() < 0.95 else user_agent
    409  - self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent})
     414 + try:
     415 + self.session.capabilities.update({'phantomjs.page.settings.userAgent': self.user_agent})
     416 + except Exception as e:
     417 + if self.debug: print('.update() exception:\n{}'.format(e))
    410 418   
    411 419   def remove_link(self):
    412 420   url = random.sample(self.links,1)[0]
    skipped 31 lines
    444 452   def get_websearch(self,query):
    445 453   '''HTTP GET of a websearch, then add any embedded links.'''
    446 454   url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
    447  - # return self.session.get(url)
    448 455   signal.alarm(self.timeout+2) # set an alarm
    449 456   try:
    450 457   self.session.get(url) # selenium driver
    451 458   except self.TimeoutError as e:
    452  - if self.debug: print(e)
     459 + if self.debug: print('.get() exception:\n{}'.format(e))
    453 460   finally:
    454 461   signal.alarm(0) # cancel the alarm
    455  - self.data_usage += len(self.session.page_source)
     462 + try:
     463 + self.data_usage += len(self.session.page_source)
     464 + except Exception as e:
     465 + if self.debug: print('.page_source exception:\n{}'.format(e))
    456 466   new_links = self.websearch_links()
    457 467   if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url)
    458 468   
    skipped 4 lines
    463 473   for div in self.session.find_elements_by_css_selector('div.g') \
    464 474   if div.find_element_by_tag_name('a').get_attribute('href') is not None ]
    465 475   except Exception as e:
    466  - if self.debug: print(e)
     476 + if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e))
    467 477   return []
    468 478   
    469 479   def get_url(self,url):
    skipped 3 lines
    473 483   try:
    474 484   self.session.get(url) # selenium driver
    475 485   except self.TimeoutError as e:
    476  - if self.debug: print(e)
     486 + if self.debug: print('.get() exception:\n{}'.format(e))
    477 487   finally:
    478 488   signal.alarm(0) # cancel the alarm
    479  - self.data_usage += len(self.session.page_source)
     489 + try:
     490 + self.data_usage += len(self.session.page_source)
     491 + except Exception as e:
     492 + if self.debug: print('.page_source exception:\n{}'.format(e))
    480 493   new_links = self.url_links()
    481 494   if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url)
    482 495   
    skipped 4 lines
    487 500   for a in self.session.find_elements_by_tag_name('a') \
    488 501   if a.get_attribute('href') is not None ]
    489 502   except Exception as e:
    490  - if self.debug: print(e)
     503 + if self.debug: print('.get_attribute() exception:\n{}'.format(e))
    491 504   return []
    492 505   
    493 506   def check_robots(self,url):
    skipped 5 lines
    499 512   rp.read()
    500 513   result = rp.can_fetch(self.user_agent,url)
    501 514   except Exception as e:
    502  - if self.debug: print(e)
     515 + if self.debug: print('rp.read() exception:\n{}'.format(e))
    503 516   del rp # ensure self.close() in urllib
    504 517   return result
    505 518   
    skipped 11 lines
    517 530   # the current_url method breaks on a lot of sites, e.g.
    518 531   # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()'
    519 532   except Exception as e:
    520  - if self.debug: print(e)
     533 + if self.debug: print('.current_url exception:\n{}'.format(e))
    521 534   if self.debug:
    522 535   print("'{}': {:d} links added, {:d} total".format(current_url,k,len(self.links)))
    523 536   elif self.verbose:
    skipped 32 lines
    556 569   self.quit_session()
    557 570   self.open_session()
    558 571   except Exception as e:
    559  - if self.debug: print(e)
     572 + if self.debug: print('.quit_session() exception:\n{}'.format(e))
    560 573   raise self.TimeoutError('Unable to quit the session as well.')
    561 574   raise self.TimeoutError('phantomjs is taking too long')
    562 575   
    skipped 11 lines
    574 587   # check existence
    575 588   os.kill(pid, 0)
    576 589   except (OSError,psutil.NoSuchProcess,Exception) as e:
    577  - if self.debug: print(e)
     590 + if self.debug: print('.phantomjs_pid_and_memory() exception:\n{}'.format(e))
    578 591   if issubclass(type(e),psutil.NoSuchProcess):
    579 592   raise Exception("There's a phantomjs zombie, and the thread shouldn't have reached this statement.")
    580 593   return False
    skipped 10 lines
    591 604   rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20)
    592 605   break
    593 606   except (psutil.NoSuchProcess,Exception) as e:
    594  - if self.debug: print(e)
     607 + if self.debug: print('.service.process.pid exception:\n{}'.format(e))
    595 608   self.quit_session(pid=pid)
    596 609   self.open_session()
    597 610   else: # throw in the towel and exit if no viable phantomjs process after multiple attempts
    skipped 6 lines
Please wait...
Page is in error, reload to recover