🤬
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 384 lines
    385 385   word = ' '.join(['"{}"'.format(' '.join(random.sample(self.words, 2))),
    386 386   ' '.join(random.sample(self.words, num_words-2))])
    387 387   if self.debug: print('Seeding with search for \'{}\'…'.format(word))
    388  - # self.add_url_links(self.websearch(word).content.decode('utf-8'))
    389 388   self.get_websearch(word)
    390 389   
    391 390   def bias_links(self):
    skipped 27 lines
    419 418   
    420 419   def every_hour_tasks(self):
    421 420   if int(self.elapsed_time/60. % 60.) == 59:
    422  - # reset user agent, clear out cookies
     421 + # reset user agent, clear out cookies, seed more links
    423 422   if self.hour_trigger:
    424  - self.set_user_agent()
    425 423   if hasattr(self,'session'):
     424 + self.set_user_agent()
    426 425   try:
    427 426   @self.phantomjs_short_timeout
    428 427   def phantomjs_delete_all_cookies(): self.session.delete_all_cookies()
    429 428   phantomjs_delete_all_cookies()
    430 429   except Exception as e:
    431 430   if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e))
     431 + self.seed_links()
     432 + else: self.open_session()
    432 433   self.hour_trigger = False
    433 434   else:
    434 435   self.hour_trigger = True
    skipped 25 lines
    460 461   self.data_usage = 0
    461 462   self.decimate_links(total_frac=0.49, decimate_frac=0.333)
    462 463   
    463  - def decimate_links(self, total_frac=0.81, decimate_frac=0.1): # decimate the stack
     464 + def decimate_links(self, total_frac=0.81, decimate_frac=0.1, log_sampling=False):
     465 + """ Delete `decimate_frac` of links if the total exceeds `total_frac` of the maximum allowed. """
    464 466   if self.link_count() > int(np.ceil(total_frac * self.max_links_cached)):
    465  - for url in self.draw_links(n=int(np.ceil(self.link_count() * decimate_frac))):
     467 + for url in self.draw_links(n=int(np.ceil(self.link_count()*decimate_frac)),log_sampling=log_sampling):
    466 468   self.remove_link(url)
    467 469   
    468 470   def set_user_agent(self):
    skipped 287 lines
Please wait...
Page is in error, reload to recover