| skipped 384 lines |
385 | 385 | | word = ' '.join(['"{}"'.format(' '.join(random.sample(self.words, 2))), |
386 | 386 | | ' '.join(random.sample(self.words, num_words-2))]) |
387 | 387 | | if self.debug: print('Seeding with search for \'{}\'…'.format(word)) |
388 | | - | # self.add_url_links(self.websearch(word).content.decode('utf-8')) |
389 | 388 | | self.get_websearch(word) |
390 | 389 | | |
391 | 390 | | def bias_links(self): |
| skipped 27 lines |
419 | 418 | | |
420 | 419 | | def every_hour_tasks(self): |
421 | 420 | | if int(self.elapsed_time/60. % 60.) == 59: |
422 | | - | # reset user agent, clear out cookies |
| 421 | + | # reset user agent, clear out cookies, seed more links |
423 | 422 | | if self.hour_trigger: |
424 | | - | self.set_user_agent() |
425 | 423 | | if hasattr(self,'session'): |
| 424 | + | self.set_user_agent() |
426 | 425 | | try: |
427 | 426 | | @self.phantomjs_short_timeout |
428 | 427 | | def phantomjs_delete_all_cookies(): self.session.delete_all_cookies() |
429 | 428 | | phantomjs_delete_all_cookies() |
430 | 429 | | except Exception as e: |
431 | 430 | | if self.debug: print('.delete_all_cookies() exception:\n{}'.format(e)) |
| 431 | + | self.seed_links() |
| 432 | + | else: self.open_session() |
432 | 433 | | self.hour_trigger = False |
433 | 434 | | else: |
434 | 435 | | self.hour_trigger = True |
| skipped 25 lines |
460 | 461 | | self.data_usage = 0 |
461 | 462 | | self.decimate_links(total_frac=0.49, decimate_frac=0.333) |
462 | 463 | | |
463 | | - | def decimate_links(self, total_frac=0.81, decimate_frac=0.1): # decimate the stack |
| 464 | + | def decimate_links(self, total_frac=0.81, decimate_frac=0.1, log_sampling=False): |
| 465 | + | """ Delete `decimate_frac` of links if the total exceeds `total_frac` of the maximum allowed. """ |
464 | 466 | | if self.link_count() > int(np.ceil(total_frac * self.max_links_cached)): |
465 | | - | for url in self.draw_links(n=int(np.ceil(self.link_count() * decimate_frac))): |
| 467 | + | for url in self.draw_links(n=int(np.ceil(self.link_count()*decimate_frac)),log_sampling=log_sampling): |
466 | 468 | | self.remove_link(url) |
467 | 469 | | |
468 | 470 | | def set_user_agent(self): |
| skipped 287 lines |