| skipped 45 lines |
46 | 46 | | search_url = 'http://www.google.com/search' # keep unencrypted for ISP DPI |
47 | 47 | | wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain' |
48 | 48 | | timeout = 20 |
| 49 | + | short_timeout = 3 |
49 | 50 | | |
50 | 51 | | blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz' |
51 | 52 | | # Usage of the Shalla Blacklists: |
| skipped 34 lines |
86 | 87 | | |
87 | 88 | | # monkeypatch the read class method in RobotFileParser |
88 | 89 | | # many sites will block access to robots.txt without a standard User-Agent header |
89 | | - | short_timeout = 3 |
90 | 90 | | class RobotFileParserUserAgent(robotparser.RobotFileParser): |
91 | 91 | | |
92 | 92 | | timeout = short_timeout # short-term timeout |
| skipped 321 lines |
414 | 414 | | |
415 | 415 | | def exceeded_bandwidth_tasks(self): |
416 | 416 | | if self.bandwidth_test(): |
417 | | - | # decimate the stack and clear the cookies |
418 | | - | if self.link_count() > int(np.ceil(0.81*self.max_links_cached)): |
419 | | - | for url in self.draw_links(n=int(np.ceil(self.link_count()/10.))): |
420 | | - | self.pop_link() |
| 417 | + | self.decimate_links(total_frac=0.81,decimate_frac=0.1) |
421 | 418 | | time.sleep(120) |
422 | 419 | | |
423 | 420 | | def every_hour_tasks(self): |
| skipped 16 lines |
440 | 437 | | |
441 | 438 | | def every_day_tasks(self): |
442 | 439 | | if int(self.elapsed_time/3600. % 24.) == 23: |
443 | | - | # clear out cookies every day, and seed more links |
| 440 | + | # clear out cookies every day, decimate, and seed more links |
444 | 441 | | if self.twentyfour_hour_trigger: |
445 | 442 | | if hasattr(self,'session'): |
446 | 443 | | self.seed_links() |
| skipped 2 lines |
449 | 446 | | self.open_session() |
450 | 447 | | else: |
451 | 448 | | self.open_session() |
| 449 | + | self.decimate_links(total_frac=0.667, decimate_frac=0.1) |
452 | 450 | | self.seed_links() |
453 | 451 | | if self.quit_driver_every_call: self.quit_session() |
454 | 452 | | self.twentyfour_hour_trigger = False |
| skipped 5 lines |
460 | 458 | | # reset bw stats and (really) decimate the stack every couple of weeks |
461 | 459 | | self.start_time = time.time() |
462 | 460 | | self.data_usage = 0 |
463 | | - | if self.link_count() > int(np.ceil(0.49*self.max_links_cached)): |
464 | | - | for url in self.draw_links(n=int(np.ceil(self.link_count()/3.))): |
465 | | - | self.pop_link(url) |
| 461 | + | self.decimate_links(total_frac=0.49, decimate_frac=0.333) |
| 462 | + | |
| 463 | + | def decimate_links(self, total_frac=0.81, decimate_frac=0.1): # decimate the stack |
| 464 | + | if self.link_count() > int(np.ceil(total_frac * self.max_links_cached)): |
| 465 | + | for url in self.draw_links(n=int(np.ceil(self.link_count() * decimate_frac))): |
| 466 | + | self.remove_link(url) |
466 | 467 | | |
467 | 468 | | def set_user_agent(self): |
468 | 469 | | global user_agent |
| skipped 11 lines |
480 | 481 | | |
481 | 482 | | def draw_links(self,n=1,log_sampling=False): |
482 | 483 | | urls = [] |
483 | | - | domain_count = np.array([(dmn,len(self.domain_links[dmn])) for dmn in self.domain_links]) |
484 | | - | p = np.array([np.float(c) for d,c in domain_count]) |
| 484 | + | domain_array = np.array([dmn for dmn in self.domain_links]) |
| 485 | + | domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])]) |
| 486 | + | p = np.array([np.float(c) for c in domain_count]) |
485 | 487 | | count_total = p.sum() |
486 | 488 | | if log_sampling: # log-sampling [log(x+1)] to bias lower count domains |
487 | 489 | | p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype) |
| skipped 1 lines |
489 | 491 | | p = p/p.sum() |
490 | 492 | | cnts = npr.multinomial(n, pvals=p) |
491 | 493 | | if n > 1: |
492 | | - | for k in range(len(cnts)): |
493 | | - | domain = domain_count[k][0] |
494 | | - | cnt = min(cnts[k],domain_count[k][1]) |
| 494 | + | for k in range(cnts.shape[0]): |
| 495 | + | domain = domain_array[k] |
| 496 | + | cnt = min(cnts[k],domain_count[k]) |
495 | 497 | | for url in random.sample(self.domain_links[domain],cnt): |
496 | 498 | | urls.append(url) |
497 | 499 | | else: |
| skipped 51 lines |
549 | 551 | | # https://github.com/detro/ghostdriver/issues/169 |
550 | 552 | | @self.phantomjs_short_timeout |
551 | 553 | | def phantomjs_find_elements_by_css_selector(): |
552 | | - | return WebDriverWait(self.session, 3).until(lambda x: x.find_elements_by_css_selector('div.g')) |
| 554 | + | return WebDriverWait(self.session,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g')) |
553 | 555 | | elements = phantomjs_find_elements_by_css_selector() |
554 | 556 | | # get links in random order until max. per page |
555 | 557 | | k = 0 |
| skipped 1 lines |
557 | 559 | | try: |
558 | 560 | | for div in sorted(elements,key=lambda k: random.random()): |
559 | 561 | | @self.phantomjs_short_timeout |
560 | | - | def phantomjs_find_element_by_tag_name(): |
561 | | - | if div.find_element_by_tag_name('a').get_attribute('href') is not None: |
562 | | - | links.append(div.find_element_by_tag_name('a').get_attribute('href')) |
563 | | - | phantomjs_find_element_by_tag_name() |
| 562 | + | def phantomjs_find_element_by_tag_name(): return div.find_element_by_tag_name('a') |
| 563 | + | a_tag = phantomjs_find_element_by_tag_name() |
| 564 | + | @self.phantomjs_short_timeout |
| 565 | + | def phantomjs_get_attribute(): return a_tag.get_attribute('href') |
| 566 | + | href = phantomjs_get_attribute() |
| 567 | + | if href is not None: links.append(href) |
564 | 568 | | k += 1 |
565 | 569 | | if k > self.max_links_per_page: break |
566 | 570 | | except Exception as e: |
567 | | - | if self.debug: print('.find_element_by_tag_name() exception:\n{}'.format(e)) |
| 571 | + | if self.debug: print('.find_element_by_tag_name.get_attribute() exception:\n{}'.format(e)) |
568 | 572 | | return links |
569 | 573 | | |
570 | 574 | | def get_url(self,url): |
| skipped 22 lines |
593 | 597 | | try: |
594 | 598 | | for a in sorted(elements,key=lambda k: random.random()): |
595 | 599 | | @self.phantomjs_short_timeout |
596 | | - | def phantomjs_get_attribute(): |
597 | | - | if a.get_attribute('href') is not None: |
598 | | - | links.append(a.get_attribute('href')) |
599 | | - | phantomjs_get_attribute() |
| 600 | + | def phantomjs_get_attribute(): return a.get_attribute('href') |
| 601 | + | href = phantomjs_get_attribute() |
| 602 | + | if href is not None: links.append(href) |
600 | 603 | | k += 1 |
601 | 604 | | if k > self.max_links_per_page: break |
602 | | - | except Exception as a: |
| 605 | + | except Exception as e: |
603 | 606 | | if self.debug: print('.get_attribute() exception:\n{}'.format(e)) |
604 | 607 | | return links |
605 | 608 | | |
| skipped 8 lines |
614 | 617 | | rp.read() |
615 | 618 | | result = rp.can_fetch(self.user_agent,url) |
616 | 619 | | del rp # ensure self.close() in urllib |
617 | | - | robots_read() |
| 620 | + | return result |
| 621 | + | result = robots_read() |
618 | 622 | | return result |
619 | 623 | | |
620 | 624 | | def add_url_links(self,links,url=''): |
| skipped 131 lines |