| skipped 46 lines |
47 | 47 | | wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain' |
48 | 48 | | timeout = 20 |
49 | 49 | | short_timeout = 3 |
| 50 | + | phantomjs_rss_limit_mb = 1024 # Default maximum meory limit of phantomjs processs (MB) |
| 51 | + | terminal_width = 80 # tty width, standard is 80 chars; add code to adapt later |
50 | 52 | | |
51 | 53 | | blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz' |
52 | 54 | | # Usage of the Shalla Blacklists: |
| skipped 60 lines |
113 | 115 | | |
114 | 116 | | |
115 | 117 | | class ISPDataPollution: |
116 | | - | '''Re: https://www.eff.org/deeplinks/2017/03/senate-puts-isp-profits-over-your-privacy |
| 118 | + | """ |
| 119 | + | Re: https://www.eff.org/deeplinks/2017/03/senate-puts-isp-profits-over-your-privacy |
117 | 120 | | |
118 | 121 | | I pay my ISP a lot for data usage every month. I typically don't use |
119 | 122 | | all the bandwidth that I pay for. If my ISP is going to sell my |
| skipped 12 lines |
132 | 135 | | The crawler uses the Python requests and lxml.html libraries, is hardcoded |
133 | 136 | | to download html without javascript processing, will not download |
134 | 137 | | images, and respects robots.txt, which all provide good security. |
135 | | - | ''' |
| 138 | + | """ |
136 | 139 | | |
137 | 140 | | def __init__(self,gb_per_month=gb_per_month, |
138 | 141 | | max_links_cached=max_links_cached, |
| skipped 4 lines |
143 | 146 | | blacklist_url=blacklist_url, |
144 | 147 | | wordsite_url=wordsite_url, |
145 | 148 | | seed_bias_links=seed_bias_links, |
146 | | - | timeout=timeout, |
| 149 | + | timeout=timeout, diurnal_flag=True, |
147 | 150 | | quit_driver_every_call=False, |
148 | 151 | | blacklist=True,verbose=True): |
149 | 152 | | self.max_links_cached = max_links_cached |
| skipped 6 lines |
156 | 159 | | self.seed_bias_links = seed_bias_links |
157 | 160 | | self.blacklist = blacklist; self.verbose = verbose |
158 | 161 | | self.timeout = timeout |
| 162 | + | self.diurnal_flag = diurnal_flag |
159 | 163 | | self.quit_driver_every_call = quit_driver_every_call |
160 | 164 | | # self.gb_per_month = gb_per_month # set in parseArgs |
161 | 165 | | # self.debug = debug # set in parseArgs |
| skipped 18 lines |
180 | 184 | | def parseArgs(self): |
181 | 185 | | parser = ap.ArgumentParser() |
182 | 186 | | parser.add_argument('-bw', '--gb_per_month', help="GB per month", type=int, default=gb_per_month) |
| 187 | + | parser.add_argument('-mm', '--maxmemory', help="Maximum memory of phantomjs (MB); 0=>restart every link", type=int, default=phantomjs_rss_limit_mb) |
183 | 188 | | parser.add_argument('-g', '--debug', help="Debug flag", action='store_true') |
184 | 189 | | args = parser.parse_args() |
185 | 190 | | for k in args.__dict__: setattr(self,k,getattr(args,k)) |
| skipped 2 lines |
188 | 193 | | |
189 | 194 | | def sanity_check_arguments(self): |
190 | 195 | | self.gb_per_month = min(2048,max(1,self.gb_per_month)) # min-max bandwidth limits |
| 196 | + | if self.maxmemory == 0: self.quit_driver_every_call = True |
| 197 | + | self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits |
191 | 198 | | |
192 | 199 | | def open_session(self): |
193 | 200 | | self.quit_session() |
| skipped 20 lines |
214 | 221 | | self.session = driver |
215 | 222 | | |
216 | 223 | | def quit_session(self,hard_quit=False,pid=None): |
217 | | - | ''' close, kill -9, quit, del ''' |
| 224 | + | """ |
| 225 | + | close, kill -9, quit, del |
| 226 | + | :param hard_quit: |
| 227 | + | :param pid: |
| 228 | + | :return: |
| 229 | + | """ |
218 | 230 | | # http://stackoverflow.com/questions/25110624/how-to-properly-stop-phantomjs-execution |
219 | 231 | | if hasattr(self,'session'): |
220 | 232 | | if not hard_quit: |
| skipped 102 lines |
323 | 335 | | if self.quit_driver_every_call: self.quit_session() |
324 | 336 | | while True: # pollute forever, pausing only to meet the bandwidth requirement |
325 | 337 | | try: |
326 | | - | if self.diurnal_cycle_test(): |
| 338 | + | if (not self.diurnal_flag) or self.diurnal_cycle_test(): |
327 | 339 | | self.pollute() |
328 | 340 | | else: |
329 | 341 | | time.sleep(self.chi2_mean_std(3.,1.)) |
| skipped 13 lines |
343 | 355 | | self.clear_session() |
344 | 356 | | if self.quit_driver_every_call: self.quit_session() |
345 | 357 | | url = self.pop_link() |
| 358 | + | if self.verbose: self.print_url(url) |
346 | 359 | | if self.quit_driver_every_call: self.open_session() |
347 | 360 | | self.get_url(url) |
348 | 361 | | self.clear_session() |
| skipped 55 lines |
404 | 417 | | return npr.uniform() < val |
405 | 418 | | |
406 | 419 | | def chi2_mean_std(self,mean=1.,std=0.1): |
407 | | - | ''' |
| 420 | + | """ |
408 | 421 | | Chi-squared random variable with given mean and standard deviation. |
409 | | - | ''' |
| 422 | + | :param mean: |
| 423 | + | :param std: |
| 424 | + | :return: |
| 425 | + | """ |
410 | 426 | | scale = 2.*mean/std |
411 | 427 | | nu = mean*scale |
412 | 428 | | return npr.chisquare(nu)/scale |
| skipped 124 lines |
537 | 553 | | return '.'.join(uprs.urlparse(url).netloc.split('.')[-2:]) |
538 | 554 | | |
539 | 555 | | def get_websearch(self,query): |
540 | | - | '''HTTP GET of a websearch, then add any embedded links.''' |
| 556 | + | """ |
| 557 | + | HTTP GET of a websearch, then add any embedded links. |
| 558 | + | :param query: |
| 559 | + | :return: |
| 560 | + | """ |
541 | 561 | | url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query))) |
| 562 | + | if self.verbose: self.print_url(url) |
542 | 563 | | @self.phantomjs_timeout |
543 | 564 | | def phantomjs_get(): self.session.get(url) # selenium driver |
544 | 565 | | phantomjs_get() |
| skipped 4 lines |
549 | 570 | | if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) |
550 | 571 | | |
551 | 572 | | def websearch_links(self): |
552 | | - | '''Webpage format for a popular search engine, <div class="g">''' |
| 573 | + | """ |
| 574 | + | Webpage format for a popular search engine, <div class="g">. |
| 575 | + | :return: |
| 576 | + | """ |
553 | 577 | | # https://github.com/detro/ghostdriver/issues/169 |
554 | 578 | | @self.phantomjs_short_timeout |
555 | 579 | | def phantomjs_find_elements_by_css_selector(): |
| skipped 18 lines |
574 | 598 | | return links |
575 | 599 | | |
576 | 600 | | def get_url(self,url): |
577 | | - | '''HTTP GET of the url, and add any embedded links.''' |
| 601 | + | """ |
| 602 | + | HTTP GET of the url, and add any embedded links. |
| 603 | + | :param url: |
| 604 | + | :return: |
| 605 | + | """ |
578 | 606 | | if not self.check_robots(url): return # bail out if robots.txt says to |
579 | 607 | | @self.phantomjs_timeout |
580 | 608 | | def phantomjs_get(): self.session.get(url) # selenium driver |
| skipped 60 lines |
641 | 669 | | except Exception as e: |
642 | 670 | | if self.debug: print('.current_url exception:\n{}'.format(e)) |
643 | 671 | | if self.debug: |
644 | | - | print("'{}': {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy())) |
| 672 | + | print("{}: {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy())) |
645 | 673 | | elif self.verbose: |
646 | | - | self.print_progress(k,current_url) |
| 674 | + | self.print_progress(current_url,num_links=k) |
647 | 675 | | |
648 | | - | def print_progress(self,num_links,url,terminal_width=80): |
649 | | - | # truncate or fill with white space |
650 | | - | text_suffix = ': +{:d}/{:d} links, H(domain)={:.1f} b'.format(num_links,self.link_count(),self.domain_entropy()) |
651 | | - | chars_used = 2 + len(text_suffix) |
652 | | - | if len(url) + chars_used > terminal_width: |
653 | | - | url = url[:terminal_width-chars_used-1] + '…' |
654 | | - | text = "'{}'{}".format(url,text_suffix) |
| 676 | + | def print_url(self,url): |
| 677 | + | if self.debug: print(url + ' …') |
| 678 | + | else: self.print_progress(url) |
| 679 | + | |
| 680 | + | def print_progress(self,url,num_links=None): |
| 681 | + | if num_links is not None: |
| 682 | + | text_suffix = ': +{:d}/{:d} links, H(domain)={:.1f} b'.format(num_links,self.link_count(),self.domain_entropy()) |
| 683 | + | else: |
| 684 | + | text_suffix = ': {:d} links, H(domain)={:.1f} b …'.format(self.link_count(),self.domain_entropy()) |
| 685 | + | self.print_truncated_line(url,text_suffix) |
| 686 | + | |
| 687 | + | def print_truncated_line(self,url,text_suffix='',terminal_width=terminal_width): |
| 688 | + | """ |
| 689 | + | Print truncated `url` + `text_suffix` to fill `terminal_width` |
| 690 | + | :param url: |
| 691 | + | :param text_suffix: |
| 692 | + | :param terminal_width: |
| 693 | + | :return: |
| 694 | + | """ |
| 695 | + | chars_used = len(text_suffix) |
| 696 | + | if text_suffix == '…': |
| 697 | + | if len(url) >= terminal_width: |
| 698 | + | url = url[:terminal_width-1] # add '…' below |
| 699 | + | elif len(url) < terminal_width-1: |
| 700 | + | url += ' ' # add an extra space before the ellipsis |
| 701 | + | else: |
| 702 | + | if len(url) + chars_used > terminal_width: |
| 703 | + | url = url[:terminal_width-chars_used-1] + '…' |
| 704 | + | text = "{}{}".format(url,text_suffix) # added white space necessary |
655 | 705 | | text = text[:min(terminal_width,len(text))] + ' ' * max(0,terminal_width-len(text)) |
656 | 706 | | print(text,end='',flush=True) |
657 | 707 | | time.sleep(0.01) |
| skipped 55 lines |
713 | 763 | | raise self.TimeoutError('robotparser is taking too long') |
714 | 764 | | |
715 | 765 | | def check_phantomjs_process(self): |
716 | | - | '''Check if phantomjs is running.''' |
| 766 | + | """ |
| 767 | + | Check if phantomjs is running. |
| 768 | + | :return: |
| 769 | + | """ |
717 | 770 | | # Check rss and restart if too large, then check existence |
718 | 771 | | # http://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python |
719 | 772 | | try: |
720 | 773 | | if not hasattr(self,'session'): self.open_session() |
721 | 774 | | pid, rss_mb = self.phantomjs_pid_and_memory() |
722 | | - | if rss_mb > 1024: # 1 GB rss limit |
| 775 | + | if rss_mb > self.phantomjs_rss_limit_mb: # memory limit |
723 | 776 | | self.quit_session(pid=pid) |
724 | 777 | | self.open_session() |
725 | 778 | | pid, _ = self.phantomjs_pid_and_memory() |
| skipped 32 lines |