| skipped 18 lines |
19 | 19 | | # You should have received a copy of the GNU General Public License |
20 | 20 | | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
21 | 21 | | |
22 | | - | __version__ = '1.0' |
| 22 | + | __version__ = '1.1' |
23 | 23 | | |
24 | 24 | | import argparse as ap, datetime as dt, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time |
25 | 25 | | import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs |
| skipped 124 lines |
150 | 150 | | timeout=timeout, diurnal_flag=True, |
151 | 151 | | quit_driver_every_call=False, |
152 | 152 | | blacklist=True,verbose=True): |
| 153 | + | print('This is ISP Data Pollution 🐙💨, Version {}'.format(__version__)) |
153 | 154 | | self.max_links_cached = max_links_cached |
154 | 155 | | self.max_links_per_page = max_links_per_page |
155 | 156 | | self.max_links_per_domain = max_links_per_domain |
| skipped 18 lines |
174 | 175 | | alarm_time=short_timeout+1,errors=(self.TimeoutError,Exception), debug=self.debug) |
175 | 176 | | self.robots_timeout = self.block_timeout(self.robots_hang_handler, \ |
176 | 177 | | alarm_time=short_timeout+1,errors=(self.TimeoutError,), debug=self.debug) |
| 178 | + | self.check_phantomjs_version() |
177 | 179 | | self.fake = Factory.create() |
178 | 180 | | self.hour_trigger = True |
179 | 181 | | self.twentyfour_hour_trigger = True |
| skipped 2 lines |
182 | 184 | | self.data_usage = 0 |
183 | 185 | | self.get_blacklist() |
184 | 186 | | self.get_random_words() |
185 | | - | print('This is ISP Data Pollution 🐙💨, Version {}'.format(__version__)) |
186 | 187 | | self.pollute_forever() |
187 | 188 | | |
188 | 189 | | def parseArgs(self): |
| skipped 1 lines |
190 | 191 | | parser.add_argument('-bw', '--gb_per_month', help="GB per month", type=int, default=gb_per_month) |
191 | 192 | | parser.add_argument('-mm', '--maxmemory', |
192 | 193 | | help="Maximum memory of phantomjs (MB); 0=>restart every link", |
193 | | - | type=int, default=0) |
| 194 | + | type=int, default=1024) |
194 | 195 | | # parser.add_argument('-P', '--phantomjs-binary-path', help="Path to phantomjs binary", type=int, default=phantomjs_rss_limit_mb) |
195 | 196 | | parser.add_argument('-g', '--debug', help="Debug flag", action='store_true') |
196 | 197 | | args = parser.parse_args() |
| skipped 6 lines |
203 | 204 | | if self.maxmemory == 0: self.quit_driver_every_call = True |
204 | 205 | | self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits |
205 | 206 | | |
| 207 | + | def check_phantomjs_version(self,recommended_version=(2,1)): |
| 208 | + | self.open_session() |
| 209 | + | if self.debug: |
| 210 | + | print("{} version is {}, {} version is {}".format(self.session.capabilities["browserName"], |
| 211 | + | self.session.capabilities["version"], |
| 212 | + | self.session.capabilities["driverName"], |
| 213 | + | self.session.capabilities["driverVersion"])) |
| 214 | + | phantomjs_version = tuple(int(i) for i in self.session.capabilities["version"].split('.')) |
| 215 | + | if phantomjs_version < recommended_version: |
| 216 | + | print("""{} version is {}; |
| 217 | + | please upgrade to at least version {} from http://phantomjs.org. |
| 218 | + | """.format(self.session.capabilities["browserName"],self.session.capabilities["version"], |
| 219 | + | '.'.join(str(i) for i in recommended_version))) |
| 220 | + | self.quit_session() |
| 221 | + | |
206 | 222 | | def open_session(self): |
207 | 223 | | self.quit_session() |
208 | 224 | | if not hasattr(self, 'session') or not isinstance(self.session,webdriver.phantomjs.webdriver.WebDriver): |
| skipped 147 lines |
356 | 372 | | if npr.uniform() < 0.005: self.set_user_agent() # reset the user agent occasionally |
357 | 373 | | self.elapsed_time = time.time() - self.start_time |
358 | 374 | | self.exceeded_bandwidth_tasks() |
| 375 | + | self.random_interval_tasks() |
359 | 376 | | self.every_hour_tasks() |
360 | 377 | | time.sleep(self.chi2_mean_std(0.5,0.2)) |
361 | 378 | | except Exception as e: |
| skipped 84 lines |
446 | 463 | | self.decimate_links(total_frac=0.81,decimate_frac=0.1) |
447 | 464 | | time.sleep(120) |
448 | 465 | | |
| 466 | + | def random_interval_tasks(self,random_interval=None): |
| 467 | + | if random_interval is None: random_interval = self.chi2_mean_std(2*3600.,3600.) |
| 468 | + | def init_random_time(): |
| 469 | + | self.random_start_time = time.time() |
| 470 | + | self.random_interval = self.random_start_time + random_interval |
| 471 | + | if not hasattr(self,'random_interval'): init_random_time() |
| 472 | + | if time.time() > self.random_interval: |
| 473 | + | init_random_time() # reinitialize random interval |
| 474 | + | self.current_preferred_domain = self.draw_domain() |
| 475 | + | |
449 | 476 | | def every_hour_tasks(self): |
450 | 477 | | if int(self.elapsed_time/60. % 60.) == 59: |
451 | 478 | | # reset user agent, clear out cookies, seed more links |
| skipped 61 lines |
513 | 540 | | if self.debug: print('.update() exception:\n{}'.format(e)) |
514 | 541 | | |
515 | 542 | | def draw_link(self,log_sampling=True): |
| 543 | + | """ Draw a single, random link. """ |
516 | 544 | | return self.draw_links(n=1,log_sampling=log_sampling)[0] |
517 | 545 | | |
518 | 546 | | def draw_links(self,n=1,log_sampling=False): |
| 547 | + | """ Draw multiple random links. """ |
519 | 548 | | urls = [] |
520 | 549 | | domain_array = np.array([dmn for dmn in self.domain_links]) |
521 | 550 | | domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])]) |
| skipped 17 lines |
539 | 568 | | urls.append(url) |
540 | 569 | | return urls |
541 | 570 | | |
542 | | - | def pop_link(self): |
543 | | - | url = self.draw_link() |
544 | | - | if npr.uniform() < 0.95: # 95% 1 GET, ~5% 2 GETs, .2% three GETs |
| 571 | + | def draw_domain(self,log_sampling=False): |
| 572 | + | """ Draw a single, random domain. """ |
| 573 | + | domain = None |
| 574 | + | domain_array = np.array([dmn for dmn in self.domain_links]) |
| 575 | + | domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])]) |
| 576 | + | p = np.array([np.float(c) for c in domain_count]) |
| 577 | + | count_total = p.sum() |
| 578 | + | if log_sampling: # log-sampling [log(x+1)] to bias lower count domains |
| 579 | + | p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype) |
| 580 | + | if count_total > 0: |
| 581 | + | p = p/p.sum() |
| 582 | + | cnts = npr.multinomial(1, pvals=p) |
| 583 | + | k = int(np.nonzero(cnts)[0]) |
| 584 | + | domain = domain_array[k] |
| 585 | + | return domain |
| 586 | + | |
| 587 | + | def draw_link_from_domain(self,domain): |
| 588 | + | """ Draw a single, random link from a specific domain. """ |
| 589 | + | domain_count = len(self.domain_links.get(domain,set())) |
| 590 | + | url = random.sample(self.domain_links[domain],1)[0] if domain_count > 0 else None |
| 591 | + | return url |
| 592 | + | |
| 593 | + | def pop_link(self,remove_link_fraction=0.95,current_preferred_domain_fraction=0.1): |
| 594 | + | """ Pop a link from the collected list. |
| 595 | + | If `self.current_preferred_domain` is defined, then a link from this domain is drawn |
| 596 | + | a fraction of the time. """ |
| 597 | + | url = None |
| 598 | + | if hasattr(self,'current_preferred_domain') and npr.uniform() < current_preferred_domain_fraction: |
| 599 | + | while url is not None: # loop until `self.current_preferred_domain` has a url |
| 600 | + | url = self.draw_link_from_domain(self.current_preferred_domain) |
| 601 | + | if url is None: self.current_preferred_domain = self.draw_domain() |
| 602 | + | if url is None: url = self.draw_link() |
| 603 | + | if npr.uniform() < remove_link_fraction: # 95% 1 GET, ~5% 2 GETs, .2% three GETs |
545 | 604 | | self.remove_link(url) # pop a random item from the stack |
546 | 605 | | return url |
547 | 606 | | |
| skipped 282 lines |