🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 18 lines
    19 19  # You should have received a copy of the GNU General Public License
    20 20  # along with this program. If not, see <http://www.gnu.org/licenses/>.
    21 21   
    22  -__version__ = '1.0'
     22 +__version__ = '1.1'
    23 23   
    24 24  import argparse as ap, datetime as dt, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time
    25 25  import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
    skipped 124 lines
    150 150   timeout=timeout, diurnal_flag=True,
    151 151   quit_driver_every_call=False,
    152 152   blacklist=True,verbose=True):
     153 + print('This is ISP Data Pollution 🐙💨, Version {}'.format(__version__))
    153 154   self.max_links_cached = max_links_cached
    154 155   self.max_links_per_page = max_links_per_page
    155 156   self.max_links_per_domain = max_links_per_domain
    skipped 18 lines
    174 175   alarm_time=short_timeout+1,errors=(self.TimeoutError,Exception), debug=self.debug)
    175 176   self.robots_timeout = self.block_timeout(self.robots_hang_handler, \
    176 177   alarm_time=short_timeout+1,errors=(self.TimeoutError,), debug=self.debug)
     178 + self.check_phantomjs_version()
    177 179   self.fake = Factory.create()
    178 180   self.hour_trigger = True
    179 181   self.twentyfour_hour_trigger = True
    skipped 2 lines
    182 184   self.data_usage = 0
    183 185   self.get_blacklist()
    184 186   self.get_random_words()
    185  - print('This is ISP Data Pollution 🐙💨, Version {}'.format(__version__))
    186 187   self.pollute_forever()
    187 188   
    188 189   def parseArgs(self):
    skipped 1 lines
    190 191   parser.add_argument('-bw', '--gb_per_month', help="GB per month", type=int, default=gb_per_month)
    191 192   parser.add_argument('-mm', '--maxmemory',
    192 193   help="Maximum memory of phantomjs (MB); 0=>restart every link",
    193  - type=int, default=0)
     194 + type=int, default=1024)
    194 195   # parser.add_argument('-P', '--phantomjs-binary-path', help="Path to phantomjs binary", type=int, default=phantomjs_rss_limit_mb)
    195 196   parser.add_argument('-g', '--debug', help="Debug flag", action='store_true')
    196 197   args = parser.parse_args()
    skipped 6 lines
    203 204   if self.maxmemory == 0: self.quit_driver_every_call = True
    204 205   self.phantomjs_rss_limit_mb = min(4096,max(256,self.maxmemory)) # min-max bandwidth limits
    205 206   
     207 + def check_phantomjs_version(self,recommended_version=(2,1)):
     208 + self.open_session()
     209 + if self.debug:
     210 + print("{} version is {}, {} version is {}".format(self.session.capabilities["browserName"],
     211 + self.session.capabilities["version"],
     212 + self.session.capabilities["driverName"],
     213 + self.session.capabilities["driverVersion"]))
     214 + phantomjs_version = tuple(int(i) for i in self.session.capabilities["version"].split('.'))
     215 + if phantomjs_version < recommended_version:
     216 + print("""{} version is {};
     217 +please upgrade to at least version {} from http://phantomjs.org.
     218 +""".format(self.session.capabilities["browserName"],self.session.capabilities["version"],
     219 + '.'.join(str(i) for i in recommended_version)))
     220 + self.quit_session()
     221 + 
    206 222   def open_session(self):
    207 223   self.quit_session()
    208 224   if not hasattr(self, 'session') or not isinstance(self.session,webdriver.phantomjs.webdriver.WebDriver):
    skipped 147 lines
    356 372   if npr.uniform() < 0.005: self.set_user_agent() # reset the user agent occasionally
    357 373   self.elapsed_time = time.time() - self.start_time
    358 374   self.exceeded_bandwidth_tasks()
     375 + self.random_interval_tasks()
    359 376   self.every_hour_tasks()
    360 377   time.sleep(self.chi2_mean_std(0.5,0.2))
    361 378   except Exception as e:
    skipped 84 lines
    446 463   self.decimate_links(total_frac=0.81,decimate_frac=0.1)
    447 464   time.sleep(120)
    448 465   
     466 + def random_interval_tasks(self,random_interval=None):
     467 + if random_interval is None: random_interval = self.chi2_mean_std(2*3600.,3600.)
     468 + def init_random_time():
     469 + self.random_start_time = time.time()
     470 + self.random_interval = self.random_start_time + random_interval
     471 + if not hasattr(self,'random_interval'): init_random_time()
     472 + if time.time() > self.random_interval:
     473 + init_random_time() # reinitialize random interval
     474 + self.current_preferred_domain = self.draw_domain()
     475 + 
    449 476   def every_hour_tasks(self):
    450 477   if int(self.elapsed_time/60. % 60.) == 59:
    451 478   # reset user agent, clear out cookies, seed more links
    skipped 61 lines
    513 540   if self.debug: print('.update() exception:\n{}'.format(e))
    514 541   
    515 542   def draw_link(self,log_sampling=True):
     543 + """ Draw a single, random link. """
    516 544   return self.draw_links(n=1,log_sampling=log_sampling)[0]
    517 545   
    518 546   def draw_links(self,n=1,log_sampling=False):
     547 + """ Draw multiple random links. """
    519 548   urls = []
    520 549   domain_array = np.array([dmn for dmn in self.domain_links])
    521 550   domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])])
    skipped 17 lines
    539 568   urls.append(url)
    540 569   return urls
    541 570   
    542  - def pop_link(self):
    543  - url = self.draw_link()
    544  - if npr.uniform() < 0.95: # 95% 1 GET, ~5% 2 GETs, .2% three GETs
     571 + def draw_domain(self,log_sampling=False):
     572 + """ Draw a single, random domain. """
     573 + domain = None
     574 + domain_array = np.array([dmn for dmn in self.domain_links])
     575 + domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])])
     576 + p = np.array([np.float(c) for c in domain_count])
     577 + count_total = p.sum()
     578 + if log_sampling: # log-sampling [log(x+1)] to bias lower count domains
     579 + p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype)
     580 + if count_total > 0:
     581 + p = p/p.sum()
     582 + cnts = npr.multinomial(1, pvals=p)
     583 + k = int(np.nonzero(cnts)[0])
     584 + domain = domain_array[k]
     585 + return domain
     586 + 
     587 + def draw_link_from_domain(self,domain):
     588 + """ Draw a single, random link from a specific domain. """
     589 + domain_count = len(self.domain_links.get(domain,set()))
     590 + url = random.sample(self.domain_links[domain],1)[0] if domain_count > 0 else None
     591 + return url
     592 + 
     593 + def pop_link(self,remove_link_fraction=0.95,current_preferred_domain_fraction=0.1):
     594 + """ Pop a link from the collected list.
     595 +If `self.current_preferred_domain` is defined, then a link from this domain is drawn
     596 +a fraction of the time. """
     597 + url = None
     598 + if hasattr(self,'current_preferred_domain') and npr.uniform() < current_preferred_domain_fraction:
     599 + while url is not None: # loop until `self.current_preferred_domain` has a url
     600 + url = self.draw_link_from_domain(self.current_preferred_domain)
     601 + if url is None: self.current_preferred_domain = self.draw_domain()
     602 + if url is None: url = self.draw_link()
     603 + if npr.uniform() < remove_link_fraction: # 95% 1 GET, ~5% 2 GETs, .2% three GETs
    545 604   self.remove_link(url) # pop a random item from the stack
    546 605   return url
    547 606   
    skipped 282 lines
Please wait...
Page is in error, reload to recover