🤬
  • ■ ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 349 lines
    350 350   return -np.fromiter((self.xlgx(x) for x in p.flatten()),dtype=p.dtype).sum()
    351 351   
    352 352   def xlgx(self,x):
    353  - x = max(0.,min(1.,x))
     353 + x = np.abs(x)
    354 354   y = 0.
    355 355   if not (x == 0. or x == 1.):
    356 356   y = x*np.log2(x)
    skipped 106 lines
    463 463   domain_count = np.array([(dmn,len(self.domain_links[dmn])) for dmn in self.domain_links])
    464 464   p = np.array([np.float(c) for d,c in domain_count])
    465 465   count_total = p.sum()
     466 + # log-sampling [log(x+1)] to bias lower count domains
     467 + p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype)
    466 468   if count_total > 0:
    467 469   p = p/p.sum()
    468 470   cnts = npr.multinomial(n, pvals=p)
    skipped 20 lines
    489 491   result = False
    490 492   domain = self.domain_name(url)
    491 493   if self.link_count() < self.max_links_cached \
    492  - and len(getattr(self.domain_links,domain,[])) < self.max_links_per_domain \
    493  - and url not in getattr(self.domain_links,domain,set()):
     494 + and len(self.domain_links.get(domain,[])) < self.max_links_per_domain \
     495 + and url not in self.domain_links.get(domain,set()):
    494 496   self.domain_links.setdefault(domain, set())
    495 497   self.domain_links[domain].add(url)
    496 498   result = True
    skipped 3 lines
    500 502   def remove_link(self,url):
    501 503   result = False
    502 504   domain = self.domain_name(url)
    503  - if url in getattr(self.domain_links,domain,set()):
     505 + if url in self.domain_links.get(domain,set()):
    504 506   self.domain_links[domain].remove(url)
    505 507   if len(self.domain_links[domain]) == 0:
    506 508   del self.domain_links[domain]
    skipped 174 lines
Please wait...
Page is in error, reload to recover