| skipped 349 lines |
350 | 350 | | return -np.fromiter((self.xlgx(x) for x in p.flatten()),dtype=p.dtype).sum() |
351 | 351 | | |
352 | 352 | | def xlgx(self,x): |
353 | | - | x = max(0.,min(1.,x)) |
| 353 | + | x = np.abs(x) |
354 | 354 | | y = 0. |
355 | 355 | | if not (x == 0. or x == 1.): |
356 | 356 | | y = x*np.log2(x) |
| skipped 106 lines |
463 | 463 | | domain_count = np.array([(dmn,len(self.domain_links[dmn])) for dmn in self.domain_links]) |
464 | 464 | | p = np.array([np.float(c) for d,c in domain_count]) |
465 | 465 | | count_total = p.sum() |
| 466 | + | # log-sampling [log(x+1)] to bias lower count domains |
| 467 | + | p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype) |
466 | 468 | | if count_total > 0: |
467 | 469 | | p = p/p.sum() |
468 | 470 | | cnts = npr.multinomial(n, pvals=p) |
| skipped 20 lines |
489 | 491 | | result = False |
490 | 492 | | domain = self.domain_name(url) |
491 | 493 | | if self.link_count() < self.max_links_cached \ |
492 | | - | and len(getattr(self.domain_links,domain,[])) < self.max_links_per_domain \ |
493 | | - | and url not in getattr(self.domain_links,domain,set()): |
| 494 | + | and len(self.domain_links.get(domain,[])) < self.max_links_per_domain \ |
| 495 | + | and url not in self.domain_links.get(domain,set()): |
494 | 496 | | self.domain_links.setdefault(domain, set()) |
495 | 497 | | self.domain_links[domain].add(url) |
496 | 498 | | result = True |
| skipped 3 lines |
500 | 502 | | def remove_link(self,url): |
501 | 503 | | result = False |
502 | 504 | | domain = self.domain_name(url) |
503 | | - | if url in getattr(self.domain_links,domain,set()): |
| 505 | + | if url in self.domain_links.get(domain,set()): |
504 | 506 | | self.domain_links[domain].remove(url) |
505 | 507 | | if len(self.domain_links[domain]) == 0: |
506 | 508 | | del self.domain_links[domain] |
| skipped 174 lines |