| skipped 335 lines |
336 | 336 | | def link_count(self): |
337 | 337 | | return int(np.array([len(self.domain_links[dmn]) for dmn in self.domain_links]).sum()) |
338 | 338 | | |
| 339 | + | def domain_entropy(self): |
| 340 | + | result = 0. |
| 341 | + | domain_count = np.array([(dmn, len(self.domain_links[dmn])) for dmn in self.domain_links]) |
| 342 | + | p = np.array([np.float(c) for d, c in domain_count]) |
| 343 | + | count_total = p.sum() |
| 344 | + | if count_total > 0: |
| 345 | + | p = p / p.sum() |
| 346 | + | result = self.entropy(p) |
| 347 | + | return result |
| 348 | + | |
| 349 | + | def entropy(self,p): |
| 350 | + | return -np.fromiter((self.xlgx(x) for x in p.flatten()),dtype=p.dtype).sum() |
| 351 | + | |
| 352 | + | def xlgx(self,x): |
| 353 | + | x = max(0.,min(1.,x)) |
| 354 | + | y = 0. |
| 355 | + | if not (x == 0. or x == 1.): |
| 356 | + | y = x*np.log2(x) |
| 357 | + | return y |
| 358 | + | |
339 | 359 | | def seed_links(self): |
340 | 360 | | # bias with non-random seed links |
341 | 361 | | self.bias_links() |
| skipped 234 lines |
576 | 596 | | except Exception as e: |
577 | 597 | | if self.debug: print('.current_url exception:\n{}'.format(e)) |
578 | 598 | | if self.debug: |
579 | | - | print("'{}': {:d} links added, {:d} total".format(current_url,k,self.link_count())) |
| 599 | + | print("'{}': {:d} links added, {:d} total, {:d} bits domain entropy".format(current_url,k,self.link_count(),int(np.round(self.domain_entropy())))) |
580 | 600 | | elif self.verbose: |
581 | 601 | | self.print_progress(k,current_url) |
582 | 602 | | |
| skipped 78 lines |