| skipped 158 lines |
159 | 159 | | self.fake = Factory.create() |
160 | 160 | | self.hour_trigger = True |
161 | 161 | | self.twentyfour_hour_trigger = True |
162 | | - | self.links = set() |
163 | | - | self.link_count = dict() |
| 162 | + | self.domain_links = dict() |
164 | 163 | | self.start_time = time.time() |
165 | 164 | | self.data_usage = 0 |
166 | 165 | | self.get_blacklist() |
| skipped 156 lines |
323 | 322 | | |
324 | 323 | | def pollute(self): |
325 | 324 | | if not self.quit_driver_every_call: self.check_phantomjs_process() |
326 | | - | if len(self.links) < 2000: |
| 325 | + | if self.link_count() < 2000: |
327 | 326 | | if self.quit_driver_every_call: self.open_session() |
328 | 327 | | self.seed_links() |
329 | 328 | | self.clear_session() |
330 | 329 | | if self.quit_driver_every_call: self.quit_session() |
331 | | - | url = self.remove_link() |
| 330 | + | url = self.pop_link() |
332 | 331 | | if self.quit_driver_every_call: self.open_session() |
333 | 332 | | self.get_url(url) |
334 | 333 | | self.clear_session() |
335 | 334 | | if self.quit_driver_every_call: self.quit_session() |
| 335 | + | |
| 336 | + | def link_count(self): |
| 337 | + | return int(np.array([len(self.domain_links[dmn]) for dmn in self.domain_links]).sum()) |
336 | 338 | | |
337 | 339 | | def seed_links(self): |
338 | 340 | | # bias with non-random seed links |
339 | | - | self.links |= set(self.seed_bias_links) |
340 | | - | if len(self.links) < self.max_links_cached: |
| 341 | + | self.bias_links() |
| 342 | + | if self.link_count() < self.max_links_cached: |
341 | 343 | | num_words = max(1,npr.poisson(1.33)+1) # mean of 1.33 words per search |
342 | | - | word = ' '.join(random.sample(self.words,num_words)) |
| 344 | + | if num_words == 1: |
| 345 | + | word = ' '.join(random.sample(self.words,num_words)) |
| 346 | + | else: |
| 347 | + | if npr.uniform() < 0.5: |
| 348 | + | word = ' '.join(random.sample(self.words,num_words)) |
| 349 | + | else: # quote the first two words together |
| 350 | + | word = ' '.join(['"{}"'.format(' '.join(random.sample(self.words, 2))), |
| 351 | + | ' '.join(random.sample(self.words, num_words-2))]) |
343 | 352 | | if self.debug: print('Seeding with search for \'{}\'…'.format(word)) |
344 | 353 | | # self.add_url_links(self.websearch(word).content.decode('utf-8')) |
345 | 354 | | self.get_websearch(word) |
| 355 | + | |
| 356 | + | def bias_links(self): |
| 357 | + | for url in self.seed_bias_links: self.add_link(url) |
346 | 358 | | |
347 | 359 | | def diurnal_cycle_test(self): |
348 | 360 | | now = dt.datetime.now() |
| skipped 19 lines |
368 | 380 | | def exceeded_bandwidth_tasks(self): |
369 | 381 | | if self.bandwidth_test(): |
370 | 382 | | # decimate the stack and clear the cookies |
371 | | - | if len(self.links) > int(np.ceil(0.81*self.max_links_cached)): |
372 | | - | for url in random.sample(self.links,int(np.ceil(len(self.links)/10.))): |
373 | | - | self.remove_link(url) |
| 383 | + | if self.link_count() > int(np.ceil(0.81*self.max_links_cached)): |
| 384 | + | for url in self.draw_links(n=int(np.ceil(self.link_count()/10.))): |
| 385 | + | self.pop_link() |
374 | 386 | | time.sleep(120) |
375 | 387 | | |
376 | 388 | | def every_hour_tasks(self): |
| skipped 34 lines |
411 | 423 | | # reset bw stats and (really) decimate the stack every couple of weeks |
412 | 424 | | self.start_time = time.time() |
413 | 425 | | self.data_usage = 0 |
414 | | - | if len(self.links) > int(np.ceil(0.49*self.max_links_cached)): |
415 | | - | for url in random.sample(self.links,int(np.ceil(len(self.links)/3.))): |
416 | | - | self.remove_link(url) |
| 426 | + | if self.link_count() > int(np.ceil(0.49*self.max_links_cached)): |
| 427 | + | for url in self.draw_links(n=int(np.ceil(self.link_count()/3.))): |
| 428 | + | self.pop_link(url) |
417 | 429 | | |
418 | 430 | | def set_user_agent(self): |
419 | 431 | | global user_agent |
| skipped 3 lines |
423 | 435 | | except Exception as e: |
424 | 436 | | if self.debug: print('.update() exception:\n{}'.format(e)) |
425 | 437 | | |
426 | | - | def remove_link(self): |
427 | | - | url = random.sample(self.links,1)[0] |
| 438 | + | def draw_link(self): |
| 439 | + | return self.draw_links(n=1)[0] |
| 440 | + | |
| 441 | + | def draw_links(self,n=1): |
| 442 | + | urls = [] |
| 443 | + | domain_count = np.array([(dmn,len(self.domain_links[dmn])) for dmn in self.domain_links]) |
| 444 | + | p = np.array([np.float(c) for d,c in domain_count]) |
| 445 | + | count_total = p.sum() |
| 446 | + | if count_total > 0: |
| 447 | + | p = p/p.sum() |
| 448 | + | cnts = npr.multinomial(n, pvals=p) |
| 449 | + | if n > 1: |
| 450 | + | for k in range(len(cnts)): |
| 451 | + | domain = domain_count[k][0] |
| 452 | + | cnt = min(cnts[k],domain_count[k][1]) |
| 453 | + | for url in random.sample(self.domain_links[domain],cnt): |
| 454 | + | urls.append(url) |
| 455 | + | else: |
| 456 | + | k = int(np.nonzero(cnts)[0]) |
| 457 | + | domain = domain_count[k][0] |
| 458 | + | url = random.sample(self.domain_links[domain],1)[0] |
| 459 | + | urls.append(url) |
| 460 | + | return urls |
| 461 | + | |
| 462 | + | def pop_link(self): |
| 463 | + | url = self.draw_link() |
428 | 464 | | if npr.uniform() < 0.95: # 95% 1 GET, ~5% 2 GETs, .2% three GETs |
429 | | - | self.links.remove(url) # pop a random item from the stack |
430 | | - | self.decrement_link_count(url) |
| 465 | + | self.remove_link(url) # pop a random item from the stack |
431 | 466 | | return url |
432 | 467 | | |
433 | 468 | | def add_link(self,url): |
434 | 469 | | result = False |
435 | 470 | | domain = self.domain_name(url) |
436 | | - | self.link_count.setdefault(domain,0) |
437 | | - | if len(self.links) < self.max_links_cached \ |
438 | | - | and self.link_count[domain] < self.max_links_per_domain \ |
439 | | - | and url not in self.links: |
440 | | - | self.links.add(url) |
441 | | - | self.increment_link_count(url,domain) |
| 471 | + | if self.link_count() < self.max_links_cached \ |
| 472 | + | and len(getattr(self.domain_links,domain,[])) < self.max_links_per_domain \ |
| 473 | + | and url not in getattr(self.domain_links,domain,set()): |
| 474 | + | self.domain_links.setdefault(domain, set()) |
| 475 | + | self.domain_links[domain].add(url) |
442 | 476 | | result = True |
443 | 477 | | # if self.debug: print('\tAdded link \'{}\'…'.format(url)) |
444 | 478 | | return result |
445 | 479 | | |
446 | | - | def decrement_link_count(self,url,domain=None): |
447 | | - | if domain is None: domain = self.domain_name(url) |
448 | | - | self.link_count.setdefault(domain,0) |
449 | | - | if self.link_count[domain] > 0: self.link_count[domain] -= 1 |
450 | | - | |
451 | | - | def increment_link_count(self,url,domain=None): |
452 | | - | if domain is None: domain = self.domain_name(url) |
453 | | - | self.link_count.setdefault(domain,0) |
454 | | - | self.link_count[domain] += 1 |
| 480 | + | def remove_link(self,url): |
| 481 | + | result = False |
| 482 | + | domain = self.domain_name(url) |
| 483 | + | if url in getattr(self.domain_links,domain,set()): |
| 484 | + | self.domain_links[domain].remove(url) |
| 485 | + | if len(self.domain_links[domain]) == 0: |
| 486 | + | self.domain_links.remove(domain) |
| 487 | + | result = True |
| 488 | + | return result |
455 | 489 | | |
456 | 490 | | def domain_name(self,url): |
457 | 491 | | return '.'.join(uprs.urlparse(url).netloc.split('.')[-2:]) |
| skipped 13 lines |
471 | 505 | | except Exception as e: |
472 | 506 | | if self.debug: print('.page_source exception:\n{}'.format(e)) |
473 | 507 | | new_links = self.websearch_links() |
474 | | - | if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url) |
| 508 | + | if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) |
475 | 509 | | |
476 | 510 | | def websearch_links(self): |
477 | 511 | | '''Webpage format for a popular search engine, <div class="g">''' |
| skipped 22 lines |
500 | 534 | | except Exception as e: |
501 | 535 | | if self.debug: print('.page_source exception:\n{}'.format(e)) |
502 | 536 | | new_links = self.url_links() |
503 | | - | if len(self.links) < self.max_links_cached: self.add_url_links(new_links,url) |
| 537 | + | if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) |
504 | 538 | | |
505 | 539 | | def url_links(self): |
506 | 540 | | '''Generic webpage link finder format.''' |
| skipped 35 lines |
542 | 576 | | except Exception as e: |
543 | 577 | | if self.debug: print('.current_url exception:\n{}'.format(e)) |
544 | 578 | | if self.debug: |
545 | | - | print("'{}': {:d} links added, {:d} total".format(current_url,k,len(self.links))) |
| 579 | + | print("'{}': {:d} links added, {:d} total".format(current_url,k,self.link_count())) |
546 | 580 | | elif self.verbose: |
547 | 581 | | self.print_progress(k,current_url) |
548 | 582 | | |
549 | 583 | | def print_progress(self,num_links,url,terminal_width=80): |
550 | 584 | | # truncate or fill with white space |
551 | | - | text_suffix = ': {:d} links added, {:d} total'.format(num_links,len(self.links)) |
| 585 | + | text_suffix = ': {:d} links added, {:d} total'.format(num_links,self.link_count()) |
552 | 586 | | chars_used = 2 + len(text_suffix) |
553 | 587 | | if len(url) + chars_used > terminal_width: |
554 | 588 | | url = url[:terminal_width-chars_used-1] + '…' |
| skipped 72 lines |