🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 18 lines
    19 19  # You should have received a copy of the GNU General Public License
    20 20  # along with this program. If not, see <http://www.gnu.org/licenses/>.
    21 21   
    22  -__version__ = '1.1'
     22 +__version__ = '1.2'
    23 23   
    24  -import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time, warnings as warn
     24 +import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn
    25 25  import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
    26 26  from selenium import webdriver
    27 27  from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    skipped 23 lines
    51 51  max_links_cached = 100000 # Maximum number of links to cache for download
    52 52  max_links_per_page = 200 # Maximum number of links to add per page
    53 53  max_links_per_domain = 400 # Maximum number of links to add per domain
    54  -search_url = 'http://www.google.com/search' # keep unencrypted for ISP DPI
    55 54  wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain'
    56 55  timeout = 20
    57 56  short_timeout = 3
    skipped 63 lines
    121 120  # 1. The bandwidth usage is undoubtedly (much) smaller because gzip encoding is used
    122 121  # 2. A lightweight proxy could be used for accurate bandwidth, and header editing
    123 122   
     123 +# Safe search options
     124 +class SafeWebSearch():
     125 + """ Safe web search class with default Google parameters.
     126 +Use unencrypted HTTP for ISP DPI.
     127 +"""
     128 + def __init__(self,
     129 + search_url='http://www.google.com/search', # search engine
     130 + query_parameter='q', # query parameter
     131 + safe_parameter='safe=active', # query parameter for safe searches
     132 + css_selector='div.g', # css selector to harvest search results
     133 + additional_parameters='', # additional parameters required to get results
     134 + result_extraction=lambda x: x): # function to extract the link
     135 + self.search_url = search_url
     136 + self.query_parameter = query_parameter
     137 + self.safe_parameter = safe_parameter
     138 + self.css_selector = css_selector
     139 + self.additional_parameters = additional_parameters
     140 + self.result_extraction = result_extraction
     141 + 
     142 +SafeGoogle = SafeWebSearch()
     143 +SafeBing = SafeWebSearch(search_url='http://www.bing.com/search',
     144 + safe_parameter='adlt=strict',css_selector='li.b_algo')
     145 +yahoo_search_reprog = re.compile(r'/RU=(.+?)/R[A-Z]=')
     146 +SafeYahoo = SafeWebSearch(search_url='http://search.yahoo.com/search', query_parameter='p',
     147 + safe_parameter='adlt=strict',css_selector='div.compTitle',
     148 + result_extraction=lambda x: yahoo_search_reprog.findall(uprs.parse_qs(x)['_ylu'][0])[0])
     149 +SafeDuckDuckGo = SafeWebSearch(search_url='http://www.duckduckgo.com/',
     150 + safe_parameter='kp=1',css_selector='div.result__body')
    124 151   
    125 152  class ISPDataPollution:
    126 153   """
    skipped 23 lines
    150 177   max_links_per_page=max_links_per_page,
    151 178   max_links_per_domain=max_links_per_domain,
    152 179   user_agent=user_agent,
    153  - search_url=search_url,
    154 180   blacklist_url=blacklist_url,
    155 181   wordsite_url=wordsite_url,
    156 182   seed_bias_links=seed_bias_links,
    skipped 5 lines
    162 188   self.max_links_per_page = max_links_per_page
    163 189   self.max_links_per_domain = max_links_per_domain
    164 190   self.user_agent = user_agent
    165  - self.search_url = search_url
    166 191   self.blacklist_url = blacklist_url
    167 192   self.wordsite_url = wordsite_url
    168 193   self.seed_bias_links = seed_bias_links
    skipped 480 lines
    649 674   :param query:
    650 675   :return:
    651 676   """
    652  - url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
     677 + self.select_random_search_engine()
     678 + url = uprs.urlunparse(uprs.urlparse(self.SafeSearch.search_url)._replace(query='{}={}{}&{}'.format(
     679 + self.SafeSearch.query_parameter,uprs.quote_plus(query),
     680 + self.SafeSearch.additional_parameters,self.SafeSearch.safe_parameter)))
    653 681   if self.verbose: self.print_url(url)
    654 682   @self.phantomjs_timeout
    655 683   def phantomjs_get(): self.driver.get(url) # selenium driver
    skipped 4 lines
    660 688   new_links = self.websearch_links()
    661 689   if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url)
    662 690   
     691 + def select_random_search_engine(self):
     692 + self.SafeSearch = random.choice([SafeGoogle, SafeBing, SafeYahoo, SafeDuckDuckGo])
     693 + return self.SafeSearch
     694 + 
    663 695   def websearch_links(self):
    664 696   """
    665 697   Webpage format for a popular search engine, <div class="g">.
    skipped 2 lines
    668 700   # https://github.com/detro/ghostdriver/issues/169
    669 701   @self.phantomjs_short_timeout
    670 702   def phantomjs_find_elements_by_css_selector():
    671  - return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector('div.g'))
     703 + return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector(self.SafeSearch.css_selector))
    672 704   elements = phantomjs_find_elements_by_css_selector()
    673 705   # get links in random order until max. per page
    674 706   k = 0
    675 707   links = []
    676 708   try:
    677  - for div in sorted(elements,key=lambda k: random.random()):
     709 + for elt in sorted(elements,key=lambda k: random.random()):
    678 710   @self.phantomjs_short_timeout
    679  - def phantomjs_find_element_by_tag_name(): return div.find_element_by_tag_name('a')
     711 + def phantomjs_find_element_by_tag_name(): return elt.find_element_by_tag_name('a')
    680 712   a_tag = phantomjs_find_element_by_tag_name()
    681 713   @self.phantomjs_short_timeout
    682 714   def phantomjs_get_attribute(): return a_tag.get_attribute('href')
    683 715   href = phantomjs_get_attribute()
    684  - if href is not None: links.append(href)
     716 + if href is not None:
     717 + href = self.SafeSearch.result_extraction(href)
     718 + links.append(href)
    685 719   k += 1
    686 720   if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break
    687 721   except Exception as e:
    skipped 215 lines
Please wait...
Page is in error, reload to recover