skipped 18 lines 19 19 # You should have received a copy of the GNU General Public License 20 20 # along with this program. If not, see <http://www.gnu.org/licenses/>. 21 21 22 - __version__ = '1.1 ' 22 + __version__ = '1.2 ' 23 23 24 - import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time, warnings as warn 24 + import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re , requests, signal, sys, tarfile, time, warnings as warn 25 25 import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs 26 26 from selenium import webdriver 27 27 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities skipped 23 lines 51 51 max_links_cached = 100000 # Maximum number of links to cache for download 52 52 max_links_per_page = 200 # Maximum number of links to add per page 53 53 max_links_per_domain = 400 # Maximum number of links to add per domain 54 - search_url = 'http://www.google.com/search' # keep unencrypted for ISP DPI 55 54 wordsite_url = 'http://svnweb.freebsd.org/csrg/share/dict/words?view=co&content-type=text/plain' 56 55 timeout = 20 57 56 short_timeout = 3 skipped 63 lines 121 120 # 1. The bandwidth usage is undoubtedly (much) smaller because gzip encoding is used 122 121 # 2. A lightweight proxy could be used for accurate bandwidth, and header editing 123 122 123 + # Safe search options 124 + class SafeWebSearch(): 125 + """ Safe web search class with default Google parameters. 126 + Use unencrypted HTTP for ISP DPI. 127 + """ 128 + def __init__(self, 129 + search_url='http://www.google.com/search', # search engine 130 + query_parameter='q', # query parameter 131 + safe_parameter='safe=active', # query parameter for safe searches 132 + css_selector='div.g', # css selector to harvest search results 133 + additional_parameters='', # additional parameters required to get results 134 + result_extraction=lambda x: x): # function to extract the link 135 + self.search_url = search_url 136 + self.query_parameter = query_parameter 137 + self.safe_parameter = safe_parameter 138 + self.css_selector = css_selector 139 + self.additional_parameters = additional_parameters 140 + self.result_extraction = result_extraction 141 + 142 + SafeGoogle = SafeWebSearch() 143 + SafeBing = SafeWebSearch(search_url='http://www.bing.com/search', 144 + safe_parameter='adlt=strict',css_selector='li.b_algo') 145 + yahoo_search_reprog = re.compile(r'/RU=(.+?)/R[A-Z]=') 146 + SafeYahoo = SafeWebSearch(search_url='http://search.yahoo.com/search', query_parameter='p', 147 + safe_parameter='adlt=strict',css_selector='div.compTitle', 148 + result_extraction=lambda x: yahoo_search_reprog.findall(uprs.parse_qs(x)['_ylu'][0])[0]) 149 + SafeDuckDuckGo = SafeWebSearch(search_url='http://www.duckduckgo.com/', 150 + safe_parameter='kp=1',css_selector='div.result__body') 124 151 125 152 class ISPDataPollution: 126 153 """ skipped 23 lines 150 177 max_links_per_page=max_links_per_page, 151 178 max_links_per_domain=max_links_per_domain, 152 179 user_agent=user_agent, 153 - search_url=search_url, 154 180 blacklist_url=blacklist_url, 155 181 wordsite_url=wordsite_url, 156 182 seed_bias_links=seed_bias_links, skipped 5 lines 162 188 self.max_links_per_page = max_links_per_page 163 189 self.max_links_per_domain = max_links_per_domain 164 190 self.user_agent = user_agent 165 - self.search_url = search_url 166 191 self.blacklist_url = blacklist_url 167 192 self.wordsite_url = wordsite_url 168 193 self.seed_bias_links = seed_bias_links skipped 480 lines 649 674 :param query: 650 675 :return: 651 676 """ 652 - url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query))) 677 + self.select_random_search_engine() 678 + url = uprs.urlunparse(uprs.urlparse(self.SafeSearch.search_url)._replace(query='{}={}{}&{}'.format( 679 + self.SafeSearch.query_parameter,uprs.quote_plus(query), 680 + self.SafeSearch.additional_parameters,self.SafeSearch.safe_parameter))) 653 681 if self.verbose: self.print_url(url) 654 682 @self.phantomjs_timeout 655 683 def phantomjs_get(): self.driver.get(url) # selenium driver skipped 4 lines 660 688 new_links = self.websearch_links() 661 689 if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) 662 690 691 + def select_random_search_engine(self): 692 + self.SafeSearch = random.choice([SafeGoogle, SafeBing, SafeYahoo, SafeDuckDuckGo]) 693 + return self.SafeSearch 694 + 663 695 def websearch_links(self): 664 696 """ 665 697 Webpage format for a popular search engine, <div class="g">. skipped 2 lines 668 700 # https://github.com/detro/ghostdriver/issues/169 669 701 @self.phantomjs_short_timeout 670 702 def phantomjs_find_elements_by_css_selector(): 671 - return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector(' div .g ' )) 703 + return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector(self .SafeSearch . css_selector )) 672 704 elements = phantomjs_find_elements_by_css_selector() 673 705 # get links in random order until max. per page 674 706 k = 0 675 707 links = [] 676 708 try: 677 - for div in sorted(elements,key=lambda k: random.random()): 709 + for elt in sorted(elements,key=lambda k: random.random()): 678 710 @self.phantomjs_short_timeout 679 - def phantomjs_find_element_by_tag_name(): return div .find_element_by_tag_name('a') 711 + def phantomjs_find_element_by_tag_name(): return elt .find_element_by_tag_name('a') 680 712 a_tag = phantomjs_find_element_by_tag_name() 681 713 @self.phantomjs_short_timeout 682 714 def phantomjs_get_attribute(): return a_tag.get_attribute('href') 683 715 href = phantomjs_get_attribute() 684 - if href is not None: links . append ( href ) 716 + if href is not None: 717 + href = self.SafeSearch.result_extraction(href) 718 + links.append(href) 685 719 k += 1 686 720 if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break 687 721 except Exception as e: skipped 215 lines