🤬
  • ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 20 lines
    21 21   
    22 22  __version__ = '1.1'
    23 23   
    24  -import argparse as ap, datetime as dt, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time
     24 +import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time, warnings as warn
    25 25  import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
    26 26  from selenium import webdriver
    27 27  from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    28 28  from selenium.webdriver.support.ui import WebDriverWait
    29 29  from io import BytesIO
    30 30  from faker import Factory
     31 + 
     32 +# ensure pyopenssl exists to address SNI support
     33 +# https://stackoverflow.com/questions/18578439/using-requests-with-tls-doesnt-give-sni-support/18579484#18579484
     34 +if importlib.util.find_spec('OpenSSL') is None:
     35 + msg = 'Use the pyopenssl package to enable SNI support for TLS-protected hosted domains.'
     36 + print(msg)
     37 + warn.warn(msg)
    31 38   
    32 39  # headless Raspberry Pi
    33 40  try:
    skipped 179 lines
    213 220   self.session.capabilities["driverVersion"]))
    214 221   phantomjs_version = tuple(int(i) for i in self.session.capabilities["version"].split('.'))
    215 222   if phantomjs_version < recommended_version:
    216  - print("""{} version is {};
     223 + warn.warn("""{} version is {};
    217 224  please upgrade to at least version {} from http://phantomjs.org.
    218 225  """.format(self.session.capabilities["browserName"],self.session.capabilities["version"],
    219 226   '.'.join(str(i) for i in recommended_version)))
    skipped 376 lines
    596 603  a fraction of the time. """
    597 604   url = None
    598 605   if hasattr(self,'current_preferred_domain') and npr.uniform() < current_preferred_domain_fraction:
    599  - while url is not None: # loop until `self.current_preferred_domain` has a url
     606 + while url is not None and len(self.domain_links) > 0: # loop until `self.current_preferred_domain` has a url
    600 607   url = self.draw_link_from_domain(self.current_preferred_domain)
    601 608   if url is None: self.current_preferred_domain = self.draw_domain()
    602 609   if url is None: url = self.draw_link()
    skipped 286 lines
Please wait...
Page is in error, reload to recover