🤬
  • ■ ■ ■ ■ ■
    isp_data_pollution.py
    skipped 18 lines
    19 19  # You should have received a copy of the GNU General Public License
    20 20  # along with this program. If not, see <http://www.gnu.org/licenses/>.
    21 21   
    22  -__version__ = '1.2'
     22 +__version__ = '1.3'
    23 23   
    24 24  import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn
    25 25  import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
    skipped 318 lines
    344 344   self.blacklist_urls = set()
    345 345   try:
    346 346   if self.blacklist: # download the blacklist or not
    347  - if self.verbose: print('Downloading the blacklist… ',end='',flush=True)
     347 + if self.verbose: print('Downloading the blacklists… ',end='',flush=True)
    348 348   else:
    349 349   raise Exception('Skip downloading the blacklist.')
    350  - # http://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed
    351  - tgzstream = urllib.request.urlopen(urllib.request.Request(self.blacklist_url, headers={'User-Agent': self.user_agent}))
    352  - tmpfile = BytesIO()
    353  - while True:
    354  - s = tgzstream.read(16384)
    355  - if not s: break
    356  - tmpfile.write(s)
    357  - tgzstream.close()
    358  - tmpfile.seek(0)
    359  - tgz = tarfile.open(fileobj=tmpfile, mode='r:gz')
    360  - # bash$ ls BL
    361  - # COPYRIGHT education isp recreation updatesites
    362  - # adv finance jobsearch redirector urlshortener
    363  - # aggressive fortunetelling library religion violence
    364  - # alcohol forum military remotecontrol warez
    365  - # anonvpn gamble models ringtones weapons
    366  - # automobile global_usage movies science webmail
    367  - # chat government music searchengines webphone
    368  - # costtraps hacking news sex webradio
    369  - # dating hobby podcasts shopping webtv
    370  - # downloads homestyle politics socialnet
    371  - # drugs hospitals porn spyware
    372  - # dynamic imagehosting radiotv tracker
    373  - for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]:
    374  - self.blacklist_domains |= set(tgz.extractfile('BL/{}/domains'.format(member)).read().decode('utf-8').splitlines())
    375  - self.blacklist_urls |= set(tgz.extractfile('BL/{}/urls'.format(member)).read().decode('utf-8').splitlines())
    376  - tgz.close()
    377  - tmpfile.close()
    378  - if self.verbose: print('done.',flush=True)
     350 + self.get_shalla_blacklist()
     351 + if self.verbose: print('Shallalist done… ', end='', flush=True)
     352 + self.get_easylist_blacklist()
     353 + if self.verbose: print('EasyList done.', flush=True)
    379 354   except Exception as e:
    380 355   if self.verbose: print(e)
     356 + # Make sure blacklists are not empty
     357 + if self.blacklist:
     358 + try:
     359 + assert self.blacklist_domains != set() or self.blacklist_urls != set()
     360 + except AssertionError as e:
     361 + print(e)
     362 + print('Empty blacklists! Exiting.')
     363 + sys.exit(1)
    381 364   # ignore problem urls
    382 365   self.blacklist_urls |= { 'about:blank' }
     366 + 
     367 + def get_shalla_blacklist(self):
     368 + # http://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed
     369 + tgzstream = urllib.request.urlopen(urllib.request.Request(self.blacklist_url, headers={'User-Agent': self.user_agent}))
     370 + tmpfile = BytesIO()
     371 + while True:
     372 + s = tgzstream.read(16384)
     373 + if not s: break
     374 + tmpfile.write(s)
     375 + tgzstream.close()
     376 + tmpfile.seek(0)
     377 + tgz = tarfile.open(fileobj=tmpfile, mode='r:gz')
     378 + # bash$ ls BL
     379 + # COPYRIGHT education isp recreation updatesites
     380 + # adv finance jobsearch redirector urlshortener
     381 + # aggressive fortunetelling library religion violence
     382 + # alcohol forum military remotecontrol warez
     383 + # anonvpn gamble models ringtones weapons
     384 + # automobile global_usage movies science webmail
     385 + # chat government music searchengines webphone
     386 + # costtraps hacking news sex webradio
     387 + # dating hobby podcasts shopping webtv
     388 + # downloads homestyle politics socialnet
     389 + # drugs hospitals porn spyware
     390 + # dynamic imagehosting radiotv tracker
     391 + for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]:
     392 + self.blacklist_domains |= set(tgz.extractfile('BL/{}/domains'.format(member)).read().decode('utf-8').splitlines())
     393 + self.blacklist_urls |= set(tgz.extractfile('BL/{}/urls'.format(member)).read().decode('utf-8').splitlines())
     394 + tgz.close()
     395 + tmpfile.close()
     396 + 
     397 + def get_easylist_blacklist(self):
     398 + # Malware lists from open source AdBlock and spam404.com lists
     399 + malwaredomains_full = 'https://easylist-downloads.adblockplus.org/malwaredomains_full.txt'
     400 + spam404_com_adblock_list = 'https://raw.githubusercontent.com/Dawsey21/Lists/master/adblock-list.txt'
     401 + spam404_com_main_blacklist = 'https://raw.githubusercontent.com/Dawsey21/Lists/master/main-blacklist.txt' # not EasyList format
     402 + download_list = list(set([malwaredomains_full, spam404_com_adblock_list, spam404_com_main_blacklist]))
     403 + download_parse = { malwaredomains_full: True, spam404_com_adblock_list: True, spam404_com_main_blacklist: False }
     404 + 
     405 + for url in download_list:
     406 + resp = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': self.user_agent}))
     407 + for line in resp:
     408 + line = line.decode('utf-8').rstrip()
     409 + if download_parse[url]: self.parse_and_filter_rule_urls(line)
     410 + else: self.blacklist_domains |= set([line])
    383 411   
    384 412   def get_random_words(self):
    385 413   try:
    skipped 174 lines
    560 588   self.start_time = time.time()
    561 589   self.data_usage = 0
    562 590   self.decimate_links(total_frac=0.49, decimate_frac=0.333)
     591 + self.get_blacklist() # reload the latest blacklists
    563 592   
    564 593   def decimate_links(self, total_frac=0.81, decimate_frac=0.1, log_sampling=False):
    565 594   """ Delete `decimate_frac` of links if the total exceeds `total_frac` of the maximum allowed. """
    skipped 362 lines
    928 957   self.quit_driver(pid=pid)
    929 958   self.open_driver()
    930 959   else: # throw in the towel and exit if no viable phantomjs process after multiple attempts
    931  - sys.exit()
     960 + print('No viable phantomjs process after multiple attempts!')
     961 + sys.exit(1)
    932 962   return (pid, rss_mb)
     963 + 
     964 + def parse_and_filter_rule_urls(self,line):
     965 + """Convert EasyList domain anchor rule to domain or url."""
     966 + line = line.rstrip()
     967 + # filter out configuration, comment, exception lines, domain-specific, and selector rules
     968 + if re_test(configuration_re, line) or re_test(comment_re, line) or re_test(exception_re, line) or re_test(
     969 + domain_option_re, line) or re_test(selector_re, line): return
     970 + if re_test(option_re, line):
     971 + line = option_re.sub('\\1', line) # delete all the options and continue
     972 + # ignore these cases
     973 + # blank url case: ignore
     974 + if re_test(httpempty_re, line): return
     975 + # blank line case: ignore
     976 + if not bool(line): return
     977 + # parse all remaining rules
     978 + # treat each of the these cases separately
     979 + # regex case: ignore
     980 + if re_test(regex_re, line): return
     981 + # now that regex's are handled, delete unnecessary wildcards, e.g. /.../*
     982 + line = wildcard_begend_re.sub('\\1', line)
     983 + # domain anchors, || or '|http://a.b' -> domain anchor 'a.b' for regex efficiency in JS
     984 + if re_test(domain_anch_re, line) or re_test(scheme_anchor_re, line):
     985 + # strip off initial || or |scheme://
     986 + if re_test(domain_anch_re, line):
     987 + line = domain_anch_re.sub('\\1', line)
     988 + elif re_test(scheme_anchor_re, line):
     989 + line = scheme_anchor_re.sub("", line)
     990 + # host subcase
     991 + if re_test(da_hostonly_re, line):
     992 + line = da_hostonly_re.sub('\\1', line)
     993 + if not re_test(wild_anch_sep_exc_re, line): # exact subsubcase
     994 + if wildcard_ignore_test(line): return
     995 + self.blacklist_domains |= set([line])
     996 + return line
     997 + else:
     998 + return # regex subsubcase
     999 + # hostpath subcase
     1000 + if re_test(da_hostpath_re, line):
     1001 + line = da_hostpath_re.sub('\\1', line)
     1002 + if not re_test(wild_sep_exc_noanch_re, line) and re_test(pathend_re, line): # exact subsubcase
     1003 + line = re.sub(r'[/|]$', '', line) # strip EOL slashes and anchors
     1004 + if wildcard_ignore_test(line): return
     1005 + self.blacklist_urls |= set([line])
     1006 + return line
     1007 + else:
     1008 + return # regex subsubcase
     1009 + # hostpathquery default case
     1010 + if wildcard_ignore_test(line): return
     1011 + self.blacklist_urls |= set([line])
     1012 + return line
     1013 + # all other non-regex patterns in for the path parts: ignore
     1014 + return
     1015 + 
     1016 + 
     1017 +# EasyList regular expressions
     1018 +# See https://github.com/essandess/easylist-pac-privoxy
     1019 +comment_re = re.compile(r'^\s*?!') # ! commment
     1020 +configuration_re = re.compile(r'^\s*?\[[^]]*?\]') # [Adblock Plus 2.0]
     1021 +easylist_opts = r'~?\b(?:third\-party|domain|script|image|stylesheet|object(?!-subrequest)|object\-subrequest|xmlhttprequest|subdocument|ping|websocket|webrtc|document|elemhide|generichide|genericblock|other|sitekey|match-case|collapse|donottrack|popup|media|font)\b'
     1022 +option_re = re.compile(r'^(.*?)\$(' + easylist_opts + r'.*?)$')
     1023 +# regex's used to exclude options for specific cases
     1024 +domain_option_re = re.compile(r'\$.*?(?:domain=)') # discards rules specific to links from specific domains
     1025 +selector_re = re.compile(r'^(.*?)#\@?#*?.*?$') # #@##div [should be #+?, but old style still used]
     1026 +regex_re = re.compile(r'^\@{0,2}\/(.*?)\/$')
     1027 +wildcard_begend_re = re.compile(r'^(?:\**?([^*]*?)\*+?|\*+?([^*]*?)\**?)$')
     1028 +wild_anch_sep_exc_re = re.compile(r'[*|^@]')
     1029 +wild_sep_exc_noanch_re = re.compile(r'(?:[*^@]|\|[\s\S])')
     1030 +exception_re = re.compile(r'^@@(.*?)$')
     1031 +httpempty_re = re.compile(r'^\|?https?://$')
     1032 +pathend_re = re.compile(r'(?i)(?:[/|]$|\.(?:jsp?|php|xml|jpe?g|png|p?gif|img|swf|flv|[sp]?html?|f?cgi|pl?|aspx|ashx|css|jsonp?|asp|search|cfm|ico|act|act(?:ion)?|spy|do|stm|cms|txt|imu|dll|io|smjs|xhr|ount|bin|py|dyn|gne|mvc|lv|nap|jam|nhn))',re.IGNORECASE)
     1033 + 
     1034 +domain_anch_re = re.compile(r'^\|\|(.+?)$')
     1035 +# omit scheme from start of rule -- this will also be done in JS for efficiency
     1036 +scheme_anchor_re = re.compile(r'^(\|?(?:[\w*+-]{1,15})?://)'); # e.g. '|http://' at start
     1037 + 
     1038 +# (Almost) fully-qualified domain name extraction (with EasyList wildcards)
     1039 +# Example case: banner.3ddownloads.com^
     1040 +da_hostonly_re = re.compile(r'^((?:[\w*-]+\.)+[a-zA-Z0-9*-]{1,24}\.?)(?:$|[/^?])$')
     1041 +da_hostpath_re = re.compile(r'^((?:[\w*-]+\.)+[a-zA-Z0-9*-]{1,24}\.?[\w~%./^*-]+?)\??$')
     1042 + 
     1043 +def re_test(regex,string):
     1044 + if isinstance(regex,str): regex = re.compile(regex)
     1045 + return bool(regex.search(string))
     1046 + 
     1047 +def wildcard_ignore_test(rule):
     1048 + return bool(wild_anch_sep_exc_re.search(rule))
    933 1049   
    934 1050  if __name__ == "__main__":
    935 1051   ISPDataPollution()
    skipped 1 lines
Please wait...
Page is in error, reload to recover