skipped 18 lines 19 19 # You should have received a copy of the GNU General Public License 20 20 # along with this program. If not, see <http://www.gnu.org/licenses/>. 21 21 22 - __version__ = '1.2 ' 22 + __version__ = '1.3 ' 23 23 24 24 import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn 25 25 import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs skipped 318 lines 344 344 self.blacklist_urls = set() 345 345 try: 346 346 if self.blacklist: # download the blacklist or not 347 - if self.verbose: print('Downloading the blacklist … ',end='',flush=True) 347 + if self.verbose: print('Downloading the blacklists … ',end='',flush=True) 348 348 else: 349 349 raise Exception('Skip downloading the blacklist.') 350 - # http://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed 351 - tgzstream = urllib.request.urlopen(urllib.request.Request(self.blacklist_url, headers={'User-Agent': self.user_agent})) 352 - tmpfile = BytesIO() 353 - while True: 354 - s = tgzstream.read(16384) 355 - if not s: break 356 - tmpfile.write(s) 357 - tgzstream.close() 358 - tmpfile.seek(0) 359 - tgz = tarfile.open(fileobj=tmpfile, mode='r:gz') 360 - # bash$ ls BL 361 - # COPYRIGHT education isp recreation updatesites 362 - # adv finance jobsearch redirector urlshortener 363 - # aggressive fortunetelling library religion violence 364 - # alcohol forum military remotecontrol warez 365 - # anonvpn gamble models ringtones weapons 366 - # automobile global_usage movies science webmail 367 - # chat government music searchengines webphone 368 - # costtraps hacking news sex webradio 369 - # dating hobby podcasts shopping webtv 370 - # downloads homestyle politics socialnet 371 - # drugs hospitals porn spyware 372 - # dynamic imagehosting radiotv tracker 373 - for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]: 374 - self.blacklist_domains |= set(tgz.extractfile('BL/{}/domains'.format(member)).read().decode('utf-8').splitlines()) 375 - self.blacklist_urls |= set(tgz.extractfile('BL/{}/urls'.format(member)).read().decode('utf-8').splitlines()) 376 - tgz.close() 377 - tmpfile.close() 378 - if self.verbose: print('done.',flush=True) 350 + self.get_shalla_blacklist() 351 + if self.verbose: print('Shallalist done… ', end='', flush=True) 352 + self.get_easylist_blacklist() 353 + if self.verbose: print('EasyList done.', flush=True) 379 354 except Exception as e: 380 355 if self.verbose: print(e) 356 + # Make sure blacklists are not empty 357 + if self.blacklist: 358 + try: 359 + assert self.blacklist_domains != set() or self.blacklist_urls != set() 360 + except AssertionError as e: 361 + print(e) 362 + print('Empty blacklists! Exiting.') 363 + sys.exit(1) 381 364 # ignore problem urls 382 365 self.blacklist_urls |= { 'about:blank' } 366 + 367 + def get_shalla_blacklist(self): 368 + # http://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed 369 + tgzstream = urllib.request.urlopen(urllib.request.Request(self.blacklist_url, headers={'User-Agent': self.user_agent})) 370 + tmpfile = BytesIO() 371 + while True: 372 + s = tgzstream.read(16384) 373 + if not s: break 374 + tmpfile.write(s) 375 + tgzstream.close() 376 + tmpfile.seek(0) 377 + tgz = tarfile.open(fileobj=tmpfile, mode='r:gz') 378 + # bash$ ls BL 379 + # COPYRIGHT education isp recreation updatesites 380 + # adv finance jobsearch redirector urlshortener 381 + # aggressive fortunetelling library religion violence 382 + # alcohol forum military remotecontrol warez 383 + # anonvpn gamble models ringtones weapons 384 + # automobile global_usage movies science webmail 385 + # chat government music searchengines webphone 386 + # costtraps hacking news sex webradio 387 + # dating hobby podcasts shopping webtv 388 + # downloads homestyle politics socialnet 389 + # drugs hospitals porn spyware 390 + # dynamic imagehosting radiotv tracker 391 + for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]: 392 + self.blacklist_domains |= set(tgz.extractfile('BL/{}/domains'.format(member)).read().decode('utf-8').splitlines()) 393 + self.blacklist_urls |= set(tgz.extractfile('BL/{}/urls'.format(member)).read().decode('utf-8').splitlines()) 394 + tgz.close() 395 + tmpfile.close() 396 + 397 + def get_easylist_blacklist(self): 398 + # Malware lists from open source AdBlock and spam404.com lists 399 + malwaredomains_full = 'https://easylist-downloads.adblockplus.org/malwaredomains_full.txt' 400 + spam404_com_adblock_list = 'https://raw.githubusercontent.com/Dawsey21/Lists/master/adblock-list.txt' 401 + spam404_com_main_blacklist = 'https://raw.githubusercontent.com/Dawsey21/Lists/master/main-blacklist.txt' # not EasyList format 402 + download_list = list(set([malwaredomains_full, spam404_com_adblock_list, spam404_com_main_blacklist])) 403 + download_parse = { malwaredomains_full: True, spam404_com_adblock_list: True, spam404_com_main_blacklist: False } 404 + 405 + for url in download_list: 406 + resp = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': self.user_agent})) 407 + for line in resp: 408 + line = line.decode('utf-8').rstrip() 409 + if download_parse[url]: self.parse_and_filter_rule_urls(line) 410 + else: self.blacklist_domains |= set([line]) 383 411 384 412 def get_random_words(self): 385 413 try: skipped 174 lines 560 588 self.start_time = time.time() 561 589 self.data_usage = 0 562 590 self.decimate_links(total_frac=0.49, decimate_frac=0.333) 591 + self.get_blacklist() # reload the latest blacklists 563 592 564 593 def decimate_links(self, total_frac=0.81, decimate_frac=0.1, log_sampling=False): 565 594 """ Delete `decimate_frac` of links if the total exceeds `total_frac` of the maximum allowed. """ skipped 362 lines 928 957 self.quit_driver(pid=pid) 929 958 self.open_driver() 930 959 else: # throw in the towel and exit if no viable phantomjs process after multiple attempts 931 - sys.exit() 960 + print('No viable phantomjs process after multiple attempts!') 961 + sys.exit(1) 932 962 return (pid, rss_mb) 963 + 964 + def parse_and_filter_rule_urls(self,line): 965 + """Convert EasyList domain anchor rule to domain or url.""" 966 + line = line.rstrip() 967 + # filter out configuration, comment, exception lines, domain-specific, and selector rules 968 + if re_test(configuration_re, line) or re_test(comment_re, line) or re_test(exception_re, line) or re_test( 969 + domain_option_re, line) or re_test(selector_re, line): return 970 + if re_test(option_re, line): 971 + line = option_re.sub('\\1', line) # delete all the options and continue 972 + # ignore these cases 973 + # blank url case: ignore 974 + if re_test(httpempty_re, line): return 975 + # blank line case: ignore 976 + if not bool(line): return 977 + # parse all remaining rules 978 + # treat each of the these cases separately 979 + # regex case: ignore 980 + if re_test(regex_re, line): return 981 + # now that regex's are handled, delete unnecessary wildcards, e.g. /.../* 982 + line = wildcard_begend_re.sub('\\1', line) 983 + # domain anchors, || or '|http://a.b' -> domain anchor 'a.b' for regex efficiency in JS 984 + if re_test(domain_anch_re, line) or re_test(scheme_anchor_re, line): 985 + # strip off initial || or |scheme:// 986 + if re_test(domain_anch_re, line): 987 + line = domain_anch_re.sub('\\1', line) 988 + elif re_test(scheme_anchor_re, line): 989 + line = scheme_anchor_re.sub("", line) 990 + # host subcase 991 + if re_test(da_hostonly_re, line): 992 + line = da_hostonly_re.sub('\\1', line) 993 + if not re_test(wild_anch_sep_exc_re, line): # exact subsubcase 994 + if wildcard_ignore_test(line): return 995 + self.blacklist_domains |= set([line]) 996 + return line 997 + else: 998 + return # regex subsubcase 999 + # hostpath subcase 1000 + if re_test(da_hostpath_re, line): 1001 + line = da_hostpath_re.sub('\\1', line) 1002 + if not re_test(wild_sep_exc_noanch_re, line) and re_test(pathend_re, line): # exact subsubcase 1003 + line = re.sub(r'[/|]$', '', line) # strip EOL slashes and anchors 1004 + if wildcard_ignore_test(line): return 1005 + self.blacklist_urls |= set([line]) 1006 + return line 1007 + else: 1008 + return # regex subsubcase 1009 + # hostpathquery default case 1010 + if wildcard_ignore_test(line): return 1011 + self.blacklist_urls |= set([line]) 1012 + return line 1013 + # all other non-regex patterns in for the path parts: ignore 1014 + return 1015 + 1016 + 1017 + # EasyList regular expressions 1018 + # See https://github.com/essandess/easylist-pac-privoxy 1019 + comment_re = re.compile(r'^\s*?!') # ! commment 1020 + configuration_re = re.compile(r'^\s*?\[[^]]*?\]') # [Adblock Plus 2.0] 1021 + easylist_opts = r'~?\b(?:third\-party|domain|script|image|stylesheet|object(?!-subrequest)|object\-subrequest|xmlhttprequest|subdocument|ping|websocket|webrtc|document|elemhide|generichide|genericblock|other|sitekey|match-case|collapse|donottrack|popup|media|font)\b' 1022 + option_re = re.compile(r'^(.*?)\$(' + easylist_opts + r'.*?)$') 1023 + # regex's used to exclude options for specific cases 1024 + domain_option_re = re.compile(r'\$.*?(?:domain=)') # discards rules specific to links from specific domains 1025 + selector_re = re.compile(r'^(.*?)#\@?#*?.*?$') # #@##div [should be #+?, but old style still used] 1026 + regex_re = re.compile(r'^\@{0,2}\/(.*?)\/$') 1027 + wildcard_begend_re = re.compile(r'^(?:\**?([^*]*?)\*+?|\*+?([^*]*?)\**?)$') 1028 + wild_anch_sep_exc_re = re.compile(r'[*|^@]') 1029 + wild_sep_exc_noanch_re = re.compile(r'(?:[*^@]|\|[\s\S])') 1030 + exception_re = re.compile(r'^@@(.*?)$') 1031 + httpempty_re = re.compile(r'^\|?https?://$') 1032 + pathend_re = re.compile(r'(?i)(?:[/|]$|\.(?:jsp?|php|xml|jpe?g|png|p?gif|img|swf|flv|[sp]?html?|f?cgi|pl?|aspx|ashx|css|jsonp?|asp|search|cfm|ico|act|act(?:ion)?|spy|do|stm|cms|txt|imu|dll|io|smjs|xhr|ount|bin|py|dyn|gne|mvc|lv|nap|jam|nhn))',re.IGNORECASE) 1033 + 1034 + domain_anch_re = re.compile(r'^\|\|(.+?)$') 1035 + # omit scheme from start of rule -- this will also be done in JS for efficiency 1036 + scheme_anchor_re = re.compile(r'^(\|?(?:[\w*+-]{1,15})?://)'); # e.g. '|http://' at start 1037 + 1038 + # (Almost) fully-qualified domain name extraction (with EasyList wildcards) 1039 + # Example case: banner.3ddownloads.com^ 1040 + da_hostonly_re = re.compile(r'^((?:[\w*-]+\.)+[a-zA-Z0-9*-]{1,24}\.?)(?:$|[/^?])$') 1041 + da_hostpath_re = re.compile(r'^((?:[\w*-]+\.)+[a-zA-Z0-9*-]{1,24}\.?[\w~%./^*-]+?)\??$') 1042 + 1043 + def re_test(regex,string): 1044 + if isinstance(regex,str): regex = re.compile(regex) 1045 + return bool(regex.search(string)) 1046 + 1047 + def wildcard_ignore_test(rule): 1048 + return bool(wild_anch_sep_exc_re.search(rule)) 933 1049 934 1050 if __name__ == "__main__": 935 1051 ISPDataPollution() skipped 1 lines