STRLCPY/isp-data-pollution

Version 1.3: Add EasyList blacklists; minor mods
Steven Thomas Smith committed 7 years ago

550aea7c

1 parent 7f29c114

Total 1 files

■ ■ ■ ■ ■ ■

isp_data_pollution.py

		skipped 18 lines
19	19		# You should have received a copy of the GNU General Public License
20	20		# along with this program. If not, see <http://www.gnu.org/licenses/>.
21	21
22		-	__version__ = '1.2'
	22	+	__version__ = '1.3'
23	23
24	24		import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, re, requests, signal, sys, tarfile, time, warnings as warn
25	25		import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs
		skipped 318 lines
344	344		self.blacklist_urls = set()
345	345		try:
346	346		if self.blacklist: # download the blacklist or not
347		-	if self.verbose: print('Downloading the blacklist… ',end='',flush=True)
	347	+	if self.verbose: print('Downloading the blacklists… ',end='',flush=True)
348	348		else:
349	349		raise Exception('Skip downloading the blacklist.')
350		-	# http://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed
351		-	tgzstream = urllib.request.urlopen(urllib.request.Request(self.blacklist_url, headers={'User-Agent': self.user_agent}))
352		-	tmpfile = BytesIO()
353		-	while True:
354		-	s = tgzstream.read(16384)
355		-	if not s: break
356		-	tmpfile.write(s)
357		-	tgzstream.close()
358		-	tmpfile.seek(0)
359		-	tgz = tarfile.open(fileobj=tmpfile, mode='r:gz')
360		-	# bash$ ls BL
361		-	# COPYRIGHT education isp recreation updatesites
362		-	# adv finance jobsearch redirector urlshortener
363		-	# aggressive fortunetelling library religion violence
364		-	# alcohol forum military remotecontrol warez
365		-	# anonvpn gamble models ringtones weapons
366		-	# automobile global_usage movies science webmail
367		-	# chat government music searchengines webphone
368		-	# costtraps hacking news sex webradio
369		-	# dating hobby podcasts shopping webtv
370		-	# downloads homestyle politics socialnet
371		-	# drugs hospitals porn spyware
372		-	# dynamic imagehosting radiotv tracker
373		-	for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]:
374		-	self.blacklist_domains \|= set(tgz.extractfile('BL/{}/domains'.format(member)).read().decode('utf-8').splitlines())
375		-	self.blacklist_urls \|= set(tgz.extractfile('BL/{}/urls'.format(member)).read().decode('utf-8').splitlines())
376		-	tgz.close()
377		-	tmpfile.close()
378		-	if self.verbose: print('done.',flush=True)
	350	+	self.get_shalla_blacklist()
	351	+	if self.verbose: print('Shallalist done… ', end='', flush=True)
	352	+	self.get_easylist_blacklist()
	353	+	if self.verbose: print('EasyList done.', flush=True)
379	354		except Exception as e:
380	355		if self.verbose: print(e)
	356	+	# Make sure blacklists are not empty
	357	+	if self.blacklist:
	358	+	try:
	359	+	assert self.blacklist_domains != set() or self.blacklist_urls != set()
	360	+	except AssertionError as e:
	361	+	print(e)
	362	+	print('Empty blacklists! Exiting.')
	363	+	sys.exit(1)
381	364		# ignore problem urls
382	365		self.blacklist_urls \|= { 'about:blank' }
	366	+
	367	+	def get_shalla_blacklist(self):
	368	+	# http://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed
	369	+	tgzstream = urllib.request.urlopen(urllib.request.Request(self.blacklist_url, headers={'User-Agent': self.user_agent}))
	370	+	tmpfile = BytesIO()
	371	+	while True:
	372	+	s = tgzstream.read(16384)
	373	+	if not s: break
	374	+	tmpfile.write(s)
	375	+	tgzstream.close()
	376	+	tmpfile.seek(0)
	377	+	tgz = tarfile.open(fileobj=tmpfile, mode='r:gz')
	378	+	# bash$ ls BL
	379	+	# COPYRIGHT education isp recreation updatesites
	380	+	# adv finance jobsearch redirector urlshortener
	381	+	# aggressive fortunetelling library religion violence
	382	+	# alcohol forum military remotecontrol warez
	383	+	# anonvpn gamble models ringtones weapons
	384	+	# automobile global_usage movies science webmail
	385	+	# chat government music searchengines webphone
	386	+	# costtraps hacking news sex webradio
	387	+	# dating hobby podcasts shopping webtv
	388	+	# downloads homestyle politics socialnet
	389	+	# drugs hospitals porn spyware
	390	+	# dynamic imagehosting radiotv tracker
	391	+	for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]:
	392	+	self.blacklist_domains \|= set(tgz.extractfile('BL/{}/domains'.format(member)).read().decode('utf-8').splitlines())
	393	+	self.blacklist_urls \|= set(tgz.extractfile('BL/{}/urls'.format(member)).read().decode('utf-8').splitlines())
	394	+	tgz.close()
	395	+	tmpfile.close()
	396	+
	397	+	def get_easylist_blacklist(self):
	398	+	# Malware lists from open source AdBlock and spam404.com lists
	399	+	malwaredomains_full = 'https://easylist-downloads.adblockplus.org/malwaredomains_full.txt'
	400	+	spam404_com_adblock_list = 'https://raw.githubusercontent.com/Dawsey21/Lists/master/adblock-list.txt'
	401	+	spam404_com_main_blacklist = 'https://raw.githubusercontent.com/Dawsey21/Lists/master/main-blacklist.txt' # not EasyList format
	402	+	download_list = list(set([malwaredomains_full, spam404_com_adblock_list, spam404_com_main_blacklist]))
	403	+	download_parse = { malwaredomains_full: True, spam404_com_adblock_list: True, spam404_com_main_blacklist: False }
	404	+
	405	+	for url in download_list:
	406	+	resp = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': self.user_agent}))
	407	+	for line in resp:
	408	+	line = line.decode('utf-8').rstrip()
	409	+	if download_parse[url]: self.parse_and_filter_rule_urls(line)
	410	+	else: self.blacklist_domains \|= set([line])
383	411
384	412		def get_random_words(self):
385	413		try:
		skipped 174 lines
560	588		self.start_time = time.time()
561	589		self.data_usage = 0
562	590		self.decimate_links(total_frac=0.49, decimate_frac=0.333)
	591	+	self.get_blacklist() # reload the latest blacklists
563	592
564	593		def decimate_links(self, total_frac=0.81, decimate_frac=0.1, log_sampling=False):
565	594		""" Delete `decimate_frac` of links if the total exceeds `total_frac` of the maximum allowed. """
		skipped 362 lines
928	957		self.quit_driver(pid=pid)
929	958		self.open_driver()
930	959		else: # throw in the towel and exit if no viable phantomjs process after multiple attempts
931		-	sys.exit()
	960	+	print('No viable phantomjs process after multiple attempts!')
	961	+	sys.exit(1)
932	962		return (pid, rss_mb)
	963	+
	964	+	def parse_and_filter_rule_urls(self,line):
	965	+	"""Convert EasyList domain anchor rule to domain or url."""
	966	+	line = line.rstrip()
	967	+	# filter out configuration, comment, exception lines, domain-specific, and selector rules
	968	+	if re_test(configuration_re, line) or re_test(comment_re, line) or re_test(exception_re, line) or re_test(
	969	+	domain_option_re, line) or re_test(selector_re, line): return
	970	+	if re_test(option_re, line):
	971	+	line = option_re.sub('\\1', line) # delete all the options and continue
	972	+	# ignore these cases
	973	+	# blank url case: ignore
	974	+	if re_test(httpempty_re, line): return
	975	+	# blank line case: ignore
	976	+	if not bool(line): return
	977	+	# parse all remaining rules
	978	+	# treat each of the these cases separately
	979	+	# regex case: ignore
	980	+	if re_test(regex_re, line): return
	981	+	# now that regex's are handled, delete unnecessary wildcards, e.g. /.../*
	982	+	line = wildcard_begend_re.sub('\\1', line)
	983	+	# domain anchors, \|\| or '\|http://a.b' -> domain anchor 'a.b' for regex efficiency in JS
	984	+	if re_test(domain_anch_re, line) or re_test(scheme_anchor_re, line):
	985	+	# strip off initial \|\| or \|scheme://
	986	+	if re_test(domain_anch_re, line):
	987	+	line = domain_anch_re.sub('\\1', line)
	988	+	elif re_test(scheme_anchor_re, line):
	989	+	line = scheme_anchor_re.sub("", line)
	990	+	# host subcase
	991	+	if re_test(da_hostonly_re, line):
	992	+	line = da_hostonly_re.sub('\\1', line)
	993	+	if not re_test(wild_anch_sep_exc_re, line): # exact subsubcase
	994	+	if wildcard_ignore_test(line): return
	995	+	self.blacklist_domains \|= set([line])
	996	+	return line
	997	+	else:
	998	+	return # regex subsubcase
	999	+	# hostpath subcase
	1000	+	if re_test(da_hostpath_re, line):
	1001	+	line = da_hostpath_re.sub('\\1', line)
	1002	+	if not re_test(wild_sep_exc_noanch_re, line) and re_test(pathend_re, line): # exact subsubcase
	1003	+	line = re.sub(r'[/\|]$', '', line) # strip EOL slashes and anchors
	1004	+	if wildcard_ignore_test(line): return
	1005	+	self.blacklist_urls \|= set([line])
	1006	+	return line
	1007	+	else:
	1008	+	return # regex subsubcase
	1009	+	# hostpathquery default case
	1010	+	if wildcard_ignore_test(line): return
	1011	+	self.blacklist_urls \|= set([line])
	1012	+	return line
	1013	+	# all other non-regex patterns in for the path parts: ignore
	1014	+	return
	1015	+
	1016	+
	1017	+	# EasyList regular expressions
	1018	+	# See https://github.com/essandess/easylist-pac-privoxy
	1019	+	comment_re = re.compile(r'^\s*?!') # ! commment
	1020	+	configuration_re = re.compile(r'^\s?\[[^]]?\]') # [Adblock Plus 2.0]
	1021	+	easylist_opts = r'~?\b(?:third\-party\|domain\|script\|image\|stylesheet\|object(?!-subrequest)\|object\-subrequest\|xmlhttprequest\|subdocument\|ping\|websocket\|webrtc\|document\|elemhide\|generichide\|genericblock\|other\|sitekey\|match-case\|collapse\|donottrack\|popup\|media\|font)\b'
	1022	+	option_re = re.compile(r'^(.?)\$(' + easylist_opts + r'.?)$')
	1023	+	# regex's used to exclude options for specific cases
	1024	+	domain_option_re = re.compile(r'\$.*?(?:domain=)') # discards rules specific to links from specific domains
	1025	+	selector_re = re.compile(r'^(.?)#\@?#?.*?$') # #@##div [should be #+?, but old style still used]
	1026	+	regex_re = re.compile(r'^\@{0,2}\/(.*?)\/$')
	1027	+	wildcard_begend_re = re.compile(r'^(?:\*?([^]?)\+?\|\+?([^]?)\*?)$')
	1028	+	wild_anch_sep_exc_re = re.compile(r'[*\|^@]')
	1029	+	wild_sep_exc_noanch_re = re.compile(r'(?:[*^@]\|\\|[\s\S])')
	1030	+	exception_re = re.compile(r'^@@(.*?)$')
	1031	+	httpempty_re = re.compile(r'^\\|?https?://$')
	1032	+	pathend_re = re.compile(r'(?i)(?:[/\|]$\|\.(?:jsp?\|php\|xml\|jpe?g\|png\|p?gif\|img\|swf\|flv\|[sp]?html?\|f?cgi\|pl?\|aspx\|ashx\|css\|jsonp?\|asp\|search\|cfm\|ico\|act\|act(?:ion)?\|spy\|do\|stm\|cms\|txt\|imu\|dll\|io\|smjs\|xhr\|ount\|bin\|py\|dyn\|gne\|mvc\|lv\|nap\|jam\|nhn))',re.IGNORECASE)
	1033	+
	1034	+	domain_anch_re = re.compile(r'^\\|\\|(.+?)$')
	1035	+	# omit scheme from start of rule -- this will also be done in JS for efficiency
	1036	+	scheme_anchor_re = re.compile(r'^(\\|?(?:[\w*+-]{1,15})?://)'); # e.g. '\|http://' at start
	1037	+
	1038	+	# (Almost) fully-qualified domain name extraction (with EasyList wildcards)
	1039	+	# Example case: banner.3ddownloads.com^
	1040	+	da_hostonly_re = re.compile(r'^((?:[\w-]+\.)+[a-zA-Z0-9-]{1,24}\.?)(?:$\|[/^?])$')
	1041	+	da_hostpath_re = re.compile(r'^((?:[\w-]+\.)+[a-zA-Z0-9-]{1,24}\.?[\w~%./^*-]+?)\??$')
	1042	+
	1043	+	def re_test(regex,string):
	1044	+	if isinstance(regex,str): regex = re.compile(regex)
	1045	+	return bool(regex.search(string))
	1046	+
	1047	+	def wildcard_ignore_test(rule):
	1048	+	return bool(wild_anch_sep_exc_re.search(rule))
933	1049
934	1050		if __name__ == "__main__":
935	1051		ISPDataPollution()
		skipped 1 lines

Version 1.3: Add EasyList blacklists; minor mods