STRLCPY/maigret

■ ■ ■ ■ ■ ■

maigret/checking.py

		skipped 96 lines
97	97		site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
98	98
99	99
100		-	# TODO: move info separate module
	100	+	# TODO: move to separate class
101	101		def detect_error_page(html_text, status_code, fail_flags, ignore_403):
102	102		# Detect service restrictions such as a country restriction
103	103		for flag, msg in fail_flags.items():
		skipped 166 lines
270	270		new_usernames[v] = k
271	271
272	272		results_info['ids_usernames'] = new_usernames
	273	+	results_info['ids_links'] = eval(extracted_ids_data.get('links', '[]'))
273	274		result.ids_data = extracted_ids_data
274	275
275	276		# Notify caller about results of query.
		skipped 327 lines

■ ■ ■ ■ ■ ■

maigret/maigret.py

		skipped 324 lines
325	325		# TODO: fix no site data issue
326	326		if not dictionary:
327	327		continue
	328	+
328	329		new_usernames = dictionary.get('ids_usernames')
329	330		if new_usernames:
330	331		for u, utype in new_usernames.items():
331	332		usernames[u] = utype
	333	+
	334	+	for url in dictionary.get('ids_links', []):
	335	+	for s in db.sites:
	336	+	u = s.detect_username(url)
	337	+	if u:
	338	+	usernames[u] = 'username'
332	339
333	340		# reporting for a one username
334	341		if args.xmind:
		skipped 53 lines

■ ■ ■ ■ ■ ■ ■

maigret/sites.py

		skipped 1 lines
2	2		"""Maigret Sites Information"""
3	3		import copy
4	4		import json
	5	+	import re
5	6		import sys
6	7
7	8		import requests
8	9
9		-	from .utils import CaseConverter
	10	+	from .utils import CaseConverter, URLMatcher
10	11
11	12
12	13		class MaigretEngine:
		skipped 8 lines
21	22
22	23
23	24		class MaigretSite:
	25	+	NOT_SERIALIZABLE_FIELDS = [
	26	+	'name',
	27	+	'engineData',
	28	+	'requestFuture',
	29	+	'detectedEngine',
	30	+	'engineObj',
	31	+	'stats',
	32	+	'urlRegexp',
	33	+	]
	34	+
24	35		def __init__(self, name, information):
25	36		self.name = name
26	37
		skipped 30 lines
57	68		# We do not know the popularity, so make site go to bottom of list.
58	69		self.alexa_rank = sys.maxsize
59	70
	71	+	self.update_detectors()
60	72
61	73		def __str__(self):
62	74		return f"{self.name} ({self.url_main})"
63	75
	76	+	def update_detectors(self):
	77	+	if 'url' in self.__dict__:
	78	+	url = self.url
	79	+	for group in ['urlMain', 'urlSubpath']:
	80	+	if group in url:
	81	+	url = url.replace('{'+group+'}', self.__dict__[CaseConverter.camel_to_snake(group)])
	82	+
	83	+	self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
	84	+
	85	+	def detect_username(self, url: str) -> str:
	86	+	if self.url_regexp:
	87	+	import logging
	88	+	match_groups = self.url_regexp.match(url)
	89	+	if match_groups:
	90	+	return match_groups.groups()[-1].rstrip('/')
	91	+
	92	+	return None
	93	+
64	94		@property
65	95		def json(self):
66	96		result = {}
		skipped 3 lines
70	100		# strip empty elements
71	101		if v in (False, '', [], {}, None, sys.maxsize, 'username'):
72	102		continue
73		-	if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj', 'stats']:
	103	+	if field in self.NOT_SERIALIZABLE_FIELDS:
74	104		continue
75	105		result[field] = v
76	106
		skipped 1 lines
78	108
79	109		def update(self, updates: dict) -> MaigretSite:
80	110		self.__dict__.update(updates)
	111	+	self.update_detectors()
81	112
82	113		return self
83	114
		skipped 11 lines
95	126		self.__dict__[field] = v
96	127
97	128		self.engine_obj = engine
	129	+	self.update_detectors()
98	130
99	131		return self
100	132
		skipped 2 lines
103	135		return self
104	136
105	137		self.request_future = None
	138	+	self.url_regexp = None
	139	+
106	140		self_copy = copy.deepcopy(self)
107	141		engine_data = self_copy.engine_obj.site
108	142		site_data_keys = list(self_copy.__dict__.keys())
		skipped 181 lines

■ ■ ■ ■ ■ ■

maigret/utils.py

1	1		import re
	2	+	import sys
2	3
3	4
4	5		class CaseConverter:
		skipped 24 lines
29	30		if link.startswith('www.') or (link.startswith('http') and '//' in link):
30	31		return f'<a class="auto-link" href="{link}">{link}</a>'
31	32		return link
	33	+
	34	+
	35	+	class URLMatcher:
	36	+	_HTTP_URL_RE_STR = '^https?://(www.)?(.+)$'
	37	+	HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
	38	+	UNSAFE_SYMBOLS = '.?'
	39	+
	40	+	@classmethod
	41	+	def extract_main_part(self, url: str) -> str:
	42	+	match = self.HTTP_URL_RE.search(url)
	43	+	if match and match.group(2):
	44	+	return match.group(2).rstrip('/')
	45	+
	46	+	return ''
	47	+
	48	+	@classmethod
	49	+	def make_profile_url_regexp(self, url: str, username_regexp: str = ''):
	50	+	url_main_part = self.extract_main_part(url)
	51	+	for c in self.UNSAFE_SYMBOLS:
	52	+	url_main_part = url_main_part.replace(c, f'\\{c}')
	53	+	username_regexp = username_regexp or '.+?'
	54	+
	55	+	url_regexp = url_main_part.replace('{username}', f'({username_regexp})')
	56	+	regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
	57	+
	58	+	return re.compile(regexp_str)

■ ■ ■ ■ ■ ■

tests/test_sites.py

		skipped 112 lines
113	113		assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
114	114
115	115
	116	+	def test_site_url_detector():
	117	+	db = MaigretDatabase()
	118	+	db.load_from_json(EXAMPLE_DB)
	119	+
	120	+	assert db.sites[0].url_regexp.pattern == r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
	121	+	assert db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test') == 'test'
	122	+
	123	+
116	124		def test_ranked_sites_dict():
117	125		db = MaigretDatabase()
118	126		db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
		skipped 53 lines

■ ■ ■ ■ ■ ■ ■

tests/test_utils.py

1	1		"""Maigret utils test functions"""
2		-	from maigret.utils import CaseConverter, is_country_tag, enrich_link_str
	2	+	import itertools
	3	+	import re
	4	+	from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher
3	5
4	6
5	7		def test_case_convert_camel_to_snake():
		skipped 27 lines
33	35		assert enrich_link_str('test') == 'test'
34	36		assert enrich_link_str(' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
35	37
	38	+	def test_url_extract_main_part():
	39	+	url_main_part = 'flickr.com/photos/alexaimephotography'
	40	+
	41	+	parts = [
	42	+	['http://', 'https://'],
	43	+	['www.', ''],
	44	+	[url_main_part],
	45	+	['/', ''],
	46	+	]
	47	+
	48	+	url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
	49	+	for url_parts in itertools.product(*parts):
	50	+	url = ''.join(url_parts)
	51	+	assert URLMatcher.extract_main_part(url) == url_main_part
	52	+	assert not url_regexp.match(url) is None
	53	+
	54	+	def test_url_make_profile_url_regexp():
	55	+	url_main_part = 'flickr.com/photos/{username}'
	56	+
	57	+	parts = [
	58	+	['http://', 'https://'],
	59	+	['www.', ''],
	60	+	[url_main_part],
	61	+	['/', ''],
	62	+	]
	63	+
	64	+	for url_parts in itertools.product(*parts):
	65	+	url = ''.join(url_parts)
	66	+	assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
	67	+

Merge pull request #55 from soxoj/username-extraction