STRLCPY/maigret

Merge pull request #88 from soxoj/parsing-mode-improve

Improving "parse" mode for extracting usernames and other info for a …

soxoj committed with GitHub 4 years ago

d23d24ee

2 parents
e90e85d2
a2ddb15f

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

Total 6 files Show one by one

■ ■ ■ ■ ■ ■

maigret/maigret.py

		skipped 18 lines
19	19		save_json_report
20	20		from .sites import MaigretDatabase
21	21		from .submit import submit_dialog
	22	+	from .utils import get_dict_ascii_tree
22	23
23	24		__version__ = '0.1.15'
24	25
		skipped 193 lines
218	219		print("Using the proxy: " + args.proxy)
219	220
220	221		if args.parse_url:
221		-	page, _ = parse(args.parse_url, cookies_str='')
222		-	info = extract(page)
223		-	text = 'Extracted ID data from webpage: ' + ', '.join([f'{a}: {b}' for a, b in info.items()])
224		-	print(text)
225		-	for k, v in info.items():
226		-	if 'username' in k:
227		-	usernames[v] = 'username'
228		-	if k in supported_recursive_search_ids:
229		-	usernames[v] = k
	222	+	# url, headers
	223	+	reqs = [(args.parse_url, set())]
	224	+	try:
	225	+	# temporary workaround for URL mutations MVP
	226	+	from socid_extractor import mutate_url
	227	+	reqs += list(mutate_url(args.parse_url))
	228	+	except:
	229	+	pass
	230	+
	231	+	for req in reqs:
	232	+	url, headers = req
	233	+	print(f'Scanning webpage by URL {url}...')
	234	+	page, _ = parse(url, cookies_str='', headers=headers)
	235	+	info = extract(page)
	236	+	if not info:
	237	+	print('Nothing extracted')
	238	+	else:
	239	+	print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
	240	+	for k, v in info.items():
	241	+	if 'username' in k:
	242	+	usernames[v] = 'username'
	243	+	if k in supported_recursive_search_ids:
	244	+	usernames[v] = k
230	245
231	246		if args.tags:
232	247		args.tags = list(set(str(args.tags).split(',')))
		skipped 184 lines

■ ■ ■ ■ ■ ■

maigret/notify.py

		skipped 7 lines
8	8		from colorama import Fore, Style, init
9	9
10	10		from .result import QueryStatus
	11	+	from .utils import get_dict_ascii_tree
11	12
12	13
13	14		class QueryNotify():
		skipped 162 lines
176	177		else:
177	178		print(msg)
178	179
179		-	def get_additional_data_text(self, items, prepend=''):
180		-	text = ''
181		-	for num, item in enumerate(items):
182		-	box_symbol = '┣╸' if num != len(items) - 1 else '┗╸'
183		-
184		-	if type(item) == tuple:
185		-	field_name, field_value = item
186		-	if field_value.startswith('[\''):
187		-	is_last_item = num == len(items) - 1
188		-	prepend_symbols = ' ' * 3 if is_last_item else ' ┃ '
189		-	field_value = self.get_additional_data_text(eval(field_value), prepend_symbols)
190		-	text += f'\n{prepend}{box_symbol}{field_name}: {field_value}'
191		-	else:
192		-	text += f'\n{prepend}{box_symbol} {item}'
193		-
194		-	return text
195	180
196	181		def update(self, result, is_similar=False):
197	182		"""Notify Update.
		skipped 13 lines
211	196		if not self.result.ids_data:
212	197		ids_data_text = ""
213	198		else:
214		-	ids_data_text = self.get_additional_data_text(self.result.ids_data.items(), ' ')
	199	+	ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), ' ')
215	200
216	201		def make_colored_terminal_notify(status, text, status_color, text_color, appendix):
217	202		text = [
		skipped 76 lines

■ ■ ■ ■ ■ ■

maigret/resources/data.json

		skipped 22760 lines
22761	22761		},
22762	22762		"codeforces.com": {
22763	22763		"tags": [
22764		-	"in"
	22764	+	"coding"
22765	22765		],
	22766	+	"errors": {
	22767	+	"The page is temporarily blocked by administrator.": "IP ban"
	22768	+	},
22766	22769		"engine": "engineRedirect",
22767	22770		"alexaRank": 8156,
22768	22771		"url": "http://codeforces.com/profile/{username}",
		skipped 871 lines

■ ■ ■ ■ ■ ■

maigret/utils.py

		skipped 55 lines
56	56
57	57		return re.compile(regexp_str)
58	58
	59	+
	60	+	def get_dict_ascii_tree(items, prepend='', new_line=True):
	61	+	text = ''
	62	+	for num, item in enumerate(items):
	63	+	box_symbol = '┣╸' if num != len(items) - 1 else '┗╸'
	64	+
	65	+	if type(item) == tuple:
	66	+	field_name, field_value = item
	67	+	if field_value.startswith('[\''):
	68	+	is_last_item = num == len(items) - 1
	69	+	prepend_symbols = ' ' * 3 if is_last_item else ' ┃ '
	70	+	field_value = print_ascii_tree(eval(field_value), prepend_symbols)
	71	+	text += f'\n{prepend}{box_symbol}{field_name}: {field_value}'
	72	+	else:
	73	+	text += f'\n{prepend}{box_symbol} {item}'
	74	+
	75	+	if not new_line:
	76	+	text = text[1:]
	77	+
	78	+	return text
	79	+

■ ■ ■ ■ ■ ■

requirements.txt

		skipped 27 lines
28	28		requests>=2.24.0
29	29		requests-futures==1.0.0
30	30		six==1.15.0
31		-	socid-extractor>=0.0.13
	31	+	socid-extractor>=0.0.15
32	32		soupsieve==2.1
33	33		stem==1.8.0
34	34		torrequest==0.1.0
		skipped 7 lines

■ ■ ■ ■ ■ ■ ■

tests/test_utils.py

		skipped 1 lines
2	2		import itertools
3	3		import re
4	4
5		-	from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher
	5	+	from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher, get_dict_ascii_tree
6	6
7	7
8	8		def test_case_convert_camel_to_snake():
		skipped 64 lines
73	73		url = ''.join(url_parts)
74	74		assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
75	75
	76	+
	77	+	def test_get_dict_ascii_tree():
	78	+	data = {'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==', 'legacy_id': '26403415', 'username': 'alexaimephotographycars', 'name': 'Alex Aimé', 'created_at': '2018-05-04T10:17:01.000+0000', 'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b', 'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201', 'website': 'www.instagram.com/street.reality.photography/', 'facebook_link': ' www.instagram.com/street.reality.photography/', 'instagram_username': 'Street.Reality.Photography', 'twitter_username': 'Alexaimephotogr'}
	79	+
	80	+	ascii_tree = get_dict_ascii_tree(data.items())
	81	+
	82	+	assert ascii_tree == """
	83	+	┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
	84	+	┣╸legacy_id: 26403415
	85	+	┣╸username: alexaimephotographycars
	86	+	┣╸name: Alex Aimé
	87	+	┣╸created_at: 2018-05-04T10:17:01.000+0000
	88	+	┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
	89	+	┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
	90	+	┣╸website: www.instagram.com/street.reality.photography/
	91	+	┣╸facebook_link: www.instagram.com/street.reality.photography/
	92	+	┣╸instagram_username: Street.Reality.Photography
	93	+	┗╸twitter_username: Alexaimephotogr"""