STRLCPY/maigret

■ ■ ■ ■ ■ ■

Makefile

		skipped 5 lines
6	6		coverage html
7	7
8	8		rerun-tests:
9		-	pytest --lf
	9	+	pytest --lf -vv
10	10
11	11		lint:
12	12		@echo 'syntax errors or undefined names'
		skipped 24 lines

■ ■ ■ ■ ■ ■

example.ipynb

1	+	{
2	+	"cells": [
3	+	{
4	+	"cell_type": "code",
5	+	"execution_count": null,
6	+	"metadata": {
7	+	"id": "8v6PEfyXb0Gx"
8	+	},
9	+	"outputs": [],
10	+	"source": [
11	+	"# clone the repo\n",
12	+	"!git clone https://github.com/soxoj/maigret\n",
13	+	"!pip3 install -r maigret/requirements.txt"
14	+	]
15	+	},
16	+	{
17	+	"cell_type": "code",
18	+	"execution_count": null,
19	+	"metadata": {
20	+	"id": "cXOQUAhDchkl"
21	+	},
22	+	"outputs": [],
23	+	"source": [
24	+	"# help\n",
25	+	"!python3 maigret/maigret.py --help"
26	+	]
27	+	},
28	+	{
29	+	"cell_type": "code",
30	+	"execution_count": null,
31	+	"metadata": {
32	+	"id": "SjDmpN4QGnJu"
33	+	},
34	+	"outputs": [],
35	+	"source": [
36	+	"# search\n",
37	+	"!python3 maigret/maigret.py user"
38	+	]
39	+	}
40	+	],
41	+	"metadata": {
42	+	"colab": {
43	+	"collapsed_sections": [],
44	+	"include_colab_link": true,
45	+	"name": "maigret.ipynb",
46	+	"provenance": []
47	+	},
48	+	"kernelspec": {
49	+	"display_name": "Python 3",
50	+	"language": "python",
51	+	"name": "python3"
52	+	},
53	+	"language_info": {
54	+	"codemirror_mode": {
55	+	"name": "ipython",
56	+	"version": 3
57	+	},
58	+	"file_extension": ".py",
59	+	"mimetype": "text/x-python",
60	+	"name": "python",
61	+	"nbconvert_exporter": "python",
62	+	"pygments_lexer": "ipython3",
63	+	"version": "3.7.10"
64	+	}
65	+	},
66	+	"nbformat": 4,
67	+	"nbformat_minor": 1
68	+	}
69	+

■ ■ ■ ■ ■ ■

maigret/checking.py

		skipped 35 lines
36	36
37	37
38	38		SUPPORTED_IDS = (
	39	+	"username",
39	40		"yandex_public_id",
40	41		"gaia_id",
41	42		"vk_id",
		skipped 840 lines

■ ■ ■ ■ ■ ■

maigret/maigret.py

		skipped 33 lines
34	34		save_json_report,
35	35		get_plaintext_report,
36	36		sort_report_by_data_points,
	37	+	save_graph_report,
37	38		)
38	39		from .sites import MaigretDatabase
39	40		from .submit import Submitter
		skipped 20 lines
60	61		query_notify.warning(
61	62		'You can see detailed site check errors with a flag `--print-errors`'
62	63		)
63		-
64		-
65		-	def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
66		-	results = {}
67		-	for s in db.sites:
68		-	result = s.extract_id_from_url(url)
69		-	if not result:
70		-	continue
71		-	_id, _type = result
72		-	results[_id] = _type
73		-	return results
74	64
75	65
76	66		def extract_ids_from_page(url, logger, timeout=5) -> dict:
		skipped 41 lines
118	108		ids_results[u] = utype
119	109
120	110		for url in dictionary.get('ids_links', []):
121		-	ids_results.update(extract_ids_from_url(url, db))
	111	+	ids_results.update(db.extract_ids_from_url(url))
122	112
123	113		return ids_results
124	114
		skipped 307 lines
432	422		help="Generate a PDF report (general report on all usernames).",
433	423		)
434	424		report_group.add_argument(
	425	+	"-G",
	426	+	"--graph",
	427	+	action="store_true",
	428	+	dest="graph",
	429	+	default=False,
	430	+	help="Generate a graph report (general report on all usernames).",
	431	+	)
	432	+	report_group.add_argument(
435	433		"-J",
436	434		"--json",
437	435		action="store",
		skipped 254 lines
692	690		filename = report_filepath_tpl.format(username=username, postfix='.pdf')
693	691		save_pdf_report(filename, report_context)
694	692		query_notify.warning(f'PDF report on all usernames saved in {filename}')
	693	+
	694	+	if args.graph:
	695	+	filename = report_filepath_tpl.format(username=username, postfix='.html')
	696	+	save_graph_report(filename, general_results, db)
	697	+	query_notify.warning(f'Graph report on all usernames saved in {filename}')
695	698
696	699		text_report = get_plaintext_report(report_context)
697	700		if text_report:
		skipped 19 lines

■ ■ ■ ■ ■ ■

maigret/report.py

	1	+	import ast
1	2		import csv
2	3		import io
3	4		import json
		skipped 7 lines
11	12		from dateutil.parser import parse as parse_datetime_str
12	13		from jinja2 import Template
13	14		from xhtml2pdf import pisa
	15	+	from pyvis.network import Network
	16	+	import networkx as nx
14	17
	18	+	from .checking import SUPPORTED_IDS
15	19		from .result import QueryStatus
	20	+	from .sites import MaigretDatabase
16	21		from .utils import is_country_tag, CaseConverter, enrich_link_str
17	22
18	23		SUPPORTED_JSON_REPORT_FORMATS = [
		skipped 61 lines
80	85		def save_json_report(filename: str, username: str, results: dict, report_type: str):
81	86		with open(filename, "w", encoding="utf-8") as f:
82	87		generate_json_report(username, results, f, report_type=report_type)
	88	+
	89	+
	90	+	class MaigretGraph:
	91	+	other_params = {'size': 10, 'group': 3}
	92	+	site_params = {'size': 15, 'group': 2}
	93	+	username_params = {'size': 20, 'group': 1}
	94	+
	95	+	def __init__(self, graph):
	96	+	self.G = graph
	97	+
	98	+	def add_node(self, key, value):
	99	+	node_name = f'{key}: {value}'
	100	+
	101	+	params = self.other_params
	102	+	if key in SUPPORTED_IDS:
	103	+	params = self.username_params
	104	+	elif value.startswith('http'):
	105	+	params = self.site_params
	106	+
	107	+	self.G.add_node(node_name, title=node_name, **params)
	108	+
	109	+	if value != value.lower():
	110	+	normalized_node_name = self.add_node(key, value.lower())
	111	+	self.link(node_name, normalized_node_name)
	112	+
	113	+	return node_name
	114	+
	115	+	def link(self, node1_name, node2_name):
	116	+	self.G.add_edge(node1_name, node2_name, weight=2)
	117	+
	118	+
	119	+	def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
	120	+	G = nx.Graph()
	121	+	graph = MaigretGraph(G)
	122	+
	123	+	for username, id_type, results in username_results:
	124	+	username_node_name = graph.add_node(id_type, username)
	125	+
	126	+	for website_name in results:
	127	+	dictionary = results[website_name]
	128	+	# TODO: fix no site data issue
	129	+	if not dictionary:
	130	+	continue
	131	+
	132	+	if dictionary.get("is_similar"):
	133	+	continue
	134	+
	135	+	status = dictionary.get("status")
	136	+	if not status: # FIXME: currently in case of timeout
	137	+	continue
	138	+
	139	+	if dictionary["status"].status != QueryStatus.CLAIMED:
	140	+	continue
	141	+
	142	+	site_fallback_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
	143	+	# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
	144	+	site_node_name = graph.add_node('site', site_fallback_name)
	145	+	graph.link(username_node_name, site_node_name)
	146	+
	147	+	def process_ids(parent_node, ids):
	148	+	for k, v in ids.items():
	149	+	if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
	150	+	continue
	151	+	if k in 'image':
	152	+	continue
	153	+
	154	+	v_data = v
	155	+	if v.startswith('['):
	156	+	try:
	157	+	v_data = ast.literal_eval(v)
	158	+	except Exception as e:
	159	+	logging.error(e)
	160	+
	161	+	# value is a list
	162	+	if isinstance(v_data, list):
	163	+	list_node_name = graph.add_node(k, site_fallback_name)
	164	+	for vv in v_data:
	165	+	data_node_name = graph.add_node(vv, site_fallback_name)
	166	+	graph.link(list_node_name, data_node_name)
	167	+
	168	+	add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()}
	169	+	if add_ids:
	170	+	process_ids(data_node_name, add_ids)
	171	+	else:
	172	+	# value is just a string
	173	+	# ids_data_name = f'{k}: {v}'
	174	+	# if ids_data_name == parent_node:
	175	+	# continue
	176	+
	177	+	ids_data_name = graph.add_node(k, v)
	178	+	# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
	179	+	graph.link(parent_node, ids_data_name)
	180	+
	181	+	# check for username
	182	+	if 'username' in k or k in SUPPORTED_IDS:
	183	+	new_username_node_name = graph.add_node('username', v)
	184	+	graph.link(ids_data_name, new_username_node_name)
	185	+
	186	+	add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
	187	+	if add_ids:
	188	+	process_ids(ids_data_name, add_ids)
	189	+
	190	+	if status.ids_data:
	191	+	process_ids(site_node_name, status.ids_data)
	192	+
	193	+	nodes_to_remove = []
	194	+	for node in G.nodes:
	195	+	if len(str(node)) > 100:
	196	+	nodes_to_remove.append(node)
	197	+
	198	+	[G.remove_node(node) for node in nodes_to_remove]
	199	+
	200	+	nt = Network(notebook=True, height="750px", width="100%")
	201	+	nt.from_nx(G)
	202	+	nt.show(filename)
83	203
84	204
85	205		def get_plaintext_report(context: dict) -> str:
		skipped 314 lines

■ ■ ■ ■ ■ ■

maigret/resources/data.json

		skipped 3642 lines
3643	3643		"errors": {
3644	3644		"Invalid API key": "New API key needed"
3645	3645		},
	3646	+	"regexCheck": "^[^/]+$",
3646	3647		"urlProbe": "https://disqus.com/api/3.0/users/details?user=username%3A{username}&attach=userFlaggedUser&api_key=E8Uh5l5fHZ6gD8U3KycjAIAk46f68Zw7C6eW8WSjZvCLXebZ7p0r1yrYDrLilk2F",
3647	3648		"checkType": "status_code",
3648	3649		"presenseStrs": [
		skipped 9387 lines
13036	13037		"us"
13037	13038		],
13038	13039		"headers": {
13039		-	"authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
	13040	+	"authorization": "Bearer BQBbhm9gxBxIDmwZvO8mzV28G7V07L57WlKILvhXijRaTxwh9N03yHxSLADfioU3uWYDAjjq_mMWQSbQ2OA"
13040	13041		},
13041	13042		"errors": {
13042	13043		"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
		skipped 1420 lines
14463	14464		"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
14464	14465		"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
14465	14466		"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
14466		-	"x-guest-token": "1403829602053771266"
	14467	+	"x-guest-token": "1404906435025195008"
14467	14468		},
14468	14469		"errors": {
14469	14470		"Bad guest token": "x-guest-token update required"
		skipped 400 lines
14870	14871		"video"
14871	14872		],
14872	14873		"headers": {
14873		-	"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
	14874	+	"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM3OTYyNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.ZUCw6WWIPIoVy1zoj8AKA1EMfX6ao7hJI2pWxgAZlac"
14874	14875		},
14875	14876		"activation": {
14876	14877		"url": "https://vimeo.com/_rv/viewer",
		skipped 13644 lines

■ ■ ■ ■ ■ ■

maigret/sites.py

		skipped 399 lines
400	400
401	401		return found_flags
402	402
	403	+
	404	+	def extract_ids_from_url(self, url: str) -> dict:
	405	+	results = {}
	406	+	for s in self._sites:
	407	+	result = s.extract_id_from_url(url)
	408	+	if not result:
	409	+	continue
	410	+	_id, _type = result
	411	+	results[_id] = _type
	412	+	return results
	413	+
	414	+
403	415		def get_db_stats(self, sites_dict):
404	416		if not sites_dict:
405	417		sites_dict = self.sites_dict()
		skipped 36 lines

■ ■ ■ ■ ■ ■

requirements.txt

		skipped 36 lines
37	37		xhtml2pdf==0.2.5
38	38		XMind==1.2.0
39	39		yarl==1.6.3
	40	+	networkx==2.5.1
	41	+	pyvis==0.1.9
40	42

■ ■ ■ ■ ■ ■

tests/test_cli.py

		skipped 12 lines
13	13		'disable_recursive_search': False,
14	14		'folderoutput': 'reports',
15	15		'html': False,
	16	+	'graph': False,
16	17		'id_type': 'username',
17	18		'ignore_ids_list': [],
18	19		'info': False,
		skipped 80 lines

■ ■ ■ ■ ■ ■

tests/test_maigret.py

		skipped 8 lines
9	9		from maigret.maigret import (
10	10		extract_ids_from_page,
11	11		extract_ids_from_results,
12		-	extract_ids_from_url,
13	12		)
14	13		from maigret.sites import MaigretSite
15	14		from maigret.result import QueryResult, QueryStatus
		skipped 128 lines
144	143
145	144
146	145		def test_extract_ids_from_url(default_db):
147		-	assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == {
	146	+	assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == {
148	147		'test': 'username'
149	148		}
150		-	assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'}
151		-	assert extract_ids_from_url('https://vk.com/ida123', default_db) == {
	149	+	assert default_db.extract_ids_from_url('https://vk.com/id123') == {'123': 'vk_id'}
	150	+	assert default_db.extract_ids_from_url('https://vk.com/ida123') == {
152	151		'ida123': 'username'
153	152		}
154		-	assert extract_ids_from_url(
155		-	'https://my.mail.ru/yandex.ru/dipres8904/', default_db
	153	+	assert default_db.extract_ids_from_url(
	154	+	'https://my.mail.ru/yandex.ru/dipres8904/'
156	155		) == {'dipres8904': 'username'}
157		-	assert extract_ids_from_url(
158		-	'https://reviews.yandex.ru/user/adbced123', default_db
	156	+	assert default_db.extract_ids_from_url(
	157	+	'https://reviews.yandex.ru/user/adbced123'
159	158		) == {'adbced123': 'yandex_public_id'}
160	159
161	160
		skipped 18 lines

Draft of graph report