STRLCPY/maigret

Refactoring of submit module, some fixes
Soxoj committed 3 years ago

9b0acc09

1 parent eb721dc7

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

■ ■ ■ ■ ■ ■

Makefile

		skipped 24 lines
25	25		pull:
26	26		git stash
27	27		git checkout main
28		-	git pull origin head
	28	+	git pull origin main
29	29		git stash pop
30	30
31	31		clean:
		skipped 5 lines

■ ■ ■ ■ ■ ■

maigret/maigret.py

		skipped 35 lines
36	36		sort_report_by_data_points,
37	37		)
38	38		from .sites import MaigretDatabase
39		-	from .submit import submit_dialog
	39	+	from .submit import Submitter
40	40		from .types import QueryResultWrapper
41	41		from .utils import get_dict_ascii_tree
	42	+	from .settings import Settings
42	43
43	44
44	45		def notify_about_errors(search_results: QueryResultWrapper, query_notify):
		skipped 451 lines
496	497		if args.tags:
497	498		args.tags = list(set(str(args.tags).split(',')))
498	499
	500	+	settings = Settings(
	501	+	os.path.join(
	502	+	os.path.dirname(os.path.realpath(__file__)), "resources/settings.json"
	503	+	)
	504	+	)
	505	+
499	506		if args.db_file is None:
500	507		args.db_file = os.path.join(
501	508		os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
		skipped 24 lines
526	533		site_data = get_top_sites_for_id(args.id_type)
527	534
528	535		if args.new_site_to_submit:
529		-	is_submitted = await submit_dialog(
530		-	db, args.new_site_to_submit, args.cookie_file, logger
531		-	)
	536	+	submitter = Submitter(db=db, logger=logger, settings=settings)
	537	+	is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
532	538		if is_submitted:
533	539		db.save_to_file(args.db_file)
534	540
		skipped 176 lines

■ ■ ■ ■ ■ ■ ■

maigret/resources/data.json

		skipped 13035 lines
13036	13036		"us"
13037	13037		],
13038	13038		"headers": {
13039		-	"authorization": "Bearer BQCypIuUtz7zDFov8xN86mj1BelLf7Apf9WBaC5yYfNkmGe4r7Hz4Awp6dqPuCAP9K9F5yYtjbyZX_vlr4I"
	13039	+	"authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
13040	13040		},
13041	13041		"errors": {
13042	13042		"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
		skipped 947 lines
13990	13990		"us"
13991	13991		],
13992	13992		"errors": {
13993		-	"Website unavailable": "Site error"
	13993	+	"Website unavailable": "Site error",
	13994	+	"is currently offline": "Site error"
13994	13995		},
13995	13996		"checkType": "message",
13996	13997		"absenceStrs": [
		skipped 465 lines
14462	14463		"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
14463	14464		"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
14464	14465		"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
14465		-	"x-guest-token": "1400174453577900043"
	14466	+	"x-guest-token": "1403829602053771266"
14466	14467		},
14467	14468		"errors": {
14468	14469		"Bad guest token": "x-guest-token update required"
		skipped 400 lines
14869	14870		"video"
14870	14871		],
14871	14872		"headers": {
14872		-	"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjI2NjcxMjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.V4VVbLzNwPU21rNP5moSxrPcPw--C7_Qz9VHgcJc1CA"
	14873	+	"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
14873	14874		},
14874	14875		"activation": {
14875	14876		"url": "https://vimeo.com/_rv/viewer",
		skipped 13581 lines
28457	28458		]
28458	28459		}
28459	28460		}
28460		-	}
	28461	+	},
	28462	+	"tags": [
	28463	+	"gaming",
	28464	+	"coding",
	28465	+	"photo",
	28466	+	"music",
	28467	+	"blog",
	28468	+	"finance",
	28469	+	"freelance",
	28470	+	"dating",
	28471	+	"tech",
	28472	+	"forum",
	28473	+	"porn",
	28474	+	"erotic",
	28475	+	"webcam",
	28476	+	"video",
	28477	+	"movies",
	28478	+	"hacking",
	28479	+	"art",
	28480	+	"discussion",
	28481	+	"sharing",
	28482	+	"writing",
	28483	+	"wiki",
	28484	+	"business",
	28485	+	"shopping",
	28486	+	"sport",
	28487	+	"books",
	28488	+	"news",
	28489	+	"documents",
	28490	+	"travel",
	28491	+	"maps",
	28492	+	"hobby",
	28493	+	"apps",
	28494	+	"classified",
	28495	+	"career",
	28496	+	"geosocial",
	28497	+	"streaming",
	28498	+	"education",
	28499	+	"networking",
	28500	+	"torrent",
	28501	+	"science",
	28502	+	"medicine",
	28503	+	"reading",
	28504	+	"stock",
	28505	+	"messaging",
	28506	+	"trading",
	28507	+	"links",
	28508	+	"fashion",
	28509	+	"tasks",
	28510	+	"military",
	28511	+	"auto",
	28512	+	"gambling",
	28513	+	"cybercriminal",
	28514	+	"review",
	28515	+	"bookmarks",
	28516	+	"design",
	28517	+	"tor",
	28518	+	"i2p"
	28519	+	]
28461	28520		}

■ ■ ■ ■ ■ ■

maigret/resources/settings.json

1	+	{
2	+	"presence_strings": [
3	+	"username",
4	+	"not found",
5	+	"пользователь",
6	+	"profile",
7	+	"lastname",
8	+	"firstname",
9	+	"biography",
10	+	"birthday",
11	+	"репутация",
12	+	"информация",
13	+	"e-mail"
14	+	],
15	+	"supposed_usernames": [
16	+	"alex", "god", "admin", "red", "blue", "john"]
17	+	}

■ ■ ■ ■ ■ ■

maigret/settings.py

1	+	import json
2	+
3	+
4	+	class Settings:
5	+	presence_strings: list
6	+	supposed_usernames: list
7	+
8	+	def __init__(self, filename):
9	+	data = {}
10	+
11	+	try:
12	+	with open(filename, "r", encoding="utf-8") as file:
13	+	try:
14	+	data = json.load(file)
15	+	except Exception as error:
16	+	raise ValueError(
17	+	f"Problem with parsing json contents of "
18	+	f"settings file '{filename}': {str(error)}."
19	+	)
20	+	except FileNotFoundError as error:
21	+	raise FileNotFoundError(
22	+	f"Problem while attempting to access settings file '{filename}'."
23	+	) from error
24	+
25	+	self.__dict__.update(data)
26	+
27	+	@property
28	+	def json(self):
29	+	return self.__dict__
30	+

■ ■ ■ ■ ■ ■

maigret/sites.py

		skipped 8 lines
9	9
10	10		from .utils import CaseConverter, URLMatcher, is_country_tag
11	11
12		-	# TODO: move to data.json
13		-	SUPPORTED_TAGS = [
14		-	"gaming",
15		-	"coding",
16		-	"photo",
17		-	"music",
18		-	"blog",
19		-	"finance",
20		-	"freelance",
21		-	"dating",
22		-	"tech",
23		-	"forum",
24		-	"porn",
25		-	"erotic",
26		-	"webcam",
27		-	"video",
28		-	"movies",
29		-	"hacking",
30		-	"art",
31		-	"discussion",
32		-	"sharing",
33		-	"writing",
34		-	"wiki",
35		-	"business",
36		-	"shopping",
37		-	"sport",
38		-	"books",
39		-	"news",
40		-	"documents",
41		-	"travel",
42		-	"maps",
43		-	"hobby",
44		-	"apps",
45		-	"classified",
46		-	"career",
47		-	"geosocial",
48		-	"streaming",
49		-	"education",
50		-	"networking",
51		-	"torrent",
52		-	"science",
53		-	"medicine",
54		-	"reading",
55		-	"stock",
56		-	"messaging",
57		-	"trading",
58		-	"links",
59		-	"fashion",
60		-	"tasks",
61		-	"military",
62		-	"auto",
63		-	"gambling",
64		-	"cybercriminal",
65		-	"review",
66		-	"bookmarks",
67		-	"design",
68		-	"tor",
69		-	"i2p",
70		-	]
71		-
72	12
73	13		class MaigretEngine:
74	14		site: Dict[str, Any] = {}
		skipped 129 lines
204	144		errors.update(self.errors)
205	145		return errors
206	146
207		-	def get_url_type(self) -> str:
	147	+	def get_url_template(self) -> str:
208	148		url = URLMatcher.extract_main_part(self.url)
209	149		if url.startswith("{username}"):
210	150		url = "SUBDOMAIN"
211	151		elif url == "":
212		-	url = f"{self.url} ({self.engine})"
	152	+	url = f"{self.url} ({self.engine or 'no engine'})"
213	153		else:
214	154		parts = url.split("/")
215	155		url = "/" + "/".join(parts[1:])
		skipped 57 lines
273	213
274	214		class MaigretDatabase:
275	215		def __init__(self):
276		-	self._sites = []
277		-	self._engines = []
	216	+	self._tags: list = []
	217	+	self._sites: list = []
	218	+	self._engines: list = []
278	219
279	220		@property
280	221		def sites(self):
		skipped 73 lines
354	295		db_data = {
355	296		"sites": {site.name: site.strip_engine_data().json for site in self._sites},
356	297		"engines": {engine.name: engine.json for engine in self._engines},
	298	+	"tags": self._tags,
357	299		}
358	300
359	301		json_data = json.dumps(db_data, indent=4)
		skipped 7 lines
367	309		# Add all of site information from the json file to internal site list.
368	310		site_data = json_data.get("sites", {})
369	311		engines_data = json_data.get("engines", {})
	312	+	tags = json_data.get("tags", [])
	313	+
	314	+	self._tags += tags
370	315
371	316		for engine_name in engines_data:
372	317		self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
		skipped 96 lines
469	414		if site.disabled:
470	415		disabled_count += 1
471	416
472		-	url_type = site.get_url_type()
	417	+	url_type = site.get_url_template()
473	418		urls[url_type] = urls.get(url_type, 0) + 1
474	419
475	420		if not site.tags:
		skipped 12 lines
488	433		output += "Top tags:\n"
489	434		for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
490	435		mark = ""
491		-	if tag not in SUPPORTED_TAGS:
	436	+	if tag not in self._tags:
492	437		mark = " (non-standard)"
493	438		output += f"{count}\t{tag}{mark}\n"
494	439
		skipped 2 lines

■ ■ ■ ■ ■ ■

maigret/submit.py

1	1		import asyncio
2		-	import difflib
3	2		import re
4	3		from typing import List
5	4		import xml.etree.ElementTree as ET
		skipped 2 lines
8	7		from .activation import import_aiohttp_cookies
9	8		from .checking import maigret
10	9		from .result import QueryStatus
	10	+	from .settings import Settings
11	11		from .sites import MaigretDatabase, MaigretSite, MaigretEngine
12		-	from .utils import get_random_user_agent
	12	+	from .utils import get_random_user_agent, get_match_ratio
13	13
14	14
15		-	DESIRED_STRINGS = [
16		-	"username",
17		-	"not found",
18		-	"пользователь",
19		-	"profile",
20		-	"lastname",
21		-	"firstname",
22		-	"biography",
23		-	"birthday",
24		-	"репутация",
25		-	"информация",
26		-	"e-mail",
27		-	]
	15	+	class Submitter:
	16	+	HEADERS = {
	17	+	"User-Agent": get_random_user_agent(),
	18	+	}
28	19
29		-	SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
	20	+	SEPARATORS = "\"'"
30	21
31		-	HEADERS = {
32		-	"User-Agent": get_random_user_agent(),
33		-	}
	22	+	RATIO = 0.6
	23	+	TOP_FEATURES = 5
	24	+	URL_RE = re.compile(r"https?://(www\.)?")
34	25
35		-	SEPARATORS = "\"'"
	26	+	def __init__(self, db: MaigretDatabase, settings: Settings, logger):
	27	+	self.settings = settings
	28	+	self.db = db
	29	+	self.logger = logger
36	30
37		-	RATIO = 0.6
38		-	TOP_FEATURES = 5
39		-	URL_RE = re.compile(r"https?://(www\.)?")
	31	+	@staticmethod
	32	+	def get_alexa_rank(site_url_main):
	33	+	url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
	34	+	xml_data = requests.get(url).text
	35	+	root = ET.fromstring(xml_data)
	36	+	alexa_rank = 0
40	37
	38	+	try:
	39	+	alexa_rank = int(root.find('.//REACH').attrib['RANK'])
	40	+	except Exception:
	41	+	pass
41	42
42		-	def get_match_ratio(x):
43		-	return round(
44		-	max(
45		-	[difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
46		-	),
47		-	2,
48		-	)
	43	+	return alexa_rank
49	44
	45	+	@staticmethod
	46	+	def extract_mainpage_url(url):
	47	+	return "/".join(url.split("/", 3)[:3])
50	48
51		-	def get_alexa_rank(site_url_main):
52		-	url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
53		-	xml_data = requests.get(url).text
54		-	root = ET.fromstring(xml_data)
55		-	alexa_rank = 0
	49	+	async def site_self_check(self, site, semaphore, silent=False):
	50	+	changes = {
	51	+	"disabled": False,
	52	+	}
56	53
57		-	try:
58		-	alexa_rank = int(root.find('.//REACH').attrib['RANK'])
59		-	except Exception:
60		-	pass
	54	+	check_data = [
	55	+	(site.username_claimed, QueryStatus.CLAIMED),
	56	+	(site.username_unclaimed, QueryStatus.AVAILABLE),
	57	+	]
61	58
62		-	return alexa_rank
	59	+	self.logger.info(f"Checking {site.name}...")
63	60
64		-
65		-	def extract_mainpage_url(url):
66		-	return "/".join(url.split("/", 3)[:3])
67		-
68		-
69		-	async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
70		-	changes = {
71		-	"disabled": False,
72		-	}
73		-
74		-	check_data = [
75		-	(site.username_claimed, QueryStatus.CLAIMED),
76		-	(site.username_unclaimed, QueryStatus.AVAILABLE),
77		-	]
78		-
79		-	logger.info(f"Checking {site.name}...")
80		-
81		-	for username, status in check_data:
82		-	results_dict = await maigret(
83		-	username=username,
84		-	site_dict={site.name: site},
85		-	logger=logger,
86		-	timeout=30,
87		-	id_type=site.type,
88		-	forced=True,
89		-	no_progressbar=True,
90		-	)
91		-
92		-	# don't disable entries with other ids types
93		-	# TODO: make normal checking
94		-	if site.name not in results_dict:
95		-	logger.info(results_dict)
96		-	changes["disabled"] = True
97		-	continue
98		-
99		-	result = results_dict[site.name]["status"]
100		-
101		-	site_status = result.status
	61	+	for username, status in check_data:
	62	+	results_dict = await maigret(
	63	+	username=username,
	64	+	site_dict={site.name: site},
	65	+	logger=self.logger,
	66	+	timeout=30,
	67	+	id_type=site.type,
	68	+	forced=True,
	69	+	no_progressbar=True,
	70	+	)
102	71
103		-	if site_status != status:
104		-	if site_status == QueryStatus.UNKNOWN:
105		-	msgs = site.absence_strs
106		-	etype = site.check_type
107		-	logger.warning(
108		-	"Error while searching '%s' in %s: %s, %s, check type %s",
109		-	username,
110		-	site.name,
111		-	result.context,
112		-	msgs,
113		-	etype,
114		-	)
115		-	# don't disable in case of available username
116		-	if status == QueryStatus.CLAIMED:
117		-	changes["disabled"] = True
118		-	elif status == QueryStatus.CLAIMED:
119		-	logger.warning(
120		-	f"Not found `{username}` in {site.name}, must be claimed"
121		-	)
122		-	logger.info(results_dict[site.name])
123		-	changes["disabled"] = True
124		-	else:
125		-	logger.warning(f"Found `{username}` in {site.name}, must be available")
126		-	logger.info(results_dict[site.name])
	72	+	# don't disable entries with other ids types
	73	+	# TODO: make normal checking
	74	+	if site.name not in results_dict:
	75	+	self.logger.info(results_dict)
127	76		changes["disabled"] = True
	77	+	continue
128	78
129		-	logger.info(f"Site {site.name} checking is finished")
	79	+	result = results_dict[site.name]["status"]
130	80
131		-	return changes
	81	+	site_status = result.status
132	82
	83	+	if site_status != status:
	84	+	if site_status == QueryStatus.UNKNOWN:
	85	+	msgs = site.absence_strs
	86	+	etype = site.check_type
	87	+	self.logger.warning(
	88	+	"Error while searching '%s' in %s: %s, %s, check type %s",
	89	+	username,
	90	+	site.name,
	91	+	result.context,
	92	+	msgs,
	93	+	etype,
	94	+	)
	95	+	# don't disable in case of available username
	96	+	if status == QueryStatus.CLAIMED:
	97	+	changes["disabled"] = True
	98	+	elif status == QueryStatus.CLAIMED:
	99	+	self.logger.warning(
	100	+	f"Not found `{username}` in {site.name}, must be claimed"
	101	+	)
	102	+	self.logger.info(results_dict[site.name])
	103	+	changes["disabled"] = True
	104	+	else:
	105	+	self.logger.warning(
	106	+	f"Found `{username}` in {site.name}, must be available"
	107	+	)
	108	+	self.logger.info(results_dict[site.name])
	109	+	changes["disabled"] = True
133	110
134		-	def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
135		-	fields = {}
136		-	if 'urlSubpath' in engine.site.get('url', ''):
137		-	msg = (
138		-	'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
139		-	'Enter in manually if it exists: '
140		-	)
141		-	subpath = input(msg).strip('/')
142		-	if subpath:
143		-	fields['urlSubpath'] = f'/{subpath}'
144		-	return fields
	111	+	self.logger.info(f"Site {site.name} checking is finished")
145	112
	113	+	return changes
146	114
147		-	async def detect_known_engine(
148		-	db, url_exists, url_mainpage, logger
149		-	) -> List[MaigretSite]:
150		-	try:
151		-	r = requests.get(url_mainpage)
152		-	logger.debug(r.text)
153		-	except Exception as e:
154		-	logger.warning(e)
155		-	print("Some error while checking main page")
156		-	return []
	115	+	def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
	116	+	fields = {}
	117	+	if 'urlSubpath' in engine.site.get('url', ''):
	118	+	msg = (
	119	+	'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
	120	+	'Enter in manually if it exists: '
	121	+	)
	122	+	subpath = input(msg).strip('/')
	123	+	if subpath:
	124	+	fields['urlSubpath'] = f'/{subpath}'
	125	+	return fields
157	126
158		-	for engine in db.engines:
159		-	strs_to_check = engine.__dict__.get("presenseStrs")
160		-	if strs_to_check and r and r.text:
161		-	all_strs_in_response = True
162		-	for s in strs_to_check:
163		-	if s not in r.text:
164		-	all_strs_in_response = False
165		-	sites = []
166		-	if all_strs_in_response:
167		-	engine_name = engine.__dict__.get("name")
	127	+	async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
	128	+	try:
	129	+	r = requests.get(url_mainpage)
	130	+	self.logger.debug(r.text)
	131	+	except Exception as e:
	132	+	self.logger.warning(e)
	133	+	print("Some error while checking main page")
	134	+	return []
168	135
169		-	print(f"Detected engine {engine_name} for site {url_mainpage}")
170		-
171		-	usernames_to_check = SUPPOSED_USERNAMES
172		-	supposed_username = extract_username_dialog(url_exists)
173		-	if supposed_username:
174		-	usernames_to_check = [supposed_username] + usernames_to_check
175		-
176		-	add_fields = generate_additional_fields_dialog(engine, url_exists)
	136	+	for engine in self.db.engines:
	137	+	strs_to_check = engine.__dict__.get("presenseStrs")
	138	+	if strs_to_check and r and r.text:
	139	+	all_strs_in_response = True
	140	+	for s in strs_to_check:
	141	+	if s not in r.text:
	142	+	all_strs_in_response = False
	143	+	sites = []
	144	+	if all_strs_in_response:
	145	+	engine_name = engine.__dict__.get("name")
177	146
178		-	for u in usernames_to_check:
179		-	site_data = {
180		-	"urlMain": url_mainpage,
181		-	"name": url_mainpage.split("//")[1],
182		-	"engine": engine_name,
183		-	"usernameClaimed": u,
184		-	"usernameUnclaimed": "noonewouldeverusethis7",
185		-	**add_fields,
186		-	}
187		-	logger.info(site_data)
	147	+	print(f"Detected engine {engine_name} for site {url_mainpage}")
188	148
189		-	maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data)
190		-	maigret_site.update_from_engine(db.engines_dict[engine_name])
191		-	sites.append(maigret_site)
	149	+	usernames_to_check = self.settings.supposed_usernames
	150	+	supposed_username = self.extract_username_dialog(url_exists)
	151	+	if supposed_username:
	152	+	usernames_to_check = [supposed_username] + usernames_to_check
192	153
193		-	return sites
	154	+	add_fields = self.generate_additional_fields_dialog(
	155	+	engine, url_exists
	156	+	)
194	157
195		-	return []
	158	+	for u in usernames_to_check:
	159	+	site_data = {
	160	+	"urlMain": url_mainpage,
	161	+	"name": url_mainpage.split("//")[1],
	162	+	"engine": engine_name,
	163	+	"usernameClaimed": u,
	164	+	"usernameUnclaimed": "noonewouldeverusethis7",
	165	+	**add_fields,
	166	+	}
	167	+	self.logger.info(site_data)
196	168
	169	+	maigret_site = MaigretSite(
	170	+	url_mainpage.split("/")[-1], site_data
	171	+	)
	172	+	maigret_site.update_from_engine(
	173	+	self.db.engines_dict[engine_name]
	174	+	)
	175	+	sites.append(maigret_site)
197	176
198		-	def extract_username_dialog(url):
199		-	url_parts = url.rstrip("/").split("/")
200		-	supposed_username = url_parts[-1].strip('@')
201		-	entered_username = input(
202		-	f'Is "{supposed_username}" a valid username? If not, write it manually: '
203		-	)
204		-	return entered_username if entered_username else supposed_username
	177	+	return sites
205	178
	179	+	return []
206	180
207		-	async def check_features_manually(
208		-	db, url_exists, url_mainpage, cookie_file, logger, redirects=False
209		-	):
210		-	custom_headers = {}
211		-	while True:
212		-	header_key = input(
213		-	'Specify custom header if you need or just press Enter to skip. Header name: '
	181	+	def extract_username_dialog(self, url):
	182	+	url_parts = url.rstrip("/").split("/")
	183	+	supposed_username = url_parts[-1].strip('@')
	184	+	entered_username = input(
	185	+	f'Is "{supposed_username}" a valid username? If not, write it manually: '
214	186		)
215		-	if not header_key:
216		-	break
217		-	header_value = input('Header value: ')
218		-	custom_headers[header_key.strip()] = header_value.strip()
	187	+	return entered_username if entered_username else supposed_username
219	188
220		-	supposed_username = extract_username_dialog(url_exists)
221		-	non_exist_username = "noonewouldeverusethis7"
	189	+	async def check_features_manually(
	190	+	self, url_exists, url_mainpage, cookie_file, redirects=False
	191	+	):
	192	+	custom_headers = {}
	193	+	while True:
	194	+	header_key = input(
	195	+	'Specify custom header if you need or just press Enter to skip. Header name: '
	196	+	)
	197	+	if not header_key:
	198	+	break
	199	+	header_value = input('Header value: ')
	200	+	custom_headers[header_key.strip()] = header_value.strip()
222	201
223		-	url_user = url_exists.replace(supposed_username, "{username}")
224		-	url_not_exists = url_exists.replace(supposed_username, non_exist_username)
	202	+	supposed_username = self.extract_username_dialog(url_exists)
	203	+	non_exist_username = "noonewouldeverusethis7"
225	204
226		-	headers = dict(HEADERS)
227		-	headers.update(custom_headers)
	205	+	url_user = url_exists.replace(supposed_username, "{username}")
	206	+	url_not_exists = url_exists.replace(supposed_username, non_exist_username)
228	207
229		-	# cookies
230		-	cookie_dict = None
231		-	if cookie_file:
232		-	logger.info(f'Use {cookie_file} for cookies')
233		-	cookie_jar = import_aiohttp_cookies(cookie_file)
234		-	cookie_dict = {c.key: c.value for c in cookie_jar}
	208	+	headers = dict(self.HEADERS)
	209	+	headers.update(custom_headers)
235	210
236		-	exists_resp = requests.get(
237		-	url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
238		-	)
239		-	logger.debug(url_exists)
240		-	logger.debug(exists_resp.status_code)
241		-	logger.debug(exists_resp.text)
	211	+	# cookies
	212	+	cookie_dict = None
	213	+	if cookie_file:
	214	+	self.logger.info(f'Use {cookie_file} for cookies')
	215	+	cookie_jar = import_aiohttp_cookies(cookie_file)
	216	+	cookie_dict = {c.key: c.value for c in cookie_jar}
242	217
243		-	non_exists_resp = requests.get(
244		-	url_not_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
245		-	)
246		-	logger.debug(url_not_exists)
247		-	logger.debug(non_exists_resp.status_code)
248		-	logger.debug(non_exists_resp.text)
	218	+	exists_resp = requests.get(
	219	+	url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
	220	+	)
	221	+	self.logger.debug(url_exists)
	222	+	self.logger.debug(exists_resp.status_code)
	223	+	self.logger.debug(exists_resp.text)
249	224
250		-	a = exists_resp.text
251		-	b = non_exists_resp.text
	225	+	non_exists_resp = requests.get(
	226	+	url_not_exists,
	227	+	cookies=cookie_dict,
	228	+	headers=headers,
	229	+	allow_redirects=redirects,
	230	+	)
	231	+	self.logger.debug(url_not_exists)
	232	+	self.logger.debug(non_exists_resp.status_code)
	233	+	self.logger.debug(non_exists_resp.text)
252	234
253		-	tokens_a = set(re.split(f'[{SEPARATORS}]', a))
254		-	tokens_b = set(re.split(f'[{SEPARATORS}]', b))
	235	+	a = exists_resp.text
	236	+	b = non_exists_resp.text
255	237
256		-	a_minus_b = tokens_a.difference(tokens_b)
257		-	b_minus_a = tokens_b.difference(tokens_a)
	238	+	tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
	239	+	tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
258	240
259		-	if len(a_minus_b) == len(b_minus_a) == 0:
260		-	print("The pages for existing and non-existing account are the same!")
	241	+	a_minus_b = tokens_a.difference(tokens_b)
	242	+	b_minus_a = tokens_b.difference(tokens_a)
261	243
262		-	top_features_count = int(
263		-	input(f"Specify count of features to extract [default {TOP_FEATURES}]: ")
264		-	or TOP_FEATURES
265		-	)
	244	+	if len(a_minus_b) == len(b_minus_a) == 0:
	245	+	print("The pages for existing and non-existing account are the same!")
266	246
267		-	presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[
268		-	:top_features_count
269		-	]
	247	+	top_features_count = int(
	248	+	input(
	249	+	f"Specify count of features to extract [default {self.TOP_FEATURES}]: "
	250	+	)
	251	+	or self.TOP_FEATURES
	252	+	)
270	253
271		-	print("Detected text features of existing account: " + ", ".join(presence_list))
272		-	features = input("If features was not detected correctly, write it manually: ")
	254	+	match_fun = get_match_ratio(self.settings.presence_strings)
273	255
274		-	if features:
275		-	presence_list = list(map(str.strip, features.split(",")))
	256	+	presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
	257	+	:top_features_count
	258	+	]
276	259
277		-	absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
278		-	:top_features_count
279		-	]
280		-	print("Detected text features of non-existing account: " + ", ".join(absence_list))
281		-	features = input("If features was not detected correctly, write it manually: ")
	260	+	print("Detected text features of existing account: " + ", ".join(presence_list))
	261	+	features = input("If features was not detected correctly, write it manually: ")
282	262
283		-	if features:
284		-	absence_list = list(map(str.strip, features.split(",")))
	263	+	if features:
	264	+	presence_list = list(map(str.strip, features.split(",")))
285	265
286		-	site_data = {
287		-	"absenceStrs": absence_list,
288		-	"presenseStrs": presence_list,
289		-	"url": url_user,
290		-	"urlMain": url_mainpage,
291		-	"usernameClaimed": supposed_username,
292		-	"usernameUnclaimed": non_exist_username,
293		-	"checkType": "message",
294		-	}
	266	+	absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
	267	+	:top_features_count
	268	+	]
	269	+	print(
	270	+	"Detected text features of non-existing account: " + ", ".join(absence_list)
	271	+	)
	272	+	features = input("If features was not detected correctly, write it manually: ")
295	273
296		-	if headers != HEADERS:
297		-	site_data['headers'] = headers
	274	+	if features:
	275	+	absence_list = list(map(str.strip, features.split(",")))
298	276
299		-	site = MaigretSite(url_mainpage.split("/")[-1], site_data)
300		-	return site
	277	+	site_data = {
	278	+	"absenceStrs": absence_list,
	279	+	"presenseStrs": presence_list,
	280	+	"url": url_user,
	281	+	"urlMain": url_mainpage,
	282	+	"usernameClaimed": supposed_username,
	283	+	"usernameUnclaimed": non_exist_username,
	284	+	"checkType": "message",
	285	+	}
301	286
	287	+	if headers != self.HEADERS:
	288	+	site_data['headers'] = headers
302	289
303		-	async def submit_dialog(db, url_exists, cookie_file, logger):
304		-	domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
305		-	domain_raw = domain_raw.split("/")[0]
306		-	logger.info('Domain is %s', domain_raw)
	290	+	site = MaigretSite(url_mainpage.split("/")[-1], site_data)
	291	+	return site
307	292
308		-	# check for existence
309		-	matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
	293	+	async def dialog(self, url_exists, cookie_file):
	294	+	domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
	295	+	domain_raw = domain_raw.split("/")[0]
	296	+	self.logger.info('Domain is %s', domain_raw)
310	297
311		-	if matched_sites:
312		-	print(
313		-	f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
	298	+	# check for existence
	299	+	matched_sites = list(
	300	+	filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
314	301		)
315		-	status = lambda s: "(disabled)" if s.disabled else ""
316		-	url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
317		-	print(
318		-	"\n".join(
319		-	[
320		-	f"{site.name} {status(site)}{url_block(site)}"
321		-	for site in matched_sites
322		-	]
	302	+
	303	+	if matched_sites:
	304	+	print(
	305	+	f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
	306	+	)
	307	+	status = lambda s: "(disabled)" if s.disabled else ""
	308	+	url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
	309	+	print(
	310	+	"\n".join(
	311	+	[
	312	+	f"{site.name} {status(site)}{url_block(site)}"
	313	+	for site in matched_sites
	314	+	]
	315	+	)
323	316		)
324		-	)
325	317
326		-	if input("Do you want to continue? [yN] ").lower() in "n":
327		-	return False
	318	+	if input("Do you want to continue? [yN] ").lower() in "n":
	319	+	return False
328	320
329		-	url_mainpage = extract_mainpage_url(url_exists)
	321	+	url_mainpage = self.extract_mainpage_url(url_exists)
330	322
331		-	print('Detecting site engine, please wait...')
332		-	sites = []
333		-	try:
334		-	sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
335		-	except KeyboardInterrupt:
336		-	print('Engine detect process is interrupted.')
	323	+	print('Detecting site engine, please wait...')
	324	+	sites = []
	325	+	try:
	326	+	sites = await self.detect_known_engine(url_exists, url_mainpage)
	327	+	except KeyboardInterrupt:
	328	+	print('Engine detect process is interrupted.')
337	329
338		-	if not sites:
339		-	print("Unable to detect site engine, lets generate checking features")
340		-	sites = [
341		-	await check_features_manually(
342		-	db, url_exists, url_mainpage, cookie_file, logger
343		-	)
344		-	]
	330	+	if not sites:
	331	+	print("Unable to detect site engine, lets generate checking features")
	332	+	sites = [
	333	+	await self.check_features_manually(
	334	+	url_exists, url_mainpage, cookie_file
	335	+	)
	336	+	]
345	337
346		-	logger.debug(sites[0].__dict__)
	338	+	self.logger.debug(sites[0].__dict__)
347	339
348		-	sem = asyncio.Semaphore(1)
	340	+	sem = asyncio.Semaphore(1)
349	341
350		-	print("Checking, please wait...")
351		-	found = False
352		-	chosen_site = None
353		-	for s in sites:
354		-	chosen_site = s
355		-	result = await site_self_check(s, logger, sem, db)
356		-	if not result["disabled"]:
357		-	found = True
358		-	break
	342	+	print("Checking, please wait...")
	343	+	found = False
	344	+	chosen_site = None
	345	+	for s in sites:
	346	+	chosen_site = s
	347	+	result = await self.site_self_check(s, sem)
	348	+	if not result["disabled"]:
	349	+	found = True
	350	+	break
359	351
360		-	if not found:
361		-	print(
362		-	f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
363		-	)
364		-	print(
365		-	"Try to run this mode again and increase features count or choose others."
366		-	)
367		-	return False
368		-	else:
369		-	if (
370		-	input(
371		-	f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
	352	+	if not found:
	353	+	print(
	354	+	f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
372	355		)
373		-	.lower()
374		-	.strip("y")
375		-	):
	356	+	print(
	357	+	"Try to run this mode again and increase features count or choose others."
	358	+	)
376	359		return False
	360	+	else:
	361	+	if (
	362	+	input(
	363	+	f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
	364	+	)
	365	+	.lower()
	366	+	.strip("y")
	367	+	):
	368	+	return False
377	369
378		-	chosen_site.name = input("Change site name if you want: ") or chosen_site.name
379		-	chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
380		-	rank = get_alexa_rank(chosen_site.url_main)
381		-	if rank:
382		-	print(f'New alexa rank: {rank}')
383		-	chosen_site.alexa_rank = rank
	370	+	chosen_site.name = input("Change site name if you want: ") or chosen_site.name
	371	+	chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
	372	+	rank = Submitter.get_alexa_rank(chosen_site.url_main)
	373	+	if rank:
	374	+	print(f'New alexa rank: {rank}')
	375	+	chosen_site.alexa_rank = rank
384	376
385		-	logger.debug(chosen_site.json)
386		-	site_data = chosen_site.strip_engine_data()
387		-	logger.debug(site_data.json)
388		-	db.update_site(site_data)
389		-	return True
	377	+	self.logger.debug(chosen_site.json)
	378	+	site_data = chosen_site.strip_engine_data()
	379	+	self.logger.debug(site_data.json)
	380	+	self.db.update_site(site_data)
	381	+	return True
390	382

■ ■ ■ ■ ■ ■

maigret/utils.py

1	1		import ast
	2	+	import difflib
2	3		import re
3	4		import random
4	5		from typing import Any
		skipped 91 lines
96	97		def get_random_user_agent():
97	98		return random.choice(DEFAULT_USER_AGENTS)
98	99
	100	+
	101	+	def get_match_ratio(base_strs: list):
	102	+	def get_match_inner(s: str):
	103	+	return round(
	104	+	max(
	105	+	[
	106	+	difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
	107	+	for s2 in base_strs
	108	+	]
	109	+	),
	110	+	2,
	111	+	)
	112	+
	113	+	return get_match_inner
	114	+

■ ■ ■ ■ ■ ■

tests/test_data.py

1	1		"""Maigret data test functions"""
2	2
3	3		from maigret.utils import is_country_tag
4		-	from maigret.sites import SUPPORTED_TAGS
5	4
6	5
7	6		def test_tags_validity(default_db):
8	7		unknown_tags = set()
9	8
	9	+	tags = default_db._tags
	10	+
10	11		for site in default_db.sites:
11	12		for tag in filter(lambda x: not is_country_tag(x), site.tags):
12		-	if tag not in SUPPORTED_TAGS:
	13	+	if tag not in tags:
13	14		unknown_tags.add(tag)
14	15
15	16		assert unknown_tags == set()
		skipped 1 lines

■ ■ ■ ■ ■ ■

tests/test_sites.py

1	1		"""Maigret Database test functions"""
2	2		from maigret.sites import MaigretDatabase, MaigretSite
	3	+	from maigret.utils import URLMatcher
3	4
4	5		EXAMPLE_DB = {
5	6		'engines': {
		skipped 174 lines
180	181		assert len(db.ranked_sites_dict(id_type='username')) == 2
181	182		assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
182	183
	184	+
	185	+	def test_get_url_template():
	186	+	site = MaigretSite(
	187	+	"test",
	188	+	{
	189	+	"urlMain": "https://ya.ru/",
	190	+	"url": "{urlMain}{urlSubpath}/members/?username={username}",
	191	+	},
	192	+	)
	193	+	assert (
	194	+	site.get_url_template()
	195	+	== "{urlMain}{urlSubpath}/members/?username={username} (no engine)"
	196	+	)
	197	+
	198	+	site = MaigretSite(
	199	+	"test",
	200	+	{
	201	+	"urlMain": "https://ya.ru/",
	202	+	"url": "https://{username}.ya.ru",
	203	+	},
	204	+	)
	205	+	assert site.get_url_template() == "SUBDOMAIN"
	206	+

■ ■ ■ ■ ■ ■

tests/test_utils.py

		skipped 7 lines
8	8		enrich_link_str,
9	9		URLMatcher,
10	10		get_dict_ascii_tree,
	11	+	get_match_ratio,
11	12		)
12	13
13	14
		skipped 123 lines
137	138		┗╸twitter_username: Alexaimephotogr"""
138	139		)
139	140
	141	+
	142	+	def test_get_match_ratio():
	143	+	fun = get_match_ratio(["test", "maigret", "username"])
	144	+
	145	+	assert fun("test") == 1
	146	+

Refactoring of submit module, some fixes