STRLCPY/maigret

Merge pull request #51 from soxoj/submit-mode
```
Experimental site submit mode
```
soxoj committed with GitHub 4 years ago

7676c053

2 parents
4f9dace1
90135d46

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

■ ■ ■ ■ ■ ■

maigret/checking.py

1	+	import asyncio
2	+	import logging
3	+	import re
4	+	import ssl
5	+
6	+	import aiohttp
7	+	import tqdm.asyncio
8	+	from aiohttp_socks import ProxyConnector
9	+	from mock import Mock
10	+	from python_socks import _errors as proxy_errors
11	+	from socid_extractor import extract
12	+
13	+	from .activation import ParsingActivator, import_aiohttp_cookies
14	+	from .result import QueryResult, QueryStatus
15	+	from .sites import MaigretDatabase, MaigretSite
16	+
17	+	supported_recursive_search_ids = (
18	+	'yandex_public_id',
19	+	'gaia_id',
20	+	'vk_id',
21	+	'ok_id',
22	+	'wikimapia_uid',
23	+	)
24	+
25	+	common_errors = {
26	+	'<title>Attention Required! \| Cloudflare</title>': 'Cloudflare captcha',
27	+	'Please stand by, while we are checking your browser': 'Cloudflare captcha',
28	+	'<title>Доступ ограничен</title>': 'Rostelecom censorship',
29	+	'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha',
30	+	'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection',
31	+	'404</h1><p class="error-card__description">Мы не нашли страницу': 'MegaFon 404 page',
32	+	'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship',
33	+	'Incapsula incident ID': 'Incapsula antibot protection',
34	+	}
35	+
36	+	unsupported_characters = '#'
37	+
38	+
39	+	async def get_response(request_future, site_name, logger):
40	+	html_text = None
41	+	status_code = 0
42	+
43	+	error_text = "General Unknown Error"
44	+	expection_text = None
45	+
46	+	try:
47	+	response = await request_future
48	+
49	+	status_code = response.status
50	+	response_content = await response.content.read()
51	+	charset = response.charset or 'utf-8'
52	+	decoded_content = response_content.decode(charset, 'ignore')
53	+	html_text = decoded_content
54	+
55	+	if status_code > 0:
56	+	error_text = None
57	+
58	+	logger.debug(html_text)
59	+
60	+	except asyncio.TimeoutError as errt:
61	+	error_text = "Timeout Error"
62	+	expection_text = str(errt)
63	+	except (ssl.SSLCertVerificationError, ssl.SSLError) as err:
64	+	error_text = "SSL Error"
65	+	expection_text = str(err)
66	+	except aiohttp.client_exceptions.ClientConnectorError as err:
67	+	error_text = "Error Connecting"
68	+	expection_text = str(err)
69	+	except aiohttp.http_exceptions.BadHttpMessage as err:
70	+	error_text = "HTTP Error"
71	+	expection_text = str(err)
72	+	except proxy_errors.ProxyError as err:
73	+	error_text = "Proxy Error"
74	+	expection_text = str(err)
75	+	except Exception as err:
76	+	logger.warning(f'Unhandled error while requesting {site_name}: {err}')
77	+	logger.debug(err, exc_info=True)
78	+	error_text = "Some Error"
79	+	expection_text = str(err)
80	+
81	+	# TODO: return only needed information
82	+	return html_text, status_code, error_text, expection_text
83	+
84	+
85	+	async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify):
86	+	async with semaphore:
87	+	site_obj = site_dict[sitename]
88	+	future = site_obj.request_future
89	+	if not future:
90	+	# ignore: search by incompatible id type
91	+	return
92	+
93	+	response = await get_response(request_future=future,
94	+	site_name=sitename,
95	+	logger=logger)
96	+
97	+	site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
98	+
99	+
100	+	# TODO: move info separate module
101	+	def detect_error_page(html_text, status_code, fail_flags, ignore_403):
102	+	# Detect service restrictions such as a country restriction
103	+	for flag, msg in fail_flags.items():
104	+	if flag in html_text:
105	+	return 'Some site error', msg
106	+
107	+	# Detect common restrictions such as provider censorship and bot protection
108	+	for flag, msg in common_errors.items():
109	+	if flag in html_text:
110	+	return 'Error', msg
111	+
112	+	# Detect common site errors
113	+	if status_code == 403 and not ignore_403:
114	+	return 'Access denied', 'Access denied, use proxy/vpn'
115	+	elif status_code >= 500:
116	+	return f'Error {status_code}', f'Site error {status_code}'
117	+
118	+	return None, None
119	+
120	+
121	+	def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
122	+	if not response:
123	+	return results_info
124	+
125	+	fulltags = site.tags
126	+
127	+	# Retrieve other site information again
128	+	username = results_info['username']
129	+	is_parsing_enabled = results_info['parsing_enabled']
130	+	url = results_info.get("url_user")
131	+	logger.debug(url)
132	+
133	+	status = results_info.get("status")
134	+	if status is not None:
135	+	# We have already determined the user doesn't exist here
136	+	return results_info
137	+
138	+	# Get the expected check type
139	+	check_type = site.check_type
140	+
141	+	# Get the failure messages and comments
142	+	failure_errors = site.errors
143	+
144	+	# TODO: refactor
145	+	if not response:
146	+	logger.error(f'No response for {site.name}')
147	+	return results_info
148	+
149	+	html_text, status_code, error_text, expection_text = response
150	+	site_error_text = '?'
151	+
152	+	# TODO: add elapsed request time counting
153	+	response_time = None
154	+
155	+	if logger.level == logging.DEBUG:
156	+	with open('debug.txt', 'a') as f:
157	+	status = status_code or 'No response'
158	+	f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n')
159	+	if html_text:
160	+	f.write(f'code: {status}\nresponse: {str(html_text)}\n')
161	+
162	+	if status_code and not error_text:
163	+	error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
164	+	site.ignore_403)
165	+
166	+	if site.activation and html_text:
167	+	is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
168	+	if is_need_activation:
169	+	method = site.activation['method']
170	+	try:
171	+	activate_fun = getattr(ParsingActivator(), method)
172	+	# TODO: async call
173	+	activate_fun(site, logger)
174	+	except AttributeError:
175	+	logger.warning(f'Activation method {method} for site {site.name} not found!')
176	+
177	+	# presense flags
178	+	# True by default
179	+	presense_flags = site.presense_strs
180	+	is_presense_detected = False
181	+	if html_text:
182	+	if not presense_flags:
183	+	is_presense_detected = True
184	+	site.stats['presense_flag'] = None
185	+	else:
186	+	for presense_flag in presense_flags:
187	+	if presense_flag in html_text:
188	+	is_presense_detected = True
189	+	site.stats['presense_flag'] = presense_flag
190	+	logger.info(presense_flag)
191	+	break
192	+
193	+	if error_text is not None:
194	+	logger.debug(error_text)
195	+	result = QueryResult(username,
196	+	site.name,
197	+	url,
198	+	QueryStatus.UNKNOWN,
199	+	query_time=response_time,
200	+	context=f'{error_text}: {site_error_text}', tags=fulltags)
201	+	elif check_type == "message":
202	+	absence_flags = site.absence_strs
203	+	is_absence_flags_list = isinstance(absence_flags, list)
204	+	absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags}
205	+	# Checks if the error message is in the HTML
206	+	is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set])
207	+	if not is_absence_detected and is_presense_detected:
208	+	result = QueryResult(username,
209	+	site.name,
210	+	url,
211	+	QueryStatus.CLAIMED,
212	+	query_time=response_time, tags=fulltags)
213	+	else:
214	+	result = QueryResult(username,
215	+	site.name,
216	+	url,
217	+	QueryStatus.AVAILABLE,
218	+	query_time=response_time, tags=fulltags)
219	+	elif check_type == "status_code":
220	+	# Checks if the status code of the response is 2XX
221	+	if (not status_code >= 300 or status_code < 200) and is_presense_detected:
222	+	result = QueryResult(username,
223	+	site.name,
224	+	url,
225	+	QueryStatus.CLAIMED,
226	+	query_time=response_time, tags=fulltags)
227	+	else:
228	+	result = QueryResult(username,
229	+	site.name,
230	+	url,
231	+	QueryStatus.AVAILABLE,
232	+	query_time=response_time, tags=fulltags)
233	+	elif check_type == "response_url":
234	+	# For this detection method, we have turned off the redirect.
235	+	# So, there is no need to check the response URL: it will always
236	+	# match the request. Instead, we will ensure that the response
237	+	# code indicates that the request was successful (i.e. no 404, or
238	+	# forward to some odd redirect).
239	+	if 200 <= status_code < 300 and is_presense_detected:
240	+	result = QueryResult(username,
241	+	site.name,
242	+	url,
243	+	QueryStatus.CLAIMED,
244	+	query_time=response_time, tags=fulltags)
245	+	else:
246	+	result = QueryResult(username,
247	+	site.name,
248	+	url,
249	+	QueryStatus.AVAILABLE,
250	+	query_time=response_time, tags=fulltags)
251	+	else:
252	+	# It should be impossible to ever get here...
253	+	raise ValueError(f"Unknown check type '{check_type}' for "
254	+	f"site '{site.name}'")
255	+
256	+	extracted_ids_data = {}
257	+
258	+	if is_parsing_enabled and result.status == QueryStatus.CLAIMED:
259	+	try:
260	+	extracted_ids_data = extract(html_text)
261	+	except Exception as e:
262	+	logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True)
263	+
264	+	if extracted_ids_data:
265	+	new_usernames = {}
266	+	for k, v in extracted_ids_data.items():
267	+	if 'username' in k:
268	+	new_usernames[v] = 'username'
269	+	if k in supported_recursive_search_ids:
270	+	new_usernames[v] = k
271	+
272	+	results_info['ids_usernames'] = new_usernames
273	+	result.ids_data = extracted_ids_data
274	+
275	+	# Notify caller about results of query.
276	+	query_notify.update(result, site.similar_search)
277	+
278	+	# Save status of request
279	+	results_info['status'] = result
280	+
281	+	# Save results from request
282	+	results_info['http_status'] = status_code
283	+	results_info['is_similar'] = site.similar_search
284	+	# results_site['response_text'] = html_text
285	+	results_info['rank'] = site.alexa_rank
286	+	return results_info
287	+
288	+
289	+	async def maigret(username, site_dict, query_notify, logger,
290	+	proxy=None, timeout=None, recursive_search=False,
291	+	id_type='username', debug=False, forced=False,
292	+	max_connections=100, no_progressbar=False,
293	+	cookies=None):
294	+	"""Main search func
295	+
296	+	Checks for existence of username on various social media sites.
297	+
298	+	Keyword Arguments:
299	+	username -- String indicating username that report
300	+	should be created against.
301	+	site_dict -- Dictionary containing all of the site data.
302	+	query_notify -- Object with base type of QueryNotify().
303	+	This will be used to notify the caller about
304	+	query results.
305	+	proxy -- String indicating the proxy URL
306	+	timeout -- Time in seconds to wait before timing out request.
307	+	Default is no timeout.
308	+	recursive_search -- Search for other usernames in website pages & recursive search by them.
309	+
310	+	Return Value:
311	+	Dictionary containing results from report. Key of dictionary is the name
312	+	of the social network site, and the value is another dictionary with
313	+	the following keys:
314	+	url_main: URL of main site.
315	+	url_user: URL of user on site (if account exists).
316	+	status: QueryResult() object indicating results of test for
317	+	account existence.
318	+	http_status: HTTP status code of query which checked for existence on
319	+	site.
320	+	response_text: Text that came back from request. May be None if
321	+	there was an HTTP error when checking for existence.
322	+	"""
323	+
324	+	# Notify caller that we are starting the query.
325	+	query_notify.start(username, id_type)
326	+
327	+	# TODO: connector
328	+	connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
329	+	# connector = aiohttp.TCPConnector(ssl=False)
330	+	connector.verify_ssl = False
331	+
332	+	cookie_jar = None
333	+	if cookies:
334	+	cookie_jar = await import_aiohttp_cookies(cookies)
335	+
336	+	session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar)
337	+
338	+	if logger.level == logging.DEBUG:
339	+	future = session.get(url='https://icanhazip.com')
340	+	ip, status, error, expection = await get_response(future, None, logger)
341	+	if ip:
342	+	logger.debug(f'My IP is: {ip.strip()}')
343	+	else:
344	+	logger.debug(f'IP requesting {error}: {expection}')
345	+
346	+	# Results from analysis of all sites
347	+	results_total = {}
348	+
349	+	# First create futures for all requests. This allows for the requests to run in parallel
350	+	for site_name, site in site_dict.items():
351	+
352	+	if site.type != id_type:
353	+	continue
354	+
355	+	if site.disabled and not forced:
356	+	logger.debug(f'Site {site.name} is disabled, skipping...')
357	+	continue
358	+
359	+	# Results from analysis of this specific site
360	+	results_site = {}
361	+
362	+	# Record URL of main site and username
363	+	results_site['username'] = username
364	+	results_site['parsing_enabled'] = recursive_search
365	+	results_site['url_main'] = site.url_main
366	+	results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
367	+
368	+	headers = {
369	+	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
370	+	}
371	+
372	+	headers.update(site.headers)
373	+
374	+	if not 'url' in site.__dict__:
375	+	logger.error('No URL for site %s', site.name)
376	+	# URL of user on site (if it exists)
377	+	url = site.url.format(
378	+	urlMain=site.url_main,
379	+	urlSubpath=site.url_subpath,
380	+	username=username
381	+	)
382	+	# workaround to prevent slash errors
383	+	url = re.sub('(?<!:)/+', '/', url)
384	+
385	+	# Don't make request if username is invalid for the site
386	+	if site.regex_check and re.search(site.regex_check, username) is None:
387	+	# No need to do the check at the site: this user name is not allowed.
388	+	results_site['status'] = QueryResult(username,
389	+	site_name,
390	+	url,
391	+	QueryStatus.ILLEGAL)
392	+	results_site["url_user"] = ""
393	+	results_site['http_status'] = ""
394	+	results_site['response_text'] = ""
395	+	query_notify.update(results_site['status'])
396	+	else:
397	+	# URL of user on site (if it exists)
398	+	results_site["url_user"] = url
399	+	url_probe = site.url_probe
400	+	if url_probe is None:
401	+	# Probe URL is normal one seen by people out on the web.
402	+	url_probe = url
403	+	else:
404	+	# There is a special URL for probing existence separate
405	+	# from where the user profile normally can be found.
406	+	url_probe = url_probe.format(
407	+	urlMain=site.url_main,
408	+	urlSubpath=site.url_subpath,
409	+	username=username,
410	+	)
411	+
412	+	for k, v in site.get_params.items():
413	+	url_probe += f'&{k}={v}'
414	+
415	+	if site.check_type == 'status_code' and site.request_head_only:
416	+	# In most cases when we are detecting by status code,
417	+	# it is not necessary to get the entire body: we can
418	+	# detect fine with just the HEAD response.
419	+	request_method = session.head
420	+	else:
421	+	# Either this detect method needs the content associated
422	+	# with the GET response, or this specific website will
423	+	# not respond properly unless we request the whole page.
424	+	request_method = session.get
425	+
426	+	if site.check_type == "response_url":
427	+	# Site forwards request to a different URL if username not
428	+	# found. Disallow the redirect so we can capture the
429	+	# http status from the original URL request.
430	+	allow_redirects = False
431	+	else:
432	+	# Allow whatever redirect that the site wants to do.
433	+	# The final result of the request will be what is available.
434	+	allow_redirects = True
435	+
436	+	future = request_method(url=url_probe, headers=headers,
437	+	allow_redirects=allow_redirects,
438	+	timeout=timeout,
439	+	)
440	+
441	+	# Store future in data for access later
442	+	# TODO: move to separate obj
443	+	site.request_future = future
444	+
445	+	# Add this site's results into final dictionary with all of the other results.
446	+	results_total[site_name] = results_site
447	+
448	+	# TODO: move into top-level function
449	+
450	+	sem = asyncio.Semaphore(max_connections)
451	+
452	+	tasks = []
453	+	for sitename, result_obj in results_total.items():
454	+	update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify)
455	+	future = asyncio.ensure_future(update_site_coro)
456	+	tasks.append(future)
457	+
458	+	if no_progressbar:
459	+	await asyncio.gather(*tasks)
460	+	else:
461	+	for f in tqdm.asyncio.tqdm.as_completed(tasks):
462	+	await f
463	+
464	+	await session.close()
465	+
466	+	# Notify caller that all queries are finished.
467	+	query_notify.finish()
468	+
469	+	return results_total
470	+
471	+
472	+	def timeout_check(value):
473	+	"""Check Timeout Argument.
474	+
475	+	Checks timeout for validity.
476	+
477	+	Keyword Arguments:
478	+	value -- Time in seconds to wait before timing out request.
479	+
480	+	Return Value:
481	+	Floating point number representing the time (in seconds) that should be
482	+	used for the timeout.
483	+
484	+	NOTE: Will raise an exception if the timeout in invalid.
485	+	"""
486	+	from argparse import ArgumentTypeError
487	+
488	+	try:
489	+	timeout = float(value)
490	+	except ValueError:
491	+	raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
492	+	if timeout <= 0:
493	+	raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
494	+	return timeout
495	+
496	+
497	+	async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
498	+	query_notify = Mock()
499	+	changes = {
500	+	'disabled': False,
501	+	}
502	+
503	+	try:
504	+	check_data = [
505	+	(site.username_claimed, QueryStatus.CLAIMED),
506	+	(site.username_unclaimed, QueryStatus.AVAILABLE),
507	+	]
508	+	except Exception as e:
509	+	logger.error(e)
510	+	logger.error(site.__dict__)
511	+	check_data = []
512	+
513	+	logger.info(f'Checking {site.name}...')
514	+
515	+	for username, status in check_data:
516	+	async with semaphore:
517	+	results_dict = await maigret(
518	+	username,
519	+	{site.name: site},
520	+	query_notify,
521	+	logger,
522	+	timeout=30,
523	+	id_type=site.type,
524	+	forced=True,
525	+	no_progressbar=True,
526	+	)
527	+
528	+	# don't disable entries with other ids types
529	+	# TODO: make normal checking
530	+	if site.name not in results_dict:
531	+	logger.info(results_dict)
532	+	changes['disabled'] = True
533	+	continue
534	+
535	+	result = results_dict[site.name]['status']
536	+
537	+	site_status = result.status
538	+
539	+	if site_status != status:
540	+	if site_status == QueryStatus.UNKNOWN:
541	+	msgs = site.absence_strs
542	+	etype = site.check_type
543	+	logger.warning(
544	+	f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
545	+	# don't disable in case of available username
546	+	if status == QueryStatus.CLAIMED:
547	+	changes['disabled'] = True
548	+	elif status == QueryStatus.CLAIMED:
549	+	logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
550	+	logger.info(results_dict[site.name])
551	+	changes['disabled'] = True
552	+	else:
553	+	logger.warning(f'Found `{username}` in {site.name}, must be available')
554	+	logger.info(results_dict[site.name])
555	+	changes['disabled'] = True
556	+
557	+	logger.info(f'Site {site.name} checking is finished')
558	+
559	+	if changes['disabled'] != site.disabled:
560	+	site.disabled = changes['disabled']
561	+	db.update_site(site)
562	+	if not silent:
563	+	action = 'Disabled' if site.disabled else 'Enabled'
564	+	print(f'{action} site {site.name}...')
565	+
566	+	return changes
567	+
568	+
569	+	async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False,
570	+	max_connections=10) -> bool:
571	+	sem = asyncio.Semaphore(max_connections)
572	+	tasks = []
573	+	all_sites = site_data
574	+
575	+	def disabled_count(lst):
576	+	return len(list(filter(lambda x: x.disabled, lst)))
577	+
578	+	disabled_old_count = disabled_count(all_sites.values())
579	+
580	+	for _, site in all_sites.items():
581	+	check_coro = site_self_check(site, logger, sem, db, silent)
582	+	future = asyncio.ensure_future(check_coro)
583	+	tasks.append(future)
584	+
585	+	for f in tqdm.asyncio.tqdm.as_completed(tasks):
586	+	await f
587	+
588	+	disabled_new_count = disabled_count(all_sites.values())
589	+	total_disabled = disabled_new_count - disabled_old_count
590	+
591	+	if total_disabled >= 0:
592	+	message = 'Disabled'
593	+	else:
594	+	message = 'Enabled'
595	+	total_disabled *= -1
596	+
597	+	if not silent:
598	+	print(
599	+	f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information')
600	+
601	+	return total_disabled != 0
602	+

■ ■ ■ ■ ■ ■

maigret/maigret.py

		skipped 1 lines
2	2		Maigret main module
3	3		"""
4	4
5		-	import asyncio
6		-	import logging
7	5		import os
8	6		import platform
9		-	import re
10		-	import ssl
11	7		import sys
12	8		from argparse import ArgumentParser, RawDescriptionHelpFormatter
13	9
14		-	import aiohttp
15	10		import requests
16		-	import tqdm.asyncio
17		-	from aiohttp_socks import ProxyConnector
18		-	from mock import Mock
19		-	from python_socks import _errors as proxy_errors
20		-	from socid_extractor import parse, extract, __version__ as socid_version
	11	+	from socid_extractor import parse, __version__ as socid_version
21	12
22		-	from .activation import ParsingActivator, import_aiohttp_cookies
	13	+	from .checking import *
23	14		from .notify import QueryNotifyPrint
24	15		from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
25	16		generate_report_context, save_txt_report
26		-	from .result import QueryResult, QueryStatus
27		-	from .sites import MaigretDatabase, MaigretSite
	17	+	from .submit import submit_dialog
28	18
29	19		__version__ = '0.1.13'
30	20
31		-	supported_recursive_search_ids = (
32		-	'yandex_public_id',
33		-	'gaia_id',
34		-	'vk_id',
35		-	'ok_id',
36		-	'wikimapia_uid',
37		-	)
38		-
39		-	common_errors = {
40		-	'<title>Attention Required! \| Cloudflare</title>': 'Cloudflare captcha',
41		-	'Please stand by, while we are checking your browser': 'Cloudflare captcha',
42		-	'<title>Доступ ограничен</title>': 'Rostelecom censorship',
43		-	'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha',
44		-	'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection',
45		-	'404</h1><p class="error-card__description">Мы не нашли страницу': 'MegaFon 404 page',
46		-	'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship',
47		-	'Incapsula incident ID': 'Incapsula antibot protection',
48		-	}
49		-
50		-	unsupported_characters = '#'
51		-
52		-	async def get_response(request_future, site_name, logger):
53		-	html_text = None
54		-	status_code = 0
55		-
56		-	error_text = "General Unknown Error"
57		-	expection_text = None
58		-
59		-	try:
60		-	response = await request_future
61		-
62		-	status_code = response.status
63		-	response_content = await response.content.read()
64		-	charset = response.charset or 'utf-8'
65		-	decoded_content = response_content.decode(charset, 'ignore')
66		-	html_text = decoded_content
67		-
68		-	if status_code > 0:
69		-	error_text = None
70		-
71		-	logger.debug(html_text)
72		-
73		-	except asyncio.TimeoutError as errt:
74		-	error_text = "Timeout Error"
75		-	expection_text = str(errt)
76		-	except (ssl.SSLCertVerificationError, ssl.SSLError) as err:
77		-	error_text = "SSL Error"
78		-	expection_text = str(err)
79		-	except aiohttp.client_exceptions.ClientConnectorError as err:
80		-	error_text = "Error Connecting"
81		-	expection_text = str(err)
82		-	except aiohttp.http_exceptions.BadHttpMessage as err:
83		-	error_text = "HTTP Error"
84		-	expection_text = str(err)
85		-	except proxy_errors.ProxyError as err:
86		-	error_text = "Proxy Error"
87		-	expection_text = str(err)
88		-	except Exception as err:
89		-	logger.warning(f'Unhandled error while requesting {site_name}: {err}')
90		-	logger.debug(err, exc_info=True)
91		-	error_text = "Some Error"
92		-	expection_text = str(err)
93		-
94		-	# TODO: return only needed information
95		-	return html_text, status_code, error_text, expection_text
96		-
97		-
98		-	async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify):
99		-	async with semaphore:
100		-	site_obj = site_dict[sitename]
101		-	future = site_obj.request_future
102		-	if not future:
103		-	# ignore: search by incompatible id type
104		-	return
105		-
106		-	response = await get_response(request_future=future,
107		-	site_name=sitename,
108		-	logger=logger)
109		-
110		-	site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
111		-
112		-	# TODO: move info separate module
113		-	def detect_error_page(html_text, status_code, fail_flags, ignore_403):
114		-	# Detect service restrictions such as a country restriction
115		-	for flag, msg in fail_flags.items():
116		-	if flag in html_text:
117		-	return 'Some site error', msg
118		-
119		-	# Detect common restrictions such as provider censorship and bot protection
120		-	for flag, msg in common_errors.items():
121		-	if flag in html_text:
122		-	return 'Error', msg
123		-
124		-	# Detect common site errors
125		-	if status_code == 403 and not ignore_403:
126		-	return 'Access denied', 'Access denied, use proxy/vpn'
127		-	elif status_code >= 500:
128		-	return f'Error {status_code}', f'Site error {status_code}'
129		-
130		-	return None, None
131		-
132		-
133		-	def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
134		-	if not response:
135		-	return results_info
136		-
137		-	fulltags = site.tags
138		-
139		-	# Retrieve other site information again
140		-	username = results_info['username']
141		-	is_parsing_enabled = results_info['parsing_enabled']
142		-	url = results_info.get("url_user")
143		-	logger.debug(url)
144		-
145		-	status = results_info.get("status")
146		-	if status is not None:
147		-	# We have already determined the user doesn't exist here
148		-	return results_info
149		-
150		-	# Get the expected check type
151		-	check_type = site.check_type
152		-
153		-	# Get the failure messages and comments
154		-	failure_errors = site.errors
155		-
156		-	# TODO: refactor
157		-	if not response:
158		-	logger.error(f'No response for {site.name}')
159		-	return results_info
160		-
161		-	html_text, status_code, error_text, expection_text = response
162		-	site_error_text = '?'
163		-
164		-	# TODO: add elapsed request time counting
165		-	response_time = None
166		-
167		-	if logger.level == logging.DEBUG:
168		-	with open('debug.txt', 'a') as f:
169		-	status = status_code or 'No response'
170		-	f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n')
171		-	if html_text:
172		-	f.write(f'code: {status}\nresponse: {str(html_text)}\n')
173		-
174		-	if status_code and not error_text:
175		-	error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
176		-	site.ignore_403)
177		-
178		-	if site.activation and html_text:
179		-	is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
180		-	if is_need_activation:
181		-	method = site.activation['method']
182		-	try:
183		-	activate_fun = getattr(ParsingActivator(), method)
184		-	# TODO: async call
185		-	activate_fun(site, logger)
186		-	except AttributeError:
187		-	logger.warning(f'Activation method {method} for site {site.name} not found!')
188		-
189		-	# presense flags
190		-	# True by default
191		-	presense_flags = site.presense_strs
192		-	is_presense_detected = False
193		-	if html_text:
194		-	if not presense_flags:
195		-	is_presense_detected = True
196		-	site.stats['presense_flag'] = None
197		-	else:
198		-	for presense_flag in presense_flags:
199		-	if presense_flag in html_text:
200		-	is_presense_detected = True
201		-	site.stats['presense_flag'] = presense_flag
202		-	logger.info(presense_flag)
203		-	break
204		-
205		-	if error_text is not None:
206		-	logger.debug(error_text)
207		-	result = QueryResult(username,
208		-	site.name,
209		-	url,
210		-	QueryStatus.UNKNOWN,
211		-	query_time=response_time,
212		-	context=f'{error_text}: {site_error_text}', tags=fulltags)
213		-	elif check_type == "message":
214		-	absence_flags = site.absence_strs
215		-	is_absence_flags_list = isinstance(absence_flags, list)
216		-	absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags}
217		-	# Checks if the error message is in the HTML
218		-	is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set])
219		-	if not is_absence_detected and is_presense_detected:
220		-	result = QueryResult(username,
221		-	site.name,
222		-	url,
223		-	QueryStatus.CLAIMED,
224		-	query_time=response_time, tags=fulltags)
225		-	else:
226		-	result = QueryResult(username,
227		-	site.name,
228		-	url,
229		-	QueryStatus.AVAILABLE,
230		-	query_time=response_time, tags=fulltags)
231		-	elif check_type == "status_code":
232		-	# Checks if the status code of the response is 2XX
233		-	if (not status_code >= 300 or status_code < 200) and is_presense_detected:
234		-	result = QueryResult(username,
235		-	site.name,
236		-	url,
237		-	QueryStatus.CLAIMED,
238		-	query_time=response_time, tags=fulltags)
239		-	else:
240		-	result = QueryResult(username,
241		-	site.name,
242		-	url,
243		-	QueryStatus.AVAILABLE,
244		-	query_time=response_time, tags=fulltags)
245		-	elif check_type == "response_url":
246		-	# For this detection method, we have turned off the redirect.
247		-	# So, there is no need to check the response URL: it will always
248		-	# match the request. Instead, we will ensure that the response
249		-	# code indicates that the request was successful (i.e. no 404, or
250		-	# forward to some odd redirect).
251		-	if 200 <= status_code < 300 and is_presense_detected:
252		-	result = QueryResult(username,
253		-	site.name,
254		-	url,
255		-	QueryStatus.CLAIMED,
256		-	query_time=response_time, tags=fulltags)
257		-	else:
258		-	result = QueryResult(username,
259		-	site.name,
260		-	url,
261		-	QueryStatus.AVAILABLE,
262		-	query_time=response_time, tags=fulltags)
263		-	else:
264		-	# It should be impossible to ever get here...
265		-	raise ValueError(f"Unknown check type '{check_type}' for "
266		-	f"site '{site.name}'")
267		-
268		-	extracted_ids_data = {}
269		-
270		-	if is_parsing_enabled and result.status == QueryStatus.CLAIMED:
271		-	try:
272		-	extracted_ids_data = extract(html_text)
273		-	except Exception as e:
274		-	logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True)
275		-
276		-	if extracted_ids_data:
277		-	new_usernames = {}
278		-	for k, v in extracted_ids_data.items():
279		-	if 'username' in k:
280		-	new_usernames[v] = 'username'
281		-	if k in supported_recursive_search_ids:
282		-	new_usernames[v] = k
283		-
284		-	results_info['ids_usernames'] = new_usernames
285		-	result.ids_data = extracted_ids_data
286		-
287		-	# Notify caller about results of query.
288		-	query_notify.update(result, site.similar_search)
289		-
290		-	# Save status of request
291		-	results_info['status'] = result
292		-
293		-	# Save results from request
294		-	results_info['http_status'] = status_code
295		-	results_info['is_similar'] = site.similar_search
296		-	# results_site['response_text'] = html_text
297		-	results_info['rank'] = site.alexa_rank
298		-	return results_info
299		-
300		-
301		-
302		-
303		-	async def maigret(username, site_dict, query_notify, logger,
304		-	proxy=None, timeout=None, recursive_search=False,
305		-	id_type='username', debug=False, forced=False,
306		-	max_connections=100, no_progressbar=False,
307		-	cookies=None):
308		-	"""Main search func
309		-
310		-	Checks for existence of username on various social media sites.
311		-
312		-	Keyword Arguments:
313		-	username -- String indicating username that report
314		-	should be created against.
315		-	site_dict -- Dictionary containing all of the site data.
316		-	query_notify -- Object with base type of QueryNotify().
317		-	This will be used to notify the caller about
318		-	query results.
319		-	proxy -- String indicating the proxy URL
320		-	timeout -- Time in seconds to wait before timing out request.
321		-	Default is no timeout.
322		-	recursive_search -- Search for other usernames in website pages & recursive search by them.
323		-
324		-	Return Value:
325		-	Dictionary containing results from report. Key of dictionary is the name
326		-	of the social network site, and the value is another dictionary with
327		-	the following keys:
328		-	url_main: URL of main site.
329		-	url_user: URL of user on site (if account exists).
330		-	status: QueryResult() object indicating results of test for
331		-	account existence.
332		-	http_status: HTTP status code of query which checked for existence on
333		-	site.
334		-	response_text: Text that came back from request. May be None if
335		-	there was an HTTP error when checking for existence.
336		-	"""
337		-
338		-	# Notify caller that we are starting the query.
339		-	query_notify.start(username, id_type)
340		-
341		-	# TODO: connector
342		-	connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
343		-	# connector = aiohttp.TCPConnector(ssl=False)
344		-	connector.verify_ssl=False
345		-
346		-	cookie_jar = None
347		-	if cookies:
348		-	cookie_jar = await import_aiohttp_cookies(cookies)
349		-
350		-	session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar)
351		-
352		-	if logger.level == logging.DEBUG:
353		-	future = session.get(url='https://icanhazip.com')
354		-	ip, status, error, expection = await get_response(future, None, logger)
355		-	if ip:
356		-	logger.debug(f'My IP is: {ip.strip()}')
357		-	else:
358		-	logger.debug(f'IP requesting {error}: {expection}')
359		-
360		-
361		-	# Results from analysis of all sites
362		-	results_total = {}
363		-
364		-	# First create futures for all requests. This allows for the requests to run in parallel
365		-	for site_name, site in site_dict.items():
366		-
367		-	if site.type != id_type:
368		-	continue
369		-
370		-	if site.disabled and not forced:
371		-	logger.debug(f'Site {site.name} is disabled, skipping...')
372		-	continue
373		-
374		-	# Results from analysis of this specific site
375		-	results_site = {}
376		-
377		-	# Record URL of main site and username
378		-	results_site['username'] = username
379		-	results_site['parsing_enabled'] = recursive_search
380		-	results_site['url_main'] = site.url_main
381		-	results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
382		-
383		-	headers = {
384		-	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
385		-	}
386		-
387		-	headers.update(site.headers)
388		-
389		-	if not 'url' in site.__dict__:
390		-	logger.error('No URL for site %s', site.name)
391		-	# URL of user on site (if it exists)
392		-	url = site.url.format(
393		-	urlMain=site.url_main,
394		-	urlSubpath=site.url_subpath,
395		-	username=username
396		-	)
397		-	# workaround to prevent slash errors
398		-	url = re.sub('(?<!:)/+', '/', url)
399		-
400		-	# Don't make request if username is invalid for the site
401		-	if site.regex_check and re.search(site.regex_check, username) is None:
402		-	# No need to do the check at the site: this user name is not allowed.
403		-	results_site['status'] = QueryResult(username,
404		-	site_name,
405		-	url,
406		-	QueryStatus.ILLEGAL)
407		-	results_site["url_user"] = ""
408		-	results_site['http_status'] = ""
409		-	results_site['response_text'] = ""
410		-	query_notify.update(results_site['status'])
411		-	else:
412		-	# URL of user on site (if it exists)
413		-	results_site["url_user"] = url
414		-	url_probe = site.url_probe
415		-	if url_probe is None:
416		-	# Probe URL is normal one seen by people out on the web.
417		-	url_probe = url
418		-	else:
419		-	# There is a special URL for probing existence separate
420		-	# from where the user profile normally can be found.
421		-	url_probe = url_probe.format(
422		-	urlMain=site.url_main,
423		-	urlSubpath=site.url_subpath,
424		-	username=username,
425		-	)
426		-
427		-	for k, v in site.get_params.items():
428		-	url_probe += f'&{k}={v}'
429		-
430		-	if site.check_type == 'status_code' and site.request_head_only:
431		-	# In most cases when we are detecting by status code,
432		-	# it is not necessary to get the entire body: we can
433		-	# detect fine with just the HEAD response.
434		-	request_method = session.head
435		-	else:
436		-	# Either this detect method needs the content associated
437		-	# with the GET response, or this specific website will
438		-	# not respond properly unless we request the whole page.
439		-	request_method = session.get
440		-
441		-	if site.check_type == "response_url":
442		-	# Site forwards request to a different URL if username not
443		-	# found. Disallow the redirect so we can capture the
444		-	# http status from the original URL request.
445		-	allow_redirects = False
446		-	else:
447		-	# Allow whatever redirect that the site wants to do.
448		-	# The final result of the request will be what is available.
449		-	allow_redirects = True
450		-
451		-	future = request_method(url=url_probe, headers=headers,
452		-	allow_redirects=allow_redirects,
453		-	timeout=timeout,
454		-	)
455		-
456		-	# Store future in data for access later
457		-	# TODO: move to separate obj
458		-	site.request_future = future
459		-
460		-	# Add this site's results into final dictionary with all of the other results.
461		-	results_total[site_name] = results_site
462		-
463		-	# TODO: move into top-level function
464		-
465		-	sem = asyncio.Semaphore(max_connections)
466		-
467		-	tasks = []
468		-	for sitename, result_obj in results_total.items():
469		-	update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify)
470		-	future = asyncio.ensure_future(update_site_coro)
471		-	tasks.append(future)
472		-
473		-	if no_progressbar:
474		-	await asyncio.gather(*tasks)
475		-	else:
476		-	for f in tqdm.asyncio.tqdm.as_completed(tasks):
477		-	await f
478		-
479		-	await session.close()
480		-
481		-	# Notify caller that all queries are finished.
482		-	query_notify.finish()
483		-
484		-	return results_total
485		-
486		-
487		-	def timeout_check(value):
488		-	"""Check Timeout Argument.
489		-
490		-	Checks timeout for validity.
491		-
492		-	Keyword Arguments:
493		-	value -- Time in seconds to wait before timing out request.
494		-
495		-	Return Value:
496		-	Floating point number representing the time (in seconds) that should be
497		-	used for the timeout.
498		-
499		-	NOTE: Will raise an exception if the timeout in invalid.
500		-	"""
501		-	from argparse import ArgumentTypeError
502		-
503		-	try:
504		-	timeout = float(value)
505		-	except ValueError:
506		-	raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
507		-	if timeout <= 0:
508		-	raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
509		-	return timeout
510		-
511		-
512		-	async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
513		-	query_notify = Mock()
514		-	changes = {
515		-	'disabled': False,
516		-	}
517		-
518		-	try:
519		-	check_data = [
520		-	(site.username_claimed, QueryStatus.CLAIMED),
521		-	(site.username_unclaimed, QueryStatus.AVAILABLE),
522		-	]
523		-	except:
524		-	print(site.__dict__)
525		-
526		-	logger.info(f'Checking {site.name}...')
527		-
528		-	for username, status in check_data:
529		-	async with semaphore:
530		-	results_dict = await maigret(
531		-	username,
532		-	{site.name: site},
533		-	query_notify,
534		-	logger,
535		-	timeout=30,
536		-	id_type=site.type,
537		-	forced=True,
538		-	no_progressbar=True,
539		-	)
540		-
541		-	# don't disable entries with other ids types
542		-	# TODO: make normal checking
543		-	if site.name not in results_dict:
544		-	logger.info(results_dict)
545		-	changes['disabled'] = True
546		-	continue
547		-
548		-	result = results_dict[site.name]['status']
549		-
550		-
551		-	site_status = result.status
552		-
553		-	if site_status != status:
554		-	if site_status == QueryStatus.UNKNOWN:
555		-	msgs = site.absence_strs
556		-	etype = site.check_type
557		-	logger.warning(f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
558		-	# don't disable in case of available username
559		-	if status == QueryStatus.CLAIMED:
560		-	changes['disabled'] = True
561		-	elif status == QueryStatus.CLAIMED:
562		-	logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
563		-	logger.info(results_dict[site.name])
564		-	changes['disabled'] = True
565		-	else:
566		-	logger.warning(f'Found `{username}` in {site.name}, must be available')
567		-	logger.info(results_dict[site.name])
568		-	changes['disabled'] = True
569		-
570		-	logger.info(f'Site {site.name} checking is finished')
571		-
572		-	if changes['disabled'] != site.disabled:
573		-	site.disabled = changes['disabled']
574		-	db.update_site(site)
575		-	if not silent:
576		-	action = 'Disabled' if site.disabled else 'Enabled'
577		-	print(f'{action} site {site.name}...')
578		-
579		-	return changes
580		-
581		-
582		-	async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False,
583		-	max_connections=10) -> bool:
584		-	sem = asyncio.Semaphore(max_connections)
585		-	tasks = []
586		-	all_sites = site_data
587		-
588		-	def disabled_count(lst):
589		-	return len(list(filter(lambda x: x.disabled, lst)))
590		-
591		-	disabled_old_count = disabled_count(all_sites.values())
592		-
593		-	for _, site in all_sites.items():
594		-	check_coro = site_self_check(site, logger, sem, db, silent)
595		-	future = asyncio.ensure_future(check_coro)
596		-	tasks.append(future)
597		-
598		-	for f in tqdm.asyncio.tqdm.as_completed(tasks):
599		-	await f
600		-
601		-	disabled_new_count = disabled_count(all_sites.values())
602		-	total_disabled = disabled_new_count - disabled_old_count
603		-
604		-	if total_disabled >= 0:
605		-	message = 'Disabled'
606		-	else:
607		-	message = 'Enabled'
608		-	total_disabled *= -1
609		-
610		-	if not silent:
611		-	print(f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information')
612		-
613		-	return total_disabled != 0
614		-
615	21
616	22		async def main():
617	23		version_string = '\n'.join([
		skipped 67 lines
685	91		action="store_true", dest="print_check_errors", default=False,
686	92		help="Print errors messages: connection, captcha, site country ban, etc."
687	93		)
	94	+	parser.add_argument("--submit",
	95	+	type=str, dest="new_site_to_submit", default=False,
	96	+	help="URL of existing profile in new site to submit."
	97	+	)
688	98		parser.add_argument("--no-color",
689	99		action="store_true", dest="no_color", default=False,
690	100		help="Don't color terminal output"
		skipped 47 lines
738	148		action="store_true", dest="html", default=False,
739	149		help="Create an HTML report file (general report on all usernames)."
740	150		)
741		-	parser.add_argument("-X","--xmind",
	151	+	parser.add_argument("-X", "--xmind",
742	152		action="store_true",
743	153		dest="xmind", default=False,
744	154		help="Generate an XMind 8 mindmap report (one report per username)."
		skipped 75 lines
820	230
821	231		site_data = get_top_sites_for_id(args.id_type)
822	232
	233	+	if args.new_site_to_submit:
	234	+	is_submitted = await submit_dialog(db, args.new_site_to_submit)
	235	+	if is_submitted:
	236	+	db.save_to_file(args.json_file)
	237	+
823	238		# Database self-checking
824	239		if args.self_check:
825	240		print('Maigret sites database self-checking...')
		skipped 48 lines
874	289
875	290		if found_unsupported_chars:
876	291		pretty_chars_str = ','.join(map(lambda s: f'"{s}"', found_unsupported_chars))
877		-	query_notify.warning(f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"')
	292	+	query_notify.warning(
	293	+	f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"')
878	294		continue
879	295
880	296		sites_to_check = get_top_sites_for_id(id_type)
		skipped 71 lines
952	368		print('Maigret is interrupted.')
953	369		sys.exit(1)
954	370
	371	+
955	372		if __name__ == "__main__":
956	373		run()
	374	+

■ ■ ■ ■ ■ ■ ■

maigret/resources/data.json

		skipped 13589 lines
13590	13590		"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
13591	13591		"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
13592	13592		"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
13593		-	"x-guest-token": "1358064134064140290"
	13593	+	"x-guest-token": "1358893858789208065"
13594	13594		},
13595	13595		"errors": {
13596	13596		"Bad guest token": "x-guest-token update required"
		skipped 359 lines
13956	13956		"video"
13957	13957		],
13958	13958		"headers": {
13959		-	"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTI2MjQ4NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.kgp8r380d1aDWcd-ROncr0Tqf8EdA-l35EeEY9is6TI"
	13959	+	"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTI4MjE0MjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.TXUhqilVT25xN4lZeoki6hEmbtcOiy7FKxTm5PWOMVs"
13960	13960		},
13961	13961		"activation": {
13962	13962		"url": "https://vimeo.com/_rv/viewer",
		skipped 9106 lines
23069	23069		"url": "https://protovary.style/user/{username}/",
23070	23070		"urlMain": "https://protovary.style",
23071	23071		"usernameClaimed": "alex",
	23072	+	"usernameUnclaimed": "noonewouldeverusethis7"
	23073	+	},
	23074	+	"beacons.ai": {
	23075	+	"checkType": "message",
	23076	+	"presenseStrs": [
	23077	+	"https://cdn.beacons.ai/profile_pictures"
	23078	+	],
	23079	+	"absenceStrs": [
	23080	+	"https://beacons.ai/bw_logo_full.png"
	23081	+	],
	23082	+	"url": "https://beacons.ai/{username}",
	23083	+	"urlMain": "https://beacons.ai",
	23084	+	"usernameClaimed": "pasteljellies",
	23085	+	"usernameUnclaimed": "noonewouldeverusethis7"
	23086	+	},
	23087	+	"are.na": {
	23088	+	"checkType": "message",
	23089	+	"presenseStrs": [
	23090	+	"Profile--view"
	23091	+	],
	23092	+	"absenceStrs": [
	23093	+	"Are.na home"
	23094	+	],
	23095	+	"url": "https://www.are.na/{username}",
	23096	+	"urlMain": "https://www.are.na",
	23097	+	"usernameClaimed": "nate-cassel",
23072	23098		"usernameUnclaimed": "noonewouldeverusethis7"
23073	23099		}
23074	23100		},
		skipped 104 lines

■ ■ ■ ■ ■ ■

maigret/submit.py

1	+	import difflib
2	+
3	+	import requests
4	+	from mock import Mock
5	+
6	+	from .checking import *
7	+
8	+	DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
9	+	"birthday", "репутация", "информация", "e-mail"]
10	+
11	+	RATIO = 0.6
12	+	TOP_FEATURES = 5
13	+
14	+
15	+	def get_match_ratio(x):
16	+	return round(max([
17	+	difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
18	+	for y in DESIRED_STRINGS
19	+	]), 2)
20	+
21	+
22	+	def extract_domain(url):
23	+	return '/'.join(url.split('/', 3)[:3])
24	+
25	+
26	+	async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
27	+	query_notify = Mock()
28	+	changes = {
29	+	'disabled': False,
30	+	}
31	+
32	+	check_data = [
33	+	(site.username_claimed, QueryStatus.CLAIMED),
34	+	(site.username_unclaimed, QueryStatus.AVAILABLE),
35	+	]
36	+
37	+	logger.info(f'Checking {site.name}...')
38	+
39	+	for username, status in check_data:
40	+	async with semaphore:
41	+	results_dict = await maigret(
42	+	username,
43	+	{site.name: site},
44	+	query_notify,
45	+	logger,
46	+	timeout=30,
47	+	id_type=site.type,
48	+	forced=True,
49	+	no_progressbar=True,
50	+	)
51	+
52	+	# don't disable entries with other ids types
53	+	# TODO: make normal checking
54	+	if site.name not in results_dict:
55	+	logger.info(results_dict)
56	+	changes['disabled'] = True
57	+	continue
58	+
59	+	result = results_dict[site.name]['status']
60	+
61	+	site_status = result.status
62	+
63	+	if site_status != status:
64	+	if site_status == QueryStatus.UNKNOWN:
65	+	msgs = site.absence_strs
66	+	etype = site.check_type
67	+	logger.warning(
68	+	f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
69	+	# don't disable in case of available username
70	+	if status == QueryStatus.CLAIMED:
71	+	changes['disabled'] = True
72	+	elif status == QueryStatus.CLAIMED:
73	+	logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
74	+	logger.info(results_dict[site.name])
75	+	changes['disabled'] = True
76	+	else:
77	+	logger.warning(f'Found `{username}` in {site.name}, must be available')
78	+	logger.info(results_dict[site.name])
79	+	changes['disabled'] = True
80	+
81	+	logger.info(f'Site {site.name} checking is finished')
82	+
83	+	return changes
84	+
85	+
86	+	async def submit_dialog(db, url_exists):
87	+	url_parts = url_exists.split('/')
88	+	supposed_username = url_parts[-1]
89	+	new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
90	+	if new_name:
91	+	supposed_username = new_name
92	+	non_exist_username = 'noonewouldeverusethis7'
93	+
94	+	url_user = url_exists.replace(supposed_username, '{username}')
95	+	url_not_exists = url_exists.replace(supposed_username, non_exist_username)
96	+
97	+	a = requests.get(url_exists).text
98	+	b = requests.get(url_not_exists).text
99	+
100	+	tokens_a = set(a.split('"'))
101	+	tokens_b = set(b.split('"'))
102	+
103	+	a_minus_b = tokens_a.difference(tokens_b)
104	+	b_minus_a = tokens_b.difference(tokens_a)
105	+
106	+	top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: '))
107	+	if not top_features_count:
108	+	top_features_count = TOP_FEATURES
109	+
110	+	presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count]
111	+
112	+	print('Detected text features of existing account: ' + ', '.join(presence_list))
113	+	features = input('If features was not detected correctly, write it manually: ')
114	+
115	+	if features:
116	+	presence_list = features.split(',')
117	+
118	+	absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[:top_features_count]
119	+	print('Detected text features of non-existing account: ' + ', '.join(absence_list))
120	+	features = input('If features was not detected correctly, write it manually: ')
121	+
122	+	if features:
123	+	absence_list = features.split(',')
124	+
125	+	url_main = extract_domain(url_exists)
126	+
127	+	site_data = {
128	+	'absenceStrs': absence_list,
129	+	'presenseStrs': presence_list,
130	+	'url': url_user,
131	+	'urlMain': url_main,
132	+	'usernameClaimed': supposed_username,
133	+	'usernameUnclaimed': non_exist_username,
134	+	'checkType': 'message',
135	+	}
136	+
137	+	site = MaigretSite(url_main.split('/')[-1], site_data)
138	+
139	+	print(site.__dict__)
140	+
141	+	sem = asyncio.Semaphore(1)
142	+	log_level = logging.INFO
143	+	logging.basicConfig(
144	+	format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
145	+	datefmt='%H:%M:%S',
146	+	level=log_level
147	+	)
148	+	logger = logging.getLogger('site-submit')
149	+	logger.setLevel(log_level)
150	+
151	+	result = await site_self_check(site, logger, sem, db)
152	+
153	+	if result['disabled']:
154	+	print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.')
155	+	print('Try to run this mode again and increase features count or choose others.')
156	+	else:
157	+	if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [yY] ') in 'yY':
158	+	db.update_site(site)
159	+	return True
160	+
161	+	return False
162	+

Merge pull request #51 from soxoj/submit-mode