STRLCPY/maigret

Introduced `--retries` flag, made thorough refactoring - updated sites list - test scripts linting
Soxoj committed 3 years ago

5ee91f66

1 parent 7fd4a2c5

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

■ ■ ■ ■ ■ ■

README.md

		skipped 25 lines
26	26		* Search by tags (site categories, countries)
27	27		* Censorship and captcha detection
28	28		* Very few false positives
	29	+	* Failed requests' restarts
29	30
30	31		## Installation
31	32
		skipped 17 lines
49	50		git clone https://github.com/soxoj/maigret && cd maigret
50	51		```
51	52
52		-	You can use your a free virtual machine, the repo will be automatically cloned:
	53	+	You can use a free virtual machine, the repo will be automatically cloned:
53	54
54	55		[![Open in Cloud Shell](https://user-images.githubusercontent.com/27065646/92304704-8d146d80-ef80-11ea-8c29-0deaabb1c702.png)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [![Run on Repl.it](https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png)](https://repl.it/github/soxoj/maigret)
55	56		<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
		skipped 59 lines

■ ■ ■ ■ ■ ■

format.sh

1 1 #!/bin/sh
2 - FILES="maigret wizard.py maigret.py"
2 + FILES="maigret wizard.py maigret.py tests"
3 3
4 4 echo 'black'
5 5 black --skip-string-normalization $FILES

All occurrences

■ ■ ■ ■ ■ ■

lint.sh

1	1		#!/bin/sh
2		-	FILES="maigret wizard.py maigret.py"
	2	+	FILES="maigret wizard.py maigret.py tests"
3	3
4	4		echo 'syntax errors or undefined names'
5	5		flake8 --count --select=E9,F63,F7,F82 --show-source --statistics $FILES
		skipped 2 lines
8	8		flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 $FILES
9	9
10	10		echo 'mypy'
11		-	mypy ./maigret
	11	+	mypy ./maigret ./wizard.py ./tests

■ ■ ■ ■ ■ ■

maigret/checking.py

		skipped 4 lines
5	5		import ssl
6	6		import sys
7	7		import tqdm
8		-	from typing import Tuple, Optional
	8	+	from typing import Tuple, Optional, Dict, List
9	9
10	10		import aiohttp
11	11		import tqdm.asyncio
		skipped 4 lines
16	16		from .activation import ParsingActivator, import_aiohttp_cookies
17	17		from . import errors
18	18		from .errors import CheckError
19		-	from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
	19	+	from .executors import (
	20	+	AsyncExecutor,
	21	+	AsyncioSimpleExecutor,
	22	+	AsyncioProgressbarQueueExecutor,
	23	+	)
20	24		from .result import QueryResult, QueryStatus
21	25		from .sites import MaigretDatabase, MaigretSite
	26	+	from .types import QueryOptions, QueryResultWrapper
22	27		from .utils import get_random_user_agent
23	28
24	29
		skipped 10 lines
35	40		unsupported_characters = "#"
36	41
37	42
38		-	async def get_response(
39		-	request_future, site_name, logger
40		-	) -> Tuple[str, int, Optional[CheckError]]:
	43	+	async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]:
41	44		html_text = None
42	45		status_code = 0
43		-	error: Optional[CheckError] = CheckError("Error")
	46	+	error: Optional[CheckError] = CheckError("Unknown")
44	47
45	48		try:
46	49		response = await request_future
		skipped 29 lines
76	79		):
77	80		error = CheckError("SSL", str(e))
78	81		else:
79		-	logger.warning(f"Unhandled error while requesting {site_name}: {e}")
80	82		logger.debug(e, exc_info=True)
81		-	error = CheckError("Error", str(e))
	83	+	error = CheckError("Unexpected", str(e))
82	84
83		-	# TODO: return only needed information
84	85		return str(html_text), status_code, error
85	86
86	87
87		-	async def update_site_dict_from_response(
88		-	sitename, site_dict, results_info, logger, query_notify
89		-	):
90		-	site_obj = site_dict[sitename]
91		-	future = site_obj.request_future
92		-	if not future:
93		-	# ignore: search by incompatible id type
94		-	return
95		-
96		-	response = await get_response(
97		-	request_future=future, site_name=sitename, logger=logger
98		-	)
99		-
100		-	return sitename, process_site_result(
101		-	response, query_notify, logger, results_info, site_obj
102		-	)
103		-
104		-
105	88		# TODO: move to separate class
106	89		def detect_error_page(
107	90		html_text, status_code, fail_flags, ignore_403
		skipped 19 lines
127	110
128	111
129	112		def process_site_result(
130		-	response, query_notify, logger, results_info, site: MaigretSite
	113	+	response, query_notify, logger, results_info: QueryResultWrapper, site: MaigretSite
131	114		):
132	115		if not response:
133	116		return results_info
		skipped 71 lines
205	188		logger.debug(presense_flag)
206	189		break
207	190
	191	+	def build_result(status, **kwargs):
	192	+	return QueryResult(
	193	+	username,
	194	+	site_name,
	195	+	url,
	196	+	status,
	197	+	query_time=response_time,
	198	+	tags=fulltags,
	199	+	**kwargs,
	200	+	)
	201	+
208	202		if check_error:
209	203		logger.debug(check_error)
210	204		result = QueryResult(
		skipped 7 lines
218	212		tags=fulltags,
219	213		)
220	214		elif check_type == "message":
221		-	absence_flags = site.absence_strs
222		-	is_absence_flags_list = isinstance(absence_flags, list)
223		-	absence_flags_set = (
224		-	set(absence_flags) if is_absence_flags_list else {absence_flags}
225		-	)
226	215		# Checks if the error message is in the HTML
227	216		is_absence_detected = any(
228		-	[(absence_flag in html_text) for absence_flag in absence_flags_set]
	217	+	[(absence_flag in html_text) for absence_flag in site.absence_strs]
229	218		)
230	219		if not is_absence_detected and is_presense_detected:
231		-	result = QueryResult(
232		-	username,
233		-	site_name,
234		-	url,
235		-	QueryStatus.CLAIMED,
236		-	query_time=response_time,
237		-	tags=fulltags,
238		-	)
	220	+	result = build_result(QueryStatus.CLAIMED)
239	221		else:
240		-	result = QueryResult(
241		-	username,
242		-	site_name,
243		-	url,
244		-	QueryStatus.AVAILABLE,
245		-	query_time=response_time,
246		-	tags=fulltags,
247		-	)
	222	+	result = build_result(QueryStatus.AVAILABLE)
248	223		elif check_type == "status_code":
249	224		# Checks if the status code of the response is 2XX
250		-	if (not status_code >= 300 or status_code < 200) and is_presense_detected:
251		-	result = QueryResult(
252		-	username,
253		-	site_name,
254		-	url,
255		-	QueryStatus.CLAIMED,
256		-	query_time=response_time,
257		-	tags=fulltags,
258		-	)
	225	+	if is_presense_detected and (not status_code >= 300 or status_code < 200):
	226	+	result = build_result(QueryStatus.CLAIMED)
259	227		else:
260		-	result = QueryResult(
261		-	username,
262		-	site_name,
263		-	url,
264		-	QueryStatus.AVAILABLE,
265		-	query_time=response_time,
266		-	tags=fulltags,
267		-	)
	228	+	result = build_result(QueryStatus.AVAILABLE)
268	229		elif check_type == "response_url":
269	230		# For this detection method, we have turned off the redirect.
270	231		# So, there is no need to check the response URL: it will always
		skipped 1 lines
272	233		# code indicates that the request was successful (i.e. no 404, or
273	234		# forward to some odd redirect).
274	235		if 200 <= status_code < 300 and is_presense_detected:
275		-	result = QueryResult(
276		-	username,
277		-	site_name,
278		-	url,
279		-	QueryStatus.CLAIMED,
280		-	query_time=response_time,
281		-	tags=fulltags,
282		-	)
	236	+	result = build_result(QueryStatus.CLAIMED)
283	237		else:
284		-	result = QueryResult(
285		-	username,
286		-	site_name,
287		-	url,
288		-	QueryStatus.AVAILABLE,
289		-	query_time=response_time,
290		-	tags=fulltags,
291		-	)
	238	+	result = build_result(QueryStatus.AVAILABLE)
292	239		else:
293	240		# It should be impossible to ever get here...
294	241		raise ValueError(
		skipped 34 lines
329	276		return results_info
330	277
331	278
	279	+	def make_site_result(
	280	+	site: MaigretSite, username: str, options: QueryOptions, logger
	281	+	) -> QueryResultWrapper:
	282	+	results_site: QueryResultWrapper = {}
	283	+
	284	+	# Record URL of main site and username
	285	+	results_site["site"] = site
	286	+	results_site["username"] = username
	287	+	results_site["parsing_enabled"] = options["parsing"]
	288	+	results_site["url_main"] = site.url_main
	289	+	results_site["cookies"] = (
	290	+	options.get("cookie_jar")
	291	+	and options["cookie_jar"].filter_cookies(site.url_main)
	292	+	or None
	293	+	)
	294	+
	295	+	headers = {
	296	+	"User-Agent": get_random_user_agent(),
	297	+	}
	298	+
	299	+	headers.update(site.headers)
	300	+
	301	+	if "url" not in site.__dict__:
	302	+	logger.error("No URL for site %s", site.name)
	303	+
	304	+	# URL of user on site (if it exists)
	305	+	url = site.url.format(
	306	+	urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
	307	+	)
	308	+
	309	+	# workaround to prevent slash errors
	310	+	url = re.sub("(?<!:)/+", "/", url)
	311	+
	312	+	session = options['session']
	313	+
	314	+	# site check is disabled
	315	+	if site.disabled and not options['forced']:
	316	+	logger.debug(f"Site {site.name} is disabled, skipping...")
	317	+	results_site["status"] = QueryResult(
	318	+	username,
	319	+	site.name,
	320	+	url,
	321	+	QueryStatus.ILLEGAL,
	322	+	error=CheckError("Check is disabled"),
	323	+	)
	324	+	# current username type could not be applied
	325	+	elif site.type != options["id_type"]:
	326	+	results_site["status"] = QueryResult(
	327	+	username,
	328	+	site.name,
	329	+	url,
	330	+	QueryStatus.ILLEGAL,
	331	+	error=CheckError('Unsupported identifier type', f'Want "{site.type}"'),
	332	+	)
	333	+	# username is not allowed.
	334	+	elif site.regex_check and re.search(site.regex_check, username) is None:
	335	+	results_site["status"] = QueryResult(
	336	+	username,
	337	+	site.name,
	338	+	url,
	339	+	QueryStatus.ILLEGAL,
	340	+	error=CheckError(
	341	+	'Unsupported username format', f'Want "{site.regex_check}"'
	342	+	),
	343	+	)
	344	+	results_site["url_user"] = ""
	345	+	results_site["http_status"] = ""
	346	+	results_site["response_text"] = ""
	347	+	# query_notify.update(results_site["status"])
	348	+	else:
	349	+	# URL of user on site (if it exists)
	350	+	results_site["url_user"] = url
	351	+	url_probe = site.url_probe
	352	+	if url_probe is None:
	353	+	# Probe URL is normal one seen by people out on the web.
	354	+	url_probe = url
	355	+	else:
	356	+	# There is a special URL for probing existence separate
	357	+	# from where the user profile normally can be found.
	358	+	url_probe = url_probe.format(
	359	+	urlMain=site.url_main,
	360	+	urlSubpath=site.url_subpath,
	361	+	username=username,
	362	+	)
	363	+
	364	+	for k, v in site.get_params.items():
	365	+	url_probe += f"&{k}={v}"
	366	+
	367	+	if site.check_type == "status_code" and site.request_head_only:
	368	+	# In most cases when we are detecting by status code,
	369	+	# it is not necessary to get the entire body: we can
	370	+	# detect fine with just the HEAD response.
	371	+	request_method = session.head
	372	+	else:
	373	+	# Either this detect method needs the content associated
	374	+	# with the GET response, or this specific website will
	375	+	# not respond properly unless we request the whole page.
	376	+	request_method = session.get
	377	+
	378	+	if site.check_type == "response_url":
	379	+	# Site forwards request to a different URL if username not
	380	+	# found. Disallow the redirect so we can capture the
	381	+	# http status from the original URL request.
	382	+	allow_redirects = False
	383	+	else:
	384	+	# Allow whatever redirect that the site wants to do.
	385	+	# The final result of the request will be what is available.
	386	+	allow_redirects = True
	387	+
	388	+	future = request_method(
	389	+	url=url_probe,
	390	+	headers=headers,
	391	+	allow_redirects=allow_redirects,
	392	+	timeout=options['timeout'],
	393	+	)
	394	+
	395	+	# Store future request object in the results object
	396	+	results_site["future"] = future
	397	+
	398	+	return results_site
	399	+
	400	+
	401	+	async def check_site_for_username(
	402	+	site, username, options: QueryOptions, logger, query_notify, args, *kwargs
	403	+	) -> Tuple[str, QueryResultWrapper]:
	404	+	default_result = make_site_result(site, username, options, logger)
	405	+	future = default_result.get("future")
	406	+	if not future:
	407	+	return site.name, default_result
	408	+
	409	+	response = await get_response(request_future=future, logger=logger)
	410	+
	411	+	response_result = process_site_result(
	412	+	response, query_notify, logger, default_result, site
	413	+	)
	414	+
	415	+	return site.name, response_result
	416	+
	417	+
	418	+	async def debug_ip_request(session, logger):
	419	+	future = session.get(url="https://icanhazip.com")
	420	+	ip, status, check_error = await get_response(future, logger)
	421	+	if ip:
	422	+	logger.debug(f"My IP is: {ip.strip()}")
	423	+	else:
	424	+	logger.debug(f"IP requesting {check_error.type}: {check_error.desc}")
	425	+
	426	+
	427	+	def get_failed_sites(results: Dict[str, QueryResultWrapper]) -> List[str]:
	428	+	sites = []
	429	+	for sitename, r in results.items():
	430	+	status = r.get('status', {})
	431	+	if status and status.error:
	432	+	if errors.is_permanent(status.error.type):
	433	+	continue
	434	+	sites.append(sitename)
	435	+	return sites
	436	+
	437	+
332	438		async def maigret(
333		-	username,
334		-	site_dict,
	439	+	username: str,
	440	+	site_dict: Dict[str, MaigretSite],
335	441		logger,
336	442		query_notify=None,
337	443		proxy=None,
		skipped 5 lines
343	449		max_connections=100,
344	450		no_progressbar=False,
345	451		cookies=None,
346		-	):
	452	+	retries=0,
	453	+	) -> QueryResultWrapper:
347	454		"""Main search func
348	455
349	456		Checks for existence of username on certain sites.
350	457
351	458		Keyword Arguments:
352	459		username -- Username string will be used for search.
353		-	site_dict -- Dictionary containing sites data.
	460	+	site_dict -- Dictionary containing sites data in MaigretSite objects.
354	461		query_notify -- Object with base type of QueryNotify().
355	462		This will be used to notify the caller about
356	463		query results.
		skipped 23 lines
380	487		there was an HTTP error when checking for existence.
381	488		"""
382	489
383		-	# Notify caller that we are starting the query.
	490	+	# notify caller that we are starting the query.
384	491		if not query_notify:
385	492		query_notify = Mock()
386	493
387	494		query_notify.start(username, id_type)
388	495
389		-	# TODO: connector
	496	+	# make http client session
390	497		connector = (
391	498		ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
392	499		)
393		-	# connector = aiohttp.TCPConnector(ssl=False)
394	500		connector.verify_ssl = False
395	501
396	502		cookie_jar = None
		skipped 6 lines
403	509		)
404	510
405	511		if logger.level == logging.DEBUG:
406		-	future = session.get(url="https://icanhazip.com")
407		-	ip, status, check_error = await get_response(future, None, logger)
408		-	if ip:
409		-	logger.debug(f"My IP is: {ip.strip()}")
410		-	else:
411		-	logger.debug(f"IP requesting {check_error[0]}: {check_error[1]}")
412		-
413		-	# Results from analysis of all sites
414		-	results_total = {}
	512	+	await debug_ip_request(session, logger)
415	513
416		-	# First create futures for all requests. This allows for the requests to run in parallel
417		-	for site_name, site in site_dict.items():
	514	+	# setup parallel executor
	515	+	executor: Optional[AsyncExecutor] = None
	516	+	if no_progressbar:
	517	+	executor = AsyncioSimpleExecutor(logger=logger)
	518	+	else:
	519	+	executor = AsyncioProgressbarQueueExecutor(
	520	+	logger=logger, in_parallel=max_connections, timeout=timeout + 0.5
	521	+	)
418	522
419		-	if site.type != id_type:
420		-	continue
	523	+	# make options objects for all the requests
	524	+	options: QueryOptions = {}
	525	+	options["cookies"] = cookie_jar
	526	+	options["session"] = session
	527	+	options["parsing"] = is_parsing_enabled
	528	+	options["timeout"] = timeout
	529	+	options["id_type"] = id_type
	530	+	options["forced"] = forced
421	531
422		-	if site.disabled and not forced:
423		-	logger.debug(f"Site {site.name} is disabled, skipping...")
424		-	continue
	532	+	# results from analysis of all sites
	533	+	all_results: Dict[str, QueryResultWrapper] = {}
425	534
426		-	# Results from analysis of this specific site
427		-	results_site = {}
	535	+	sites = list(site_dict.keys())
428	536
429		-	# Record URL of main site and username
430		-	results_site["username"] = username
431		-	results_site["parsing_enabled"] = is_parsing_enabled
432		-	results_site["url_main"] = site.url_main
433		-	results_site["cookies"] = (
434		-	cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
435		-	)
	537	+	attempts = retries + 1
	538	+	while attempts:
	539	+	tasks_dict = {}
436	540
437		-	headers = {
438		-	"User-Agent": get_random_user_agent(),
439		-	}
440		-
441		-	headers.update(site.headers)
442		-
443		-	if "url" not in site.__dict__:
444		-	logger.error("No URL for site %s", site.name)
445		-	# URL of user on site (if it exists)
446		-	url = site.url.format(
447		-	urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
448		-	)
449		-	# workaround to prevent slash errors
450		-	url = re.sub("(?<!:)/+", "/", url)
451		-
452		-	# Don't make request if username is invalid for the site
453		-	if site.regex_check and re.search(site.regex_check, username) is None:
454		-	# No need to do the check at the site: this user name is not allowed.
455		-	results_site["status"] = QueryResult(
456		-	username, site_name, url, QueryStatus.ILLEGAL
	541	+	for sitename, site in site_dict.items():
	542	+	if sitename not in sites:
	543	+	continue
	544	+	default_result: QueryResultWrapper = {
	545	+	'site': site,
	546	+	'status': QueryResult(
	547	+	username,
	548	+	sitename,
	549	+	'',
	550	+	QueryStatus.UNKNOWN,
	551	+	error=CheckError('Request failed'),
	552	+	),
	553	+	}
	554	+	tasks_dict[sitename] = (
	555	+	check_site_for_username,
	556	+	[site, username, options, logger, query_notify],
	557	+	{'default': (sitename, default_result)},
457	558		)
458		-	results_site["url_user"] = ""
459		-	results_site["http_status"] = ""
460		-	results_site["response_text"] = ""
461		-	query_notify.update(results_site["status"])
462		-	else:
463		-	# URL of user on site (if it exists)
464		-	results_site["url_user"] = url
465		-	url_probe = site.url_probe
466		-	if url_probe is None:
467		-	# Probe URL is normal one seen by people out on the web.
468		-	url_probe = url
469		-	else:
470		-	# There is a special URL for probing existence separate
471		-	# from where the user profile normally can be found.
472		-	url_probe = url_probe.format(
473		-	urlMain=site.url_main,
474		-	urlSubpath=site.url_subpath,
475		-	username=username,
476		-	)
477	559
478		-	for k, v in site.get_params.items():
479		-	url_probe += f"&{k}={v}"
	560	+	cur_results = await executor.run(tasks_dict.values())
480	561
481		-	if site.check_type == "status_code" and site.request_head_only:
482		-	# In most cases when we are detecting by status code,
483		-	# it is not necessary to get the entire body: we can
484		-	# detect fine with just the HEAD response.
485		-	request_method = session.head
486		-	else:
487		-	# Either this detect method needs the content associated
488		-	# with the GET response, or this specific website will
489		-	# not respond properly unless we request the whole page.
490		-	request_method = session.get
	562	+	# wait for executor timeout errors
	563	+	await asyncio.sleep(1)
491	564
492		-	if site.check_type == "response_url":
493		-	# Site forwards request to a different URL if username not
494		-	# found. Disallow the redirect so we can capture the
495		-	# http status from the original URL request.
496		-	allow_redirects = False
497		-	else:
498		-	# Allow whatever redirect that the site wants to do.
499		-	# The final result of the request will be what is available.
500		-	allow_redirects = True
	565	+	all_results.update(cur_results)
501	566
502		-	future = request_method(
503		-	url=url_probe,
504		-	headers=headers,
505		-	allow_redirects=allow_redirects,
506		-	timeout=timeout,
507		-	)
	567	+	sites = get_failed_sites(dict(cur_results))
	568	+	attempts -= 1
508	569
509		-	# Store future in data for access later
510		-	# TODO: move to separate obj
511		-	site.request_future = future
	570	+	if not sites:
	571	+	break
512	572
513		-	# Add this site's results into final dictionary with all of the other results.
514		-	results_total[site_name] = results_site
515		-
516		-	coroutines = []
517		-	for sitename, result_obj in results_total.items():
518		-	coroutines.append(
519		-	(
520		-	update_site_dict_from_response,
521		-	[sitename, site_dict, result_obj, logger, query_notify],
522		-	{},
	573	+	if attempts:
	574	+	query_notify.warning(
	575	+	f'Restarting checks for {len(sites)} sites... ({attempts} attempts left)'
523	576		)
524		-	)
525	577
526		-	if no_progressbar:
527		-	executor = AsyncioSimpleExecutor(logger=logger)
528		-	else:
529		-	executor = AsyncioProgressbarQueueExecutor(
530		-	logger=logger, in_parallel=max_connections, timeout=timeout + 0.5
531		-	)
532		-
533		-	results = await executor.run(coroutines)
534		-
	578	+	# closing http client session
535	579		await session.close()
536	580
537		-	# Notify caller that all queries are finished.
	581	+	# notify caller that all queries are finished
538	582		query_notify.finish()
539	583
540		-	data = {}
541		-	for result in results:
542		-	# TODO: still can be empty
543		-	if result:
544		-	try:
545		-	data[result[0]] = result[1]
546		-	except Exception as e:
547		-	logger.error(e, exc_info=True)
548		-	logger.info(result)
549		-
550		-	return data
	584	+	return all_results
551	585
552	586
553	587		def timeout_check(value):
		skipped 21 lines
575	609		return timeout
576	610
577	611
578		-	async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
	612	+	async def site_self_check(
	613	+	site: MaigretSite, logger, semaphore, db: MaigretDatabase, silent=False
	614	+	):
579	615		changes = {
580	616		"disabled": False,
581	617		}
		skipped 20 lines
602	638		id_type=site.type,
603	639		forced=True,
604	640		no_progressbar=True,
	641	+	retries=1,
605	642		)
606	643
607	644		# don't disable entries with other ids types
		skipped 80 lines

■ ■ ■ ■ ■ ■

maigret/errors.py

		skipped 56 lines
57	57		'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
58	58		}
59	59
	60	+	TEMPORARY_ERRORS_TYPES = [
	61	+	'Request timeout',
	62	+	'Unknown',
	63	+	'Request failed',
	64	+	'Connecting failure',
	65	+	'HTTP',
	66	+	'Proxy',
	67	+	'Interrupted',
	68	+	'Connection lost',
	69	+	]
	70	+
60	71		THRESHOLD = 3 # percent
61	72
62	73
		skipped 1 lines
64	75		return err_data['perc'] >= THRESHOLD
65	76
66	77
67		-	def is_not_permanent(err_data):
68		-	return True
	78	+	def is_permanent(err_type):
	79	+	return err_type not in TEMPORARY_ERRORS_TYPES
69	80
70	81
71	82		def detect(text):
		skipped 34 lines

■ ■ ■ ■ ■ ■

maigret/executors.py

		skipped 92 lines
93	93		try:
94	94		result = await asyncio.wait_for(query_task, timeout=self.timeout)
95	95		except asyncio.TimeoutError:
96		-	result = None
	96	+	result = kwargs.get('default')
97	97
98	98		self.results.append(result)
99	99		self.progress.update(1)
		skipped 20 lines

■ ■ ■ ■ ■ ■

maigret/maigret.py

		skipped 58 lines
59	59		)
60	60
61	61
62		-	async def main():
	62	+	def setup_arguments_parser():
63	63		version_string = '\n'.join(
64	64		[
65	65		f'%(prog)s {__version__}',
		skipped 83 lines
149	149		"On the other hand, this may cause a long delay to gather all results. ",
150	150		)
151	151		parser.add_argument(
	152	+	"--retries",
	153	+	action="store",
	154	+	type=int,
	155	+	metavar='RETRIES',
	156	+	default=1,
	157	+	help="Attempts to restart temporary failed requests.",
	158	+	)
	159	+	parser.add_argument(
152	160		"-n",
153	161		"--max-connections",
154	162		action="store",
		skipped 179 lines
334	342		help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
335	343		" (one report per username).",
336	344		)
	345	+	return parser
337	346
338		-	args = parser.parse_args()
	347	+
	348	+	async def main():
	349	+	arg_parser = setup_arguments_parser()
	350	+	args = arg_parser.parse_args()
339	351
340	352		# Logging
341	353		log_level = logging.ERROR
		skipped 186 lines
528	540		forced=args.use_disabled_sites,
529	541		max_connections=args.connections,
530	542		no_progressbar=args.no_progressbar,
	543	+	retries=args.retries,
531	544		)
532	545
533	546		notify_about_errors(results, query_notify)
		skipped 79 lines

maigret/resources/data.json

Diff is too large to be displayed.

■ ■ ■ ■ ■ ■

maigret/sites.py

		skipped 2 lines
3	3		import copy
4	4		import json
5	5		import sys
6		-	from typing import Optional
	6	+	from typing import Optional, List, Dict, Any
7	7
8	8		import requests
9	9
		skipped 47 lines
57	57
58	58
59	59		class MaigretEngine:
	60	+	site: Dict[str, Any] = {}
	61	+
60	62		def __init__(self, name, data):
61	63		self.name = name
62		-	self.site = {}
63	64		self.__dict__.update(data)
64	65
65	66		@property
		skipped 12 lines
78	79		"urlRegexp",
79	80		]
80	81
81		-	def __init__(self, name, information):
82		-	self.name = name
	82	+	username_claimed = ""
	83	+	username_unclaimed = ""
	84	+	url_subpath = ""
	85	+	url_main = ""
	86	+	url = ""
	87	+	disabled = False
	88	+	similar_search = False
	89	+	ignore403 = False
	90	+	tags: List[str] = []
83	91
84		-	self.disabled = False
85		-	self.similar_search = False
86		-	self.ignore403 = False
87		-	self.tags = []
	92	+	type = "username"
	93	+	headers: Dict[str, str] = {}
	94	+	errors: Dict[str, str] = {}
	95	+	activation: Dict[str, Any] = {}
	96	+	regex_check = None
	97	+	url_probe = None
	98	+	check_type = ""
	99	+	request_head_only = ""
	100	+	get_params: Dict[str, Any] = {}
88	101
89		-	self.type = "username"
90		-	self.headers = {}
91		-	self.errors = {}
92		-	self.activation = {}
93		-	self.url_subpath = ""
94		-	self.regex_check = None
95		-	self.url_probe = None
96		-	self.check_type = ""
97		-	self.request_head_only = ""
98		-	self.get_params = {}
	102	+	presense_strs: List[str] = []
	103	+	absence_strs: List[str] = []
	104	+	stats: Dict[str, Any] = {}
99	105
100		-	self.presense_strs = []
101		-	self.absence_strs = []
102		-	self.stats = {}
	106	+	engine = None
	107	+	engine_data: Dict[str, Any] = {}
	108	+	engine_obj: Optional["MaigretEngine"] = None
	109	+	request_future = None
	110	+	alexa_rank = None
	111	+	source = None
103	112
104		-	self.engine = None
105		-	self.engine_data = {}
106		-	self.engine_obj = None
107		-	self.request_future = None
108		-	self.alexa_rank = None
109		-	self.source = None
	113	+	def __init__(self, name, information):
	114	+	self.name = name
	115	+	self.url_subpath = ""
110	116
111	117		for k, v in information.items():
112	118		self.__dict__[CaseConverter.camel_to_snake(k)] = v
		skipped 80 lines
193	199		self.url_regexp = None
194	200
195	201		self_copy = copy.deepcopy(self)
196		-	engine_data = self_copy.engine_obj.site
	202	+	engine_data = self_copy.engine_obj and self_copy.engine_obj.site or {}
197	203		site_data_keys = list(self_copy.__dict__.keys())
198	204
199	205		for k in engine_data.keys():
		skipped 247 lines

■ ■ ■ ■ ■ ■

maigret/types.py

1		-	from typing import Callable, Any, Tuple
	1	+	from typing import Callable, List, Dict, Tuple, Any
2	2
3	3
4	4		# search query
5		-	QueryDraft = Tuple[Callable, Any, Any]
	5	+	QueryDraft = Tuple[Callable, List, Dict]
	6	+
	7	+	# options dict
	8	+	QueryOptions = Dict[str, Any]
	9	+
	10	+	# TODO: throw out
	11	+	QueryResultWrapper = Dict[str, Any]
6	12

sites.md

Diff is too large to be displayed.

■ ■ ■ ■ ■ ■

tests/conftest.py

		skipped 25 lines
26	26
27	27		def remove_test_reports():
28	28		reports_list = get_test_reports_filenames()
29		-	for f in reports_list: os.remove(f)
	29	+	for f in reports_list:
	30	+	os.remove(f)
30	31		logging.error(f'Removed test reports {reports_list}')
31	32
32	33
		skipped 13 lines

■ ■ ■ ■ ■ ■

tests/test_activation.py

		skipped 43 lines
44	44
45	45		url = 'https://httpbin.org/cookies'
46	46		connector = aiohttp.TCPConnector(ssl=False)
47		-	session = aiohttp.ClientSession(connector=connector, trust_env=True,
48		-	cookie_jar=cookie_jar)
	47	+	session = aiohttp.ClientSession(
	48	+	connector=connector, trust_env=True, cookie_jar=cookie_jar
	49	+	)
49	50
50	51		response = await session.get(url=url)
51	52		result = json.loads(await response.content.read())
		skipped 4 lines

■ ■ ■ ■ ■ ■

tests/test_executors.py

		skipped 1 lines
2	2		import pytest
3	3		import asyncio
4	4		import logging
5		-	from maigret.executors import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, \
6		-	AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
	5	+	from maigret.executors import (
	6	+	AsyncioSimpleExecutor,
	7	+	AsyncioProgressbarExecutor,
	8	+	AsyncioProgressbarSemaphoreExecutor,
	9	+	AsyncioProgressbarQueueExecutor,
	10	+	)
7	11
8	12		logger = logging.getLogger(__name__)
	13	+
9	14
10	15		async def func(n):
11	16		await asyncio.sleep(0.1 * (n % 3))
		skipped 7 lines
19	24		assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
20	25		assert executor.execution_time > 0.2
21	26		assert executor.execution_time < 0.3
	27	+
22	28
23	29		@pytest.mark.asyncio
24	30		async def test_asyncio_progressbar_executor():
		skipped 40 lines
65	71		assert await executor.run(tasks) == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
66	72		assert executor.execution_time > 0.2
67	73		assert executor.execution_time < 0.3
	74	+

■ ■ ■ ■ ■ ■

tests/test_maigret.py

		skipped 7 lines
8	8		from maigret.sites import MaigretDatabase
9	9
10	10		EXAMPLE_DB = {
11		-	'engines': {
12		-	},
	11	+	'engines': {},
13	12		'sites': {
14	13		"GooglePlayStore": {
15		-	"tags": [
16		-	"global",
17		-	"us"
18		-	],
	14	+	"tags": ["global", "us"],
19	15		"disabled": False,
20	16		"checkType": "status_code",
21	17		"alexaRank": 1,
22	18		"url": "https://play.google.com/store/apps/developer?id={username}",
23	19		"urlMain": "https://play.google.com/store",
24	20		"usernameClaimed": "Facebook_nosuchname",
25		-	"usernameUnclaimed": "noonewouldeverusethis7"
	21	+	"usernameUnclaimed": "noonewouldeverusethis7",
26	22		},
27	23		"Reddit": {
28		-	"tags": [
29		-	"news",
30		-	"social",
31		-	"us"
32		-	],
	24	+	"tags": ["news", "social", "us"],
33	25		"checkType": "status_code",
34		-	"presenseStrs": [
35		-	"totalKarma"
36		-	],
	26	+	"presenseStrs": ["totalKarma"],
37	27		"disabled": True,
38	28		"alexaRank": 17,
39	29		"url": "https://www.reddit.com/user/{username}",
40	30		"urlMain": "https://www.reddit.com/",
41	31		"usernameClaimed": "blue",
42		-	"usernameUnclaimed": "noonewouldeverusethis7"
	32	+	"usernameUnclaimed": "noonewouldeverusethis7",
43	33		},
44		-	}
	34	+	},
45	35		}
46	36
47	37
		skipped 60 lines

■ ■ ■ ■ ■ ■

tests/test_report.py

		skipped 6 lines
7	7		import xmind
8	8		from jinja2 import Template
9	9
10		-	from maigret.report import generate_csv_report, generate_txt_report, save_xmind_report, save_html_report, \
11		-	save_pdf_report, generate_report_template, generate_report_context, generate_json_report
	10	+	from maigret.report import (
	11	+	generate_csv_report,
	12	+	generate_txt_report,
	13	+	save_xmind_report,
	14	+	save_html_report,
	15	+	save_pdf_report,
	16	+	generate_report_template,
	17	+	generate_report_context,
	18	+	generate_json_report,
	19	+	)
12	20		from maigret.result import QueryResult, QueryStatus
13	21
14	22		EXAMPLE_RESULTS = {
		skipped 2 lines
17	25		'parsing_enabled': True,
18	26		'url_main': 'https://www.github.com/',
19	27		'url_user': 'https://www.github.com/test',
20		-	'status': QueryResult('test',
21		-	'GitHub',
22		-	'https://www.github.com/test',
23		-	QueryStatus.CLAIMED,
24		-	tags=['test_tag']),
	28	+	'status': QueryResult(
	29	+	'test',
	30	+	'GitHub',
	31	+	'https://www.github.com/test',
	32	+	QueryStatus.CLAIMED,
	33	+	tags=['test_tag'],
	34	+	),
25	35		'http_status': 200,
26	36		'is_similar': False,
27		-	'rank': 78
	37	+	'rank': 78,
28	38		}
29	39		}
30	40
		skipped 2 lines
33	43
34	44		GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
35	45		GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
36		-	GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415",
37		-	"username": "alexaimephotographycars", "name": "Alex Aim\u00e9",
38		-	"website": "www.flickr.com/photos/alexaimephotography/",
39		-	"facebook_link": " www.instagram.com/street.reality.photography/",
40		-	"instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
	46	+	GOOD_500PX_RESULT.ids_data = {
	47	+	"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==",
	48	+	"legacy_id": "26403415",
	49	+	"username": "alexaimephotographycars",
	50	+	"name": "Alex Aim\u00e9",
	51	+	"website": "www.flickr.com/photos/alexaimephotography/",
	52	+	"facebook_link": " www.instagram.com/street.reality.photography/",
	53	+	"instagram_username": "alexaimephotography",
	54	+	"twitter_username": "Alexaimephotogr",
	55	+	}
41	56
42	57		GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
43	58		GOOD_REDDIT_RESULT.tags = ['news', 'us']
44		-	GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography",
45		-	"fullname": "alexaimephotography",
46		-	"image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
47		-	"is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True",
48		-	"has_user_profile": "True", "hide_from_robots": "False",
49		-	"created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
	59	+	GOOD_REDDIT_RESULT.ids_data = {
	60	+	"reddit_id": "t5_1nytpy",
	61	+	"reddit_username": "alexaimephotography",
	62	+	"fullname": "alexaimephotography",
	63	+	"image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
	64	+	"is_employee": "False",
	65	+	"is_nsfw": "False",
	66	+	"is_mod": "True",
	67	+	"is_following": "True",
	68	+	"has_user_profile": "True",
	69	+	"hide_from_robots": "False",
	70	+	"created_at": "2019-07-10 12:20:03",
	71	+	"total_karma": "53959",
	72	+	"post_karma": "52738",
	73	+	}
50	74
51	75		GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
52	76		GOOD_IG_RESULT.tags = ['photo', 'global']
53		-	GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography",
54		-	"id": "6828488620",
55		-	"image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
56		-	"bio": "Photographer \nChild of fine street arts",
57		-	"external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
	77	+	GOOD_IG_RESULT.ids_data = {
	78	+	"instagram_username": "alexaimephotography",
	79	+	"fullname": "Alexaimephotography",
	80	+	"id": "6828488620",
	81	+	"image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
	82	+	"bio": "Photographer \nChild of fine street arts",
	83	+	"external_url": "https://www.flickr.com/photos/alexaimephotography2020/",
	84	+	}
58	85
59	86		GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
60	87		GOOD_TWITTER_RESULT.tags = ['social', 'us']
61	88
62		-	TEST = [('alexaimephotographycars', 'username', {
63		-	'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
64		-	'url_user': 'https://500px.com/p/alexaimephotographycars',
65		-	'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username',
66		-	'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200,
67		-	'is_similar': False, 'rank': 2981},
68		-	'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
69		-	'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT,
70		-	'http_status': 404, 'is_similar': False, 'rank': 17},
71		-	'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
72		-	'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400,
73		-	'is_similar': False, 'rank': 55},
74		-	'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True,
75		-	'url_main': 'https://www.instagram.com/',
76		-	'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT,
77		-	'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {
78		-	'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
79		-	'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200,
80		-	'is_similar': False, 'rank': 2981},
81		-	'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
82		-	'url_user': 'https://www.reddit.com/user/alexaimephotography',
83		-	'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200,
84		-	'is_similar': False, 'rank': 17},
85		-	'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
86		-	'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400,
87		-	'is_similar': False, 'rank': 55},
88		-	'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
89		-	'url_user': 'https://www.instagram.com/alexaimephotography',
90		-	'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200,
91		-	'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {
92		-	'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
93		-	'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200,
94		-	'is_similar': False, 'rank': 2981},
95		-	'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
96		-	'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
97		-	'is_similar': False, 'rank': 17},
98		-	'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
99		-	'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400,
100		-	'is_similar': False, 'rank': 55},
101		-	'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
102		-	'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
103		-	'is_similar': False, 'rank': 29}})]
	89	+	TEST = [
	90	+	(
	91	+	'alexaimephotographycars',
	92	+	'username',
	93	+	{
	94	+	'500px': {
	95	+	'username': 'alexaimephotographycars',
	96	+	'parsing_enabled': True,
	97	+	'url_main': 'https://500px.com/',
	98	+	'url_user': 'https://500px.com/p/alexaimephotographycars',
	99	+	'ids_usernames': {
	100	+	'alexaimephotographycars': 'username',
	101	+	'alexaimephotography': 'username',
	102	+	'Alexaimephotogr': 'username',
	103	+	},
	104	+	'status': GOOD_500PX_RESULT,
	105	+	'http_status': 200,
	106	+	'is_similar': False,
	107	+	'rank': 2981,
	108	+	},
	109	+	'Reddit': {
	110	+	'username': 'alexaimephotographycars',
	111	+	'parsing_enabled': True,
	112	+	'url_main': 'https://www.reddit.com/',
	113	+	'url_user': 'https://www.reddit.com/user/alexaimephotographycars',
	114	+	'status': BAD_RESULT,
	115	+	'http_status': 404,
	116	+	'is_similar': False,
	117	+	'rank': 17,
	118	+	},
	119	+	'Twitter': {
	120	+	'username': 'alexaimephotographycars',
	121	+	'parsing_enabled': True,
	122	+	'url_main': 'https://www.twitter.com/',
	123	+	'url_user': 'https://twitter.com/alexaimephotographycars',
	124	+	'status': BAD_RESULT,
	125	+	'http_status': 400,
	126	+	'is_similar': False,
	127	+	'rank': 55,
	128	+	},
	129	+	'Instagram': {
	130	+	'username': 'alexaimephotographycars',
	131	+	'parsing_enabled': True,
	132	+	'url_main': 'https://www.instagram.com/',
	133	+	'url_user': 'https://www.instagram.com/alexaimephotographycars',
	134	+	'status': BAD_RESULT,
	135	+	'http_status': 404,
	136	+	'is_similar': False,
	137	+	'rank': 29,
	138	+	},
	139	+	},
	140	+	),
	141	+	(
	142	+	'alexaimephotography',
	143	+	'username',
	144	+	{
	145	+	'500px': {
	146	+	'username': 'alexaimephotography',
	147	+	'parsing_enabled': True,
	148	+	'url_main': 'https://500px.com/',
	149	+	'url_user': 'https://500px.com/p/alexaimephotography',
	150	+	'status': BAD_RESULT,
	151	+	'http_status': 200,
	152	+	'is_similar': False,
	153	+	'rank': 2981,
	154	+	},
	155	+	'Reddit': {
	156	+	'username': 'alexaimephotography',
	157	+	'parsing_enabled': True,
	158	+	'url_main': 'https://www.reddit.com/',
	159	+	'url_user': 'https://www.reddit.com/user/alexaimephotography',
	160	+	'ids_usernames': {'alexaimephotography': 'username'},
	161	+	'status': GOOD_REDDIT_RESULT,
	162	+	'http_status': 200,
	163	+	'is_similar': False,
	164	+	'rank': 17,
	165	+	},
	166	+	'Twitter': {
	167	+	'username': 'alexaimephotography',
	168	+	'parsing_enabled': True,
	169	+	'url_main': 'https://www.twitter.com/',
	170	+	'url_user': 'https://twitter.com/alexaimephotography',
	171	+	'status': BAD_RESULT,
	172	+	'http_status': 400,
	173	+	'is_similar': False,
	174	+	'rank': 55,
	175	+	},
	176	+	'Instagram': {
	177	+	'username': 'alexaimephotography',
	178	+	'parsing_enabled': True,
	179	+	'url_main': 'https://www.instagram.com/',
	180	+	'url_user': 'https://www.instagram.com/alexaimephotography',
	181	+	'ids_usernames': {'alexaimephotography': 'username'},
	182	+	'status': GOOD_IG_RESULT,
	183	+	'http_status': 200,
	184	+	'is_similar': False,
	185	+	'rank': 29,
	186	+	},
	187	+	},
	188	+	),
	189	+	(
	190	+	'Alexaimephotogr',
	191	+	'username',
	192	+	{
	193	+	'500px': {
	194	+	'username': 'Alexaimephotogr',
	195	+	'parsing_enabled': True,
	196	+	'url_main': 'https://500px.com/',
	197	+	'url_user': 'https://500px.com/p/Alexaimephotogr',
	198	+	'status': BAD_RESULT,
	199	+	'http_status': 200,
	200	+	'is_similar': False,
	201	+	'rank': 2981,
	202	+	},
	203	+	'Reddit': {
	204	+	'username': 'Alexaimephotogr',
	205	+	'parsing_enabled': True,
	206	+	'url_main': 'https://www.reddit.com/',
	207	+	'url_user': 'https://www.reddit.com/user/Alexaimephotogr',
	208	+	'status': BAD_RESULT,
	209	+	'http_status': 404,
	210	+	'is_similar': False,
	211	+	'rank': 17,
	212	+	},
	213	+	'Twitter': {
	214	+	'username': 'Alexaimephotogr',
	215	+	'parsing_enabled': True,
	216	+	'url_main': 'https://www.twitter.com/',
	217	+	'url_user': 'https://twitter.com/Alexaimephotogr',
	218	+	'status': GOOD_TWITTER_RESULT,
	219	+	'http_status': 400,
	220	+	'is_similar': False,
	221	+	'rank': 55,
	222	+	},
	223	+	'Instagram': {
	224	+	'username': 'Alexaimephotogr',
	225	+	'parsing_enabled': True,
	226	+	'url_main': 'https://www.instagram.com/',
	227	+	'url_user': 'https://www.instagram.com/Alexaimephotogr',
	228	+	'status': BAD_RESULT,
	229	+	'http_status': 404,
	230	+	'is_similar': False,
	231	+	'rank': 29,
	232	+	},
	233	+	},
	234	+	),
	235	+	]
104	236
105	237		SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
106	238
		skipped 80 lines
187	319		assert data['topic']['topics'][0]['title'] == 'Undefined'
188	320		assert data['topic']['topics'][1]['title'] == 'test_tag'
189	321		assert len(data['topic']['topics'][1]['topics']) == 1
190		-	assert data['topic']['topics'][1]['topics'][0]['label'] == 'https://www.github.com/test'
	322	+	assert (
	323	+	data['topic']['topics'][1]['topics'][0]['label']
	324	+	== 'https://www.github.com/test'
	325	+	)
191	326
192	327
193	328		def test_html_report():
		skipped 18 lines

■ ■ ■ ■ ■ ■

tests/test_sites.py

		skipped 9 lines
10	10		"The specified member cannot be found. Please enter a member's entire name.",
11	11		],
12	12		"checkType": "message",
13		-	"errors": {
14		-	"You must be logged-in to do that.": "Login required"
15		-	},
16		-	"url": "{urlMain}{urlSubpath}/members/?username={username}"
17		-	}
	13	+	"errors": {"You must be logged-in to do that.": "Login required"},
	14	+	"url": "{urlMain}{urlSubpath}/members/?username={username}",
	15	+	},
18	16		},
19	17		},
20	18		'sites': {
21	19		"Amperka": {
22	20		"engine": "XenForo",
23	21		"rank": 121613,
24		-	"tags": [
25		-	"ru"
26		-	],
	22	+	"tags": ["ru"],
27	23		"urlMain": "http://forum.amperka.ru",
28	24		"usernameClaimed": "adam",
29		-	"usernameUnclaimed": "noonewouldeverusethis7"
	25	+	"usernameUnclaimed": "noonewouldeverusethis7",
30	26		},
31		-	}
	27	+	},
32	28		}
33	29
34	30
		skipped 81 lines
116	112		db = MaigretDatabase()
117	113		db.load_from_json(EXAMPLE_DB)
118	114
119		-	assert db.sites[0].url_regexp.pattern == r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
120		-	assert db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test') == 'test'
	115	+	assert (
	116	+	db.sites[0].url_regexp.pattern
	117	+	== r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
	118	+	)
	119	+	assert (
	120	+	db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test')
	121	+	== 'test'
	122	+	)
121	123
122	124
123	125		def test_ranked_sites_dict():
		skipped 56 lines

■ ■ ■ ■ ■ ■

tests/test_utils.py

		skipped 1 lines
2	2		import itertools
3	3		import re
4	4
5		-	from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher, get_dict_ascii_tree
	5	+	from maigret.utils import (
	6	+	CaseConverter,
	7	+	is_country_tag,
	8	+	enrich_link_str,
	9	+	URLMatcher,
	10	+	get_dict_ascii_tree,
	11	+	)
6	12
7	13
8	14		def test_case_convert_camel_to_snake():
		skipped 36 lines
45	51
46	52		def test_enrich_link_str():
47	53		assert enrich_link_str('test') == 'test'
48		-	assert enrich_link_str(
49		-	' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
	54	+	assert (
	55	+	enrich_link_str(' www.flickr.com/photos/alexaimephotography/')
	56	+	== '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
	57	+	)
50	58
51	59
52	60		def test_url_extract_main_part():
		skipped 25 lines
78	86
79	87		for url_parts in itertools.product(*parts):
80	88		url = ''.join(url_parts)
81		-	assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
	89	+	assert (
	90	+	URLMatcher.make_profile_url_regexp(url).pattern
	91	+	== r'^https?://(www.)?flickr\.com/photos/(.+?)$'
	92	+	)
82	93
83	94
84	95		def test_get_dict_ascii_tree():
85		-	data = {'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==', 'legacy_id': '26403415', 'username': 'alexaimephotographycars', 'name': 'Alex Aimé', 'created_at': '2018-05-04T10:17:01.000+0000', 'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b', 'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201', 'website': 'www.instagram.com/street.reality.photography/', 'facebook_link': ' www.instagram.com/street.reality.photography/', 'instagram_username': 'Street.Reality.Photography', 'twitter_username': 'Alexaimephotogr'}
	96	+	data = {
	97	+	'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==',
	98	+	'legacy_id': '26403415',
	99	+	'username': 'alexaimephotographycars',
	100	+	'name': 'Alex Aimé',
	101	+	'created_at': '2018-05-04T10:17:01.000+0000',
	102	+	'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b',
	103	+	'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201',
	104	+	'website': 'www.instagram.com/street.reality.photography/',
	105	+	'facebook_link': ' www.instagram.com/street.reality.photography/',
	106	+	'instagram_username': 'Street.Reality.Photography',
	107	+	'twitter_username': 'Alexaimephotogr',
	108	+	}
86	109
87	110		ascii_tree = get_dict_ascii_tree(data.items())
88	111
89		-	assert ascii_tree == """
	112	+	assert (
	113	+	ascii_tree
	114	+	== """
90	115		┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
91	116		┣╸legacy_id: 26403415
92	117		┣╸username: alexaimephotographycars
		skipped 5 lines
98	123		┣╸facebook_link: www.instagram.com/street.reality.photography/
99	124		┣╸instagram_username: Street.Reality.Photography
100	125		┗╸twitter_username: Alexaimephotogr"""
	126	+	)
	127	+

Introduced `--retries` flag, made thorough refactoring - updated sites list - test scripts linting