Projects STRLCPY maigret Commits 7676c053
🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■ ■
    maigret/checking.py
     1 +import asyncio
     2 +import logging
     3 +import re
     4 +import ssl
     5 + 
     6 +import aiohttp
     7 +import tqdm.asyncio
     8 +from aiohttp_socks import ProxyConnector
     9 +from mock import Mock
     10 +from python_socks import _errors as proxy_errors
     11 +from socid_extractor import extract
     12 + 
     13 +from .activation import ParsingActivator, import_aiohttp_cookies
     14 +from .result import QueryResult, QueryStatus
     15 +from .sites import MaigretDatabase, MaigretSite
     16 + 
     17 +supported_recursive_search_ids = (
     18 + 'yandex_public_id',
     19 + 'gaia_id',
     20 + 'vk_id',
     21 + 'ok_id',
     22 + 'wikimapia_uid',
     23 +)
     24 + 
     25 +common_errors = {
     26 + '<title>Attention Required! | Cloudflare</title>': 'Cloudflare captcha',
     27 + 'Please stand by, while we are checking your browser': 'Cloudflare captcha',
     28 + '<title>Доступ ограничен</title>': 'Rostelecom censorship',
     29 + 'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha',
     30 + 'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection',
     31 + '404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': 'MegaFon 404 page',
     32 + 'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship',
     33 + 'Incapsula incident ID': 'Incapsula antibot protection',
     34 +}
     35 + 
     36 +unsupported_characters = '#'
     37 + 
     38 + 
     39 +async def get_response(request_future, site_name, logger):
     40 + html_text = None
     41 + status_code = 0
     42 + 
     43 + error_text = "General Unknown Error"
     44 + expection_text = None
     45 + 
     46 + try:
     47 + response = await request_future
     48 + 
     49 + status_code = response.status
     50 + response_content = await response.content.read()
     51 + charset = response.charset or 'utf-8'
     52 + decoded_content = response_content.decode(charset, 'ignore')
     53 + html_text = decoded_content
     54 + 
     55 + if status_code > 0:
     56 + error_text = None
     57 + 
     58 + logger.debug(html_text)
     59 + 
     60 + except asyncio.TimeoutError as errt:
     61 + error_text = "Timeout Error"
     62 + expection_text = str(errt)
     63 + except (ssl.SSLCertVerificationError, ssl.SSLError) as err:
     64 + error_text = "SSL Error"
     65 + expection_text = str(err)
     66 + except aiohttp.client_exceptions.ClientConnectorError as err:
     67 + error_text = "Error Connecting"
     68 + expection_text = str(err)
     69 + except aiohttp.http_exceptions.BadHttpMessage as err:
     70 + error_text = "HTTP Error"
     71 + expection_text = str(err)
     72 + except proxy_errors.ProxyError as err:
     73 + error_text = "Proxy Error"
     74 + expection_text = str(err)
     75 + except Exception as err:
     76 + logger.warning(f'Unhandled error while requesting {site_name}: {err}')
     77 + logger.debug(err, exc_info=True)
     78 + error_text = "Some Error"
     79 + expection_text = str(err)
     80 + 
     81 + # TODO: return only needed information
     82 + return html_text, status_code, error_text, expection_text
     83 + 
     84 + 
     85 +async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify):
     86 + async with semaphore:
     87 + site_obj = site_dict[sitename]
     88 + future = site_obj.request_future
     89 + if not future:
     90 + # ignore: search by incompatible id type
     91 + return
     92 + 
     93 + response = await get_response(request_future=future,
     94 + site_name=sitename,
     95 + logger=logger)
     96 + 
     97 + site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
     98 + 
     99 + 
     100 +# TODO: move info separate module
     101 +def detect_error_page(html_text, status_code, fail_flags, ignore_403):
     102 + # Detect service restrictions such as a country restriction
     103 + for flag, msg in fail_flags.items():
     104 + if flag in html_text:
     105 + return 'Some site error', msg
     106 + 
     107 + # Detect common restrictions such as provider censorship and bot protection
     108 + for flag, msg in common_errors.items():
     109 + if flag in html_text:
     110 + return 'Error', msg
     111 + 
     112 + # Detect common site errors
     113 + if status_code == 403 and not ignore_403:
     114 + return 'Access denied', 'Access denied, use proxy/vpn'
     115 + elif status_code >= 500:
     116 + return f'Error {status_code}', f'Site error {status_code}'
     117 + 
     118 + return None, None
     119 + 
     120 + 
     121 +def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
     122 + if not response:
     123 + return results_info
     124 + 
     125 + fulltags = site.tags
     126 + 
     127 + # Retrieve other site information again
     128 + username = results_info['username']
     129 + is_parsing_enabled = results_info['parsing_enabled']
     130 + url = results_info.get("url_user")
     131 + logger.debug(url)
     132 + 
     133 + status = results_info.get("status")
     134 + if status is not None:
     135 + # We have already determined the user doesn't exist here
     136 + return results_info
     137 + 
     138 + # Get the expected check type
     139 + check_type = site.check_type
     140 + 
     141 + # Get the failure messages and comments
     142 + failure_errors = site.errors
     143 + 
     144 + # TODO: refactor
     145 + if not response:
     146 + logger.error(f'No response for {site.name}')
     147 + return results_info
     148 + 
     149 + html_text, status_code, error_text, expection_text = response
     150 + site_error_text = '?'
     151 + 
     152 + # TODO: add elapsed request time counting
     153 + response_time = None
     154 + 
     155 + if logger.level == logging.DEBUG:
     156 + with open('debug.txt', 'a') as f:
     157 + status = status_code or 'No response'
     158 + f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n')
     159 + if html_text:
     160 + f.write(f'code: {status}\nresponse: {str(html_text)}\n')
     161 + 
     162 + if status_code and not error_text:
     163 + error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
     164 + site.ignore_403)
     165 + 
     166 + if site.activation and html_text:
     167 + is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
     168 + if is_need_activation:
     169 + method = site.activation['method']
     170 + try:
     171 + activate_fun = getattr(ParsingActivator(), method)
     172 + # TODO: async call
     173 + activate_fun(site, logger)
     174 + except AttributeError:
     175 + logger.warning(f'Activation method {method} for site {site.name} not found!')
     176 + 
     177 + # presense flags
     178 + # True by default
     179 + presense_flags = site.presense_strs
     180 + is_presense_detected = False
     181 + if html_text:
     182 + if not presense_flags:
     183 + is_presense_detected = True
     184 + site.stats['presense_flag'] = None
     185 + else:
     186 + for presense_flag in presense_flags:
     187 + if presense_flag in html_text:
     188 + is_presense_detected = True
     189 + site.stats['presense_flag'] = presense_flag
     190 + logger.info(presense_flag)
     191 + break
     192 + 
     193 + if error_text is not None:
     194 + logger.debug(error_text)
     195 + result = QueryResult(username,
     196 + site.name,
     197 + url,
     198 + QueryStatus.UNKNOWN,
     199 + query_time=response_time,
     200 + context=f'{error_text}: {site_error_text}', tags=fulltags)
     201 + elif check_type == "message":
     202 + absence_flags = site.absence_strs
     203 + is_absence_flags_list = isinstance(absence_flags, list)
     204 + absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags}
     205 + # Checks if the error message is in the HTML
     206 + is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set])
     207 + if not is_absence_detected and is_presense_detected:
     208 + result = QueryResult(username,
     209 + site.name,
     210 + url,
     211 + QueryStatus.CLAIMED,
     212 + query_time=response_time, tags=fulltags)
     213 + else:
     214 + result = QueryResult(username,
     215 + site.name,
     216 + url,
     217 + QueryStatus.AVAILABLE,
     218 + query_time=response_time, tags=fulltags)
     219 + elif check_type == "status_code":
     220 + # Checks if the status code of the response is 2XX
     221 + if (not status_code >= 300 or status_code < 200) and is_presense_detected:
     222 + result = QueryResult(username,
     223 + site.name,
     224 + url,
     225 + QueryStatus.CLAIMED,
     226 + query_time=response_time, tags=fulltags)
     227 + else:
     228 + result = QueryResult(username,
     229 + site.name,
     230 + url,
     231 + QueryStatus.AVAILABLE,
     232 + query_time=response_time, tags=fulltags)
     233 + elif check_type == "response_url":
     234 + # For this detection method, we have turned off the redirect.
     235 + # So, there is no need to check the response URL: it will always
     236 + # match the request. Instead, we will ensure that the response
     237 + # code indicates that the request was successful (i.e. no 404, or
     238 + # forward to some odd redirect).
     239 + if 200 <= status_code < 300 and is_presense_detected:
     240 + result = QueryResult(username,
     241 + site.name,
     242 + url,
     243 + QueryStatus.CLAIMED,
     244 + query_time=response_time, tags=fulltags)
     245 + else:
     246 + result = QueryResult(username,
     247 + site.name,
     248 + url,
     249 + QueryStatus.AVAILABLE,
     250 + query_time=response_time, tags=fulltags)
     251 + else:
     252 + # It should be impossible to ever get here...
     253 + raise ValueError(f"Unknown check type '{check_type}' for "
     254 + f"site '{site.name}'")
     255 + 
     256 + extracted_ids_data = {}
     257 + 
     258 + if is_parsing_enabled and result.status == QueryStatus.CLAIMED:
     259 + try:
     260 + extracted_ids_data = extract(html_text)
     261 + except Exception as e:
     262 + logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True)
     263 + 
     264 + if extracted_ids_data:
     265 + new_usernames = {}
     266 + for k, v in extracted_ids_data.items():
     267 + if 'username' in k:
     268 + new_usernames[v] = 'username'
     269 + if k in supported_recursive_search_ids:
     270 + new_usernames[v] = k
     271 + 
     272 + results_info['ids_usernames'] = new_usernames
     273 + result.ids_data = extracted_ids_data
     274 + 
     275 + # Notify caller about results of query.
     276 + query_notify.update(result, site.similar_search)
     277 + 
     278 + # Save status of request
     279 + results_info['status'] = result
     280 + 
     281 + # Save results from request
     282 + results_info['http_status'] = status_code
     283 + results_info['is_similar'] = site.similar_search
     284 + # results_site['response_text'] = html_text
     285 + results_info['rank'] = site.alexa_rank
     286 + return results_info
     287 + 
     288 + 
     289 +async def maigret(username, site_dict, query_notify, logger,
     290 + proxy=None, timeout=None, recursive_search=False,
     291 + id_type='username', debug=False, forced=False,
     292 + max_connections=100, no_progressbar=False,
     293 + cookies=None):
     294 + """Main search func
     295 + 
     296 + Checks for existence of username on various social media sites.
     297 + 
     298 + Keyword Arguments:
     299 + username -- String indicating username that report
     300 + should be created against.
     301 + site_dict -- Dictionary containing all of the site data.
     302 + query_notify -- Object with base type of QueryNotify().
     303 + This will be used to notify the caller about
     304 + query results.
     305 + proxy -- String indicating the proxy URL
     306 + timeout -- Time in seconds to wait before timing out request.
     307 + Default is no timeout.
     308 + recursive_search -- Search for other usernames in website pages & recursive search by them.
     309 + 
     310 + Return Value:
     311 + Dictionary containing results from report. Key of dictionary is the name
     312 + of the social network site, and the value is another dictionary with
     313 + the following keys:
     314 + url_main: URL of main site.
     315 + url_user: URL of user on site (if account exists).
     316 + status: QueryResult() object indicating results of test for
     317 + account existence.
     318 + http_status: HTTP status code of query which checked for existence on
     319 + site.
     320 + response_text: Text that came back from request. May be None if
     321 + there was an HTTP error when checking for existence.
     322 + """
     323 + 
     324 + # Notify caller that we are starting the query.
     325 + query_notify.start(username, id_type)
     326 + 
     327 + # TODO: connector
     328 + connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
     329 + # connector = aiohttp.TCPConnector(ssl=False)
     330 + connector.verify_ssl = False
     331 + 
     332 + cookie_jar = None
     333 + if cookies:
     334 + cookie_jar = await import_aiohttp_cookies(cookies)
     335 + 
     336 + session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar)
     337 + 
     338 + if logger.level == logging.DEBUG:
     339 + future = session.get(url='https://icanhazip.com')
     340 + ip, status, error, expection = await get_response(future, None, logger)
     341 + if ip:
     342 + logger.debug(f'My IP is: {ip.strip()}')
     343 + else:
     344 + logger.debug(f'IP requesting {error}: {expection}')
     345 + 
     346 + # Results from analysis of all sites
     347 + results_total = {}
     348 + 
     349 + # First create futures for all requests. This allows for the requests to run in parallel
     350 + for site_name, site in site_dict.items():
     351 + 
     352 + if site.type != id_type:
     353 + continue
     354 + 
     355 + if site.disabled and not forced:
     356 + logger.debug(f'Site {site.name} is disabled, skipping...')
     357 + continue
     358 + 
     359 + # Results from analysis of this specific site
     360 + results_site = {}
     361 + 
     362 + # Record URL of main site and username
     363 + results_site['username'] = username
     364 + results_site['parsing_enabled'] = recursive_search
     365 + results_site['url_main'] = site.url_main
     366 + results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
     367 + 
     368 + headers = {
     369 + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
     370 + }
     371 + 
     372 + headers.update(site.headers)
     373 + 
     374 + if not 'url' in site.__dict__:
     375 + logger.error('No URL for site %s', site.name)
     376 + # URL of user on site (if it exists)
     377 + url = site.url.format(
     378 + urlMain=site.url_main,
     379 + urlSubpath=site.url_subpath,
     380 + username=username
     381 + )
     382 + # workaround to prevent slash errors
     383 + url = re.sub('(?<!:)/+', '/', url)
     384 + 
     385 + # Don't make request if username is invalid for the site
     386 + if site.regex_check and re.search(site.regex_check, username) is None:
     387 + # No need to do the check at the site: this user name is not allowed.
     388 + results_site['status'] = QueryResult(username,
     389 + site_name,
     390 + url,
     391 + QueryStatus.ILLEGAL)
     392 + results_site["url_user"] = ""
     393 + results_site['http_status'] = ""
     394 + results_site['response_text'] = ""
     395 + query_notify.update(results_site['status'])
     396 + else:
     397 + # URL of user on site (if it exists)
     398 + results_site["url_user"] = url
     399 + url_probe = site.url_probe
     400 + if url_probe is None:
     401 + # Probe URL is normal one seen by people out on the web.
     402 + url_probe = url
     403 + else:
     404 + # There is a special URL for probing existence separate
     405 + # from where the user profile normally can be found.
     406 + url_probe = url_probe.format(
     407 + urlMain=site.url_main,
     408 + urlSubpath=site.url_subpath,
     409 + username=username,
     410 + )
     411 + 
     412 + for k, v in site.get_params.items():
     413 + url_probe += f'&{k}={v}'
     414 + 
     415 + if site.check_type == 'status_code' and site.request_head_only:
     416 + # In most cases when we are detecting by status code,
     417 + # it is not necessary to get the entire body: we can
     418 + # detect fine with just the HEAD response.
     419 + request_method = session.head
     420 + else:
     421 + # Either this detect method needs the content associated
     422 + # with the GET response, or this specific website will
     423 + # not respond properly unless we request the whole page.
     424 + request_method = session.get
     425 + 
     426 + if site.check_type == "response_url":
     427 + # Site forwards request to a different URL if username not
     428 + # found. Disallow the redirect so we can capture the
     429 + # http status from the original URL request.
     430 + allow_redirects = False
     431 + else:
     432 + # Allow whatever redirect that the site wants to do.
     433 + # The final result of the request will be what is available.
     434 + allow_redirects = True
     435 + 
     436 + future = request_method(url=url_probe, headers=headers,
     437 + allow_redirects=allow_redirects,
     438 + timeout=timeout,
     439 + )
     440 + 
     441 + # Store future in data for access later
     442 + # TODO: move to separate obj
     443 + site.request_future = future
     444 + 
     445 + # Add this site's results into final dictionary with all of the other results.
     446 + results_total[site_name] = results_site
     447 + 
     448 + # TODO: move into top-level function
     449 + 
     450 + sem = asyncio.Semaphore(max_connections)
     451 + 
     452 + tasks = []
     453 + for sitename, result_obj in results_total.items():
     454 + update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify)
     455 + future = asyncio.ensure_future(update_site_coro)
     456 + tasks.append(future)
     457 + 
     458 + if no_progressbar:
     459 + await asyncio.gather(*tasks)
     460 + else:
     461 + for f in tqdm.asyncio.tqdm.as_completed(tasks):
     462 + await f
     463 + 
     464 + await session.close()
     465 + 
     466 + # Notify caller that all queries are finished.
     467 + query_notify.finish()
     468 + 
     469 + return results_total
     470 + 
     471 + 
     472 +def timeout_check(value):
     473 + """Check Timeout Argument.
     474 + 
     475 + Checks timeout for validity.
     476 + 
     477 + Keyword Arguments:
     478 + value -- Time in seconds to wait before timing out request.
     479 + 
     480 + Return Value:
     481 + Floating point number representing the time (in seconds) that should be
     482 + used for the timeout.
     483 + 
     484 + NOTE: Will raise an exception if the timeout in invalid.
     485 + """
     486 + from argparse import ArgumentTypeError
     487 + 
     488 + try:
     489 + timeout = float(value)
     490 + except ValueError:
     491 + raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
     492 + if timeout <= 0:
     493 + raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
     494 + return timeout
     495 + 
     496 + 
     497 +async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
     498 + query_notify = Mock()
     499 + changes = {
     500 + 'disabled': False,
     501 + }
     502 + 
     503 + try:
     504 + check_data = [
     505 + (site.username_claimed, QueryStatus.CLAIMED),
     506 + (site.username_unclaimed, QueryStatus.AVAILABLE),
     507 + ]
     508 + except Exception as e:
     509 + logger.error(e)
     510 + logger.error(site.__dict__)
     511 + check_data = []
     512 + 
     513 + logger.info(f'Checking {site.name}...')
     514 + 
     515 + for username, status in check_data:
     516 + async with semaphore:
     517 + results_dict = await maigret(
     518 + username,
     519 + {site.name: site},
     520 + query_notify,
     521 + logger,
     522 + timeout=30,
     523 + id_type=site.type,
     524 + forced=True,
     525 + no_progressbar=True,
     526 + )
     527 + 
     528 + # don't disable entries with other ids types
     529 + # TODO: make normal checking
     530 + if site.name not in results_dict:
     531 + logger.info(results_dict)
     532 + changes['disabled'] = True
     533 + continue
     534 + 
     535 + result = results_dict[site.name]['status']
     536 + 
     537 + site_status = result.status
     538 + 
     539 + if site_status != status:
     540 + if site_status == QueryStatus.UNKNOWN:
     541 + msgs = site.absence_strs
     542 + etype = site.check_type
     543 + logger.warning(
     544 + f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
     545 + # don't disable in case of available username
     546 + if status == QueryStatus.CLAIMED:
     547 + changes['disabled'] = True
     548 + elif status == QueryStatus.CLAIMED:
     549 + logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
     550 + logger.info(results_dict[site.name])
     551 + changes['disabled'] = True
     552 + else:
     553 + logger.warning(f'Found `{username}` in {site.name}, must be available')
     554 + logger.info(results_dict[site.name])
     555 + changes['disabled'] = True
     556 + 
     557 + logger.info(f'Site {site.name} checking is finished')
     558 + 
     559 + if changes['disabled'] != site.disabled:
     560 + site.disabled = changes['disabled']
     561 + db.update_site(site)
     562 + if not silent:
     563 + action = 'Disabled' if site.disabled else 'Enabled'
     564 + print(f'{action} site {site.name}...')
     565 + 
     566 + return changes
     567 + 
     568 + 
     569 +async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False,
     570 + max_connections=10) -> bool:
     571 + sem = asyncio.Semaphore(max_connections)
     572 + tasks = []
     573 + all_sites = site_data
     574 + 
     575 + def disabled_count(lst):
     576 + return len(list(filter(lambda x: x.disabled, lst)))
     577 + 
     578 + disabled_old_count = disabled_count(all_sites.values())
     579 + 
     580 + for _, site in all_sites.items():
     581 + check_coro = site_self_check(site, logger, sem, db, silent)
     582 + future = asyncio.ensure_future(check_coro)
     583 + tasks.append(future)
     584 + 
     585 + for f in tqdm.asyncio.tqdm.as_completed(tasks):
     586 + await f
     587 + 
     588 + disabled_new_count = disabled_count(all_sites.values())
     589 + total_disabled = disabled_new_count - disabled_old_count
     590 + 
     591 + if total_disabled >= 0:
     592 + message = 'Disabled'
     593 + else:
     594 + message = 'Enabled'
     595 + total_disabled *= -1
     596 + 
     597 + if not silent:
     598 + print(
     599 + f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information')
     600 + 
     601 + return total_disabled != 0
     602 + 
  • ■ ■ ■ ■ ■
    maigret/maigret.py
    skipped 1 lines
    2 2  Maigret main module
    3 3  """
    4 4   
    5  -import asyncio
    6  -import logging
    7 5  import os
    8 6  import platform
    9  -import re
    10  -import ssl
    11 7  import sys
    12 8  from argparse import ArgumentParser, RawDescriptionHelpFormatter
    13 9   
    14  -import aiohttp
    15 10  import requests
    16  -import tqdm.asyncio
    17  -from aiohttp_socks import ProxyConnector
    18  -from mock import Mock
    19  -from python_socks import _errors as proxy_errors
    20  -from socid_extractor import parse, extract, __version__ as socid_version
     11 +from socid_extractor import parse, __version__ as socid_version
    21 12   
    22  -from .activation import ParsingActivator, import_aiohttp_cookies
     13 +from .checking import *
    23 14  from .notify import QueryNotifyPrint
    24 15  from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
    25 16   generate_report_context, save_txt_report
    26  -from .result import QueryResult, QueryStatus
    27  -from .sites import MaigretDatabase, MaigretSite
     17 +from .submit import submit_dialog
    28 18   
    29 19  __version__ = '0.1.13'
    30 20   
    31  -supported_recursive_search_ids = (
    32  - 'yandex_public_id',
    33  - 'gaia_id',
    34  - 'vk_id',
    35  - 'ok_id',
    36  - 'wikimapia_uid',
    37  -)
    38  - 
    39  -common_errors = {
    40  - '<title>Attention Required! | Cloudflare</title>': 'Cloudflare captcha',
    41  - 'Please stand by, while we are checking your browser': 'Cloudflare captcha',
    42  - '<title>Доступ ограничен</title>': 'Rostelecom censorship',
    43  - 'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha',
    44  - 'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection',
    45  - '404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': 'MegaFon 404 page',
    46  - 'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship',
    47  - 'Incapsula incident ID': 'Incapsula antibot protection',
    48  -}
    49  - 
    50  -unsupported_characters = '#'
    51  - 
    52  -async def get_response(request_future, site_name, logger):
    53  - html_text = None
    54  - status_code = 0
    55  - 
    56  - error_text = "General Unknown Error"
    57  - expection_text = None
    58  - 
    59  - try:
    60  - response = await request_future
    61  - 
    62  - status_code = response.status
    63  - response_content = await response.content.read()
    64  - charset = response.charset or 'utf-8'
    65  - decoded_content = response_content.decode(charset, 'ignore')
    66  - html_text = decoded_content
    67  - 
    68  - if status_code > 0:
    69  - error_text = None
    70  - 
    71  - logger.debug(html_text)
    72  - 
    73  - except asyncio.TimeoutError as errt:
    74  - error_text = "Timeout Error"
    75  - expection_text = str(errt)
    76  - except (ssl.SSLCertVerificationError, ssl.SSLError) as err:
    77  - error_text = "SSL Error"
    78  - expection_text = str(err)
    79  - except aiohttp.client_exceptions.ClientConnectorError as err:
    80  - error_text = "Error Connecting"
    81  - expection_text = str(err)
    82  - except aiohttp.http_exceptions.BadHttpMessage as err:
    83  - error_text = "HTTP Error"
    84  - expection_text = str(err)
    85  - except proxy_errors.ProxyError as err:
    86  - error_text = "Proxy Error"
    87  - expection_text = str(err)
    88  - except Exception as err:
    89  - logger.warning(f'Unhandled error while requesting {site_name}: {err}')
    90  - logger.debug(err, exc_info=True)
    91  - error_text = "Some Error"
    92  - expection_text = str(err)
    93  - 
    94  - # TODO: return only needed information
    95  - return html_text, status_code, error_text, expection_text
    96  - 
    97  - 
    98  -async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify):
    99  - async with semaphore:
    100  - site_obj = site_dict[sitename]
    101  - future = site_obj.request_future
    102  - if not future:
    103  - # ignore: search by incompatible id type
    104  - return
    105  - 
    106  - response = await get_response(request_future=future,
    107  - site_name=sitename,
    108  - logger=logger)
    109  - 
    110  - site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
    111  - 
    112  -# TODO: move info separate module
    113  -def detect_error_page(html_text, status_code, fail_flags, ignore_403):
    114  - # Detect service restrictions such as a country restriction
    115  - for flag, msg in fail_flags.items():
    116  - if flag in html_text:
    117  - return 'Some site error', msg
    118  - 
    119  - # Detect common restrictions such as provider censorship and bot protection
    120  - for flag, msg in common_errors.items():
    121  - if flag in html_text:
    122  - return 'Error', msg
    123  - 
    124  - # Detect common site errors
    125  - if status_code == 403 and not ignore_403:
    126  - return 'Access denied', 'Access denied, use proxy/vpn'
    127  - elif status_code >= 500:
    128  - return f'Error {status_code}', f'Site error {status_code}'
    129  - 
    130  - return None, None
    131  - 
    132  - 
    133  -def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
    134  - if not response:
    135  - return results_info
    136  - 
    137  - fulltags = site.tags
    138  - 
    139  - # Retrieve other site information again
    140  - username = results_info['username']
    141  - is_parsing_enabled = results_info['parsing_enabled']
    142  - url = results_info.get("url_user")
    143  - logger.debug(url)
    144  - 
    145  - status = results_info.get("status")
    146  - if status is not None:
    147  - # We have already determined the user doesn't exist here
    148  - return results_info
    149  - 
    150  - # Get the expected check type
    151  - check_type = site.check_type
    152  - 
    153  - # Get the failure messages and comments
    154  - failure_errors = site.errors
    155  - 
    156  - # TODO: refactor
    157  - if not response:
    158  - logger.error(f'No response for {site.name}')
    159  - return results_info
    160  - 
    161  - html_text, status_code, error_text, expection_text = response
    162  - site_error_text = '?'
    163  - 
    164  - # TODO: add elapsed request time counting
    165  - response_time = None
    166  - 
    167  - if logger.level == logging.DEBUG:
    168  - with open('debug.txt', 'a') as f:
    169  - status = status_code or 'No response'
    170  - f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n')
    171  - if html_text:
    172  - f.write(f'code: {status}\nresponse: {str(html_text)}\n')
    173  - 
    174  - if status_code and not error_text:
    175  - error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
    176  - site.ignore_403)
    177  - 
    178  - if site.activation and html_text:
    179  - is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
    180  - if is_need_activation:
    181  - method = site.activation['method']
    182  - try:
    183  - activate_fun = getattr(ParsingActivator(), method)
    184  - # TODO: async call
    185  - activate_fun(site, logger)
    186  - except AttributeError:
    187  - logger.warning(f'Activation method {method} for site {site.name} not found!')
    188  - 
    189  - # presense flags
    190  - # True by default
    191  - presense_flags = site.presense_strs
    192  - is_presense_detected = False
    193  - if html_text:
    194  - if not presense_flags:
    195  - is_presense_detected = True
    196  - site.stats['presense_flag'] = None
    197  - else:
    198  - for presense_flag in presense_flags:
    199  - if presense_flag in html_text:
    200  - is_presense_detected = True
    201  - site.stats['presense_flag'] = presense_flag
    202  - logger.info(presense_flag)
    203  - break
    204  - 
    205  - if error_text is not None:
    206  - logger.debug(error_text)
    207  - result = QueryResult(username,
    208  - site.name,
    209  - url,
    210  - QueryStatus.UNKNOWN,
    211  - query_time=response_time,
    212  - context=f'{error_text}: {site_error_text}', tags=fulltags)
    213  - elif check_type == "message":
    214  - absence_flags = site.absence_strs
    215  - is_absence_flags_list = isinstance(absence_flags, list)
    216  - absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags}
    217  - # Checks if the error message is in the HTML
    218  - is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set])
    219  - if not is_absence_detected and is_presense_detected:
    220  - result = QueryResult(username,
    221  - site.name,
    222  - url,
    223  - QueryStatus.CLAIMED,
    224  - query_time=response_time, tags=fulltags)
    225  - else:
    226  - result = QueryResult(username,
    227  - site.name,
    228  - url,
    229  - QueryStatus.AVAILABLE,
    230  - query_time=response_time, tags=fulltags)
    231  - elif check_type == "status_code":
    232  - # Checks if the status code of the response is 2XX
    233  - if (not status_code >= 300 or status_code < 200) and is_presense_detected:
    234  - result = QueryResult(username,
    235  - site.name,
    236  - url,
    237  - QueryStatus.CLAIMED,
    238  - query_time=response_time, tags=fulltags)
    239  - else:
    240  - result = QueryResult(username,
    241  - site.name,
    242  - url,
    243  - QueryStatus.AVAILABLE,
    244  - query_time=response_time, tags=fulltags)
    245  - elif check_type == "response_url":
    246  - # For this detection method, we have turned off the redirect.
    247  - # So, there is no need to check the response URL: it will always
    248  - # match the request. Instead, we will ensure that the response
    249  - # code indicates that the request was successful (i.e. no 404, or
    250  - # forward to some odd redirect).
    251  - if 200 <= status_code < 300 and is_presense_detected:
    252  - result = QueryResult(username,
    253  - site.name,
    254  - url,
    255  - QueryStatus.CLAIMED,
    256  - query_time=response_time, tags=fulltags)
    257  - else:
    258  - result = QueryResult(username,
    259  - site.name,
    260  - url,
    261  - QueryStatus.AVAILABLE,
    262  - query_time=response_time, tags=fulltags)
    263  - else:
    264  - # It should be impossible to ever get here...
    265  - raise ValueError(f"Unknown check type '{check_type}' for "
    266  - f"site '{site.name}'")
    267  - 
    268  - extracted_ids_data = {}
    269  - 
    270  - if is_parsing_enabled and result.status == QueryStatus.CLAIMED:
    271  - try:
    272  - extracted_ids_data = extract(html_text)
    273  - except Exception as e:
    274  - logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True)
    275  - 
    276  - if extracted_ids_data:
    277  - new_usernames = {}
    278  - for k, v in extracted_ids_data.items():
    279  - if 'username' in k:
    280  - new_usernames[v] = 'username'
    281  - if k in supported_recursive_search_ids:
    282  - new_usernames[v] = k
    283  - 
    284  - results_info['ids_usernames'] = new_usernames
    285  - result.ids_data = extracted_ids_data
    286  - 
    287  - # Notify caller about results of query.
    288  - query_notify.update(result, site.similar_search)
    289  - 
    290  - # Save status of request
    291  - results_info['status'] = result
    292  - 
    293  - # Save results from request
    294  - results_info['http_status'] = status_code
    295  - results_info['is_similar'] = site.similar_search
    296  - # results_site['response_text'] = html_text
    297  - results_info['rank'] = site.alexa_rank
    298  - return results_info
    299  - 
    300  - 
    301  - 
    302  - 
    303  -async def maigret(username, site_dict, query_notify, logger,
    304  - proxy=None, timeout=None, recursive_search=False,
    305  - id_type='username', debug=False, forced=False,
    306  - max_connections=100, no_progressbar=False,
    307  - cookies=None):
    308  - """Main search func
    309  - 
    310  - Checks for existence of username on various social media sites.
    311  - 
    312  - Keyword Arguments:
    313  - username -- String indicating username that report
    314  - should be created against.
    315  - site_dict -- Dictionary containing all of the site data.
    316  - query_notify -- Object with base type of QueryNotify().
    317  - This will be used to notify the caller about
    318  - query results.
    319  - proxy -- String indicating the proxy URL
    320  - timeout -- Time in seconds to wait before timing out request.
    321  - Default is no timeout.
    322  - recursive_search -- Search for other usernames in website pages & recursive search by them.
    323  - 
    324  - Return Value:
    325  - Dictionary containing results from report. Key of dictionary is the name
    326  - of the social network site, and the value is another dictionary with
    327  - the following keys:
    328  - url_main: URL of main site.
    329  - url_user: URL of user on site (if account exists).
    330  - status: QueryResult() object indicating results of test for
    331  - account existence.
    332  - http_status: HTTP status code of query which checked for existence on
    333  - site.
    334  - response_text: Text that came back from request. May be None if
    335  - there was an HTTP error when checking for existence.
    336  - """
    337  - 
    338  - # Notify caller that we are starting the query.
    339  - query_notify.start(username, id_type)
    340  - 
    341  - # TODO: connector
    342  - connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
    343  - # connector = aiohttp.TCPConnector(ssl=False)
    344  - connector.verify_ssl=False
    345  - 
    346  - cookie_jar = None
    347  - if cookies:
    348  - cookie_jar = await import_aiohttp_cookies(cookies)
    349  - 
    350  - session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar)
    351  - 
    352  - if logger.level == logging.DEBUG:
    353  - future = session.get(url='https://icanhazip.com')
    354  - ip, status, error, expection = await get_response(future, None, logger)
    355  - if ip:
    356  - logger.debug(f'My IP is: {ip.strip()}')
    357  - else:
    358  - logger.debug(f'IP requesting {error}: {expection}')
    359  - 
    360  - 
    361  - # Results from analysis of all sites
    362  - results_total = {}
    363  - 
    364  - # First create futures for all requests. This allows for the requests to run in parallel
    365  - for site_name, site in site_dict.items():
    366  - 
    367  - if site.type != id_type:
    368  - continue
    369  - 
    370  - if site.disabled and not forced:
    371  - logger.debug(f'Site {site.name} is disabled, skipping...')
    372  - continue
    373  - 
    374  - # Results from analysis of this specific site
    375  - results_site = {}
    376  - 
    377  - # Record URL of main site and username
    378  - results_site['username'] = username
    379  - results_site['parsing_enabled'] = recursive_search
    380  - results_site['url_main'] = site.url_main
    381  - results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
    382  - 
    383  - headers = {
    384  - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
    385  - }
    386  - 
    387  - headers.update(site.headers)
    388  - 
    389  - if not 'url' in site.__dict__:
    390  - logger.error('No URL for site %s', site.name)
    391  - # URL of user on site (if it exists)
    392  - url = site.url.format(
    393  - urlMain=site.url_main,
    394  - urlSubpath=site.url_subpath,
    395  - username=username
    396  - )
    397  - # workaround to prevent slash errors
    398  - url = re.sub('(?<!:)/+', '/', url)
    399  - 
    400  - # Don't make request if username is invalid for the site
    401  - if site.regex_check and re.search(site.regex_check, username) is None:
    402  - # No need to do the check at the site: this user name is not allowed.
    403  - results_site['status'] = QueryResult(username,
    404  - site_name,
    405  - url,
    406  - QueryStatus.ILLEGAL)
    407  - results_site["url_user"] = ""
    408  - results_site['http_status'] = ""
    409  - results_site['response_text'] = ""
    410  - query_notify.update(results_site['status'])
    411  - else:
    412  - # URL of user on site (if it exists)
    413  - results_site["url_user"] = url
    414  - url_probe = site.url_probe
    415  - if url_probe is None:
    416  - # Probe URL is normal one seen by people out on the web.
    417  - url_probe = url
    418  - else:
    419  - # There is a special URL for probing existence separate
    420  - # from where the user profile normally can be found.
    421  - url_probe = url_probe.format(
    422  - urlMain=site.url_main,
    423  - urlSubpath=site.url_subpath,
    424  - username=username,
    425  - )
    426  - 
    427  - for k, v in site.get_params.items():
    428  - url_probe += f'&{k}={v}'
    429  - 
    430  - if site.check_type == 'status_code' and site.request_head_only:
    431  - # In most cases when we are detecting by status code,
    432  - # it is not necessary to get the entire body: we can
    433  - # detect fine with just the HEAD response.
    434  - request_method = session.head
    435  - else:
    436  - # Either this detect method needs the content associated
    437  - # with the GET response, or this specific website will
    438  - # not respond properly unless we request the whole page.
    439  - request_method = session.get
    440  - 
    441  - if site.check_type == "response_url":
    442  - # Site forwards request to a different URL if username not
    443  - # found. Disallow the redirect so we can capture the
    444  - # http status from the original URL request.
    445  - allow_redirects = False
    446  - else:
    447  - # Allow whatever redirect that the site wants to do.
    448  - # The final result of the request will be what is available.
    449  - allow_redirects = True
    450  - 
    451  - future = request_method(url=url_probe, headers=headers,
    452  - allow_redirects=allow_redirects,
    453  - timeout=timeout,
    454  - )
    455  - 
    456  - # Store future in data for access later
    457  - # TODO: move to separate obj
    458  - site.request_future = future
    459  - 
    460  - # Add this site's results into final dictionary with all of the other results.
    461  - results_total[site_name] = results_site
    462  - 
    463  - # TODO: move into top-level function
    464  - 
    465  - sem = asyncio.Semaphore(max_connections)
    466  - 
    467  - tasks = []
    468  - for sitename, result_obj in results_total.items():
    469  - update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify)
    470  - future = asyncio.ensure_future(update_site_coro)
    471  - tasks.append(future)
    472  - 
    473  - if no_progressbar:
    474  - await asyncio.gather(*tasks)
    475  - else:
    476  - for f in tqdm.asyncio.tqdm.as_completed(tasks):
    477  - await f
    478  - 
    479  - await session.close()
    480  - 
    481  - # Notify caller that all queries are finished.
    482  - query_notify.finish()
    483  - 
    484  - return results_total
    485  - 
    486  - 
    487  -def timeout_check(value):
    488  - """Check Timeout Argument.
    489  - 
    490  - Checks timeout for validity.
    491  - 
    492  - Keyword Arguments:
    493  - value -- Time in seconds to wait before timing out request.
    494  - 
    495  - Return Value:
    496  - Floating point number representing the time (in seconds) that should be
    497  - used for the timeout.
    498  - 
    499  - NOTE: Will raise an exception if the timeout in invalid.
    500  - """
    501  - from argparse import ArgumentTypeError
    502  - 
    503  - try:
    504  - timeout = float(value)
    505  - except ValueError:
    506  - raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
    507  - if timeout <= 0:
    508  - raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
    509  - return timeout
    510  - 
    511  - 
    512  -async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
    513  - query_notify = Mock()
    514  - changes = {
    515  - 'disabled': False,
    516  - }
    517  - 
    518  - try:
    519  - check_data = [
    520  - (site.username_claimed, QueryStatus.CLAIMED),
    521  - (site.username_unclaimed, QueryStatus.AVAILABLE),
    522  - ]
    523  - except:
    524  - print(site.__dict__)
    525  - 
    526  - logger.info(f'Checking {site.name}...')
    527  - 
    528  - for username, status in check_data:
    529  - async with semaphore:
    530  - results_dict = await maigret(
    531  - username,
    532  - {site.name: site},
    533  - query_notify,
    534  - logger,
    535  - timeout=30,
    536  - id_type=site.type,
    537  - forced=True,
    538  - no_progressbar=True,
    539  - )
    540  - 
    541  - # don't disable entries with other ids types
    542  - # TODO: make normal checking
    543  - if site.name not in results_dict:
    544  - logger.info(results_dict)
    545  - changes['disabled'] = True
    546  - continue
    547  - 
    548  - result = results_dict[site.name]['status']
    549  - 
    550  - 
    551  - site_status = result.status
    552  - 
    553  - if site_status != status:
    554  - if site_status == QueryStatus.UNKNOWN:
    555  - msgs = site.absence_strs
    556  - etype = site.check_type
    557  - logger.warning(f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
    558  - # don't disable in case of available username
    559  - if status == QueryStatus.CLAIMED:
    560  - changes['disabled'] = True
    561  - elif status == QueryStatus.CLAIMED:
    562  - logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
    563  - logger.info(results_dict[site.name])
    564  - changes['disabled'] = True
    565  - else:
    566  - logger.warning(f'Found `{username}` in {site.name}, must be available')
    567  - logger.info(results_dict[site.name])
    568  - changes['disabled'] = True
    569  - 
    570  - logger.info(f'Site {site.name} checking is finished')
    571  - 
    572  - if changes['disabled'] != site.disabled:
    573  - site.disabled = changes['disabled']
    574  - db.update_site(site)
    575  - if not silent:
    576  - action = 'Disabled' if site.disabled else 'Enabled'
    577  - print(f'{action} site {site.name}...')
    578  - 
    579  - return changes
    580  - 
    581  - 
    582  -async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False,
    583  - max_connections=10) -> bool:
    584  - sem = asyncio.Semaphore(max_connections)
    585  - tasks = []
    586  - all_sites = site_data
    587  - 
    588  - def disabled_count(lst):
    589  - return len(list(filter(lambda x: x.disabled, lst)))
    590  - 
    591  - disabled_old_count = disabled_count(all_sites.values())
    592  - 
    593  - for _, site in all_sites.items():
    594  - check_coro = site_self_check(site, logger, sem, db, silent)
    595  - future = asyncio.ensure_future(check_coro)
    596  - tasks.append(future)
    597  - 
    598  - for f in tqdm.asyncio.tqdm.as_completed(tasks):
    599  - await f
    600  - 
    601  - disabled_new_count = disabled_count(all_sites.values())
    602  - total_disabled = disabled_new_count - disabled_old_count
    603  - 
    604  - if total_disabled >= 0:
    605  - message = 'Disabled'
    606  - else:
    607  - message = 'Enabled'
    608  - total_disabled *= -1
    609  - 
    610  - if not silent:
    611  - print(f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information')
    612  - 
    613  - return total_disabled != 0
    614  - 
    615 21   
    616 22  async def main():
    617 23   version_string = '\n'.join([
    skipped 67 lines
    685 91   action="store_true", dest="print_check_errors", default=False,
    686 92   help="Print errors messages: connection, captcha, site country ban, etc."
    687 93   )
     94 + parser.add_argument("--submit",
     95 + type=str, dest="new_site_to_submit", default=False,
     96 + help="URL of existing profile in new site to submit."
     97 + )
    688 98   parser.add_argument("--no-color",
    689 99   action="store_true", dest="no_color", default=False,
    690 100   help="Don't color terminal output"
    skipped 47 lines
    738 148   action="store_true", dest="html", default=False,
    739 149   help="Create an HTML report file (general report on all usernames)."
    740 150   )
    741  - parser.add_argument("-X","--xmind",
     151 + parser.add_argument("-X", "--xmind",
    742 152   action="store_true",
    743 153   dest="xmind", default=False,
    744 154   help="Generate an XMind 8 mindmap report (one report per username)."
    skipped 75 lines
    820 230   
    821 231   site_data = get_top_sites_for_id(args.id_type)
    822 232   
     233 + if args.new_site_to_submit:
     234 + is_submitted = await submit_dialog(db, args.new_site_to_submit)
     235 + if is_submitted:
     236 + db.save_to_file(args.json_file)
     237 + 
    823 238   # Database self-checking
    824 239   if args.self_check:
    825 240   print('Maigret sites database self-checking...')
    skipped 48 lines
    874 289   
    875 290   if found_unsupported_chars:
    876 291   pretty_chars_str = ','.join(map(lambda s: f'"{s}"', found_unsupported_chars))
    877  - query_notify.warning(f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"')
     292 + query_notify.warning(
     293 + f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"')
    878 294   continue
    879 295   
    880 296   sites_to_check = get_top_sites_for_id(id_type)
    skipped 71 lines
    952 368   print('Maigret is interrupted.')
    953 369   sys.exit(1)
    954 370   
     371 + 
    955 372  if __name__ == "__main__":
    956 373   run()
     374 + 
  • ■ ■ ■ ■ ■ ■
    maigret/resources/data.json
    skipped 13589 lines
    13590 13590   "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
    13591 13591   "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
    13592 13592   "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    13593  - "x-guest-token": "1358064134064140290"
     13593 + "x-guest-token": "1358893858789208065"
    13594 13594   },
    13595 13595   "errors": {
    13596 13596   "Bad guest token": "x-guest-token update required"
    skipped 359 lines
    13956 13956   "video"
    13957 13957   ],
    13958 13958   "headers": {
    13959  - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTI2MjQ4NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.kgp8r380d1aDWcd-ROncr0Tqf8EdA-l35EeEY9is6TI"
     13959 + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTI4MjE0MjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.TXUhqilVT25xN4lZeoki6hEmbtcOiy7FKxTm5PWOMVs"
    13960 13960   },
    13961 13961   "activation": {
    13962 13962   "url": "https://vimeo.com/_rv/viewer",
    skipped 9106 lines
    23069 23069   "url": "https://protovary.style/user/{username}/",
    23070 23070   "urlMain": "https://protovary.style",
    23071 23071   "usernameClaimed": "alex",
     23072 + "usernameUnclaimed": "noonewouldeverusethis7"
     23073 + },
     23074 + "beacons.ai": {
     23075 + "checkType": "message",
     23076 + "presenseStrs": [
     23077 + "https://cdn.beacons.ai/profile_pictures"
     23078 + ],
     23079 + "absenceStrs": [
     23080 + "https://beacons.ai/bw_logo_full.png"
     23081 + ],
     23082 + "url": "https://beacons.ai/{username}",
     23083 + "urlMain": "https://beacons.ai",
     23084 + "usernameClaimed": "pasteljellies",
     23085 + "usernameUnclaimed": "noonewouldeverusethis7"
     23086 + },
     23087 + "are.na": {
     23088 + "checkType": "message",
     23089 + "presenseStrs": [
     23090 + "Profile--view"
     23091 + ],
     23092 + "absenceStrs": [
     23093 + "Are.na home"
     23094 + ],
     23095 + "url": "https://www.are.na/{username}",
     23096 + "urlMain": "https://www.are.na",
     23097 + "usernameClaimed": "nate-cassel",
    23072 23098   "usernameUnclaimed": "noonewouldeverusethis7"
    23073 23099   }
    23074 23100   },
    skipped 104 lines
  • ■ ■ ■ ■ ■ ■
    maigret/submit.py
     1 +import difflib
     2 + 
     3 +import requests
     4 +from mock import Mock
     5 + 
     6 +from .checking import *
     7 + 
     8 +DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
     9 + "birthday", "репутация", "информация", "e-mail"]
     10 + 
     11 +RATIO = 0.6
     12 +TOP_FEATURES = 5
     13 + 
     14 + 
     15 +def get_match_ratio(x):
     16 + return round(max([
     17 + difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
     18 + for y in DESIRED_STRINGS
     19 + ]), 2)
     20 + 
     21 + 
     22 +def extract_domain(url):
     23 + return '/'.join(url.split('/', 3)[:3])
     24 + 
     25 + 
     26 +async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
     27 + query_notify = Mock()
     28 + changes = {
     29 + 'disabled': False,
     30 + }
     31 + 
     32 + check_data = [
     33 + (site.username_claimed, QueryStatus.CLAIMED),
     34 + (site.username_unclaimed, QueryStatus.AVAILABLE),
     35 + ]
     36 + 
     37 + logger.info(f'Checking {site.name}...')
     38 + 
     39 + for username, status in check_data:
     40 + async with semaphore:
     41 + results_dict = await maigret(
     42 + username,
     43 + {site.name: site},
     44 + query_notify,
     45 + logger,
     46 + timeout=30,
     47 + id_type=site.type,
     48 + forced=True,
     49 + no_progressbar=True,
     50 + )
     51 + 
     52 + # don't disable entries with other ids types
     53 + # TODO: make normal checking
     54 + if site.name not in results_dict:
     55 + logger.info(results_dict)
     56 + changes['disabled'] = True
     57 + continue
     58 + 
     59 + result = results_dict[site.name]['status']
     60 + 
     61 + site_status = result.status
     62 + 
     63 + if site_status != status:
     64 + if site_status == QueryStatus.UNKNOWN:
     65 + msgs = site.absence_strs
     66 + etype = site.check_type
     67 + logger.warning(
     68 + f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
     69 + # don't disable in case of available username
     70 + if status == QueryStatus.CLAIMED:
     71 + changes['disabled'] = True
     72 + elif status == QueryStatus.CLAIMED:
     73 + logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
     74 + logger.info(results_dict[site.name])
     75 + changes['disabled'] = True
     76 + else:
     77 + logger.warning(f'Found `{username}` in {site.name}, must be available')
     78 + logger.info(results_dict[site.name])
     79 + changes['disabled'] = True
     80 + 
     81 + logger.info(f'Site {site.name} checking is finished')
     82 + 
     83 + return changes
     84 + 
     85 + 
     86 +async def submit_dialog(db, url_exists):
     87 + url_parts = url_exists.split('/')
     88 + supposed_username = url_parts[-1]
     89 + new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
     90 + if new_name:
     91 + supposed_username = new_name
     92 + non_exist_username = 'noonewouldeverusethis7'
     93 + 
     94 + url_user = url_exists.replace(supposed_username, '{username}')
     95 + url_not_exists = url_exists.replace(supposed_username, non_exist_username)
     96 + 
     97 + a = requests.get(url_exists).text
     98 + b = requests.get(url_not_exists).text
     99 + 
     100 + tokens_a = set(a.split('"'))
     101 + tokens_b = set(b.split('"'))
     102 + 
     103 + a_minus_b = tokens_a.difference(tokens_b)
     104 + b_minus_a = tokens_b.difference(tokens_a)
     105 + 
     106 + top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: '))
     107 + if not top_features_count:
     108 + top_features_count = TOP_FEATURES
     109 + 
     110 + presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count]
     111 + 
     112 + print('Detected text features of existing account: ' + ', '.join(presence_list))
     113 + features = input('If features was not detected correctly, write it manually: ')
     114 + 
     115 + if features:
     116 + presence_list = features.split(',')
     117 + 
     118 + absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[:top_features_count]
     119 + print('Detected text features of non-existing account: ' + ', '.join(absence_list))
     120 + features = input('If features was not detected correctly, write it manually: ')
     121 + 
     122 + if features:
     123 + absence_list = features.split(',')
     124 + 
     125 + url_main = extract_domain(url_exists)
     126 + 
     127 + site_data = {
     128 + 'absenceStrs': absence_list,
     129 + 'presenseStrs': presence_list,
     130 + 'url': url_user,
     131 + 'urlMain': url_main,
     132 + 'usernameClaimed': supposed_username,
     133 + 'usernameUnclaimed': non_exist_username,
     134 + 'checkType': 'message',
     135 + }
     136 + 
     137 + site = MaigretSite(url_main.split('/')[-1], site_data)
     138 + 
     139 + print(site.__dict__)
     140 + 
     141 + sem = asyncio.Semaphore(1)
     142 + log_level = logging.INFO
     143 + logging.basicConfig(
     144 + format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
     145 + datefmt='%H:%M:%S',
     146 + level=log_level
     147 + )
     148 + logger = logging.getLogger('site-submit')
     149 + logger.setLevel(log_level)
     150 + 
     151 + result = await site_self_check(site, logger, sem, db)
     152 + 
     153 + if result['disabled']:
     154 + print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.')
     155 + print('Try to run this mode again and increase features count or choose others.')
     156 + else:
     157 + if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [yY] ') in 'yY':
     158 + db.update_site(site)
     159 + return True
     160 + 
     161 + return False
     162 + 
Please wait...
Page is in error, reload to recover