Projects STRLCPY maigret Commits f43dc5bd
🤬
  • ■ ■ ■ ■ ■ ■
    maigret/checking.py
    skipped 5 lines
    6 6  import sys
    7 7  import tqdm
    8 8  import time
    9  -from typing import Callable, Any, Iterable, Tuple
    10 9   
    11 10  import aiohttp
    12 11  import tqdm.asyncio
    skipped 3 lines
    16 15  from socid_extractor import extract
    17 16   
    18 17  from .activation import ParsingActivator, import_aiohttp_cookies
     18 +from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
    19 19  from .result import QueryResult, QueryStatus
    20 20  from .sites import MaigretDatabase, MaigretSite
     21 +from .types import CheckError
     22 + 
    21 23   
    22 24  supported_recursive_search_ids = (
    23 25   'yandex_public_id',
    skipped 6 lines
    30 32  )
    31 33   
    32 34  common_errors = {
    33  - '<title>Attention Required! | Cloudflare</title>': 'Cloudflare captcha',
    34  - 'Please stand by, while we are checking your browser': 'Cloudflare captcha',
    35  - '<title>Доступ ограничен</title>': 'Rostelecom censorship',
    36  - 'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha',
    37  - 'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection',
    38  - '404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': 'MegaFon 404 page',
    39  - 'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship',
    40  - 'Incapsula incident ID': 'Incapsula antibot protection',
     35 + '<title>Attention Required! | Cloudflare</title>': CheckError('Captcha', 'Cloudflare'),
     36 + 'Please stand by, while we are checking your browser': CheckError('Bot protection', 'Cloudflare'),
     37 + '<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
     38 + 'document.getElementById(\'validate_form_submit\').disabled=true': CheckError('Captcha', 'Mail.ru'),
     39 + 'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError('Bot protection', 'Blazingfast'),
     40 + '404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': CheckError('Resolving', 'MegaFon 404 page'),
     41 + 'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError('Censorship', 'MGTS'),
     42 + 'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
    41 43  }
    42 44   
    43 45  unsupported_characters = '#'
    44 46   
    45  -QueryDraft = Tuple[Callable, Any, Any]
    46  -QueriesDraft = Iterable[QueryDraft]
    47  - 
    48  - 
    49  -def create_task_func():
    50  - if sys.version_info.minor > 6:
    51  - create_asyncio_task = asyncio.create_task
    52  - else:
    53  - loop = asyncio.get_event_loop()
    54  - create_asyncio_task = loop.create_task
    55  - return create_asyncio_task
    56  - 
    57  -class AsyncExecutor:
    58  - def __init__(self, *args, **kwargs):
    59  - self.logger = kwargs['logger']
    60  - 
    61  - async def run(self, tasks: QueriesDraft):
    62  - start_time = time.time()
    63  - results = await self._run(tasks)
    64  - self.execution_time = time.time() - start_time
    65  - self.logger.debug(f'Spent time: {self.execution_time}')
    66  - return results
    67  - 
    68  - async def _run(self, tasks: QueriesDraft):
    69  - await asyncio.sleep(0)
    70  - 
    71  - 
    72  -class AsyncioSimpleExecutor(AsyncExecutor):
    73  - def __init__(self, *args, **kwargs):
    74  - super().__init__(*args, **kwargs)
    75  - 
    76  - async def _run(self, tasks: QueriesDraft):
    77  - futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
    78  - return await asyncio.gather(*futures)
    79  - 
    80  - 
    81  -class AsyncioProgressbarExecutor(AsyncExecutor):
    82  - def __init__(self, *args, **kwargs):
    83  - super().__init__(*args, **kwargs)
    84  - 
    85  - async def _run(self, tasks: QueriesDraft):
    86  - futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
    87  - results = []
    88  - for f in tqdm.asyncio.tqdm.as_completed(futures):
    89  - results.append(await f)
    90  - return results
    91  - 
    92  - 
    93  -class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor):
    94  - def __init__(self, *args, **kwargs):
    95  - super().__init__(*args, **kwargs)
    96  - self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1))
    97  - 
    98  - async def _run(self, tasks: QueriesDraft):
    99  - async def _wrap_query(q: QueryDraft):
    100  - async with self.semaphore:
    101  - f, args, kwargs = q
    102  - return await f(*args, **kwargs)
    103  - 
    104  - async def semaphore_gather(tasks: QueriesDraft):
    105  - coros = [_wrap_query(q) for q in tasks]
    106  - results = []
    107  - for f in tqdm.asyncio.tqdm.as_completed(coros):
    108  - results.append(await f)
    109  - return results
    110  - 
    111  - return await semaphore_gather(tasks)
    112  - 
    113  - 
    114  -class AsyncioProgressbarQueueExecutor(AsyncExecutor):
    115  - def __init__(self, *args, **kwargs):
    116  - super().__init__(*args, **kwargs)
    117  - self.workers_count = kwargs.get('in_parallel', 10)
    118  - self.progress_func = kwargs.get('progress_func', tqdm.tqdm)
    119  - self.queue = asyncio.Queue(self.workers_count)
    120  - self.timeout = kwargs.get('timeout')
    121  - 
    122  - async def worker(self):
    123  - while True:
    124  - try:
    125  - f, args, kwargs = self.queue.get_nowait()
    126  - except asyncio.QueueEmpty:
    127  - return
    128  - 
    129  - query_future = f(*args, **kwargs)
    130  - query_task = create_task_func()(query_future)
    131  - try:
    132  - result = await asyncio.wait_for(query_task, timeout=self.timeout)
    133  - except asyncio.TimeoutError:
    134  - result = None
    135  - 
    136  - self.results.append(result)
    137  - self.progress.update(1)
    138  - self.queue.task_done()
    139  - 
    140  - async def _run(self, queries: QueriesDraft):
    141  - self.results = []
    142  - 
    143  - queries_list = list(queries)
    144  - 
    145  - min_workers = min(len(queries_list), self.workers_count)
    146 47   
    147  - workers = [create_task_func()(self.worker())
    148  - for _ in range(min_workers)]
    149  - 
    150  - self.progress = self.progress_func(total=len(queries_list))
    151  - for t in queries_list:
    152  - await self.queue.put(t)
    153  - await self.queue.join()
    154  - for w in workers:
    155  - w.cancel()
    156  - self.progress.close()
    157  - return self.results
    158  - 
    159  - 
    160  -async def get_response(request_future, site_name, logger):
     48 +async def get_response(request_future, site_name, logger) -> (str, int, CheckError):
    161 49   html_text = None
    162 50   status_code = 0
    163  - 
    164  - error_text = "General Unknown Error"
    165  - expection_text = None
     51 + error = CheckError('Error')
    166 52   
    167 53   try:
    168 54   response = await request_future
    skipped 4 lines
    173 59   decoded_content = response_content.decode(charset, 'ignore')
    174 60   html_text = decoded_content
    175 61   
    176  - if status_code > 0:
    177  - error_text = None
     62 + if status_code == 0:
     63 + error = CheckError('Connection lost')
     64 + else:
     65 + error = None
    178 66   
    179 67   logger.debug(html_text)
    180 68   
    181  - except asyncio.TimeoutError as errt:
    182  - error_text = "Timeout Error"
    183  - expection_text = str(errt)
    184  - except aiohttp.client_exceptions.ClientConnectorError as err:
    185  - error_text = "Error Connecting"
    186  - expection_text = str(err)
    187  - except aiohttp.http_exceptions.BadHttpMessage as err:
    188  - error_text = "HTTP Error"
    189  - expection_text = str(err)
    190  - except proxy_errors.ProxyError as err:
    191  - error_text = "Proxy Error"
    192  - expection_text = str(err)
    193  - except Exception as err:
     69 + except asyncio.TimeoutError as e:
     70 + error = CheckError('Request timeout', str(e))
     71 + except aiohttp.client_exceptions.ClientConnectorError as e:
     72 + error = CheckError('Connecting failure', str(e))
     73 + except aiohttp.http_exceptions.BadHttpMessage as e:
     74 + error = CheckError('HTTP', str(e))
     75 + except proxy_errors.ProxyError as e:
     76 + error = CheckError('Proxy', str(e))
     77 + except Exception as e:
    194 78   # python-specific exceptions
    195 79   if sys.version_info.minor > 6:
    196  - if isinstance(err, ssl.SSLCertVerificationError) or isinstance(err, ssl.SSLError):
    197  - error_text = "SSL Error"
    198  - expection_text = str(err)
     80 + if isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError):
     81 + error = CheckError('SSL', str(e))
    199 82   else:
    200  - logger.warning(f'Unhandled error while requesting {site_name}: {err}')
    201  - logger.debug(err, exc_info=True)
    202  - error_text = "Some Error"
    203  - expection_text = str(err)
     83 + logger.warning(f'Unhandled error while requesting {site_name}: {e}')
     84 + logger.debug(e, exc_info=True)
     85 + error = CheckError('Error', str(e))
    204 86   
    205 87   # TODO: return only needed information
    206  - return html_text, status_code, error_text, expection_text
     88 + return html_text, status_code, error
    207 89   
    208 90   
    209 91  async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify):
    skipped 11 lines
    221 103   
    222 104   
    223 105  # TODO: move to separate class
    224  -def detect_error_page(html_text, status_code, fail_flags, ignore_403):
     106 +def detect_error_page(html_text, status_code, fail_flags, ignore_403) -> CheckError:
    225 107   # Detect service restrictions such as a country restriction
    226 108   for flag, msg in fail_flags.items():
    227 109   if flag in html_text:
    228  - return 'Some site error', msg
     110 + return CheckError('Site-specific', msg)
    229 111   
    230 112   # Detect common restrictions such as provider censorship and bot protection
    231  - for flag, msg in common_errors.items():
     113 + for flag, err in common_errors.items():
    232 114   if flag in html_text:
    233  - return 'Error', msg
     115 + return err
    234 116   
    235 117   # Detect common site errors
    236 118   if status_code == 403 and not ignore_403:
    237  - return 'Access denied', 'Access denied, use proxy/vpn'
     119 + return CheckError('Access denied', '403 status code, use proxy/vpn')
     120 + 
    238 121   elif status_code >= 500:
    239  - return f'Error {status_code}', f'Site error {status_code}'
     122 + return CheckError(f'Server', f'{status_code} status code')
    240 123   
    241  - return None, None
     124 + return None
    242 125   
    243 126   
    244 127  def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
    skipped 16 lines
    261 144   # Get the expected check type
    262 145   check_type = site.check_type
    263 146   
    264  - # Get the failure messages and comments
    265  - failure_errors = site.errors
    266  - 
    267 147   # TODO: refactor
    268 148   if not response:
    269 149   logger.error(f'No response for {site.name}')
    270 150   return results_info
    271 151   
    272  - html_text, status_code, error_text, expection_text = response
    273  - site_error_text = '?'
     152 + html_text, status_code, check_error = response
    274 153   
    275 154   # TODO: add elapsed request time counting
    276 155   response_time = None
    skipped 1 lines
    278 157   if logger.level == logging.DEBUG:
    279 158   with open('debug.txt', 'a') as f:
    280 159   status = status_code or 'No response'
    281  - f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n')
     160 + f.write(f'url: {url}\nerror: {check_error}\nr: {status}\n')
    282 161   if html_text:
    283 162   f.write(f'code: {status}\nresponse: {str(html_text)}\n')
    284 163   
    285  - if status_code and not error_text:
    286  - error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
    287  - site.ignore403)
     164 + # additional check for errors
     165 + if status_code and not check_error:
     166 + check_error = detect_error_page(html_text, status_code, site.errors, site.ignore403)
    288 167   
    289 168   if site.activation and html_text:
    290 169   is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
    skipped 21 lines
    312 191   if presense_flag in html_text:
    313 192   is_presense_detected = True
    314 193   site.stats['presense_flag'] = presense_flag
    315  - logger.info(presense_flag)
     194 + logger.debug(presense_flag)
    316 195   break
    317 196   
    318  - if error_text is not None:
    319  - logger.debug(error_text)
     197 + if check_error:
     198 + logger.debug(check_error)
    320 199   result = QueryResult(username,
    321 200   site.name,
    322 201   url,
    323 202   QueryStatus.UNKNOWN,
    324 203   query_time=response_time,
    325  - context=f'{error_text}: {site_error_text}', tags=fulltags)
     204 + error=check_error,
     205 + context=str(CheckError), tags=fulltags)
    326 206   elif check_type == "message":
    327 207   absence_flags = site.absence_strs
    328 208   is_absence_flags_list = isinstance(absence_flags, list)
    skipped 144 lines
    473 353   
    474 354   if logger.level == logging.DEBUG:
    475 355   future = session.get(url='https://icanhazip.com')
    476  - ip, status, error, expection = await get_response(future, None, logger)
     356 + ip, status, check_error = await get_response(future, None, logger)
    477 357   if ip:
    478 358   logger.debug(f'My IP is: {ip.strip()}')
    479 359   else:
    480  - logger.debug(f'IP requesting {error}: {expection}')
     360 + logger.debug(f'IP requesting {check_error[0]}: {check_error[1]}')
    481 361   
    482 362   # Results from analysis of all sites
    483 363   results_total = {}
    skipped 109 lines
    593 473   results = await executor.run(coroutines)
    594 474   
    595 475   await session.close()
     476 + 
     477 + # TODO: move to separate function
     478 + errors = {}
     479 + for el in results:
     480 + if not el:
     481 + continue
     482 + _, r = el
     483 + if r and isinstance(r, dict) and r.get('status'):
     484 + if not isinstance(r['status'], QueryResult):
     485 + continue
     486 + 
     487 + err = r['status'].error
     488 + if not err:
     489 + continue
     490 + errors[err.type] = errors.get(err.type, 0) + 1
     491 + 
     492 + for err, count in sorted(errors.items(), key=lambda x: x[1], reverse=True):
     493 + logger.warning(f'Errors of type "{err}": {count}')
    596 494   
    597 495   # Notify caller that all queries are finished.
    598 496   query_notify.finish()
    skipped 143 lines
  • ■ ■ ■ ■
    maigret/notify.py
    skipped 240 lines
    241 241   self.color,
    242 242   '?', result.site_name,
    243 243   Fore.RED, Fore.RED,
    244  - self.result.context + ids_data_text
     244 + str(self.result.error) + ids_data_text
    245 245   )
    246 246   elif result.status == QueryStatus.ILLEGAL:
    247 247   if not self.print_found_only:
    skipped 31 lines
  • ■ ■ ■ ■ ■ ■
    maigret/resources/data.json
    skipped 293 lines
    294 294   ],
    295 295   "errors": {
    296 296   "INTERNAL_SERVER_ERROR": "Site error",
    297  - "Something just went wrong": "Site error"
     297 + "Something just went wrong": "Site error",
     298 + "PersistedQueryNotFound": "Site error"
    298 299   },
    299  - "urlProbe": "https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22{username}%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22105058632482dd2786fd5775745908dc928f537b28e28356b076522757d65c19%22%7D%7D",
     300 + "urlProbe": "https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22{username}%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22fcecc7028c308115b0defebc63acec3fe3c12df86a602c3e1785ba5cfb8fff47%22%7D%7D",
    300 301   "checkType": "message",
    301 302   "absenceStrs": "No message available",
    302 303   "alexaRank": 3029,
    skipped 1616 lines
    1919 1920   "usernameUnclaimed": "noonewouldeverusethis7"
    1920 1921   },
    1921 1922   "Blu-ray": {
     1923 + "disabled": true,
    1922 1924   "tags": [
    1923 1925   "us"
    1924 1926   ],
    skipped 10224 lines
    12149 12151   "us"
    12150 12152   ],
    12151 12153   "headers": {
    12152  - "authorization": "Bearer BQC38O_gP1qY7X8Ui7RsgyutAuZ2QissgeFgsDEX6siaE_dAFmzV0mWMSziGB_dLErQwtfJZa7qM9IsmNHI"
     12154 + "authorization": "Bearer BQAjb32z4TLh0t19LDuYfk2BV3gUXCpqyUuy2gBOyJTN_2xoZlN4AW1B6ZVmdKMDcI3Hc8agrrQsKbQZE90"
    12153 12155   },
    12154 12156   "errors": {
    12155 12157   "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
    skipped 1297 lines
    13453 13455   "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
    13454 13456   "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
    13455 13457   "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    13456  - "x-guest-token": "1383863637358432258"
     13458 + "x-guest-token": "1386060728566681601"
    13457 13459   },
    13458 13460   "errors": {
    13459 13461   "Bad guest token": "x-guest-token update required"
    skipped 370 lines
    13830 13832   "video"
    13831 13833   ],
    13832 13834   "headers": {
    13833  - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTg3NzQ2ODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.q3cuo2lPiamc4wj5NgWOl3JPr_d33y5C06epHB92thk"
     13835 + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTkzMDI0NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.fN8PQIEkzQjfu7znGoIaLEP9Qr6bV8JbA2ZwpBSFI5E"
    13834 13836   },
    13835 13837   "activation": {
    13836 13838   "url": "https://vimeo.com/_rv/viewer",
    skipped 9925 lines
  • ■ ■ ■ ■ ■
    maigret/result.py
    skipped 33 lines
    34 34   """
    35 35   
    36 36   def __init__(self, username, site_name, site_url_user, status, ids_data=None,
    37  - query_time=None, context=None, tags=[]):
     37 + query_time=None, context=None, error=None, tags=[]):
    38 38   """Create Query Result Object.
    39 39   
    40 40   Contains information about a specific method of detecting usernames on
    skipped 32 lines
    73 73   self.context = context
    74 74   self.ids_data = ids_data
    75 75   self.tags = tags
     76 + self.error = error
    76 77   
    77 78   def json(self):
    78 79   return {
    skipped 28 lines
  • ■ ■ ■ ■ ■
    tests/test_checking.py tests/test_executors.py
    skipped 1 lines
    2 2  import pytest
    3 3  import asyncio
    4 4  import logging
    5  -from maigret.checking import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
     5 +from maigret.executors import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, \
     6 + AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
    6 7   
    7 8  logger = logging.getLogger(__name__)
    8 9   
    skipped 58 lines
Please wait...
Page is in error, reload to recover