| skipped 5 lines |
6 | 6 | | import sys |
7 | 7 | | import tqdm |
8 | 8 | | import time |
9 | | - | from typing import Callable, Any, Iterable, Tuple |
10 | 9 | | |
11 | 10 | | import aiohttp |
12 | 11 | | import tqdm.asyncio |
| skipped 3 lines |
16 | 15 | | from socid_extractor import extract |
17 | 16 | | |
18 | 17 | | from .activation import ParsingActivator, import_aiohttp_cookies |
| 18 | + | from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor |
19 | 19 | | from .result import QueryResult, QueryStatus |
20 | 20 | | from .sites import MaigretDatabase, MaigretSite |
| 21 | + | from .types import CheckError |
| 22 | + | |
21 | 23 | | |
22 | 24 | | supported_recursive_search_ids = ( |
23 | 25 | | 'yandex_public_id', |
| skipped 6 lines |
30 | 32 | | ) |
31 | 33 | | |
32 | 34 | | common_errors = { |
33 | | - | '<title>Attention Required! | Cloudflare</title>': 'Cloudflare captcha', |
34 | | - | 'Please stand by, while we are checking your browser': 'Cloudflare captcha', |
35 | | - | '<title>Доступ ограничен</title>': 'Rostelecom censorship', |
36 | | - | 'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha', |
37 | | - | 'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection', |
38 | | - | '404</h1><p class="error-card__description">Мы не нашли страницу': 'MegaFon 404 page', |
39 | | - | 'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship', |
40 | | - | 'Incapsula incident ID': 'Incapsula antibot protection', |
| 35 | + | '<title>Attention Required! | Cloudflare</title>': CheckError('Captcha', 'Cloudflare'), |
| 36 | + | 'Please stand by, while we are checking your browser': CheckError('Bot protection', 'Cloudflare'), |
| 37 | + | '<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'), |
| 38 | + | 'document.getElementById(\'validate_form_submit\').disabled=true': CheckError('Captcha', 'Mail.ru'), |
| 39 | + | 'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError('Bot protection', 'Blazingfast'), |
| 40 | + | '404</h1><p class="error-card__description">Мы не нашли страницу': CheckError('Resolving', 'MegaFon 404 page'), |
| 41 | + | 'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError('Censorship', 'MGTS'), |
| 42 | + | 'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'), |
41 | 43 | | } |
42 | 44 | | |
43 | 45 | | unsupported_characters = '#' |
44 | 46 | | |
45 | | - | QueryDraft = Tuple[Callable, Any, Any] |
46 | | - | QueriesDraft = Iterable[QueryDraft] |
47 | | - | |
48 | | - | |
49 | | - | def create_task_func(): |
50 | | - | if sys.version_info.minor > 6: |
51 | | - | create_asyncio_task = asyncio.create_task |
52 | | - | else: |
53 | | - | loop = asyncio.get_event_loop() |
54 | | - | create_asyncio_task = loop.create_task |
55 | | - | return create_asyncio_task |
56 | | - | |
57 | | - | class AsyncExecutor: |
58 | | - | def __init__(self, *args, **kwargs): |
59 | | - | self.logger = kwargs['logger'] |
60 | | - | |
61 | | - | async def run(self, tasks: QueriesDraft): |
62 | | - | start_time = time.time() |
63 | | - | results = await self._run(tasks) |
64 | | - | self.execution_time = time.time() - start_time |
65 | | - | self.logger.debug(f'Spent time: {self.execution_time}') |
66 | | - | return results |
67 | | - | |
68 | | - | async def _run(self, tasks: QueriesDraft): |
69 | | - | await asyncio.sleep(0) |
70 | | - | |
71 | | - | |
72 | | - | class AsyncioSimpleExecutor(AsyncExecutor): |
73 | | - | def __init__(self, *args, **kwargs): |
74 | | - | super().__init__(*args, **kwargs) |
75 | | - | |
76 | | - | async def _run(self, tasks: QueriesDraft): |
77 | | - | futures = [f(*args, **kwargs) for f, args, kwargs in tasks] |
78 | | - | return await asyncio.gather(*futures) |
79 | | - | |
80 | | - | |
81 | | - | class AsyncioProgressbarExecutor(AsyncExecutor): |
82 | | - | def __init__(self, *args, **kwargs): |
83 | | - | super().__init__(*args, **kwargs) |
84 | | - | |
85 | | - | async def _run(self, tasks: QueriesDraft): |
86 | | - | futures = [f(*args, **kwargs) for f, args, kwargs in tasks] |
87 | | - | results = [] |
88 | | - | for f in tqdm.asyncio.tqdm.as_completed(futures): |
89 | | - | results.append(await f) |
90 | | - | return results |
91 | | - | |
92 | | - | |
93 | | - | class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor): |
94 | | - | def __init__(self, *args, **kwargs): |
95 | | - | super().__init__(*args, **kwargs) |
96 | | - | self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1)) |
97 | | - | |
98 | | - | async def _run(self, tasks: QueriesDraft): |
99 | | - | async def _wrap_query(q: QueryDraft): |
100 | | - | async with self.semaphore: |
101 | | - | f, args, kwargs = q |
102 | | - | return await f(*args, **kwargs) |
103 | | - | |
104 | | - | async def semaphore_gather(tasks: QueriesDraft): |
105 | | - | coros = [_wrap_query(q) for q in tasks] |
106 | | - | results = [] |
107 | | - | for f in tqdm.asyncio.tqdm.as_completed(coros): |
108 | | - | results.append(await f) |
109 | | - | return results |
110 | | - | |
111 | | - | return await semaphore_gather(tasks) |
112 | | - | |
113 | | - | |
114 | | - | class AsyncioProgressbarQueueExecutor(AsyncExecutor): |
115 | | - | def __init__(self, *args, **kwargs): |
116 | | - | super().__init__(*args, **kwargs) |
117 | | - | self.workers_count = kwargs.get('in_parallel', 10) |
118 | | - | self.progress_func = kwargs.get('progress_func', tqdm.tqdm) |
119 | | - | self.queue = asyncio.Queue(self.workers_count) |
120 | | - | self.timeout = kwargs.get('timeout') |
121 | | - | |
122 | | - | async def worker(self): |
123 | | - | while True: |
124 | | - | try: |
125 | | - | f, args, kwargs = self.queue.get_nowait() |
126 | | - | except asyncio.QueueEmpty: |
127 | | - | return |
128 | | - | |
129 | | - | query_future = f(*args, **kwargs) |
130 | | - | query_task = create_task_func()(query_future) |
131 | | - | try: |
132 | | - | result = await asyncio.wait_for(query_task, timeout=self.timeout) |
133 | | - | except asyncio.TimeoutError: |
134 | | - | result = None |
135 | | - | |
136 | | - | self.results.append(result) |
137 | | - | self.progress.update(1) |
138 | | - | self.queue.task_done() |
139 | | - | |
140 | | - | async def _run(self, queries: QueriesDraft): |
141 | | - | self.results = [] |
142 | | - | |
143 | | - | queries_list = list(queries) |
144 | | - | |
145 | | - | min_workers = min(len(queries_list), self.workers_count) |
146 | 47 | | |
147 | | - | workers = [create_task_func()(self.worker()) |
148 | | - | for _ in range(min_workers)] |
149 | | - | |
150 | | - | self.progress = self.progress_func(total=len(queries_list)) |
151 | | - | for t in queries_list: |
152 | | - | await self.queue.put(t) |
153 | | - | await self.queue.join() |
154 | | - | for w in workers: |
155 | | - | w.cancel() |
156 | | - | self.progress.close() |
157 | | - | return self.results |
158 | | - | |
159 | | - | |
160 | | - | async def get_response(request_future, site_name, logger): |
| 48 | + | async def get_response(request_future, site_name, logger) -> (str, int, CheckError): |
161 | 49 | | html_text = None |
162 | 50 | | status_code = 0 |
163 | | - | |
164 | | - | error_text = "General Unknown Error" |
165 | | - | expection_text = None |
| 51 | + | error = CheckError('Error') |
166 | 52 | | |
167 | 53 | | try: |
168 | 54 | | response = await request_future |
| skipped 4 lines |
173 | 59 | | decoded_content = response_content.decode(charset, 'ignore') |
174 | 60 | | html_text = decoded_content |
175 | 61 | | |
176 | | - | if status_code > 0: |
177 | | - | error_text = None |
| 62 | + | if status_code == 0: |
| 63 | + | error = CheckError('Connection lost') |
| 64 | + | else: |
| 65 | + | error = None |
178 | 66 | | |
179 | 67 | | logger.debug(html_text) |
180 | 68 | | |
181 | | - | except asyncio.TimeoutError as errt: |
182 | | - | error_text = "Timeout Error" |
183 | | - | expection_text = str(errt) |
184 | | - | except aiohttp.client_exceptions.ClientConnectorError as err: |
185 | | - | error_text = "Error Connecting" |
186 | | - | expection_text = str(err) |
187 | | - | except aiohttp.http_exceptions.BadHttpMessage as err: |
188 | | - | error_text = "HTTP Error" |
189 | | - | expection_text = str(err) |
190 | | - | except proxy_errors.ProxyError as err: |
191 | | - | error_text = "Proxy Error" |
192 | | - | expection_text = str(err) |
193 | | - | except Exception as err: |
| 69 | + | except asyncio.TimeoutError as e: |
| 70 | + | error = CheckError('Request timeout', str(e)) |
| 71 | + | except aiohttp.client_exceptions.ClientConnectorError as e: |
| 72 | + | error = CheckError('Connecting failure', str(e)) |
| 73 | + | except aiohttp.http_exceptions.BadHttpMessage as e: |
| 74 | + | error = CheckError('HTTP', str(e)) |
| 75 | + | except proxy_errors.ProxyError as e: |
| 76 | + | error = CheckError('Proxy', str(e)) |
| 77 | + | except Exception as e: |
194 | 78 | | # python-specific exceptions |
195 | 79 | | if sys.version_info.minor > 6: |
196 | | - | if isinstance(err, ssl.SSLCertVerificationError) or isinstance(err, ssl.SSLError): |
197 | | - | error_text = "SSL Error" |
198 | | - | expection_text = str(err) |
| 80 | + | if isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError): |
| 81 | + | error = CheckError('SSL', str(e)) |
199 | 82 | | else: |
200 | | - | logger.warning(f'Unhandled error while requesting {site_name}: {err}') |
201 | | - | logger.debug(err, exc_info=True) |
202 | | - | error_text = "Some Error" |
203 | | - | expection_text = str(err) |
| 83 | + | logger.warning(f'Unhandled error while requesting {site_name}: {e}') |
| 84 | + | logger.debug(e, exc_info=True) |
| 85 | + | error = CheckError('Error', str(e)) |
204 | 86 | | |
205 | 87 | | # TODO: return only needed information |
206 | | - | return html_text, status_code, error_text, expection_text |
| 88 | + | return html_text, status_code, error |
207 | 89 | | |
208 | 90 | | |
209 | 91 | | async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify): |
| skipped 11 lines |
221 | 103 | | |
222 | 104 | | |
223 | 105 | | # TODO: move to separate class |
224 | | - | def detect_error_page(html_text, status_code, fail_flags, ignore_403): |
| 106 | + | def detect_error_page(html_text, status_code, fail_flags, ignore_403) -> CheckError: |
225 | 107 | | # Detect service restrictions such as a country restriction |
226 | 108 | | for flag, msg in fail_flags.items(): |
227 | 109 | | if flag in html_text: |
228 | | - | return 'Some site error', msg |
| 110 | + | return CheckError('Site-specific', msg) |
229 | 111 | | |
230 | 112 | | # Detect common restrictions such as provider censorship and bot protection |
231 | | - | for flag, msg in common_errors.items(): |
| 113 | + | for flag, err in common_errors.items(): |
232 | 114 | | if flag in html_text: |
233 | | - | return 'Error', msg |
| 115 | + | return err |
234 | 116 | | |
235 | 117 | | # Detect common site errors |
236 | 118 | | if status_code == 403 and not ignore_403: |
237 | | - | return 'Access denied', 'Access denied, use proxy/vpn' |
| 119 | + | return CheckError('Access denied', '403 status code, use proxy/vpn') |
| 120 | + | |
238 | 121 | | elif status_code >= 500: |
239 | | - | return f'Error {status_code}', f'Site error {status_code}' |
| 122 | + | return CheckError(f'Server', f'{status_code} status code') |
240 | 123 | | |
241 | | - | return None, None |
| 124 | + | return None |
242 | 125 | | |
243 | 126 | | |
244 | 127 | | def process_site_result(response, query_notify, logger, results_info, site: MaigretSite): |
| skipped 16 lines |
261 | 144 | | # Get the expected check type |
262 | 145 | | check_type = site.check_type |
263 | 146 | | |
264 | | - | # Get the failure messages and comments |
265 | | - | failure_errors = site.errors |
266 | | - | |
267 | 147 | | # TODO: refactor |
268 | 148 | | if not response: |
269 | 149 | | logger.error(f'No response for {site.name}') |
270 | 150 | | return results_info |
271 | 151 | | |
272 | | - | html_text, status_code, error_text, expection_text = response |
273 | | - | site_error_text = '?' |
| 152 | + | html_text, status_code, check_error = response |
274 | 153 | | |
275 | 154 | | # TODO: add elapsed request time counting |
276 | 155 | | response_time = None |
| skipped 1 lines |
278 | 157 | | if logger.level == logging.DEBUG: |
279 | 158 | | with open('debug.txt', 'a') as f: |
280 | 159 | | status = status_code or 'No response' |
281 | | - | f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n') |
| 160 | + | f.write(f'url: {url}\nerror: {check_error}\nr: {status}\n') |
282 | 161 | | if html_text: |
283 | 162 | | f.write(f'code: {status}\nresponse: {str(html_text)}\n') |
284 | 163 | | |
285 | | - | if status_code and not error_text: |
286 | | - | error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors, |
287 | | - | site.ignore403) |
| 164 | + | # additional check for errors |
| 165 | + | if status_code and not check_error: |
| 166 | + | check_error = detect_error_page(html_text, status_code, site.errors, site.ignore403) |
288 | 167 | | |
289 | 168 | | if site.activation and html_text: |
290 | 169 | | is_need_activation = any([s for s in site.activation['marks'] if s in html_text]) |
| skipped 21 lines |
312 | 191 | | if presense_flag in html_text: |
313 | 192 | | is_presense_detected = True |
314 | 193 | | site.stats['presense_flag'] = presense_flag |
315 | | - | logger.info(presense_flag) |
| 194 | + | logger.debug(presense_flag) |
316 | 195 | | break |
317 | 196 | | |
318 | | - | if error_text is not None: |
319 | | - | logger.debug(error_text) |
| 197 | + | if check_error: |
| 198 | + | logger.debug(check_error) |
320 | 199 | | result = QueryResult(username, |
321 | 200 | | site.name, |
322 | 201 | | url, |
323 | 202 | | QueryStatus.UNKNOWN, |
324 | 203 | | query_time=response_time, |
325 | | - | context=f'{error_text}: {site_error_text}', tags=fulltags) |
| 204 | + | error=check_error, |
| 205 | + | context=str(CheckError), tags=fulltags) |
326 | 206 | | elif check_type == "message": |
327 | 207 | | absence_flags = site.absence_strs |
328 | 208 | | is_absence_flags_list = isinstance(absence_flags, list) |
| skipped 144 lines |
473 | 353 | | |
474 | 354 | | if logger.level == logging.DEBUG: |
475 | 355 | | future = session.get(url='https://icanhazip.com') |
476 | | - | ip, status, error, expection = await get_response(future, None, logger) |
| 356 | + | ip, status, check_error = await get_response(future, None, logger) |
477 | 357 | | if ip: |
478 | 358 | | logger.debug(f'My IP is: {ip.strip()}') |
479 | 359 | | else: |
480 | | - | logger.debug(f'IP requesting {error}: {expection}') |
| 360 | + | logger.debug(f'IP requesting {check_error[0]}: {check_error[1]}') |
481 | 361 | | |
482 | 362 | | # Results from analysis of all sites |
483 | 363 | | results_total = {} |
| skipped 109 lines |
593 | 473 | | results = await executor.run(coroutines) |
594 | 474 | | |
595 | 475 | | await session.close() |
| 476 | + | |
| 477 | + | # TODO: move to separate function |
| 478 | + | errors = {} |
| 479 | + | for el in results: |
| 480 | + | if not el: |
| 481 | + | continue |
| 482 | + | _, r = el |
| 483 | + | if r and isinstance(r, dict) and r.get('status'): |
| 484 | + | if not isinstance(r['status'], QueryResult): |
| 485 | + | continue |
| 486 | + | |
| 487 | + | err = r['status'].error |
| 488 | + | if not err: |
| 489 | + | continue |
| 490 | + | errors[err.type] = errors.get(err.type, 0) + 1 |
| 491 | + | |
| 492 | + | for err, count in sorted(errors.items(), key=lambda x: x[1], reverse=True): |
| 493 | + | logger.warning(f'Errors of type "{err}": {count}') |
596 | 494 | | |
597 | 495 | | # Notify caller that all queries are finished. |
598 | 496 | | query_notify.finish() |
| skipped 143 lines |