| skipped 4 lines |
5 | 5 | | import ssl |
6 | 6 | | import sys |
7 | 7 | | import tqdm |
8 | | - | import time |
| 8 | + | from typing import Tuple, Optional |
9 | 9 | | |
10 | 10 | | import aiohttp |
11 | 11 | | import tqdm.asyncio |
12 | 12 | | from aiohttp_socks import ProxyConnector |
13 | | - | from mock import Mock |
14 | 13 | | from python_socks import _errors as proxy_errors |
15 | 14 | | from socid_extractor import extract |
16 | 15 | | |
17 | 16 | | from .activation import ParsingActivator, import_aiohttp_cookies |
| 17 | + | from . import errors |
| 18 | + | from .errors import CheckError |
18 | 19 | | from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor |
19 | 20 | | from .result import QueryResult, QueryStatus |
20 | 21 | | from .sites import MaigretDatabase, MaigretSite |
21 | | - | from .types import CheckError |
22 | 22 | | from .utils import get_random_user_agent |
23 | 23 | | |
24 | 24 | | |
25 | 25 | | supported_recursive_search_ids = ( |
26 | | - | 'yandex_public_id', |
27 | | - | 'gaia_id', |
28 | | - | 'vk_id', |
29 | | - | 'ok_id', |
30 | | - | 'wikimapia_uid', |
31 | | - | 'steam_id', |
32 | | - | 'uidme_uguid', |
| 26 | + | "yandex_public_id", |
| 27 | + | "gaia_id", |
| 28 | + | "vk_id", |
| 29 | + | "ok_id", |
| 30 | + | "wikimapia_uid", |
| 31 | + | "steam_id", |
| 32 | + | "uidme_uguid", |
33 | 33 | | ) |
34 | 34 | | |
35 | | - | common_errors = { |
36 | | - | '<title>Attention Required! | Cloudflare</title>': CheckError('Captcha', 'Cloudflare'), |
37 | | - | 'Please stand by, while we are checking your browser': CheckError('Bot protection', 'Cloudflare'), |
38 | | - | '<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'), |
39 | | - | 'document.getElementById(\'validate_form_submit\').disabled=true': CheckError('Captcha', 'Mail.ru'), |
40 | | - | 'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError('Bot protection', 'Blazingfast'), |
41 | | - | '404</h1><p class="error-card__description">Мы не нашли страницу': CheckError('Resolving', 'MegaFon 404 page'), |
42 | | - | 'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError('Censorship', 'MGTS'), |
43 | | - | 'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'), |
44 | | - | } |
| 35 | + | unsupported_characters = "#" |
45 | 36 | | |
46 | | - | unsupported_characters = '#' |
47 | 37 | | |
48 | | - | |
49 | | - | async def get_response(request_future, site_name, logger) -> (str, int, CheckError): |
| 38 | + | async def get_response( |
| 39 | + | request_future, site_name, logger |
| 40 | + | ) -> Tuple[str, int, Optional[CheckError]]: |
50 | 41 | | html_text = None |
51 | 42 | | status_code = 0 |
52 | | - | error = CheckError('Error') |
| 43 | + | error: Optional[CheckError] = CheckError("Error") |
53 | 44 | | |
54 | 45 | | try: |
55 | 46 | | response = await request_future |
56 | 47 | | |
57 | 48 | | status_code = response.status |
58 | 49 | | response_content = await response.content.read() |
59 | | - | charset = response.charset or 'utf-8' |
60 | | - | decoded_content = response_content.decode(charset, 'ignore') |
| 50 | + | charset = response.charset or "utf-8" |
| 51 | + | decoded_content = response_content.decode(charset, "ignore") |
61 | 52 | | html_text = decoded_content |
62 | 53 | | |
63 | 54 | | if status_code == 0: |
64 | | - | error = CheckError('Connection lost') |
| 55 | + | error = CheckError("Connection lost") |
65 | 56 | | else: |
66 | 57 | | error = None |
67 | 58 | | |
68 | 59 | | logger.debug(html_text) |
69 | 60 | | |
70 | 61 | | except asyncio.TimeoutError as e: |
71 | | - | error = CheckError('Request timeout', str(e)) |
| 62 | + | error = CheckError("Request timeout", str(e)) |
72 | 63 | | except aiohttp.client_exceptions.ClientConnectorError as e: |
73 | | - | error = CheckError('Connecting failure', str(e)) |
| 64 | + | error = CheckError("Connecting failure", str(e)) |
74 | 65 | | except aiohttp.http_exceptions.BadHttpMessage as e: |
75 | | - | error = CheckError('HTTP', str(e)) |
| 66 | + | error = CheckError("HTTP", str(e)) |
76 | 67 | | except proxy_errors.ProxyError as e: |
77 | | - | error = CheckError('Proxy', str(e)) |
| 68 | + | error = CheckError("Proxy", str(e)) |
| 69 | + | except KeyboardInterrupt: |
| 70 | + | error = CheckError("Interrupted") |
78 | 71 | | except Exception as e: |
79 | 72 | | # python-specific exceptions |
80 | 73 | | if sys.version_info.minor > 6: |
81 | | - | if isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError): |
82 | | - | error = CheckError('SSL', str(e)) |
| 74 | + | if isinstance(e, ssl.SSLCertVerificationError) or isinstance( |
| 75 | + | e, ssl.SSLError |
| 76 | + | ): |
| 77 | + | error = CheckError("SSL", str(e)) |
83 | 78 | | else: |
84 | | - | logger.warning(f'Unhandled error while requesting {site_name}: {e}') |
| 79 | + | logger.warning(f"Unhandled error while requesting {site_name}: {e}") |
85 | 80 | | logger.debug(e, exc_info=True) |
86 | | - | error = CheckError('Error', str(e)) |
| 81 | + | error = CheckError("Error", str(e)) |
87 | 82 | | |
88 | 83 | | # TODO: return only needed information |
89 | | - | return html_text, status_code, error |
| 84 | + | return str(html_text), status_code, error |
90 | 85 | | |
91 | 86 | | |
92 | | - | async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify): |
| 87 | + | async def update_site_dict_from_response( |
| 88 | + | sitename, site_dict, results_info, logger, query_notify |
| 89 | + | ): |
93 | 90 | | site_obj = site_dict[sitename] |
94 | 91 | | future = site_obj.request_future |
95 | 92 | | if not future: |
96 | 93 | | # ignore: search by incompatible id type |
97 | 94 | | return |
98 | 95 | | |
99 | | - | response = await get_response(request_future=future, |
100 | | - | site_name=sitename, |
101 | | - | logger=logger) |
| 96 | + | response = await get_response( |
| 97 | + | request_future=future, site_name=sitename, logger=logger |
| 98 | + | ) |
102 | 99 | | |
103 | | - | return sitename, process_site_result(response, query_notify, logger, results_info, site_obj) |
| 100 | + | return sitename, process_site_result( |
| 101 | + | response, query_notify, logger, results_info, site_obj |
| 102 | + | ) |
104 | 103 | | |
105 | 104 | | |
106 | 105 | | # TODO: move to separate class |
107 | | - | def detect_error_page(html_text, status_code, fail_flags, ignore_403) -> CheckError: |
| 106 | + | def detect_error_page( |
| 107 | + | html_text, status_code, fail_flags, ignore_403 |
| 108 | + | ) -> Optional[CheckError]: |
108 | 109 | | # Detect service restrictions such as a country restriction |
109 | 110 | | for flag, msg in fail_flags.items(): |
110 | 111 | | if flag in html_text: |
111 | | - | return CheckError('Site-specific', msg) |
| 112 | + | return CheckError("Site-specific", msg) |
112 | 113 | | |
113 | 114 | | # Detect common restrictions such as provider censorship and bot protection |
114 | | - | for flag, err in common_errors.items(): |
115 | | - | if flag in html_text: |
116 | | - | return err |
| 115 | + | err = errors.detect(html_text) |
| 116 | + | if err: |
| 117 | + | return err |
117 | 118 | | |
118 | 119 | | # Detect common site errors |
119 | 120 | | if status_code == 403 and not ignore_403: |
120 | | - | return CheckError('Access denied', '403 status code, use proxy/vpn') |
| 121 | + | return CheckError("Access denied", "403 status code, use proxy/vpn") |
121 | 122 | | |
122 | 123 | | elif status_code >= 500: |
123 | | - | return CheckError(f'Server', f'{status_code} status code') |
| 124 | + | return CheckError("Server", f"{status_code} status code") |
124 | 125 | | |
125 | 126 | | return None |
126 | 127 | | |
127 | 128 | | |
128 | | - | def process_site_result(response, query_notify, logger, results_info, site: MaigretSite): |
| 129 | + | def process_site_result( |
| 130 | + | response, query_notify, logger, results_info, site: MaigretSite |
| 131 | + | ): |
129 | 132 | | if not response: |
130 | 133 | | return results_info |
131 | 134 | | |
132 | 135 | | fulltags = site.tags |
133 | 136 | | |
134 | 137 | | # Retrieve other site information again |
135 | | - | username = results_info['username'] |
136 | | - | is_parsing_enabled = results_info['parsing_enabled'] |
| 138 | + | username = results_info["username"] |
| 139 | + | is_parsing_enabled = results_info["parsing_enabled"] |
137 | 140 | | url = results_info.get("url_user") |
138 | 141 | | logger.debug(url) |
139 | 142 | | |
| skipped 7 lines |
147 | 150 | | |
148 | 151 | | # TODO: refactor |
149 | 152 | | if not response: |
150 | | - | logger.error(f'No response for {site.name}') |
| 153 | + | logger.error(f"No response for {site.name}") |
151 | 154 | | return results_info |
152 | 155 | | |
153 | 156 | | html_text, status_code, check_error = response |
| skipped 2 lines |
156 | 159 | | response_time = None |
157 | 160 | | |
158 | 161 | | if logger.level == logging.DEBUG: |
159 | | - | with open('debug.txt', 'a') as f: |
160 | | - | status = status_code or 'No response' |
161 | | - | f.write(f'url: {url}\nerror: {check_error}\nr: {status}\n') |
| 162 | + | with open("debug.txt", "a") as f: |
| 163 | + | status = status_code or "No response" |
| 164 | + | f.write(f"url: {url}\nerror: {check_error}\nr: {status}\n") |
162 | 165 | | if html_text: |
163 | | - | f.write(f'code: {status}\nresponse: {str(html_text)}\n') |
| 166 | + | f.write(f"code: {status}\nresponse: {str(html_text)}\n") |
164 | 167 | | |
165 | 168 | | # additional check for errors |
166 | 169 | | if status_code and not check_error: |
167 | | - | check_error = detect_error_page(html_text, status_code, site.errors, site.ignore403) |
| 170 | + | check_error = detect_error_page( |
| 171 | + | html_text, status_code, site.errors, site.ignore403 |
| 172 | + | ) |
168 | 173 | | |
169 | 174 | | if site.activation and html_text: |
170 | | - | is_need_activation = any([s for s in site.activation['marks'] if s in html_text]) |
| 175 | + | is_need_activation = any( |
| 176 | + | [s for s in site.activation["marks"] if s in html_text] |
| 177 | + | ) |
171 | 178 | | if is_need_activation: |
172 | | - | method = site.activation['method'] |
| 179 | + | method = site.activation["method"] |
173 | 180 | | try: |
174 | 181 | | activate_fun = getattr(ParsingActivator(), method) |
175 | 182 | | # TODO: async call |
176 | 183 | | activate_fun(site, logger) |
177 | 184 | | except AttributeError: |
178 | | - | logger.warning(f'Activation method {method} for site {site.name} not found!') |
| 185 | + | logger.warning( |
| 186 | + | f"Activation method {method} for site {site.name} not found!" |
| 187 | + | ) |
179 | 188 | | except Exception as e: |
180 | | - | logger.warning(f'Failed activation {method} for site {site.name}: {e}') |
| 189 | + | logger.warning(f"Failed activation {method} for site {site.name}: {e}") |
181 | 190 | | |
182 | 191 | | site_name = site.pretty_name |
183 | 192 | | # presense flags |
| skipped 3 lines |
187 | 196 | | if html_text: |
188 | 197 | | if not presense_flags: |
189 | 198 | | is_presense_detected = True |
190 | | - | site.stats['presense_flag'] = None |
| 199 | + | site.stats["presense_flag"] = None |
191 | 200 | | else: |
192 | 201 | | for presense_flag in presense_flags: |
193 | 202 | | if presense_flag in html_text: |
194 | 203 | | is_presense_detected = True |
195 | | - | site.stats['presense_flag'] = presense_flag |
| 204 | + | site.stats["presense_flag"] = presense_flag |
196 | 205 | | logger.debug(presense_flag) |
197 | 206 | | break |
198 | 207 | | |
199 | 208 | | if check_error: |
200 | 209 | | logger.debug(check_error) |
201 | | - | result = QueryResult(username, |
202 | | - | site_name, |
203 | | - | url, |
204 | | - | QueryStatus.UNKNOWN, |
205 | | - | query_time=response_time, |
206 | | - | error=check_error, |
207 | | - | context=str(CheckError), tags=fulltags) |
| 210 | + | result = QueryResult( |
| 211 | + | username, |
| 212 | + | site_name, |
| 213 | + | url, |
| 214 | + | QueryStatus.UNKNOWN, |
| 215 | + | query_time=response_time, |
| 216 | + | error=check_error, |
| 217 | + | context=str(CheckError), |
| 218 | + | tags=fulltags, |
| 219 | + | ) |
208 | 220 | | elif check_type == "message": |
209 | 221 | | absence_flags = site.absence_strs |
210 | 222 | | is_absence_flags_list = isinstance(absence_flags, list) |
211 | | - | absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags} |
| 223 | + | absence_flags_set = ( |
| 224 | + | set(absence_flags) if is_absence_flags_list else {absence_flags} |
| 225 | + | ) |
212 | 226 | | # Checks if the error message is in the HTML |
213 | | - | is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set]) |
| 227 | + | is_absence_detected = any( |
| 228 | + | [(absence_flag in html_text) for absence_flag in absence_flags_set] |
| 229 | + | ) |
214 | 230 | | if not is_absence_detected and is_presense_detected: |
215 | | - | result = QueryResult(username, |
216 | | - | site_name, |
217 | | - | url, |
218 | | - | QueryStatus.CLAIMED, |
219 | | - | query_time=response_time, tags=fulltags) |
| 231 | + | result = QueryResult( |
| 232 | + | username, |
| 233 | + | site_name, |
| 234 | + | url, |
| 235 | + | QueryStatus.CLAIMED, |
| 236 | + | query_time=response_time, |
| 237 | + | tags=fulltags, |
| 238 | + | ) |
220 | 239 | | else: |
221 | | - | result = QueryResult(username, |
222 | | - | site_name, |
223 | | - | url, |
224 | | - | QueryStatus.AVAILABLE, |
225 | | - | query_time=response_time, tags=fulltags) |
| 240 | + | result = QueryResult( |
| 241 | + | username, |
| 242 | + | site_name, |
| 243 | + | url, |
| 244 | + | QueryStatus.AVAILABLE, |
| 245 | + | query_time=response_time, |
| 246 | + | tags=fulltags, |
| 247 | + | ) |
226 | 248 | | elif check_type == "status_code": |
227 | 249 | | # Checks if the status code of the response is 2XX |
228 | 250 | | if (not status_code >= 300 or status_code < 200) and is_presense_detected: |
229 | | - | result = QueryResult(username, |
230 | | - | site_name, |
231 | | - | url, |
232 | | - | QueryStatus.CLAIMED, |
233 | | - | query_time=response_time, tags=fulltags) |
| 251 | + | result = QueryResult( |
| 252 | + | username, |
| 253 | + | site_name, |
| 254 | + | url, |
| 255 | + | QueryStatus.CLAIMED, |
| 256 | + | query_time=response_time, |
| 257 | + | tags=fulltags, |
| 258 | + | ) |
234 | 259 | | else: |
235 | | - | result = QueryResult(username, |
236 | | - | site_name, |
237 | | - | url, |
238 | | - | QueryStatus.AVAILABLE, |
239 | | - | query_time=response_time, tags=fulltags) |
| 260 | + | result = QueryResult( |
| 261 | + | username, |
| 262 | + | site_name, |
| 263 | + | url, |
| 264 | + | QueryStatus.AVAILABLE, |
| 265 | + | query_time=response_time, |
| 266 | + | tags=fulltags, |
| 267 | + | ) |
240 | 268 | | elif check_type == "response_url": |
241 | 269 | | # For this detection method, we have turned off the redirect. |
242 | 270 | | # So, there is no need to check the response URL: it will always |
| skipped 1 lines |
244 | 272 | | # code indicates that the request was successful (i.e. no 404, or |
245 | 273 | | # forward to some odd redirect). |
246 | 274 | | if 200 <= status_code < 300 and is_presense_detected: |
247 | | - | result = QueryResult(username, |
248 | | - | site_name, |
249 | | - | url, |
250 | | - | QueryStatus.CLAIMED, |
251 | | - | query_time=response_time, tags=fulltags) |
| 275 | + | result = QueryResult( |
| 276 | + | username, |
| 277 | + | site_name, |
| 278 | + | url, |
| 279 | + | QueryStatus.CLAIMED, |
| 280 | + | query_time=response_time, |
| 281 | + | tags=fulltags, |
| 282 | + | ) |
252 | 283 | | else: |
253 | | - | result = QueryResult(username, |
254 | | - | site_name, |
255 | | - | url, |
256 | | - | QueryStatus.AVAILABLE, |
257 | | - | query_time=response_time, tags=fulltags) |
| 284 | + | result = QueryResult( |
| 285 | + | username, |
| 286 | + | site_name, |
| 287 | + | url, |
| 288 | + | QueryStatus.AVAILABLE, |
| 289 | + | query_time=response_time, |
| 290 | + | tags=fulltags, |
| 291 | + | ) |
258 | 292 | | else: |
259 | 293 | | # It should be impossible to ever get here... |
260 | | - | raise ValueError(f"Unknown check type '{check_type}' for " |
261 | | - | f"site '{site.name}'") |
| 294 | + | raise ValueError( |
| 295 | + | f"Unknown check type '{check_type}' for " f"site '{site.name}'" |
| 296 | + | ) |
262 | 297 | | |
263 | 298 | | extracted_ids_data = {} |
264 | 299 | | |
| skipped 1 lines |
266 | 301 | | try: |
267 | 302 | | extracted_ids_data = extract(html_text) |
268 | 303 | | except Exception as e: |
269 | | - | logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True) |
| 304 | + | logger.warning(f"Error while parsing {site.name}: {e}", exc_info=True) |
270 | 305 | | |
271 | 306 | | if extracted_ids_data: |
272 | 307 | | new_usernames = {} |
273 | 308 | | for k, v in extracted_ids_data.items(): |
274 | | - | if 'username' in k: |
275 | | - | new_usernames[v] = 'username' |
| 309 | + | if "username" in k: |
| 310 | + | new_usernames[v] = "username" |
276 | 311 | | if k in supported_recursive_search_ids: |
277 | 312 | | new_usernames[v] = k |
278 | 313 | | |
279 | | - | results_info['ids_usernames'] = new_usernames |
280 | | - | results_info['ids_links'] = eval(extracted_ids_data.get('links', '[]')) |
| 314 | + | results_info["ids_usernames"] = new_usernames |
| 315 | + | results_info["ids_links"] = eval(extracted_ids_data.get("links", "[]")) |
281 | 316 | | result.ids_data = extracted_ids_data |
282 | 317 | | |
283 | 318 | | # Notify caller about results of query. |
284 | 319 | | query_notify.update(result, site.similar_search) |
285 | 320 | | |
286 | 321 | | # Save status of request |
287 | | - | results_info['status'] = result |
| 322 | + | results_info["status"] = result |
288 | 323 | | |
289 | 324 | | # Save results from request |
290 | | - | results_info['http_status'] = status_code |
291 | | - | results_info['is_similar'] = site.similar_search |
| 325 | + | results_info["http_status"] = status_code |
| 326 | + | results_info["is_similar"] = site.similar_search |
292 | 327 | | # results_site['response_text'] = html_text |
293 | | - | results_info['rank'] = site.alexa_rank |
| 328 | + | results_info["rank"] = site.alexa_rank |
294 | 329 | | return results_info |
295 | 330 | | |
296 | 331 | | |
297 | | - | async def maigret(username, site_dict, logger, query_notify=None, |
298 | | - | proxy=None, timeout=None, is_parsing_enabled=False, |
299 | | - | id_type='username', debug=False, forced=False, |
300 | | - | max_connections=100, no_progressbar=False, |
301 | | - | cookies=None): |
| 332 | + | async def maigret( |
| 333 | + | username, |
| 334 | + | site_dict, |
| 335 | + | logger, |
| 336 | + | query_notify=None, |
| 337 | + | proxy=None, |
| 338 | + | timeout=None, |
| 339 | + | is_parsing_enabled=False, |
| 340 | + | id_type="username", |
| 341 | + | debug=False, |
| 342 | + | forced=False, |
| 343 | + | max_connections=100, |
| 344 | + | no_progressbar=False, |
| 345 | + | cookies=None, |
| 346 | + | ): |
302 | 347 | | """Main search func |
303 | 348 | | |
304 | 349 | | Checks for existence of username on certain sites. |
| skipped 37 lines |
342 | 387 | | query_notify.start(username, id_type) |
343 | 388 | | |
344 | 389 | | # TODO: connector |
345 | | - | connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False) |
| 390 | + | connector = ( |
| 391 | + | ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False) |
| 392 | + | ) |
346 | 393 | | # connector = aiohttp.TCPConnector(ssl=False) |
347 | 394 | | connector.verify_ssl = False |
348 | 395 | | |
349 | 396 | | cookie_jar = None |
350 | 397 | | if cookies: |
351 | | - | logger.debug(f'Using cookies jar file {cookies}') |
| 398 | + | logger.debug(f"Using cookies jar file {cookies}") |
352 | 399 | | cookie_jar = await import_aiohttp_cookies(cookies) |
353 | 400 | | |
354 | | - | session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar) |
| 401 | + | session = aiohttp.ClientSession( |
| 402 | + | connector=connector, trust_env=True, cookie_jar=cookie_jar |
| 403 | + | ) |
355 | 404 | | |
356 | 405 | | if logger.level == logging.DEBUG: |
357 | | - | future = session.get(url='https://icanhazip.com') |
| 406 | + | future = session.get(url="https://icanhazip.com") |
358 | 407 | | ip, status, check_error = await get_response(future, None, logger) |
359 | 408 | | if ip: |
360 | | - | logger.debug(f'My IP is: {ip.strip()}') |
| 409 | + | logger.debug(f"My IP is: {ip.strip()}") |
361 | 410 | | else: |
362 | | - | logger.debug(f'IP requesting {check_error[0]}: {check_error[1]}') |
| 411 | + | logger.debug(f"IP requesting {check_error[0]}: {check_error[1]}") |
363 | 412 | | |
364 | 413 | | # Results from analysis of all sites |
365 | 414 | | results_total = {} |
| skipped 5 lines |
371 | 420 | | continue |
372 | 421 | | |
373 | 422 | | if site.disabled and not forced: |
374 | | - | logger.debug(f'Site {site.name} is disabled, skipping...') |
| 423 | + | logger.debug(f"Site {site.name} is disabled, skipping...") |
375 | 424 | | continue |
376 | 425 | | |
377 | 426 | | # Results from analysis of this specific site |
378 | 427 | | results_site = {} |
379 | 428 | | |
380 | 429 | | # Record URL of main site and username |
381 | | - | results_site['username'] = username |
382 | | - | results_site['parsing_enabled'] = is_parsing_enabled |
383 | | - | results_site['url_main'] = site.url_main |
384 | | - | results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None |
| 430 | + | results_site["username"] = username |
| 431 | + | results_site["parsing_enabled"] = is_parsing_enabled |
| 432 | + | results_site["url_main"] = site.url_main |
| 433 | + | results_site["cookies"] = ( |
| 434 | + | cookie_jar and cookie_jar.filter_cookies(site.url_main) or None |
| 435 | + | ) |
385 | 436 | | |
386 | 437 | | headers = { |
387 | | - | 'User-Agent': get_random_user_agent(), |
| 438 | + | "User-Agent": get_random_user_agent(), |
388 | 439 | | } |
389 | 440 | | |
390 | 441 | | headers.update(site.headers) |
391 | 442 | | |
392 | | - | if 'url' not in site.__dict__: |
393 | | - | logger.error('No URL for site %s', site.name) |
| 443 | + | if "url" not in site.__dict__: |
| 444 | + | logger.error("No URL for site %s", site.name) |
394 | 445 | | # URL of user on site (if it exists) |
395 | 446 | | url = site.url.format( |
396 | | - | urlMain=site.url_main, |
397 | | - | urlSubpath=site.url_subpath, |
398 | | - | username=username |
| 447 | + | urlMain=site.url_main, urlSubpath=site.url_subpath, username=username |
399 | 448 | | ) |
400 | 449 | | # workaround to prevent slash errors |
401 | | - | url = re.sub('(?<!:)/+', '/', url) |
| 450 | + | url = re.sub("(?<!:)/+", "/", url) |
402 | 451 | | |
403 | 452 | | # Don't make request if username is invalid for the site |
404 | 453 | | if site.regex_check and re.search(site.regex_check, username) is None: |
405 | 454 | | # No need to do the check at the site: this user name is not allowed. |
406 | | - | results_site['status'] = QueryResult(username, |
407 | | - | site_name, |
408 | | - | url, |
409 | | - | QueryStatus.ILLEGAL) |
| 455 | + | results_site["status"] = QueryResult( |
| 456 | + | username, site_name, url, QueryStatus.ILLEGAL |
| 457 | + | ) |
410 | 458 | | results_site["url_user"] = "" |
411 | | - | results_site['http_status'] = "" |
412 | | - | results_site['response_text'] = "" |
413 | | - | query_notify.update(results_site['status']) |
| 459 | + | results_site["http_status"] = "" |
| 460 | + | results_site["response_text"] = "" |
| 461 | + | query_notify.update(results_site["status"]) |
414 | 462 | | else: |
415 | 463 | | # URL of user on site (if it exists) |
416 | 464 | | results_site["url_user"] = url |
| skipped 11 lines |
428 | 476 | | ) |
429 | 477 | | |
430 | 478 | | for k, v in site.get_params.items(): |
431 | | - | url_probe += f'&{k}={v}' |
| 479 | + | url_probe += f"&{k}={v}" |
432 | 480 | | |
433 | | - | if site.check_type == 'status_code' and site.request_head_only: |
| 481 | + | if site.check_type == "status_code" and site.request_head_only: |
434 | 482 | | # In most cases when we are detecting by status code, |
435 | 483 | | # it is not necessary to get the entire body: we can |
436 | 484 | | # detect fine with just the HEAD response. |
| skipped 14 lines |
451 | 499 | | # The final result of the request will be what is available. |
452 | 500 | | allow_redirects = True |
453 | 501 | | |
454 | | - | future = request_method(url=url_probe, headers=headers, |
455 | | - | allow_redirects=allow_redirects, |
456 | | - | timeout=timeout, |
457 | | - | ) |
| 502 | + | future = request_method( |
| 503 | + | url=url_probe, |
| 504 | + | headers=headers, |
| 505 | + | allow_redirects=allow_redirects, |
| 506 | + | timeout=timeout, |
| 507 | + | ) |
458 | 508 | | |
459 | 509 | | # Store future in data for access later |
460 | 510 | | # TODO: move to separate obj |
| skipped 4 lines |
465 | 515 | | |
466 | 516 | | coroutines = [] |
467 | 517 | | for sitename, result_obj in results_total.items(): |
468 | | - | coroutines.append((update_site_dict_from_response, [sitename, site_dict, result_obj, logger, query_notify], {})) |
| 518 | + | coroutines.append( |
| 519 | + | ( |
| 520 | + | update_site_dict_from_response, |
| 521 | + | [sitename, site_dict, result_obj, logger, query_notify], |
| 522 | + | {}, |
| 523 | + | ) |
| 524 | + | ) |
469 | 525 | | |
470 | 526 | | if no_progressbar: |
471 | 527 | | executor = AsyncioSimpleExecutor(logger=logger) |
472 | 528 | | else: |
473 | | - | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=max_connections, timeout=timeout+0.5) |
| 529 | + | executor = AsyncioProgressbarQueueExecutor( |
| 530 | + | logger=logger, in_parallel=max_connections, timeout=timeout + 0.5 |
| 531 | + | ) |
474 | 532 | | |
475 | 533 | | results = await executor.run(coroutines) |
476 | 534 | | |
477 | 535 | | await session.close() |
478 | 536 | | |
479 | | - | # TODO: move to separate function |
480 | | - | errors = {} |
481 | | - | for el in results: |
482 | | - | if not el: |
483 | | - | continue |
484 | | - | _, r = el |
485 | | - | if r and isinstance(r, dict) and r.get('status'): |
486 | | - | if not isinstance(r['status'], QueryResult): |
487 | | - | continue |
488 | | - | |
489 | | - | err = r['status'].error |
490 | | - | if not err: |
491 | | - | continue |
492 | | - | errors[err.type] = errors.get(err.type, 0) + 1 |
493 | | - | |
494 | | - | for err, count in sorted(errors.items(), key=lambda x: x[1], reverse=True): |
495 | | - | logger.warning(f'Errors of type "{err}": {count}') |
496 | | - | |
497 | 537 | | # Notify caller that all queries are finished. |
498 | 538 | | query_notify.finish() |
499 | 539 | | |
| skipped 37 lines |
537 | 577 | | |
538 | 578 | | async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False): |
539 | 579 | | changes = { |
540 | | - | 'disabled': False, |
| 580 | + | "disabled": False, |
541 | 581 | | } |
542 | 582 | | |
543 | 583 | | try: |
| skipped 6 lines |
550 | 590 | | logger.error(site.__dict__) |
551 | 591 | | check_data = [] |
552 | 592 | | |
553 | | - | logger.info(f'Checking {site.name}...') |
| 593 | + | logger.info(f"Checking {site.name}...") |
554 | 594 | | |
555 | 595 | | for username, status in check_data: |
556 | 596 | | async with semaphore: |
| skipped 11 lines |
568 | 608 | | # TODO: make normal checking |
569 | 609 | | if site.name not in results_dict: |
570 | 610 | | logger.info(results_dict) |
571 | | - | changes['disabled'] = True |
| 611 | + | changes["disabled"] = True |
572 | 612 | | continue |
573 | 613 | | |
574 | | - | result = results_dict[site.name]['status'] |
| 614 | + | result = results_dict[site.name]["status"] |
575 | 615 | | |
576 | 616 | | site_status = result.status |
577 | 617 | | |
| skipped 2 lines |
580 | 620 | | msgs = site.absence_strs |
581 | 621 | | etype = site.check_type |
582 | 622 | | logger.warning( |
583 | | - | f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}') |
| 623 | + | f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}" |
| 624 | + | ) |
584 | 625 | | # don't disable in case of available username |
585 | 626 | | if status == QueryStatus.CLAIMED: |
586 | | - | changes['disabled'] = True |
| 627 | + | changes["disabled"] = True |
587 | 628 | | elif status == QueryStatus.CLAIMED: |
588 | | - | logger.warning(f'Not found `{username}` in {site.name}, must be claimed') |
| 629 | + | logger.warning( |
| 630 | + | f"Not found `{username}` in {site.name}, must be claimed" |
| 631 | + | ) |
589 | 632 | | logger.info(results_dict[site.name]) |
590 | | - | changes['disabled'] = True |
| 633 | + | changes["disabled"] = True |
591 | 634 | | else: |
592 | | - | logger.warning(f'Found `{username}` in {site.name}, must be available') |
| 635 | + | logger.warning(f"Found `{username}` in {site.name}, must be available") |
593 | 636 | | logger.info(results_dict[site.name]) |
594 | | - | changes['disabled'] = True |
| 637 | + | changes["disabled"] = True |
595 | 638 | | |
596 | | - | logger.info(f'Site {site.name} checking is finished') |
| 639 | + | logger.info(f"Site {site.name} checking is finished") |
597 | 640 | | |
598 | | - | if changes['disabled'] != site.disabled: |
599 | | - | site.disabled = changes['disabled'] |
| 641 | + | if changes["disabled"] != site.disabled: |
| 642 | + | site.disabled = changes["disabled"] |
600 | 643 | | db.update_site(site) |
601 | 644 | | if not silent: |
602 | | - | action = 'Disabled' if site.disabled else 'Enabled' |
603 | | - | print(f'{action} site {site.name}...') |
| 645 | + | action = "Disabled" if site.disabled else "Enabled" |
| 646 | + | print(f"{action} site {site.name}...") |
604 | 647 | | |
605 | 648 | | return changes |
606 | 649 | | |
607 | 650 | | |
608 | | - | async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False, |
609 | | - | max_connections=10) -> bool: |
| 651 | + | async def self_check( |
| 652 | + | db: MaigretDatabase, site_data: dict, logger, silent=False, max_connections=10 |
| 653 | + | ) -> bool: |
610 | 654 | | sem = asyncio.Semaphore(max_connections) |
611 | 655 | | tasks = [] |
612 | 656 | | all_sites = site_data |
| skipped 15 lines |
628 | 672 | | total_disabled = disabled_new_count - disabled_old_count |
629 | 673 | | |
630 | 674 | | if total_disabled >= 0: |
631 | | - | message = 'Disabled' |
| 675 | + | message = "Disabled" |
632 | 676 | | else: |
633 | | - | message = 'Enabled' |
| 677 | + | message = "Enabled" |
634 | 678 | | total_disabled *= -1 |
635 | 679 | | |
636 | 680 | | if not silent: |
637 | 681 | | print( |
638 | | - | f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information') |
| 682 | + | f"{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. " |
| 683 | + | "Run with `--info` flag to get more information" |
| 684 | + | ) |
639 | 685 | | |
640 | 686 | | return total_disabled != 0 |
641 | 687 | | |