skipped 4 lines 5 5 import ssl 6 6 import sys 7 7 import tqdm 8 - from typing import Tuple, Optional 8 + from typing import Tuple, Optional, Dict , List 9 9 10 10 import aiohttp 11 11 import tqdm.asyncio skipped 4 lines 16 16 from .activation import ParsingActivator, import_aiohttp_cookies 17 17 from . import errors 18 18 from .errors import CheckError 19 - from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor 19 + from .executors import ( 20 + AsyncExecutor, 21 + AsyncioSimpleExecutor, 22 + AsyncioProgressbarQueueExecutor, 23 + ) 20 24 from .result import QueryResult, QueryStatus 21 25 from .sites import MaigretDatabase, MaigretSite 26 + from .types import QueryOptions, QueryResultWrapper 22 27 from .utils import get_random_user_agent 23 28 24 29 skipped 10 lines 35 40 unsupported_characters = "#" 36 41 37 42 38 - async def get_response( 39 - request_future, site_name, logger 40 - ) -> Tuple[str, int, Optional[CheckError]]: 43 + async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]: 41 44 html_text = None 42 45 status_code = 0 43 - error: Optional[CheckError] = CheckError("Error ") 46 + error: Optional[CheckError] = CheckError("Unknown ") 44 47 45 48 try: 46 49 response = await request_future skipped 29 lines 76 79 ): 77 80 error = CheckError("SSL", str(e)) 78 81 else: 79 - logger.warning(f"Unhandled error while requesting {site_name}: {e}") 80 82 logger.debug(e, exc_info=True) 81 - error = CheckError("Error ", str(e)) 83 + error = CheckError("Unexpected ", str(e)) 82 84 83 - # TODO: return only needed information 84 85 return str(html_text), status_code, error 85 86 86 87 87 - async def update_site_dict_from_response( 88 - sitename, site_dict, results_info, logger, query_notify 89 - ): 90 - site_obj = site_dict[sitename] 91 - future = site_obj.request_future 92 - if not future: 93 - # ignore: search by incompatible id type 94 - return 95 - 96 - response = await get_response( 97 - request_future=future, site_name=sitename, logger=logger 98 - ) 99 - 100 - return sitename, process_site_result( 101 - response, query_notify, logger, results_info, site_obj 102 - ) 103 - 104 - 105 88 # TODO: move to separate class 106 89 def detect_error_page( 107 90 html_text, status_code, fail_flags, ignore_403 skipped 19 lines 127 110 128 111 129 112 def process_site_result( 130 - response, query_notify, logger, results_info, site: MaigretSite 113 + response, query_notify, logger, results_info: QueryResultWrapper , site: MaigretSite 131 114 ): 132 115 if not response: 133 116 return results_info skipped 71 lines 205 188 logger.debug(presense_flag) 206 189 break 207 190 191 + def build_result(status, **kwargs): 192 + return QueryResult( 193 + username, 194 + site_name, 195 + url, 196 + status, 197 + query_time=response_time, 198 + tags=fulltags, 199 + **kwargs, 200 + ) 201 + 208 202 if check_error: 209 203 logger.debug(check_error) 210 204 result = QueryResult( skipped 7 lines 218 212 tags=fulltags, 219 213 ) 220 214 elif check_type == "message": 221 - absence_flags = site.absence_strs 222 - is_absence_flags_list = isinstance(absence_flags, list) 223 - absence_flags_set = ( 224 - set(absence_flags) if is_absence_flags_list else {absence_flags} 225 - ) 226 215 # Checks if the error message is in the HTML 227 216 is_absence_detected = any( 228 - [(absence_flag in html_text) for absence_flag in absence_flags_set ] 217 + [(absence_flag in html_text) for absence_flag in site . absence_strs ] 229 218 ) 230 219 if not is_absence_detected and is_presense_detected: 231 - result = QueryResult( 232 - username, 233 - site_name, 234 - url, 235 - QueryStatus.CLAIMED, 236 - query_time=response_time, 237 - tags=fulltags, 238 - ) 220 + result = build_result(QueryStatus.CLAIMED) 239 221 else: 240 - result = QueryResult( 241 - username, 242 - site_name, 243 - url, 244 - QueryStatus.AVAILABLE, 245 - query_time=response_time, 246 - tags=fulltags, 247 - ) 222 + result = build_result(QueryStatus.AVAILABLE) 248 223 elif check_type == "status_code": 249 224 # Checks if the status code of the response is 2XX 250 - if (not status_code >= 300 or status_code < 200) and is_presense_detected : 251 - result = QueryResult( 252 - username, 253 - site_name, 254 - url, 255 - QueryStatus.CLAIMED, 256 - query_time=response_time, 257 - tags=fulltags, 258 - ) 225 + if is_presense_detected and (not status_code >= 300 or status_code < 200): 226 + result = build_result(QueryStatus.CLAIMED) 259 227 else: 260 - result = QueryResult( 261 - username, 262 - site_name, 263 - url, 264 - QueryStatus.AVAILABLE, 265 - query_time=response_time, 266 - tags=fulltags, 267 - ) 228 + result = build_result(QueryStatus.AVAILABLE) 268 229 elif check_type == "response_url": 269 230 # For this detection method, we have turned off the redirect. 270 231 # So, there is no need to check the response URL: it will always skipped 1 lines 272 233 # code indicates that the request was successful (i.e. no 404, or 273 234 # forward to some odd redirect). 274 235 if 200 <= status_code < 300 and is_presense_detected: 275 - result = QueryResult( 276 - username, 277 - site_name, 278 - url, 279 - QueryStatus.CLAIMED, 280 - query_time=response_time, 281 - tags=fulltags, 282 - ) 236 + result = build_result(QueryStatus.CLAIMED) 283 237 else: 284 - result = QueryResult( 285 - username, 286 - site_name, 287 - url, 288 - QueryStatus.AVAILABLE, 289 - query_time=response_time, 290 - tags=fulltags, 291 - ) 238 + result = build_result(QueryStatus.AVAILABLE) 292 239 else: 293 240 # It should be impossible to ever get here... 294 241 raise ValueError( skipped 34 lines 329 276 return results_info 330 277 331 278 279 + def make_site_result( 280 + site: MaigretSite, username: str, options: QueryOptions, logger 281 + ) -> QueryResultWrapper: 282 + results_site: QueryResultWrapper = {} 283 + 284 + # Record URL of main site and username 285 + results_site["site"] = site 286 + results_site["username"] = username 287 + results_site["parsing_enabled"] = options["parsing"] 288 + results_site["url_main"] = site.url_main 289 + results_site["cookies"] = ( 290 + options.get("cookie_jar") 291 + and options["cookie_jar"].filter_cookies(site.url_main) 292 + or None 293 + ) 294 + 295 + headers = { 296 + "User-Agent": get_random_user_agent(), 297 + } 298 + 299 + headers.update(site.headers) 300 + 301 + if "url" not in site.__dict__: 302 + logger.error("No URL for site %s", site.name) 303 + 304 + # URL of user on site (if it exists) 305 + url = site.url.format( 306 + urlMain=site.url_main, urlSubpath=site.url_subpath, username=username 307 + ) 308 + 309 + # workaround to prevent slash errors 310 + url = re.sub("(?<!:)/+", "/", url) 311 + 312 + session = options['session'] 313 + 314 + # site check is disabled 315 + if site.disabled and not options['forced']: 316 + logger.debug(f"Site {site.name} is disabled, skipping...") 317 + results_site["status"] = QueryResult( 318 + username, 319 + site.name, 320 + url, 321 + QueryStatus.ILLEGAL, 322 + error=CheckError("Check is disabled"), 323 + ) 324 + # current username type could not be applied 325 + elif site.type != options["id_type"]: 326 + results_site["status"] = QueryResult( 327 + username, 328 + site.name, 329 + url, 330 + QueryStatus.ILLEGAL, 331 + error=CheckError('Unsupported identifier type', f'Want "{site.type}"'), 332 + ) 333 + # username is not allowed. 334 + elif site.regex_check and re.search(site.regex_check, username) is None: 335 + results_site["status"] = QueryResult( 336 + username, 337 + site.name, 338 + url, 339 + QueryStatus.ILLEGAL, 340 + error=CheckError( 341 + 'Unsupported username format', f'Want "{site.regex_check}"' 342 + ), 343 + ) 344 + results_site["url_user"] = "" 345 + results_site["http_status"] = "" 346 + results_site["response_text"] = "" 347 + # query_notify.update(results_site["status"]) 348 + else: 349 + # URL of user on site (if it exists) 350 + results_site["url_user"] = url 351 + url_probe = site.url_probe 352 + if url_probe is None: 353 + # Probe URL is normal one seen by people out on the web. 354 + url_probe = url 355 + else: 356 + # There is a special URL for probing existence separate 357 + # from where the user profile normally can be found. 358 + url_probe = url_probe.format( 359 + urlMain=site.url_main, 360 + urlSubpath=site.url_subpath, 361 + username=username, 362 + ) 363 + 364 + for k, v in site.get_params.items(): 365 + url_probe += f"&{k}={v}" 366 + 367 + if site.check_type == "status_code" and site.request_head_only: 368 + # In most cases when we are detecting by status code, 369 + # it is not necessary to get the entire body: we can 370 + # detect fine with just the HEAD response. 371 + request_method = session.head 372 + else: 373 + # Either this detect method needs the content associated 374 + # with the GET response, or this specific website will 375 + # not respond properly unless we request the whole page. 376 + request_method = session.get 377 + 378 + if site.check_type == "response_url": 379 + # Site forwards request to a different URL if username not 380 + # found. Disallow the redirect so we can capture the 381 + # http status from the original URL request. 382 + allow_redirects = False 383 + else: 384 + # Allow whatever redirect that the site wants to do. 385 + # The final result of the request will be what is available. 386 + allow_redirects = True 387 + 388 + future = request_method( 389 + url=url_probe, 390 + headers=headers, 391 + allow_redirects=allow_redirects, 392 + timeout=options['timeout'], 393 + ) 394 + 395 + # Store future request object in the results object 396 + results_site["future"] = future 397 + 398 + return results_site 399 + 400 + 401 + async def check_site_for_username( 402 + site, username, options: QueryOptions, logger, query_notify, *args, **kwargs 403 + ) -> Tuple[str, QueryResultWrapper]: 404 + default_result = make_site_result(site, username, options, logger) 405 + future = default_result.get("future") 406 + if not future: 407 + return site.name, default_result 408 + 409 + response = await get_response(request_future=future, logger=logger) 410 + 411 + response_result = process_site_result( 412 + response, query_notify, logger, default_result, site 413 + ) 414 + 415 + return site.name, response_result 416 + 417 + 418 + async def debug_ip_request(session, logger): 419 + future = session.get(url="https://icanhazip.com") 420 + ip, status, check_error = await get_response(future, logger) 421 + if ip: 422 + logger.debug(f"My IP is: {ip.strip()}") 423 + else: 424 + logger.debug(f"IP requesting {check_error.type}: {check_error.desc}") 425 + 426 + 427 + def get_failed_sites(results: Dict[str, QueryResultWrapper]) -> List[str]: 428 + sites = [] 429 + for sitename, r in results.items(): 430 + status = r.get('status', {}) 431 + if status and status.error: 432 + if errors.is_permanent(status.error.type): 433 + continue 434 + sites.append(sitename) 435 + return sites 436 + 437 + 332 438 async def maigret( 333 - username, 334 - site_dict, 439 + username: str , 440 + site_dict: Dict[str, MaigretSite], 335 441 logger, 336 442 query_notify=None, 337 443 proxy=None, skipped 5 lines 343 449 max_connections=100, 344 450 no_progressbar=False, 345 451 cookies=None, 346 - ): 452 + retries=0, 453 + ) -> QueryResultWrapper: 347 454 """Main search func 348 455 349 456 Checks for existence of username on certain sites. 350 457 351 458 Keyword Arguments: 352 459 username -- Username string will be used for search. 353 - site_dict -- Dictionary containing sites data. 460 + site_dict -- Dictionary containing sites data in MaigretSite objects . 354 461 query_notify -- Object with base type of QueryNotify(). 355 462 This will be used to notify the caller about 356 463 query results. skipped 23 lines 380 487 there was an HTTP error when checking for existence. 381 488 """ 382 489 383 - # Notify caller that we are starting the query. 490 + # notify caller that we are starting the query. 384 491 if not query_notify: 385 492 query_notify = Mock() 386 493 387 494 query_notify.start(username, id_type) 388 495 389 - # TODO: connector 496 + # make http client session 390 497 connector = ( 391 498 ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False) 392 499 ) 393 - # connector = aiohttp.TCPConnector(ssl=False) 394 500 connector.verify_ssl = False 395 501 396 502 cookie_jar = None skipped 6 lines 403 509 ) 404 510 405 511 if logger.level == logging.DEBUG: 406 - future = session.get(url="https://icanhazip.com") 407 - ip, status, check_error = await get_response(future, None, logger) 408 - if ip: 409 - logger.debug(f"My IP is: {ip.strip()}") 410 - else: 411 - logger.debug(f"IP requesting {check_error[0]}: {check_error[1]}") 412 - 413 - # Results from analysis of all sites 414 - results_total = {} 512 + await debug_ip_request(session, logger) 415 513 416 - # First create futures for all requests. This allows for the requests to run in parallel 417 - for site_name, site in site_dict.items(): 514 + # setup parallel executor 515 + executor: Optional[AsyncExecutor] = None 516 + if no_progressbar: 517 + executor = AsyncioSimpleExecutor(logger=logger) 518 + else: 519 + executor = AsyncioProgressbarQueueExecutor( 520 + logger=logger, in_parallel=max_connections, timeout=timeout + 0.5 521 + ) 418 522 419 - if site.type != id_type: 420 - continue 523 + # make options objects for all the requests 524 + options: QueryOptions = {} 525 + options["cookies"] = cookie_jar 526 + options["session"] = session 527 + options["parsing"] = is_parsing_enabled 528 + options["timeout"] = timeout 529 + options["id_type"] = id_type 530 + options["forced"] = forced 421 531 422 - if site.disabled and not forced: 423 - logger.debug(f"Site {site.name} is disabled, skipping...") 424 - continue 532 + # results from analysis of all sites 533 + all_results: Dict[str, QueryResultWrapper] = {} 425 534 426 - # Results from analysis of this specific site 427 - results_site = {} 535 + sites = list(site_dict.keys()) 428 536 429 - # Record URL of main site and username 430 - results_site["username"] = username 431 - results_site["parsing_enabled"] = is_parsing_enabled 432 - results_site["url_main"] = site.url_main 433 - results_site["cookies"] = ( 434 - cookie_jar and cookie_jar.filter_cookies(site.url_main) or None 435 - ) 537 + attempts = retries + 1 538 + while attempts: 539 + tasks_dict = {} 436 540 437 - headers = { 438 - "User-Agent": get_random_user_agent(), 439 - } 440 - 441 - headers.update(site.headers) 442 - 443 - if "url" not in site.__dict__: 444 - logger.error("No URL for site %s", site.name) 445 - # URL of user on site (if it exists) 446 - url = site.url.format( 447 - urlMain=site.url_main, urlSubpath=site.url_subpath, username=username 448 - ) 449 - # workaround to prevent slash errors 450 - url = re.sub("(?<!:)/+", "/", url) 451 - 452 - # Don't make request if username is invalid for the site 453 - if site.regex_check and re.search(site.regex_check, username) is None: 454 - # No need to do the check at the site: this user name is not allowed. 455 - results_site["status"] = QueryResult( 456 - username, site_name, url, QueryStatus.ILLEGAL 541 + for sitename, site in site_dict.items(): 542 + if sitename not in sites: 543 + continue 544 + default_result: QueryResultWrapper = { 545 + 'site': site, 546 + 'status': QueryResult( 547 + username, 548 + sitename, 549 + '', 550 + QueryStatus.UNKNOWN, 551 + error=CheckError('Request failed'), 552 + ), 553 + } 554 + tasks_dict[sitename] = ( 555 + check_site_for_username, 556 + [site, username, options, logger, query_notify], 557 + {'default': (sitename, default_result)}, 457 558 ) 458 - results_site["url_user"] = "" 459 - results_site["http_status"] = "" 460 - results_site["response_text"] = "" 461 - query_notify.update(results_site["status"]) 462 - else: 463 - # URL of user on site (if it exists) 464 - results_site["url_user"] = url 465 - url_probe = site.url_probe 466 - if url_probe is None: 467 - # Probe URL is normal one seen by people out on the web. 468 - url_probe = url 469 - else: 470 - # There is a special URL for probing existence separate 471 - # from where the user profile normally can be found. 472 - url_probe = url_probe.format( 473 - urlMain=site.url_main, 474 - urlSubpath=site.url_subpath, 475 - username=username, 476 - ) 477 559 478 - for k, v in site.get_params.items(): 479 - url_probe += f"&{k}={v}" 560 + cur_results = await executor.run(tasks_dict.values()) 480 561 481 - if site.check_type == "status_code" and site.request_head_only: 482 - # In most cases when we are detecting by status code, 483 - # it is not necessary to get the entire body: we can 484 - # detect fine with just the HEAD response. 485 - request_method = session.head 486 - else: 487 - # Either this detect method needs the content associated 488 - # with the GET response, or this specific website will 489 - # not respond properly unless we request the whole page. 490 - request_method = session.get 562 + # wait for executor timeout errors 563 + await asyncio.sleep(1) 491 564 492 - if site.check_type == "response_url": 493 - # Site forwards request to a different URL if username not 494 - # found. Disallow the redirect so we can capture the 495 - # http status from the original URL request. 496 - allow_redirects = False 497 - else: 498 - # Allow whatever redirect that the site wants to do. 499 - # The final result of the request will be what is available. 500 - allow_redirects = True 565 + all_results.update(cur_results) 501 566 502 - future = request_method( 503 - url=url_probe, 504 - headers=headers, 505 - allow_redirects=allow_redirects, 506 - timeout=timeout, 507 - ) 567 + sites = get_failed_sites(dict(cur_results)) 568 + attempts -= 1 508 569 509 - # Store future in data for access later 510 - # TODO: move to separate obj 511 - site.request_future = future 570 + if not sites: 571 + break 512 572 513 - # Add this site's results into final dictionary with all of the other results. 514 - results_total[site_name] = results_site 515 - 516 - coroutines = [] 517 - for sitename, result_obj in results_total.items(): 518 - coroutines.append( 519 - ( 520 - update_site_dict_from_response, 521 - [sitename, site_dict, result_obj, logger, query_notify], 522 - {}, 573 + if attempts: 574 + query_notify.warning( 575 + f'Restarting checks for {len(sites)} sites... ({attempts} attempts left)' 523 576 ) 524 - ) 525 577 526 - if no_progressbar: 527 - executor = AsyncioSimpleExecutor(logger=logger) 528 - else: 529 - executor = AsyncioProgressbarQueueExecutor( 530 - logger=logger, in_parallel=max_connections, timeout=timeout + 0.5 531 - ) 532 - 533 - results = await executor.run(coroutines) 534 - 578 + # closing http client session 535 579 await session.close() 536 580 537 - # Notify caller that all queries are finished. 581 + # notify caller that all queries are finished 538 582 query_notify.finish() 539 583 540 - data = {} 541 - for result in results: 542 - # TODO: still can be empty 543 - if result: 544 - try: 545 - data[result[0]] = result[1] 546 - except Exception as e: 547 - logger.error(e, exc_info=True) 548 - logger.info(result) 549 - 550 - return data 584 + return all_results 551 585 552 586 553 587 def timeout_check(value): skipped 21 lines 575 609 return timeout 576 610 577 611 578 - async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False): 612 + async def site_self_check( 613 + site: MaigretSite, logger, semaphore, db: MaigretDatabase, silent=False 614 + ): 579 615 changes = { 580 616 "disabled": False, 581 617 } skipped 20 lines 602 638 id_type=site.type, 603 639 forced=True, 604 640 no_progressbar=True, 641 + retries=1, 605 642 ) 606 643 607 644 # don't disable entries with other ids types skipped 80 lines