Projects STRLCPY maigret Commits 9b0acc09
🤬
  • Refactoring of submit module, some fixes

  • Loading...
  • Soxoj committed 3 years ago
    9b0acc09
    1 parent eb721dc7
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■
    Makefile
    skipped 24 lines
    25 25  pull:
    26 26   git stash
    27 27   git checkout main
    28  - git pull origin head
     28 + git pull origin main
    29 29   git stash pop
    30 30   
    31 31  clean:
    skipped 5 lines
  • ■ ■ ■ ■ ■ ■
    maigret/maigret.py
    skipped 35 lines
    36 36   sort_report_by_data_points,
    37 37  )
    38 38  from .sites import MaigretDatabase
    39  -from .submit import submit_dialog
     39 +from .submit import Submitter
    40 40  from .types import QueryResultWrapper
    41 41  from .utils import get_dict_ascii_tree
     42 +from .settings import Settings
    42 43   
    43 44   
    44 45  def notify_about_errors(search_results: QueryResultWrapper, query_notify):
    skipped 451 lines
    496 497   if args.tags:
    497 498   args.tags = list(set(str(args.tags).split(',')))
    498 499   
     500 + settings = Settings(
     501 + os.path.join(
     502 + os.path.dirname(os.path.realpath(__file__)), "resources/settings.json"
     503 + )
     504 + )
     505 + 
    499 506   if args.db_file is None:
    500 507   args.db_file = os.path.join(
    501 508   os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
    skipped 24 lines
    526 533   site_data = get_top_sites_for_id(args.id_type)
    527 534   
    528 535   if args.new_site_to_submit:
    529  - is_submitted = await submit_dialog(
    530  - db, args.new_site_to_submit, args.cookie_file, logger
    531  - )
     536 + submitter = Submitter(db=db, logger=logger, settings=settings)
     537 + is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
    532 538   if is_submitted:
    533 539   db.save_to_file(args.db_file)
    534 540   
    skipped 176 lines
  • ■ ■ ■ ■ ■ ■
    maigret/resources/data.json
    skipped 13035 lines
    13036 13036   "us"
    13037 13037   ],
    13038 13038   "headers": {
    13039  - "authorization": "Bearer BQCypIuUtz7zDFov8xN86mj1BelLf7Apf9WBaC5yYfNkmGe4r7Hz4Awp6dqPuCAP9K9F5yYtjbyZX_vlr4I"
     13039 + "authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
    13040 13040   },
    13041 13041   "errors": {
    13042 13042   "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
    skipped 947 lines
    13990 13990   "us"
    13991 13991   ],
    13992 13992   "errors": {
    13993  - "Website unavailable": "Site error"
     13993 + "Website unavailable": "Site error",
     13994 + "is currently offline": "Site error"
    13994 13995   },
    13995 13996   "checkType": "message",
    13996 13997   "absenceStrs": [
    skipped 465 lines
    14462 14463   "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
    14463 14464   "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
    14464 14465   "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    14465  - "x-guest-token": "1400174453577900043"
     14466 + "x-guest-token": "1403829602053771266"
    14466 14467   },
    14467 14468   "errors": {
    14468 14469   "Bad guest token": "x-guest-token update required"
    skipped 400 lines
    14869 14870   "video"
    14870 14871   ],
    14871 14872   "headers": {
    14872  - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjI2NjcxMjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.V4VVbLzNwPU21rNP5moSxrPcPw--C7_Qz9VHgcJc1CA"
     14873 + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
    14873 14874   },
    14874 14875   "activation": {
    14875 14876   "url": "https://vimeo.com/_rv/viewer",
    skipped 13581 lines
    28457 28458   ]
    28458 28459   }
    28459 28460   }
    28460  - }
     28461 + },
     28462 + "tags": [
     28463 + "gaming",
     28464 + "coding",
     28465 + "photo",
     28466 + "music",
     28467 + "blog",
     28468 + "finance",
     28469 + "freelance",
     28470 + "dating",
     28471 + "tech",
     28472 + "forum",
     28473 + "porn",
     28474 + "erotic",
     28475 + "webcam",
     28476 + "video",
     28477 + "movies",
     28478 + "hacking",
     28479 + "art",
     28480 + "discussion",
     28481 + "sharing",
     28482 + "writing",
     28483 + "wiki",
     28484 + "business",
     28485 + "shopping",
     28486 + "sport",
     28487 + "books",
     28488 + "news",
     28489 + "documents",
     28490 + "travel",
     28491 + "maps",
     28492 + "hobby",
     28493 + "apps",
     28494 + "classified",
     28495 + "career",
     28496 + "geosocial",
     28497 + "streaming",
     28498 + "education",
     28499 + "networking",
     28500 + "torrent",
     28501 + "science",
     28502 + "medicine",
     28503 + "reading",
     28504 + "stock",
     28505 + "messaging",
     28506 + "trading",
     28507 + "links",
     28508 + "fashion",
     28509 + "tasks",
     28510 + "military",
     28511 + "auto",
     28512 + "gambling",
     28513 + "cybercriminal",
     28514 + "review",
     28515 + "bookmarks",
     28516 + "design",
     28517 + "tor",
     28518 + "i2p"
     28519 + ]
    28461 28520  }
  • ■ ■ ■ ■ ■ ■
    maigret/resources/settings.json
     1 +{
     2 + "presence_strings": [
     3 + "username",
     4 + "not found",
     5 + "пользователь",
     6 + "profile",
     7 + "lastname",
     8 + "firstname",
     9 + "biography",
     10 + "birthday",
     11 + "репутация",
     12 + "информация",
     13 + "e-mail"
     14 + ],
     15 + "supposed_usernames": [
     16 + "alex", "god", "admin", "red", "blue", "john"]
     17 +}
  • ■ ■ ■ ■ ■ ■
    maigret/settings.py
     1 +import json
     2 + 
     3 + 
     4 +class Settings:
     5 + presence_strings: list
     6 + supposed_usernames: list
     7 + 
     8 + def __init__(self, filename):
     9 + data = {}
     10 + 
     11 + try:
     12 + with open(filename, "r", encoding="utf-8") as file:
     13 + try:
     14 + data = json.load(file)
     15 + except Exception as error:
     16 + raise ValueError(
     17 + f"Problem with parsing json contents of "
     18 + f"settings file '{filename}': {str(error)}."
     19 + )
     20 + except FileNotFoundError as error:
     21 + raise FileNotFoundError(
     22 + f"Problem while attempting to access settings file '{filename}'."
     23 + ) from error
     24 + 
     25 + self.__dict__.update(data)
     26 + 
     27 + @property
     28 + def json(self):
     29 + return self.__dict__
     30 + 
  • ■ ■ ■ ■ ■
    maigret/sites.py
    skipped 8 lines
    9 9   
    10 10  from .utils import CaseConverter, URLMatcher, is_country_tag
    11 11   
    12  -# TODO: move to data.json
    13  -SUPPORTED_TAGS = [
    14  - "gaming",
    15  - "coding",
    16  - "photo",
    17  - "music",
    18  - "blog",
    19  - "finance",
    20  - "freelance",
    21  - "dating",
    22  - "tech",
    23  - "forum",
    24  - "porn",
    25  - "erotic",
    26  - "webcam",
    27  - "video",
    28  - "movies",
    29  - "hacking",
    30  - "art",
    31  - "discussion",
    32  - "sharing",
    33  - "writing",
    34  - "wiki",
    35  - "business",
    36  - "shopping",
    37  - "sport",
    38  - "books",
    39  - "news",
    40  - "documents",
    41  - "travel",
    42  - "maps",
    43  - "hobby",
    44  - "apps",
    45  - "classified",
    46  - "career",
    47  - "geosocial",
    48  - "streaming",
    49  - "education",
    50  - "networking",
    51  - "torrent",
    52  - "science",
    53  - "medicine",
    54  - "reading",
    55  - "stock",
    56  - "messaging",
    57  - "trading",
    58  - "links",
    59  - "fashion",
    60  - "tasks",
    61  - "military",
    62  - "auto",
    63  - "gambling",
    64  - "cybercriminal",
    65  - "review",
    66  - "bookmarks",
    67  - "design",
    68  - "tor",
    69  - "i2p",
    70  -]
    71  - 
    72 12   
    73 13  class MaigretEngine:
    74 14   site: Dict[str, Any] = {}
    skipped 129 lines
    204 144   errors.update(self.errors)
    205 145   return errors
    206 146   
    207  - def get_url_type(self) -> str:
     147 + def get_url_template(self) -> str:
    208 148   url = URLMatcher.extract_main_part(self.url)
    209 149   if url.startswith("{username}"):
    210 150   url = "SUBDOMAIN"
    211 151   elif url == "":
    212  - url = f"{self.url} ({self.engine})"
     152 + url = f"{self.url} ({self.engine or 'no engine'})"
    213 153   else:
    214 154   parts = url.split("/")
    215 155   url = "/" + "/".join(parts[1:])
    skipped 57 lines
    273 213   
    274 214  class MaigretDatabase:
    275 215   def __init__(self):
    276  - self._sites = []
    277  - self._engines = []
     216 + self._tags: list = []
     217 + self._sites: list = []
     218 + self._engines: list = []
    278 219   
    279 220   @property
    280 221   def sites(self):
    skipped 73 lines
    354 295   db_data = {
    355 296   "sites": {site.name: site.strip_engine_data().json for site in self._sites},
    356 297   "engines": {engine.name: engine.json for engine in self._engines},
     298 + "tags": self._tags,
    357 299   }
    358 300   
    359 301   json_data = json.dumps(db_data, indent=4)
    skipped 7 lines
    367 309   # Add all of site information from the json file to internal site list.
    368 310   site_data = json_data.get("sites", {})
    369 311   engines_data = json_data.get("engines", {})
     312 + tags = json_data.get("tags", [])
     313 + 
     314 + self._tags += tags
    370 315   
    371 316   for engine_name in engines_data:
    372 317   self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
    skipped 96 lines
    469 414   if site.disabled:
    470 415   disabled_count += 1
    471 416   
    472  - url_type = site.get_url_type()
     417 + url_type = site.get_url_template()
    473 418   urls[url_type] = urls.get(url_type, 0) + 1
    474 419   
    475 420   if not site.tags:
    skipped 12 lines
    488 433   output += "Top tags:\n"
    489 434   for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
    490 435   mark = ""
    491  - if tag not in SUPPORTED_TAGS:
     436 + if tag not in self._tags:
    492 437   mark = " (non-standard)"
    493 438   output += f"{count}\t{tag}{mark}\n"
    494 439   
    skipped 2 lines
  • ■ ■ ■ ■ ■ ■
    maigret/submit.py
    1 1  import asyncio
    2  -import difflib
    3 2  import re
    4 3  from typing import List
    5 4  import xml.etree.ElementTree as ET
    skipped 2 lines
    8 7  from .activation import import_aiohttp_cookies
    9 8  from .checking import maigret
    10 9  from .result import QueryStatus
     10 +from .settings import Settings
    11 11  from .sites import MaigretDatabase, MaigretSite, MaigretEngine
    12  -from .utils import get_random_user_agent
     12 +from .utils import get_random_user_agent, get_match_ratio
    13 13   
    14 14   
    15  -DESIRED_STRINGS = [
    16  - "username",
    17  - "not found",
    18  - "пользователь",
    19  - "profile",
    20  - "lastname",
    21  - "firstname",
    22  - "biography",
    23  - "birthday",
    24  - "репутация",
    25  - "информация",
    26  - "e-mail",
    27  -]
     15 +class Submitter:
     16 + HEADERS = {
     17 + "User-Agent": get_random_user_agent(),
     18 + }
    28 19   
    29  -SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
     20 + SEPARATORS = "\"'"
    30 21   
    31  -HEADERS = {
    32  - "User-Agent": get_random_user_agent(),
    33  -}
     22 + RATIO = 0.6
     23 + TOP_FEATURES = 5
     24 + URL_RE = re.compile(r"https?://(www\.)?")
    34 25   
    35  -SEPARATORS = "\"'"
     26 + def __init__(self, db: MaigretDatabase, settings: Settings, logger):
     27 + self.settings = settings
     28 + self.db = db
     29 + self.logger = logger
    36 30   
    37  -RATIO = 0.6
    38  -TOP_FEATURES = 5
    39  -URL_RE = re.compile(r"https?://(www\.)?")
     31 + @staticmethod
     32 + def get_alexa_rank(site_url_main):
     33 + url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
     34 + xml_data = requests.get(url).text
     35 + root = ET.fromstring(xml_data)
     36 + alexa_rank = 0
    40 37   
     38 + try:
     39 + alexa_rank = int(root.find('.//REACH').attrib['RANK'])
     40 + except Exception:
     41 + pass
    41 42   
    42  -def get_match_ratio(x):
    43  - return round(
    44  - max(
    45  - [difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
    46  - ),
    47  - 2,
    48  - )
     43 + return alexa_rank
    49 44   
     45 + @staticmethod
     46 + def extract_mainpage_url(url):
     47 + return "/".join(url.split("/", 3)[:3])
    50 48   
    51  -def get_alexa_rank(site_url_main):
    52  - url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
    53  - xml_data = requests.get(url).text
    54  - root = ET.fromstring(xml_data)
    55  - alexa_rank = 0
     49 + async def site_self_check(self, site, semaphore, silent=False):
     50 + changes = {
     51 + "disabled": False,
     52 + }
    56 53   
    57  - try:
    58  - alexa_rank = int(root.find('.//REACH').attrib['RANK'])
    59  - except Exception:
    60  - pass
     54 + check_data = [
     55 + (site.username_claimed, QueryStatus.CLAIMED),
     56 + (site.username_unclaimed, QueryStatus.AVAILABLE),
     57 + ]
    61 58   
    62  - return alexa_rank
     59 + self.logger.info(f"Checking {site.name}...")
    63 60   
    64  - 
    65  -def extract_mainpage_url(url):
    66  - return "/".join(url.split("/", 3)[:3])
    67  - 
    68  - 
    69  -async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
    70  - changes = {
    71  - "disabled": False,
    72  - }
    73  - 
    74  - check_data = [
    75  - (site.username_claimed, QueryStatus.CLAIMED),
    76  - (site.username_unclaimed, QueryStatus.AVAILABLE),
    77  - ]
    78  - 
    79  - logger.info(f"Checking {site.name}...")
    80  - 
    81  - for username, status in check_data:
    82  - results_dict = await maigret(
    83  - username=username,
    84  - site_dict={site.name: site},
    85  - logger=logger,
    86  - timeout=30,
    87  - id_type=site.type,
    88  - forced=True,
    89  - no_progressbar=True,
    90  - )
    91  - 
    92  - # don't disable entries with other ids types
    93  - # TODO: make normal checking
    94  - if site.name not in results_dict:
    95  - logger.info(results_dict)
    96  - changes["disabled"] = True
    97  - continue
    98  - 
    99  - result = results_dict[site.name]["status"]
    100  - 
    101  - site_status = result.status
     61 + for username, status in check_data:
     62 + results_dict = await maigret(
     63 + username=username,
     64 + site_dict={site.name: site},
     65 + logger=self.logger,
     66 + timeout=30,
     67 + id_type=site.type,
     68 + forced=True,
     69 + no_progressbar=True,
     70 + )
    102 71   
    103  - if site_status != status:
    104  - if site_status == QueryStatus.UNKNOWN:
    105  - msgs = site.absence_strs
    106  - etype = site.check_type
    107  - logger.warning(
    108  - "Error while searching '%s' in %s: %s, %s, check type %s",
    109  - username,
    110  - site.name,
    111  - result.context,
    112  - msgs,
    113  - etype,
    114  - )
    115  - # don't disable in case of available username
    116  - if status == QueryStatus.CLAIMED:
    117  - changes["disabled"] = True
    118  - elif status == QueryStatus.CLAIMED:
    119  - logger.warning(
    120  - f"Not found `{username}` in {site.name}, must be claimed"
    121  - )
    122  - logger.info(results_dict[site.name])
    123  - changes["disabled"] = True
    124  - else:
    125  - logger.warning(f"Found `{username}` in {site.name}, must be available")
    126  - logger.info(results_dict[site.name])
     72 + # don't disable entries with other ids types
     73 + # TODO: make normal checking
     74 + if site.name not in results_dict:
     75 + self.logger.info(results_dict)
    127 76   changes["disabled"] = True
     77 + continue
    128 78   
    129  - logger.info(f"Site {site.name} checking is finished")
     79 + result = results_dict[site.name]["status"]
    130 80   
    131  - return changes
     81 + site_status = result.status
    132 82   
     83 + if site_status != status:
     84 + if site_status == QueryStatus.UNKNOWN:
     85 + msgs = site.absence_strs
     86 + etype = site.check_type
     87 + self.logger.warning(
     88 + "Error while searching '%s' in %s: %s, %s, check type %s",
     89 + username,
     90 + site.name,
     91 + result.context,
     92 + msgs,
     93 + etype,
     94 + )
     95 + # don't disable in case of available username
     96 + if status == QueryStatus.CLAIMED:
     97 + changes["disabled"] = True
     98 + elif status == QueryStatus.CLAIMED:
     99 + self.logger.warning(
     100 + f"Not found `{username}` in {site.name}, must be claimed"
     101 + )
     102 + self.logger.info(results_dict[site.name])
     103 + changes["disabled"] = True
     104 + else:
     105 + self.logger.warning(
     106 + f"Found `{username}` in {site.name}, must be available"
     107 + )
     108 + self.logger.info(results_dict[site.name])
     109 + changes["disabled"] = True
    133 110   
    134  -def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
    135  - fields = {}
    136  - if 'urlSubpath' in engine.site.get('url', ''):
    137  - msg = (
    138  - 'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
    139  - 'Enter in manually if it exists: '
    140  - )
    141  - subpath = input(msg).strip('/')
    142  - if subpath:
    143  - fields['urlSubpath'] = f'/{subpath}'
    144  - return fields
     111 + self.logger.info(f"Site {site.name} checking is finished")
    145 112   
     113 + return changes
    146 114   
    147  -async def detect_known_engine(
    148  - db, url_exists, url_mainpage, logger
    149  -) -> List[MaigretSite]:
    150  - try:
    151  - r = requests.get(url_mainpage)
    152  - logger.debug(r.text)
    153  - except Exception as e:
    154  - logger.warning(e)
    155  - print("Some error while checking main page")
    156  - return []
     115 + def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
     116 + fields = {}
     117 + if 'urlSubpath' in engine.site.get('url', ''):
     118 + msg = (
     119 + 'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
     120 + 'Enter in manually if it exists: '
     121 + )
     122 + subpath = input(msg).strip('/')
     123 + if subpath:
     124 + fields['urlSubpath'] = f'/{subpath}'
     125 + return fields
    157 126   
    158  - for engine in db.engines:
    159  - strs_to_check = engine.__dict__.get("presenseStrs")
    160  - if strs_to_check and r and r.text:
    161  - all_strs_in_response = True
    162  - for s in strs_to_check:
    163  - if s not in r.text:
    164  - all_strs_in_response = False
    165  - sites = []
    166  - if all_strs_in_response:
    167  - engine_name = engine.__dict__.get("name")
     127 + async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
     128 + try:
     129 + r = requests.get(url_mainpage)
     130 + self.logger.debug(r.text)
     131 + except Exception as e:
     132 + self.logger.warning(e)
     133 + print("Some error while checking main page")
     134 + return []
    168 135   
    169  - print(f"Detected engine {engine_name} for site {url_mainpage}")
    170  - 
    171  - usernames_to_check = SUPPOSED_USERNAMES
    172  - supposed_username = extract_username_dialog(url_exists)
    173  - if supposed_username:
    174  - usernames_to_check = [supposed_username] + usernames_to_check
    175  - 
    176  - add_fields = generate_additional_fields_dialog(engine, url_exists)
     136 + for engine in self.db.engines:
     137 + strs_to_check = engine.__dict__.get("presenseStrs")
     138 + if strs_to_check and r and r.text:
     139 + all_strs_in_response = True
     140 + for s in strs_to_check:
     141 + if s not in r.text:
     142 + all_strs_in_response = False
     143 + sites = []
     144 + if all_strs_in_response:
     145 + engine_name = engine.__dict__.get("name")
    177 146   
    178  - for u in usernames_to_check:
    179  - site_data = {
    180  - "urlMain": url_mainpage,
    181  - "name": url_mainpage.split("//")[1],
    182  - "engine": engine_name,
    183  - "usernameClaimed": u,
    184  - "usernameUnclaimed": "noonewouldeverusethis7",
    185  - **add_fields,
    186  - }
    187  - logger.info(site_data)
     147 + print(f"Detected engine {engine_name} for site {url_mainpage}")
    188 148   
    189  - maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data)
    190  - maigret_site.update_from_engine(db.engines_dict[engine_name])
    191  - sites.append(maigret_site)
     149 + usernames_to_check = self.settings.supposed_usernames
     150 + supposed_username = self.extract_username_dialog(url_exists)
     151 + if supposed_username:
     152 + usernames_to_check = [supposed_username] + usernames_to_check
    192 153   
    193  - return sites
     154 + add_fields = self.generate_additional_fields_dialog(
     155 + engine, url_exists
     156 + )
    194 157   
    195  - return []
     158 + for u in usernames_to_check:
     159 + site_data = {
     160 + "urlMain": url_mainpage,
     161 + "name": url_mainpage.split("//")[1],
     162 + "engine": engine_name,
     163 + "usernameClaimed": u,
     164 + "usernameUnclaimed": "noonewouldeverusethis7",
     165 + **add_fields,
     166 + }
     167 + self.logger.info(site_data)
    196 168   
     169 + maigret_site = MaigretSite(
     170 + url_mainpage.split("/")[-1], site_data
     171 + )
     172 + maigret_site.update_from_engine(
     173 + self.db.engines_dict[engine_name]
     174 + )
     175 + sites.append(maigret_site)
    197 176   
    198  -def extract_username_dialog(url):
    199  - url_parts = url.rstrip("/").split("/")
    200  - supposed_username = url_parts[-1].strip('@')
    201  - entered_username = input(
    202  - f'Is "{supposed_username}" a valid username? If not, write it manually: '
    203  - )
    204  - return entered_username if entered_username else supposed_username
     177 + return sites
    205 178   
     179 + return []
    206 180   
    207  -async def check_features_manually(
    208  - db, url_exists, url_mainpage, cookie_file, logger, redirects=False
    209  -):
    210  - custom_headers = {}
    211  - while True:
    212  - header_key = input(
    213  - 'Specify custom header if you need or just press Enter to skip. Header name: '
     181 + def extract_username_dialog(self, url):
     182 + url_parts = url.rstrip("/").split("/")
     183 + supposed_username = url_parts[-1].strip('@')
     184 + entered_username = input(
     185 + f'Is "{supposed_username}" a valid username? If not, write it manually: '
    214 186   )
    215  - if not header_key:
    216  - break
    217  - header_value = input('Header value: ')
    218  - custom_headers[header_key.strip()] = header_value.strip()
     187 + return entered_username if entered_username else supposed_username
    219 188   
    220  - supposed_username = extract_username_dialog(url_exists)
    221  - non_exist_username = "noonewouldeverusethis7"
     189 + async def check_features_manually(
     190 + self, url_exists, url_mainpage, cookie_file, redirects=False
     191 + ):
     192 + custom_headers = {}
     193 + while True:
     194 + header_key = input(
     195 + 'Specify custom header if you need or just press Enter to skip. Header name: '
     196 + )
     197 + if not header_key:
     198 + break
     199 + header_value = input('Header value: ')
     200 + custom_headers[header_key.strip()] = header_value.strip()
    222 201   
    223  - url_user = url_exists.replace(supposed_username, "{username}")
    224  - url_not_exists = url_exists.replace(supposed_username, non_exist_username)
     202 + supposed_username = self.extract_username_dialog(url_exists)
     203 + non_exist_username = "noonewouldeverusethis7"
    225 204   
    226  - headers = dict(HEADERS)
    227  - headers.update(custom_headers)
     205 + url_user = url_exists.replace(supposed_username, "{username}")
     206 + url_not_exists = url_exists.replace(supposed_username, non_exist_username)
    228 207   
    229  - # cookies
    230  - cookie_dict = None
    231  - if cookie_file:
    232  - logger.info(f'Use {cookie_file} for cookies')
    233  - cookie_jar = import_aiohttp_cookies(cookie_file)
    234  - cookie_dict = {c.key: c.value for c in cookie_jar}
     208 + headers = dict(self.HEADERS)
     209 + headers.update(custom_headers)
    235 210   
    236  - exists_resp = requests.get(
    237  - url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
    238  - )
    239  - logger.debug(url_exists)
    240  - logger.debug(exists_resp.status_code)
    241  - logger.debug(exists_resp.text)
     211 + # cookies
     212 + cookie_dict = None
     213 + if cookie_file:
     214 + self.logger.info(f'Use {cookie_file} for cookies')
     215 + cookie_jar = import_aiohttp_cookies(cookie_file)
     216 + cookie_dict = {c.key: c.value for c in cookie_jar}
    242 217   
    243  - non_exists_resp = requests.get(
    244  - url_not_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
    245  - )
    246  - logger.debug(url_not_exists)
    247  - logger.debug(non_exists_resp.status_code)
    248  - logger.debug(non_exists_resp.text)
     218 + exists_resp = requests.get(
     219 + url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
     220 + )
     221 + self.logger.debug(url_exists)
     222 + self.logger.debug(exists_resp.status_code)
     223 + self.logger.debug(exists_resp.text)
    249 224   
    250  - a = exists_resp.text
    251  - b = non_exists_resp.text
     225 + non_exists_resp = requests.get(
     226 + url_not_exists,
     227 + cookies=cookie_dict,
     228 + headers=headers,
     229 + allow_redirects=redirects,
     230 + )
     231 + self.logger.debug(url_not_exists)
     232 + self.logger.debug(non_exists_resp.status_code)
     233 + self.logger.debug(non_exists_resp.text)
    252 234   
    253  - tokens_a = set(re.split(f'[{SEPARATORS}]', a))
    254  - tokens_b = set(re.split(f'[{SEPARATORS}]', b))
     235 + a = exists_resp.text
     236 + b = non_exists_resp.text
    255 237   
    256  - a_minus_b = tokens_a.difference(tokens_b)
    257  - b_minus_a = tokens_b.difference(tokens_a)
     238 + tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
     239 + tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
    258 240   
    259  - if len(a_minus_b) == len(b_minus_a) == 0:
    260  - print("The pages for existing and non-existing account are the same!")
     241 + a_minus_b = tokens_a.difference(tokens_b)
     242 + b_minus_a = tokens_b.difference(tokens_a)
    261 243   
    262  - top_features_count = int(
    263  - input(f"Specify count of features to extract [default {TOP_FEATURES}]: ")
    264  - or TOP_FEATURES
    265  - )
     244 + if len(a_minus_b) == len(b_minus_a) == 0:
     245 + print("The pages for existing and non-existing account are the same!")
    266 246   
    267  - presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[
    268  - :top_features_count
    269  - ]
     247 + top_features_count = int(
     248 + input(
     249 + f"Specify count of features to extract [default {self.TOP_FEATURES}]: "
     250 + )
     251 + or self.TOP_FEATURES
     252 + )
    270 253   
    271  - print("Detected text features of existing account: " + ", ".join(presence_list))
    272  - features = input("If features was not detected correctly, write it manually: ")
     254 + match_fun = get_match_ratio(self.settings.presence_strings)
    273 255   
    274  - if features:
    275  - presence_list = list(map(str.strip, features.split(",")))
     256 + presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
     257 + :top_features_count
     258 + ]
    276 259   
    277  - absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
    278  - :top_features_count
    279  - ]
    280  - print("Detected text features of non-existing account: " + ", ".join(absence_list))
    281  - features = input("If features was not detected correctly, write it manually: ")
     260 + print("Detected text features of existing account: " + ", ".join(presence_list))
     261 + features = input("If features was not detected correctly, write it manually: ")
    282 262   
    283  - if features:
    284  - absence_list = list(map(str.strip, features.split(",")))
     263 + if features:
     264 + presence_list = list(map(str.strip, features.split(",")))
    285 265   
    286  - site_data = {
    287  - "absenceStrs": absence_list,
    288  - "presenseStrs": presence_list,
    289  - "url": url_user,
    290  - "urlMain": url_mainpage,
    291  - "usernameClaimed": supposed_username,
    292  - "usernameUnclaimed": non_exist_username,
    293  - "checkType": "message",
    294  - }
     266 + absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
     267 + :top_features_count
     268 + ]
     269 + print(
     270 + "Detected text features of non-existing account: " + ", ".join(absence_list)
     271 + )
     272 + features = input("If features was not detected correctly, write it manually: ")
    295 273   
    296  - if headers != HEADERS:
    297  - site_data['headers'] = headers
     274 + if features:
     275 + absence_list = list(map(str.strip, features.split(",")))
    298 276   
    299  - site = MaigretSite(url_mainpage.split("/")[-1], site_data)
    300  - return site
     277 + site_data = {
     278 + "absenceStrs": absence_list,
     279 + "presenseStrs": presence_list,
     280 + "url": url_user,
     281 + "urlMain": url_mainpage,
     282 + "usernameClaimed": supposed_username,
     283 + "usernameUnclaimed": non_exist_username,
     284 + "checkType": "message",
     285 + }
    301 286   
     287 + if headers != self.HEADERS:
     288 + site_data['headers'] = headers
    302 289   
    303  -async def submit_dialog(db, url_exists, cookie_file, logger):
    304  - domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
    305  - domain_raw = domain_raw.split("/")[0]
    306  - logger.info('Domain is %s', domain_raw)
     290 + site = MaigretSite(url_mainpage.split("/")[-1], site_data)
     291 + return site
    307 292   
    308  - # check for existence
    309  - matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
     293 + async def dialog(self, url_exists, cookie_file):
     294 + domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
     295 + domain_raw = domain_raw.split("/")[0]
     296 + self.logger.info('Domain is %s', domain_raw)
    310 297   
    311  - if matched_sites:
    312  - print(
    313  - f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
     298 + # check for existence
     299 + matched_sites = list(
     300 + filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
    314 301   )
    315  - status = lambda s: "(disabled)" if s.disabled else ""
    316  - url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
    317  - print(
    318  - "\n".join(
    319  - [
    320  - f"{site.name} {status(site)}{url_block(site)}"
    321  - for site in matched_sites
    322  - ]
     302 + 
     303 + if matched_sites:
     304 + print(
     305 + f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
     306 + )
     307 + status = lambda s: "(disabled)" if s.disabled else ""
     308 + url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
     309 + print(
     310 + "\n".join(
     311 + [
     312 + f"{site.name} {status(site)}{url_block(site)}"
     313 + for site in matched_sites
     314 + ]
     315 + )
    323 316   )
    324  - )
    325 317   
    326  - if input("Do you want to continue? [yN] ").lower() in "n":
    327  - return False
     318 + if input("Do you want to continue? [yN] ").lower() in "n":
     319 + return False
    328 320   
    329  - url_mainpage = extract_mainpage_url(url_exists)
     321 + url_mainpage = self.extract_mainpage_url(url_exists)
    330 322   
    331  - print('Detecting site engine, please wait...')
    332  - sites = []
    333  - try:
    334  - sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
    335  - except KeyboardInterrupt:
    336  - print('Engine detect process is interrupted.')
     323 + print('Detecting site engine, please wait...')
     324 + sites = []
     325 + try:
     326 + sites = await self.detect_known_engine(url_exists, url_mainpage)
     327 + except KeyboardInterrupt:
     328 + print('Engine detect process is interrupted.')
    337 329   
    338  - if not sites:
    339  - print("Unable to detect site engine, lets generate checking features")
    340  - sites = [
    341  - await check_features_manually(
    342  - db, url_exists, url_mainpage, cookie_file, logger
    343  - )
    344  - ]
     330 + if not sites:
     331 + print("Unable to detect site engine, lets generate checking features")
     332 + sites = [
     333 + await self.check_features_manually(
     334 + url_exists, url_mainpage, cookie_file
     335 + )
     336 + ]
    345 337   
    346  - logger.debug(sites[0].__dict__)
     338 + self.logger.debug(sites[0].__dict__)
    347 339   
    348  - sem = asyncio.Semaphore(1)
     340 + sem = asyncio.Semaphore(1)
    349 341   
    350  - print("Checking, please wait...")
    351  - found = False
    352  - chosen_site = None
    353  - for s in sites:
    354  - chosen_site = s
    355  - result = await site_self_check(s, logger, sem, db)
    356  - if not result["disabled"]:
    357  - found = True
    358  - break
     342 + print("Checking, please wait...")
     343 + found = False
     344 + chosen_site = None
     345 + for s in sites:
     346 + chosen_site = s
     347 + result = await self.site_self_check(s, sem)
     348 + if not result["disabled"]:
     349 + found = True
     350 + break
    359 351   
    360  - if not found:
    361  - print(
    362  - f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
    363  - )
    364  - print(
    365  - "Try to run this mode again and increase features count or choose others."
    366  - )
    367  - return False
    368  - else:
    369  - if (
    370  - input(
    371  - f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
     352 + if not found:
     353 + print(
     354 + f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
    372 355   )
    373  - .lower()
    374  - .strip("y")
    375  - ):
     356 + print(
     357 + "Try to run this mode again and increase features count or choose others."
     358 + )
    376 359   return False
     360 + else:
     361 + if (
     362 + input(
     363 + f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
     364 + )
     365 + .lower()
     366 + .strip("y")
     367 + ):
     368 + return False
    377 369   
    378  - chosen_site.name = input("Change site name if you want: ") or chosen_site.name
    379  - chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
    380  - rank = get_alexa_rank(chosen_site.url_main)
    381  - if rank:
    382  - print(f'New alexa rank: {rank}')
    383  - chosen_site.alexa_rank = rank
     370 + chosen_site.name = input("Change site name if you want: ") or chosen_site.name
     371 + chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
     372 + rank = Submitter.get_alexa_rank(chosen_site.url_main)
     373 + if rank:
     374 + print(f'New alexa rank: {rank}')
     375 + chosen_site.alexa_rank = rank
    384 376   
    385  - logger.debug(chosen_site.json)
    386  - site_data = chosen_site.strip_engine_data()
    387  - logger.debug(site_data.json)
    388  - db.update_site(site_data)
    389  - return True
     377 + self.logger.debug(chosen_site.json)
     378 + site_data = chosen_site.strip_engine_data()
     379 + self.logger.debug(site_data.json)
     380 + self.db.update_site(site_data)
     381 + return True
    390 382   
  • ■ ■ ■ ■ ■ ■
    maigret/utils.py
    1 1  import ast
     2 +import difflib
    2 3  import re
    3 4  import random
    4 5  from typing import Any
    skipped 91 lines
    96 97  def get_random_user_agent():
    97 98   return random.choice(DEFAULT_USER_AGENTS)
    98 99   
     100 + 
     101 +def get_match_ratio(base_strs: list):
     102 + def get_match_inner(s: str):
     103 + return round(
     104 + max(
     105 + [
     106 + difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
     107 + for s2 in base_strs
     108 + ]
     109 + ),
     110 + 2,
     111 + )
     112 + 
     113 + return get_match_inner
     114 + 
  • ■ ■ ■ ■ ■
    tests/test_data.py
    1 1  """Maigret data test functions"""
    2 2   
    3 3  from maigret.utils import is_country_tag
    4  -from maigret.sites import SUPPORTED_TAGS
    5 4   
    6 5   
    7 6  def test_tags_validity(default_db):
    8 7   unknown_tags = set()
    9 8   
     9 + tags = default_db._tags
     10 + 
    10 11   for site in default_db.sites:
    11 12   for tag in filter(lambda x: not is_country_tag(x), site.tags):
    12  - if tag not in SUPPORTED_TAGS:
     13 + if tag not in tags:
    13 14   unknown_tags.add(tag)
    14 15   
    15 16   assert unknown_tags == set()
    skipped 1 lines
  • ■ ■ ■ ■ ■ ■
    tests/test_sites.py
    1 1  """Maigret Database test functions"""
    2 2  from maigret.sites import MaigretDatabase, MaigretSite
     3 +from maigret.utils import URLMatcher
    3 4   
    4 5  EXAMPLE_DB = {
    5 6   'engines': {
    skipped 174 lines
    180 181   assert len(db.ranked_sites_dict(id_type='username')) == 2
    181 182   assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
    182 183   
     184 + 
     185 +def test_get_url_template():
     186 + site = MaigretSite(
     187 + "test",
     188 + {
     189 + "urlMain": "https://ya.ru/",
     190 + "url": "{urlMain}{urlSubpath}/members/?username={username}",
     191 + },
     192 + )
     193 + assert (
     194 + site.get_url_template()
     195 + == "{urlMain}{urlSubpath}/members/?username={username} (no engine)"
     196 + )
     197 + 
     198 + site = MaigretSite(
     199 + "test",
     200 + {
     201 + "urlMain": "https://ya.ru/",
     202 + "url": "https://{username}.ya.ru",
     203 + },
     204 + )
     205 + assert site.get_url_template() == "SUBDOMAIN"
     206 + 
  • ■ ■ ■ ■ ■ ■
    tests/test_utils.py
    skipped 7 lines
    8 8   enrich_link_str,
    9 9   URLMatcher,
    10 10   get_dict_ascii_tree,
     11 + get_match_ratio,
    11 12  )
    12 13   
    13 14   
    skipped 123 lines
    137 138   ┗╸twitter_username: Alexaimephotogr"""
    138 139   )
    139 140   
     141 + 
     142 +def test_get_match_ratio():
     143 + fun = get_match_ratio(["test", "maigret", "username"])
     144 + 
     145 + assert fun("test") == 1
     146 + 
Please wait...
Page is in error, reload to recover