Projects STRLCPY maigret Commits 79f872c7
🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■
    utils/__init__.py
     1 + 
  • ■ ■ ■ ■ ■ ■
    utils/check_engines.py
     1 +#!/usr/bin/env python3
     2 +"""Maigret: Supported Site Listing with Alexa ranking and country tags
     3 +This module generates the listing of supported sites in file `SITES.md`
     4 +and pretty prints file with sites data.
     5 +"""
     6 +import aiohttp
     7 +import asyncio
     8 +import json
     9 +import sys
     10 +import requests
     11 +import logging
     12 +import threading
     13 +import xml.etree.ElementTree as ET
     14 +from datetime import datetime
     15 +from argparse import ArgumentParser, RawDescriptionHelpFormatter
     16 + 
     17 +import tqdm.asyncio
     18 + 
     19 +from maigret.maigret import get_response, site_self_check
     20 +from maigret.sites import MaigretSite, MaigretDatabase, MaigretEngine
     21 +from maigret.utils import CaseConverter
     22 + 
     23 + 
     24 +async def check_engine_of_site(site_name, sites_with_engines, future, engine_name, semaphore, logger):
     25 + async with semaphore:
     26 + response = await get_response(request_future=future,
     27 + site_name=site_name,
     28 + logger=logger)
     29 + 
     30 + html_text, status_code, error_text, expection_text = response
     31 + 
     32 + if html_text and engine_name in html_text:
     33 + sites_with_engines.append(site_name)
     34 + return True
     35 + return False
     36 + 
     37 + 
     38 +if __name__ == '__main__':
     39 + parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
     40 + )
     41 + parser.add_argument("--base","-b", metavar="BASE_FILE",
     42 + dest="base_file", default="maigret/resources/data.json",
     43 + help="JSON file with sites data to update.")
     44 + 
     45 + parser.add_argument('--engine', '-e', help='check only selected engine', type=str)
     46 + 
     47 + args = parser.parse_args()
     48 + 
     49 + log_level = logging.INFO
     50 + logging.basicConfig(
     51 + format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
     52 + datefmt='%H:%M:%S',
     53 + level=log_level
     54 + )
     55 + logger = logging.getLogger('engines-check')
     56 + logger.setLevel(log_level)
     57 + 
     58 + db = MaigretDatabase()
     59 + sites_subset = db.load_from_file(args.base_file).sites
     60 + sites = {site.name: site for site in sites_subset}
     61 + 
     62 + with open(args.base_file, "r", encoding="utf-8") as data_file:
     63 + sites_info = json.load(data_file)
     64 + engines = sites_info['engines']
     65 + 
     66 + for engine_name, engine_data in engines.items():
     67 + if args.engine and args.engine != engine_name:
     68 + continue
     69 + 
     70 + if not 'presenseStrs' in engine_data:
     71 + print(f'No features to automatically detect sites on engine {engine_name}')
     72 + continue
     73 + 
     74 + engine_obj = MaigretEngine(engine_name, engine_data)
     75 + 
     76 + # setup connections for checking both engine and usernames
     77 + connector = aiohttp.TCPConnector(ssl=False)
     78 + connector.verify_ssl=False
     79 + session = aiohttp.ClientSession(connector=connector)
     80 + 
     81 + sem = asyncio.Semaphore(100)
     82 + loop = asyncio.get_event_loop()
     83 + tasks = []
     84 + 
     85 + # check sites without engine if they look like sites on this engine
     86 + new_engine_sites = []
     87 + for site_name, site_data in sites.items():
     88 + if site_data.engine:
     89 + continue
     90 + 
     91 + future = session.get(url=site_data.url_main,
     92 + allow_redirects=True,
     93 + timeout=10,
     94 + )
     95 + 
     96 + check_engine_coro = check_engine_of_site(site_name, new_engine_sites, future, engine_name, sem, logger)
     97 + future = asyncio.ensure_future(check_engine_coro)
     98 + tasks.append(future)
     99 + 
     100 + # progress bar
     101 + for f in tqdm.asyncio.tqdm.as_completed(tasks):
     102 + loop.run_until_complete(f)
     103 + 
     104 + print(f'Total detected {len(new_engine_sites)} sites on engine {engine_name}')
     105 + # dict with new found engine sites
     106 + new_sites = {site_name: sites[site_name] for site_name in new_engine_sites}
     107 + 
     108 + # update sites obj from engine
     109 + for site_name, site in new_sites.items():
     110 + site.request_future = None
     111 + site.engine = engine_name
     112 + site.update_from_engine(engine_obj)
     113 + 
     114 + async def update_site_data(site_name, site_data, all_sites, logger, no_progressbar):
     115 + updates = await site_self_check(site_name, site_data, logger, no_progressbar)
     116 + all_sites[site_name].update(updates)
     117 + 
     118 + tasks = []
     119 + # for new_site_name, new_site_data in new_sites.items():
     120 + # coro = update_site_data(new_site_name, new_site_data, new_sites, logger)
     121 + # future = asyncio.ensure_future(coro)
     122 + # tasks.append(future)
     123 + 
     124 + # asyncio.gather(*tasks)
     125 + for new_site_name, new_site_data in new_sites.items():
     126 + coro = update_site_data(new_site_name, new_site_data, new_sites, logger, no_progressbar=True)
     127 + loop.run_until_complete(coro)
     128 + 
     129 + updated_sites_count = 0
     130 + 
     131 + for s in new_sites:
     132 + site = new_sites[s]
     133 + site.request_future = None
     134 + 
     135 + if site.disabled:
     136 + print(f'{site.name} failed username checking of engine {engine_name}')
     137 + continue
     138 + 
     139 + site = site.strip_engine_data()
     140 + 
     141 + db.update_site(site)
     142 + updated_sites_count += 1
     143 + db.save_to_file(args.base_file)
     144 + 
     145 + print(f'Site "{s}": ' + json.dumps(site.json, indent=4))
     146 + 
     147 + print(f'Updated total {updated_sites_count} sites!')
     148 + print(f'Checking all sites on engine {engine_name}')
     149 + 
     150 + loop.run_until_complete(session.close())
     151 + 
     152 + print("\nFinished updating supported site listing!")
     153 + 
  • ■ ■ ■ ■ ■ ■
    utils/import_sites.py
     1 +#!/usr/bin/env python3
     2 +import json
     3 +import random
     4 +import re
     5 + 
     6 +import tqdm.asyncio
     7 +from mock import Mock
     8 +import requests
     9 + 
     10 +from maigret.maigret import *
     11 +from maigret.result import QueryStatus
     12 +from maigret.sites import MaigretSite
     13 + 
     14 +URL_RE = re.compile(r"https?://(www\.)?")
     15 +TIMEOUT = 200
     16 + 
     17 + 
     18 +async def maigret_check(site, site_data, username, status, logger):
     19 + query_notify = Mock()
     20 + logger.debug(f'Checking {site}...')
     21 + 
     22 + for username, status in [(username, status)]:
     23 + results = await maigret(
     24 + username,
     25 + {site: site_data},
     26 + logger,
     27 + query_notify,
     28 + timeout=TIMEOUT,
     29 + forced=True,
     30 + no_progressbar=True,
     31 + )
     32 + 
     33 + if results[site]['status'].status != status:
     34 + if results[site]['status'].status == QueryStatus.UNKNOWN:
     35 + msg = site_data.absence_strs
     36 + etype = site_data.check_type
     37 + context = results[site]['status'].context
     38 + 
     39 + logger.debug(f'Error while searching {username} in {site}, must be claimed. Context: {context}')
     40 + # if site_data.get('errors'):
     41 + # continue
     42 + return False
     43 + 
     44 + if status == QueryStatus.CLAIMED:
     45 + logger.debug(f'Not found {username} in {site}, must be claimed')
     46 + logger.debug(results[site])
     47 + pass
     48 + else:
     49 + logger.debug(f'Found {username} in {site}, must be available')
     50 + logger.debug(results[site])
     51 + pass
     52 + return False
     53 + 
     54 + return site_data
     55 + 
     56 + 
     57 +async def check_and_add_maigret_site(site_data, semaphore, logger, ok_usernames, bad_usernames):
     58 + async with semaphore:
     59 + sitename = site_data.name
     60 + positive = False
     61 + negative = False
     62 + 
     63 + for ok_username in ok_usernames:
     64 + site_data.username_claimed = ok_username
     65 + status = QueryStatus.CLAIMED
     66 + if await maigret_check(sitename, site_data, ok_username, status, logger):
     67 + # print(f'{sitename} positive case is okay')
     68 + positive = True
     69 + break
     70 + 
     71 + for bad_username in bad_usernames:
     72 + site_data.username_unclaimed = bad_username
     73 + status = QueryStatus.AVAILABLE
     74 + if await maigret_check(sitename, site_data, bad_username, status, logger):
     75 + # print(f'{sitename} negative case is okay')
     76 + negative = True
     77 + break
     78 + 
     79 + if positive and negative:
     80 + site_data = site_data.strip_engine_data()
     81 + 
     82 + db.update_site(site_data)
     83 + print(site_data.json)
     84 + try:
     85 + db.save_to_file(args.base_file)
     86 + except Exception as e:
     87 + logging.error(e, exc_info=True)
     88 + print(f'Saved new site {sitename}...')
     89 + ok_sites.append(site_data)
     90 + 
     91 + 
     92 +if __name__ == '__main__':
     93 + parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
     94 + )
     95 + parser.add_argument("--base", "-b", metavar="BASE_FILE",
     96 + dest="base_file", default="maigret/resources/data.json",
     97 + help="JSON file with sites data to update.")
     98 + 
     99 + parser.add_argument("--add-engine", dest="add_engine", help="Additional engine to check")
     100 + 
     101 + parser.add_argument("--only-engine", dest="only_engine", help="Use only this engine from detected to check")
     102 + 
     103 + parser.add_argument('--check', help='only check sites in database', action='store_true')
     104 + 
     105 + parser.add_argument('--random', help='shuffle list of urls', action='store_true', default=False)
     106 + 
     107 + parser.add_argument('--top', help='top count of records in file', type=int, default=10000)
     108 + 
     109 + parser.add_argument('--filter', help='substring to filter input urls', type=str, default='')
     110 + 
     111 + parser.add_argument('--username', help='preferable username to check with', type=str)
     112 + 
     113 + parser.add_argument(
     114 + "--info",
     115 + "-vv",
     116 + action="store_true",
     117 + dest="info",
     118 + default=False,
     119 + help="Display service information.",
     120 + )
     121 + parser.add_argument(
     122 + "--verbose",
     123 + "-v",
     124 + action="store_true",
     125 + dest="verbose",
     126 + default=False,
     127 + help="Display extra information and metrics.",
     128 + )
     129 + parser.add_argument(
     130 + "-d",
     131 + "--debug",
     132 + "-vvv",
     133 + action="store_true",
     134 + dest="debug",
     135 + default=False,
     136 + help="Saving debugging information and sites responses in debug.txt.",
     137 + )
     138 + 
     139 + parser.add_argument("urls_file",
     140 + metavar='URLS_FILE',
     141 + action="store",
     142 + help="File with base site URLs"
     143 + )
     144 + 
     145 + args = parser.parse_args()
     146 + 
     147 + log_level = logging.ERROR
     148 + if args.debug:
     149 + log_level = logging.DEBUG
     150 + elif args.info:
     151 + log_level = logging.INFO
     152 + elif args.verbose:
     153 + log_level = logging.WARNING
     154 + 
     155 + logging.basicConfig(
     156 + format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
     157 + datefmt='%H:%M:%S',
     158 + level=log_level
     159 + )
     160 + logger = logging.getLogger('engines-check')
     161 + logger.setLevel(log_level)
     162 + 
     163 + db = MaigretDatabase()
     164 + sites_subset = db.load_from_file(args.base_file).sites
     165 + sites = {site.name: site for site in sites_subset}
     166 + engines = db.engines
     167 + 
     168 + # TODO: usernames extractors
     169 + ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
     170 + if args.username:
     171 + ok_usernames = [args.username] + ok_usernames
     172 + 
     173 + bad_usernames = ['noonewouldeverusethis7']
     174 + 
     175 + with open(args.urls_file, 'r') as urls_file:
     176 + urls = urls_file.read().splitlines()
     177 + if args.random:
     178 + random.shuffle(urls)
     179 + urls = urls[:args.top]
     180 + 
     181 + raw_maigret_data = json.dumps({site.name: site.json for site in sites_subset})
     182 + 
     183 + new_sites = []
     184 + for site in tqdm.asyncio.tqdm(urls):
     185 + site_lowercase = site.lower()
     186 + 
     187 + domain_raw = URL_RE.sub('', site_lowercase).strip().strip('/')
     188 + domain_raw = domain_raw.split('/')[0]
     189 + 
     190 + if args.filter and args.filter not in domain_raw:
     191 + logger.debug('Site %s skipped due to filtering by "%s"', domain_raw, args.filter)
     192 + continue
     193 + 
     194 + if domain_raw in raw_maigret_data:
     195 + logger.debug(f'Site {domain_raw} already exists in the Maigret database!')
     196 + continue
     197 + 
     198 + if '"' in domain_raw:
     199 + logger.debug(f'Invalid site {domain_raw}')
     200 + continue
     201 + 
     202 + main_page_url = '/'.join(site.split('/', 3)[:3])
     203 + 
     204 + site_data = {
     205 + 'url': site,
     206 + 'urlMain': main_page_url,
     207 + 'name': domain_raw,
     208 + }
     209 + 
     210 + try:
     211 + r = requests.get(main_page_url, timeout=5)
     212 + except:
     213 + r = None
     214 + pass
     215 + 
     216 + detected_engines = []
     217 + 
     218 + for e in engines:
     219 + strs_to_check = e.__dict__.get('presenseStrs')
     220 + if strs_to_check and r and r.text:
     221 + all_strs_in_response = True
     222 + for s in strs_to_check:
     223 + if not s in r.text:
     224 + all_strs_in_response = False
     225 + if all_strs_in_response:
     226 + engine_name = e.__dict__.get('name')
     227 + detected_engines.append(engine_name)
     228 + logger.info(f'Detected engine {engine_name} for site {main_page_url}')
     229 + 
     230 + if args.only_engine and args.only_engine in detected_engines:
     231 + detected_engines = [args.only_engine]
     232 + elif not detected_engines and args.add_engine:
     233 + logging.debug('Could not detect any engine, applying default engine %s...', args.add_engine)
     234 + detected_engines = [args.add_engine]
     235 + 
     236 + def create_site_from_engine(sitename, data, e):
     237 + site = MaigretSite(sitename, data)
     238 + site.update_from_engine(db.engines_dict[e])
     239 + site.engine = e
     240 + return site
     241 + 
     242 + for engine_name in detected_engines:
     243 + site = create_site_from_engine(domain_raw, site_data, engine_name)
     244 + new_sites.append(site)
     245 + logger.debug(site.json)
     246 + 
     247 + # if engine_name == "phpBB":
     248 + # site_data_with_subpath = dict(site_data)
     249 + # site_data_with_subpath["urlSubpath"] = "/forum"
     250 + # site = create_site_from_engine(domain_raw, site_data_with_subpath, engine_name)
     251 + # new_sites.append(site)
     252 + 
     253 + # except Exception as e:
     254 + # print(f'Error: {str(e)}')
     255 + # pass
     256 + 
     257 + print(f'Found {len(new_sites)}/{len(urls)} new sites')
     258 + 
     259 + if args.check:
     260 + for s in new_sites:
     261 + print(s.url_main)
     262 + sys.exit(0)
     263 + 
     264 + sem = asyncio.Semaphore(20)
     265 + loop = asyncio.get_event_loop()
     266 + 
     267 + ok_sites = []
     268 + tasks = []
     269 + for site in new_sites:
     270 + check_coro = check_and_add_maigret_site(site, sem, logger, ok_usernames, bad_usernames)
     271 + future = asyncio.ensure_future(check_coro)
     272 + tasks.append(future)
     273 + 
     274 + for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=TIMEOUT):
     275 + try:
     276 + loop.run_until_complete(f)
     277 + except asyncio.exceptions.TimeoutError:
     278 + pass
     279 + 
     280 + print(f'Found and saved {len(ok_sites)} sites!')
     281 + 
  • ■ ■ ■ ■ ■ ■
    utils/sites_diff.py
     1 +import sys
     2 +import difflib
     3 +import requests
     4 + 
     5 + 
     6 +a = requests.get(sys.argv[1]).text
     7 +b = requests.get(sys.argv[2]).text
     8 + 
     9 + 
     10 +tokens_a = set(a.split('"'))
     11 +tokens_b = set(b.split('"'))
     12 + 
     13 +a_minus_b = tokens_a.difference(tokens_b)
     14 +b_minus_a = tokens_b.difference(tokens_a)
     15 + 
     16 +print(a_minus_b)
     17 +print(b_minus_a)
     18 + 
     19 +print(len(a_minus_b))
     20 +print(len(b_minus_a))
     21 + 
     22 +desired_strings = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
     23 +"birthday", "репутация", "информация", "e-mail"]
     24 + 
     25 + 
     26 +def get_match_ratio(x):
     27 + return round(max([
     28 + difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
     29 + for y in desired_strings
     30 + ]), 2)
     31 + 
     32 + 
     33 +RATIO = 0.6
     34 + 
     35 +print(sorted(a_minus_b, key=get_match_ratio, reverse=True)[:10])
     36 +print(sorted(b_minus_a, key=get_match_ratio, reverse=True)[:10])
Please wait...
Page is in error, reload to recover