| 1 | + | #!/usr/bin/env python3 |
| 2 | + | import json |
| 3 | + | import random |
| 4 | + | import re |
| 5 | + | |
| 6 | + | import tqdm.asyncio |
| 7 | + | from mock import Mock |
| 8 | + | import requests |
| 9 | + | |
| 10 | + | from maigret.maigret import * |
| 11 | + | from maigret.result import QueryStatus |
| 12 | + | from maigret.sites import MaigretSite |
| 13 | + | |
| 14 | + | URL_RE = re.compile(r"https?://(www\.)?") |
| 15 | + | TIMEOUT = 200 |
| 16 | + | |
| 17 | + | |
| 18 | + | async def maigret_check(site, site_data, username, status, logger): |
| 19 | + | query_notify = Mock() |
| 20 | + | logger.debug(f'Checking {site}...') |
| 21 | + | |
| 22 | + | for username, status in [(username, status)]: |
| 23 | + | results = await maigret( |
| 24 | + | username, |
| 25 | + | {site: site_data}, |
| 26 | + | logger, |
| 27 | + | query_notify, |
| 28 | + | timeout=TIMEOUT, |
| 29 | + | forced=True, |
| 30 | + | no_progressbar=True, |
| 31 | + | ) |
| 32 | + | |
| 33 | + | if results[site]['status'].status != status: |
| 34 | + | if results[site]['status'].status == QueryStatus.UNKNOWN: |
| 35 | + | msg = site_data.absence_strs |
| 36 | + | etype = site_data.check_type |
| 37 | + | context = results[site]['status'].context |
| 38 | + | |
| 39 | + | logger.debug(f'Error while searching {username} in {site}, must be claimed. Context: {context}') |
| 40 | + | # if site_data.get('errors'): |
| 41 | + | # continue |
| 42 | + | return False |
| 43 | + | |
| 44 | + | if status == QueryStatus.CLAIMED: |
| 45 | + | logger.debug(f'Not found {username} in {site}, must be claimed') |
| 46 | + | logger.debug(results[site]) |
| 47 | + | pass |
| 48 | + | else: |
| 49 | + | logger.debug(f'Found {username} in {site}, must be available') |
| 50 | + | logger.debug(results[site]) |
| 51 | + | pass |
| 52 | + | return False |
| 53 | + | |
| 54 | + | return site_data |
| 55 | + | |
| 56 | + | |
| 57 | + | async def check_and_add_maigret_site(site_data, semaphore, logger, ok_usernames, bad_usernames): |
| 58 | + | async with semaphore: |
| 59 | + | sitename = site_data.name |
| 60 | + | positive = False |
| 61 | + | negative = False |
| 62 | + | |
| 63 | + | for ok_username in ok_usernames: |
| 64 | + | site_data.username_claimed = ok_username |
| 65 | + | status = QueryStatus.CLAIMED |
| 66 | + | if await maigret_check(sitename, site_data, ok_username, status, logger): |
| 67 | + | # print(f'{sitename} positive case is okay') |
| 68 | + | positive = True |
| 69 | + | break |
| 70 | + | |
| 71 | + | for bad_username in bad_usernames: |
| 72 | + | site_data.username_unclaimed = bad_username |
| 73 | + | status = QueryStatus.AVAILABLE |
| 74 | + | if await maigret_check(sitename, site_data, bad_username, status, logger): |
| 75 | + | # print(f'{sitename} negative case is okay') |
| 76 | + | negative = True |
| 77 | + | break |
| 78 | + | |
| 79 | + | if positive and negative: |
| 80 | + | site_data = site_data.strip_engine_data() |
| 81 | + | |
| 82 | + | db.update_site(site_data) |
| 83 | + | print(site_data.json) |
| 84 | + | try: |
| 85 | + | db.save_to_file(args.base_file) |
| 86 | + | except Exception as e: |
| 87 | + | logging.error(e, exc_info=True) |
| 88 | + | print(f'Saved new site {sitename}...') |
| 89 | + | ok_sites.append(site_data) |
| 90 | + | |
| 91 | + | |
| 92 | + | if __name__ == '__main__': |
| 93 | + | parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter |
| 94 | + | ) |
| 95 | + | parser.add_argument("--base", "-b", metavar="BASE_FILE", |
| 96 | + | dest="base_file", default="maigret/resources/data.json", |
| 97 | + | help="JSON file with sites data to update.") |
| 98 | + | |
| 99 | + | parser.add_argument("--add-engine", dest="add_engine", help="Additional engine to check") |
| 100 | + | |
| 101 | + | parser.add_argument("--only-engine", dest="only_engine", help="Use only this engine from detected to check") |
| 102 | + | |
| 103 | + | parser.add_argument('--check', help='only check sites in database', action='store_true') |
| 104 | + | |
| 105 | + | parser.add_argument('--random', help='shuffle list of urls', action='store_true', default=False) |
| 106 | + | |
| 107 | + | parser.add_argument('--top', help='top count of records in file', type=int, default=10000) |
| 108 | + | |
| 109 | + | parser.add_argument('--filter', help='substring to filter input urls', type=str, default='') |
| 110 | + | |
| 111 | + | parser.add_argument('--username', help='preferable username to check with', type=str) |
| 112 | + | |
| 113 | + | parser.add_argument( |
| 114 | + | "--info", |
| 115 | + | "-vv", |
| 116 | + | action="store_true", |
| 117 | + | dest="info", |
| 118 | + | default=False, |
| 119 | + | help="Display service information.", |
| 120 | + | ) |
| 121 | + | parser.add_argument( |
| 122 | + | "--verbose", |
| 123 | + | "-v", |
| 124 | + | action="store_true", |
| 125 | + | dest="verbose", |
| 126 | + | default=False, |
| 127 | + | help="Display extra information and metrics.", |
| 128 | + | ) |
| 129 | + | parser.add_argument( |
| 130 | + | "-d", |
| 131 | + | "--debug", |
| 132 | + | "-vvv", |
| 133 | + | action="store_true", |
| 134 | + | dest="debug", |
| 135 | + | default=False, |
| 136 | + | help="Saving debugging information and sites responses in debug.txt.", |
| 137 | + | ) |
| 138 | + | |
| 139 | + | parser.add_argument("urls_file", |
| 140 | + | metavar='URLS_FILE', |
| 141 | + | action="store", |
| 142 | + | help="File with base site URLs" |
| 143 | + | ) |
| 144 | + | |
| 145 | + | args = parser.parse_args() |
| 146 | + | |
| 147 | + | log_level = logging.ERROR |
| 148 | + | if args.debug: |
| 149 | + | log_level = logging.DEBUG |
| 150 | + | elif args.info: |
| 151 | + | log_level = logging.INFO |
| 152 | + | elif args.verbose: |
| 153 | + | log_level = logging.WARNING |
| 154 | + | |
| 155 | + | logging.basicConfig( |
| 156 | + | format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s', |
| 157 | + | datefmt='%H:%M:%S', |
| 158 | + | level=log_level |
| 159 | + | ) |
| 160 | + | logger = logging.getLogger('engines-check') |
| 161 | + | logger.setLevel(log_level) |
| 162 | + | |
| 163 | + | db = MaigretDatabase() |
| 164 | + | sites_subset = db.load_from_file(args.base_file).sites |
| 165 | + | sites = {site.name: site for site in sites_subset} |
| 166 | + | engines = db.engines |
| 167 | + | |
| 168 | + | # TODO: usernames extractors |
| 169 | + | ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john'] |
| 170 | + | if args.username: |
| 171 | + | ok_usernames = [args.username] + ok_usernames |
| 172 | + | |
| 173 | + | bad_usernames = ['noonewouldeverusethis7'] |
| 174 | + | |
| 175 | + | with open(args.urls_file, 'r') as urls_file: |
| 176 | + | urls = urls_file.read().splitlines() |
| 177 | + | if args.random: |
| 178 | + | random.shuffle(urls) |
| 179 | + | urls = urls[:args.top] |
| 180 | + | |
| 181 | + | raw_maigret_data = json.dumps({site.name: site.json for site in sites_subset}) |
| 182 | + | |
| 183 | + | new_sites = [] |
| 184 | + | for site in tqdm.asyncio.tqdm(urls): |
| 185 | + | site_lowercase = site.lower() |
| 186 | + | |
| 187 | + | domain_raw = URL_RE.sub('', site_lowercase).strip().strip('/') |
| 188 | + | domain_raw = domain_raw.split('/')[0] |
| 189 | + | |
| 190 | + | if args.filter and args.filter not in domain_raw: |
| 191 | + | logger.debug('Site %s skipped due to filtering by "%s"', domain_raw, args.filter) |
| 192 | + | continue |
| 193 | + | |
| 194 | + | if domain_raw in raw_maigret_data: |
| 195 | + | logger.debug(f'Site {domain_raw} already exists in the Maigret database!') |
| 196 | + | continue |
| 197 | + | |
| 198 | + | if '"' in domain_raw: |
| 199 | + | logger.debug(f'Invalid site {domain_raw}') |
| 200 | + | continue |
| 201 | + | |
| 202 | + | main_page_url = '/'.join(site.split('/', 3)[:3]) |
| 203 | + | |
| 204 | + | site_data = { |
| 205 | + | 'url': site, |
| 206 | + | 'urlMain': main_page_url, |
| 207 | + | 'name': domain_raw, |
| 208 | + | } |
| 209 | + | |
| 210 | + | try: |
| 211 | + | r = requests.get(main_page_url, timeout=5) |
| 212 | + | except: |
| 213 | + | r = None |
| 214 | + | pass |
| 215 | + | |
| 216 | + | detected_engines = [] |
| 217 | + | |
| 218 | + | for e in engines: |
| 219 | + | strs_to_check = e.__dict__.get('presenseStrs') |
| 220 | + | if strs_to_check and r and r.text: |
| 221 | + | all_strs_in_response = True |
| 222 | + | for s in strs_to_check: |
| 223 | + | if not s in r.text: |
| 224 | + | all_strs_in_response = False |
| 225 | + | if all_strs_in_response: |
| 226 | + | engine_name = e.__dict__.get('name') |
| 227 | + | detected_engines.append(engine_name) |
| 228 | + | logger.info(f'Detected engine {engine_name} for site {main_page_url}') |
| 229 | + | |
| 230 | + | if args.only_engine and args.only_engine in detected_engines: |
| 231 | + | detected_engines = [args.only_engine] |
| 232 | + | elif not detected_engines and args.add_engine: |
| 233 | + | logging.debug('Could not detect any engine, applying default engine %s...', args.add_engine) |
| 234 | + | detected_engines = [args.add_engine] |
| 235 | + | |
| 236 | + | def create_site_from_engine(sitename, data, e): |
| 237 | + | site = MaigretSite(sitename, data) |
| 238 | + | site.update_from_engine(db.engines_dict[e]) |
| 239 | + | site.engine = e |
| 240 | + | return site |
| 241 | + | |
| 242 | + | for engine_name in detected_engines: |
| 243 | + | site = create_site_from_engine(domain_raw, site_data, engine_name) |
| 244 | + | new_sites.append(site) |
| 245 | + | logger.debug(site.json) |
| 246 | + | |
| 247 | + | # if engine_name == "phpBB": |
| 248 | + | # site_data_with_subpath = dict(site_data) |
| 249 | + | # site_data_with_subpath["urlSubpath"] = "/forum" |
| 250 | + | # site = create_site_from_engine(domain_raw, site_data_with_subpath, engine_name) |
| 251 | + | # new_sites.append(site) |
| 252 | + | |
| 253 | + | # except Exception as e: |
| 254 | + | # print(f'Error: {str(e)}') |
| 255 | + | # pass |
| 256 | + | |
| 257 | + | print(f'Found {len(new_sites)}/{len(urls)} new sites') |
| 258 | + | |
| 259 | + | if args.check: |
| 260 | + | for s in new_sites: |
| 261 | + | print(s.url_main) |
| 262 | + | sys.exit(0) |
| 263 | + | |
| 264 | + | sem = asyncio.Semaphore(20) |
| 265 | + | loop = asyncio.get_event_loop() |
| 266 | + | |
| 267 | + | ok_sites = [] |
| 268 | + | tasks = [] |
| 269 | + | for site in new_sites: |
| 270 | + | check_coro = check_and_add_maigret_site(site, sem, logger, ok_usernames, bad_usernames) |
| 271 | + | future = asyncio.ensure_future(check_coro) |
| 272 | + | tasks.append(future) |
| 273 | + | |
| 274 | + | for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=TIMEOUT): |
| 275 | + | try: |
| 276 | + | loop.run_until_complete(f) |
| 277 | + | except asyncio.exceptions.TimeoutError: |
| 278 | + | pass |
| 279 | + | |
| 280 | + | print(f'Found and saved {len(ok_sites)} sites!') |
| 281 | + | |