1 | 1 | | import asyncio |
2 | | - | import difflib |
3 | 2 | | import re |
4 | 3 | | from typing import List |
5 | 4 | | import xml.etree.ElementTree as ET |
| skipped 2 lines |
8 | 7 | | from .activation import import_aiohttp_cookies |
9 | 8 | | from .checking import maigret |
10 | 9 | | from .result import QueryStatus |
| 10 | + | from .settings import Settings |
11 | 11 | | from .sites import MaigretDatabase, MaigretSite, MaigretEngine |
12 | | - | from .utils import get_random_user_agent |
| 12 | + | from .utils import get_random_user_agent, get_match_ratio |
13 | 13 | | |
14 | 14 | | |
15 | | - | DESIRED_STRINGS = [ |
16 | | - | "username", |
17 | | - | "not found", |
18 | | - | "пользователь", |
19 | | - | "profile", |
20 | | - | "lastname", |
21 | | - | "firstname", |
22 | | - | "biography", |
23 | | - | "birthday", |
24 | | - | "репутация", |
25 | | - | "информация", |
26 | | - | "e-mail", |
27 | | - | ] |
| 15 | + | class Submitter: |
| 16 | + | HEADERS = { |
| 17 | + | "User-Agent": get_random_user_agent(), |
| 18 | + | } |
28 | 19 | | |
29 | | - | SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"] |
| 20 | + | SEPARATORS = "\"'" |
30 | 21 | | |
31 | | - | HEADERS = { |
32 | | - | "User-Agent": get_random_user_agent(), |
33 | | - | } |
| 22 | + | RATIO = 0.6 |
| 23 | + | TOP_FEATURES = 5 |
| 24 | + | URL_RE = re.compile(r"https?://(www\.)?") |
34 | 25 | | |
35 | | - | SEPARATORS = "\"'" |
| 26 | + | def __init__(self, db: MaigretDatabase, settings: Settings, logger): |
| 27 | + | self.settings = settings |
| 28 | + | self.db = db |
| 29 | + | self.logger = logger |
36 | 30 | | |
37 | | - | RATIO = 0.6 |
38 | | - | TOP_FEATURES = 5 |
39 | | - | URL_RE = re.compile(r"https?://(www\.)?") |
| 31 | + | @staticmethod |
| 32 | + | def get_alexa_rank(site_url_main): |
| 33 | + | url = f"http://data.alexa.com/data?cli=10&url={site_url_main}" |
| 34 | + | xml_data = requests.get(url).text |
| 35 | + | root = ET.fromstring(xml_data) |
| 36 | + | alexa_rank = 0 |
40 | 37 | | |
| 38 | + | try: |
| 39 | + | alexa_rank = int(root.find('.//REACH').attrib['RANK']) |
| 40 | + | except Exception: |
| 41 | + | pass |
41 | 42 | | |
42 | | - | def get_match_ratio(x): |
43 | | - | return round( |
44 | | - | max( |
45 | | - | [difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS] |
46 | | - | ), |
47 | | - | 2, |
48 | | - | ) |
| 43 | + | return alexa_rank |
49 | 44 | | |
| 45 | + | @staticmethod |
| 46 | + | def extract_mainpage_url(url): |
| 47 | + | return "/".join(url.split("/", 3)[:3]) |
50 | 48 | | |
51 | | - | def get_alexa_rank(site_url_main): |
52 | | - | url = f"http://data.alexa.com/data?cli=10&url={site_url_main}" |
53 | | - | xml_data = requests.get(url).text |
54 | | - | root = ET.fromstring(xml_data) |
55 | | - | alexa_rank = 0 |
| 49 | + | async def site_self_check(self, site, semaphore, silent=False): |
| 50 | + | changes = { |
| 51 | + | "disabled": False, |
| 52 | + | } |
56 | 53 | | |
57 | | - | try: |
58 | | - | alexa_rank = int(root.find('.//REACH').attrib['RANK']) |
59 | | - | except Exception: |
60 | | - | pass |
| 54 | + | check_data = [ |
| 55 | + | (site.username_claimed, QueryStatus.CLAIMED), |
| 56 | + | (site.username_unclaimed, QueryStatus.AVAILABLE), |
| 57 | + | ] |
61 | 58 | | |
62 | | - | return alexa_rank |
| 59 | + | self.logger.info(f"Checking {site.name}...") |
63 | 60 | | |
64 | | - | |
65 | | - | def extract_mainpage_url(url): |
66 | | - | return "/".join(url.split("/", 3)[:3]) |
67 | | - | |
68 | | - | |
69 | | - | async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False): |
70 | | - | changes = { |
71 | | - | "disabled": False, |
72 | | - | } |
73 | | - | |
74 | | - | check_data = [ |
75 | | - | (site.username_claimed, QueryStatus.CLAIMED), |
76 | | - | (site.username_unclaimed, QueryStatus.AVAILABLE), |
77 | | - | ] |
78 | | - | |
79 | | - | logger.info(f"Checking {site.name}...") |
80 | | - | |
81 | | - | for username, status in check_data: |
82 | | - | results_dict = await maigret( |
83 | | - | username=username, |
84 | | - | site_dict={site.name: site}, |
85 | | - | logger=logger, |
86 | | - | timeout=30, |
87 | | - | id_type=site.type, |
88 | | - | forced=True, |
89 | | - | no_progressbar=True, |
90 | | - | ) |
91 | | - | |
92 | | - | # don't disable entries with other ids types |
93 | | - | # TODO: make normal checking |
94 | | - | if site.name not in results_dict: |
95 | | - | logger.info(results_dict) |
96 | | - | changes["disabled"] = True |
97 | | - | continue |
98 | | - | |
99 | | - | result = results_dict[site.name]["status"] |
100 | | - | |
101 | | - | site_status = result.status |
| 61 | + | for username, status in check_data: |
| 62 | + | results_dict = await maigret( |
| 63 | + | username=username, |
| 64 | + | site_dict={site.name: site}, |
| 65 | + | logger=self.logger, |
| 66 | + | timeout=30, |
| 67 | + | id_type=site.type, |
| 68 | + | forced=True, |
| 69 | + | no_progressbar=True, |
| 70 | + | ) |
102 | 71 | | |
103 | | - | if site_status != status: |
104 | | - | if site_status == QueryStatus.UNKNOWN: |
105 | | - | msgs = site.absence_strs |
106 | | - | etype = site.check_type |
107 | | - | logger.warning( |
108 | | - | "Error while searching '%s' in %s: %s, %s, check type %s", |
109 | | - | username, |
110 | | - | site.name, |
111 | | - | result.context, |
112 | | - | msgs, |
113 | | - | etype, |
114 | | - | ) |
115 | | - | # don't disable in case of available username |
116 | | - | if status == QueryStatus.CLAIMED: |
117 | | - | changes["disabled"] = True |
118 | | - | elif status == QueryStatus.CLAIMED: |
119 | | - | logger.warning( |
120 | | - | f"Not found `{username}` in {site.name}, must be claimed" |
121 | | - | ) |
122 | | - | logger.info(results_dict[site.name]) |
123 | | - | changes["disabled"] = True |
124 | | - | else: |
125 | | - | logger.warning(f"Found `{username}` in {site.name}, must be available") |
126 | | - | logger.info(results_dict[site.name]) |
| 72 | + | # don't disable entries with other ids types |
| 73 | + | # TODO: make normal checking |
| 74 | + | if site.name not in results_dict: |
| 75 | + | self.logger.info(results_dict) |
127 | 76 | | changes["disabled"] = True |
| 77 | + | continue |
128 | 78 | | |
129 | | - | logger.info(f"Site {site.name} checking is finished") |
| 79 | + | result = results_dict[site.name]["status"] |
130 | 80 | | |
131 | | - | return changes |
| 81 | + | site_status = result.status |
132 | 82 | | |
| 83 | + | if site_status != status: |
| 84 | + | if site_status == QueryStatus.UNKNOWN: |
| 85 | + | msgs = site.absence_strs |
| 86 | + | etype = site.check_type |
| 87 | + | self.logger.warning( |
| 88 | + | "Error while searching '%s' in %s: %s, %s, check type %s", |
| 89 | + | username, |
| 90 | + | site.name, |
| 91 | + | result.context, |
| 92 | + | msgs, |
| 93 | + | etype, |
| 94 | + | ) |
| 95 | + | # don't disable in case of available username |
| 96 | + | if status == QueryStatus.CLAIMED: |
| 97 | + | changes["disabled"] = True |
| 98 | + | elif status == QueryStatus.CLAIMED: |
| 99 | + | self.logger.warning( |
| 100 | + | f"Not found `{username}` in {site.name}, must be claimed" |
| 101 | + | ) |
| 102 | + | self.logger.info(results_dict[site.name]) |
| 103 | + | changes["disabled"] = True |
| 104 | + | else: |
| 105 | + | self.logger.warning( |
| 106 | + | f"Found `{username}` in {site.name}, must be available" |
| 107 | + | ) |
| 108 | + | self.logger.info(results_dict[site.name]) |
| 109 | + | changes["disabled"] = True |
133 | 110 | | |
134 | | - | def generate_additional_fields_dialog(engine: MaigretEngine, dialog): |
135 | | - | fields = {} |
136 | | - | if 'urlSubpath' in engine.site.get('url', ''): |
137 | | - | msg = ( |
138 | | - | 'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). ' |
139 | | - | 'Enter in manually if it exists: ' |
140 | | - | ) |
141 | | - | subpath = input(msg).strip('/') |
142 | | - | if subpath: |
143 | | - | fields['urlSubpath'] = f'/{subpath}' |
144 | | - | return fields |
| 111 | + | self.logger.info(f"Site {site.name} checking is finished") |
145 | 112 | | |
| 113 | + | return changes |
146 | 114 | | |
147 | | - | async def detect_known_engine( |
148 | | - | db, url_exists, url_mainpage, logger |
149 | | - | ) -> List[MaigretSite]: |
150 | | - | try: |
151 | | - | r = requests.get(url_mainpage) |
152 | | - | logger.debug(r.text) |
153 | | - | except Exception as e: |
154 | | - | logger.warning(e) |
155 | | - | print("Some error while checking main page") |
156 | | - | return [] |
| 115 | + | def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog): |
| 116 | + | fields = {} |
| 117 | + | if 'urlSubpath' in engine.site.get('url', ''): |
| 118 | + | msg = ( |
| 119 | + | 'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). ' |
| 120 | + | 'Enter in manually if it exists: ' |
| 121 | + | ) |
| 122 | + | subpath = input(msg).strip('/') |
| 123 | + | if subpath: |
| 124 | + | fields['urlSubpath'] = f'/{subpath}' |
| 125 | + | return fields |
157 | 126 | | |
158 | | - | for engine in db.engines: |
159 | | - | strs_to_check = engine.__dict__.get("presenseStrs") |
160 | | - | if strs_to_check and r and r.text: |
161 | | - | all_strs_in_response = True |
162 | | - | for s in strs_to_check: |
163 | | - | if s not in r.text: |
164 | | - | all_strs_in_response = False |
165 | | - | sites = [] |
166 | | - | if all_strs_in_response: |
167 | | - | engine_name = engine.__dict__.get("name") |
| 127 | + | async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]: |
| 128 | + | try: |
| 129 | + | r = requests.get(url_mainpage) |
| 130 | + | self.logger.debug(r.text) |
| 131 | + | except Exception as e: |
| 132 | + | self.logger.warning(e) |
| 133 | + | print("Some error while checking main page") |
| 134 | + | return [] |
168 | 135 | | |
169 | | - | print(f"Detected engine {engine_name} for site {url_mainpage}") |
170 | | - | |
171 | | - | usernames_to_check = SUPPOSED_USERNAMES |
172 | | - | supposed_username = extract_username_dialog(url_exists) |
173 | | - | if supposed_username: |
174 | | - | usernames_to_check = [supposed_username] + usernames_to_check |
175 | | - | |
176 | | - | add_fields = generate_additional_fields_dialog(engine, url_exists) |
| 136 | + | for engine in self.db.engines: |
| 137 | + | strs_to_check = engine.__dict__.get("presenseStrs") |
| 138 | + | if strs_to_check and r and r.text: |
| 139 | + | all_strs_in_response = True |
| 140 | + | for s in strs_to_check: |
| 141 | + | if s not in r.text: |
| 142 | + | all_strs_in_response = False |
| 143 | + | sites = [] |
| 144 | + | if all_strs_in_response: |
| 145 | + | engine_name = engine.__dict__.get("name") |
177 | 146 | | |
178 | | - | for u in usernames_to_check: |
179 | | - | site_data = { |
180 | | - | "urlMain": url_mainpage, |
181 | | - | "name": url_mainpage.split("//")[1], |
182 | | - | "engine": engine_name, |
183 | | - | "usernameClaimed": u, |
184 | | - | "usernameUnclaimed": "noonewouldeverusethis7", |
185 | | - | **add_fields, |
186 | | - | } |
187 | | - | logger.info(site_data) |
| 147 | + | print(f"Detected engine {engine_name} for site {url_mainpage}") |
188 | 148 | | |
189 | | - | maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data) |
190 | | - | maigret_site.update_from_engine(db.engines_dict[engine_name]) |
191 | | - | sites.append(maigret_site) |
| 149 | + | usernames_to_check = self.settings.supposed_usernames |
| 150 | + | supposed_username = self.extract_username_dialog(url_exists) |
| 151 | + | if supposed_username: |
| 152 | + | usernames_to_check = [supposed_username] + usernames_to_check |
192 | 153 | | |
193 | | - | return sites |
| 154 | + | add_fields = self.generate_additional_fields_dialog( |
| 155 | + | engine, url_exists |
| 156 | + | ) |
194 | 157 | | |
195 | | - | return [] |
| 158 | + | for u in usernames_to_check: |
| 159 | + | site_data = { |
| 160 | + | "urlMain": url_mainpage, |
| 161 | + | "name": url_mainpage.split("//")[1], |
| 162 | + | "engine": engine_name, |
| 163 | + | "usernameClaimed": u, |
| 164 | + | "usernameUnclaimed": "noonewouldeverusethis7", |
| 165 | + | **add_fields, |
| 166 | + | } |
| 167 | + | self.logger.info(site_data) |
196 | 168 | | |
| 169 | + | maigret_site = MaigretSite( |
| 170 | + | url_mainpage.split("/")[-1], site_data |
| 171 | + | ) |
| 172 | + | maigret_site.update_from_engine( |
| 173 | + | self.db.engines_dict[engine_name] |
| 174 | + | ) |
| 175 | + | sites.append(maigret_site) |
197 | 176 | | |
198 | | - | def extract_username_dialog(url): |
199 | | - | url_parts = url.rstrip("/").split("/") |
200 | | - | supposed_username = url_parts[-1].strip('@') |
201 | | - | entered_username = input( |
202 | | - | f'Is "{supposed_username}" a valid username? If not, write it manually: ' |
203 | | - | ) |
204 | | - | return entered_username if entered_username else supposed_username |
| 177 | + | return sites |
205 | 178 | | |
| 179 | + | return [] |
206 | 180 | | |
207 | | - | async def check_features_manually( |
208 | | - | db, url_exists, url_mainpage, cookie_file, logger, redirects=False |
209 | | - | ): |
210 | | - | custom_headers = {} |
211 | | - | while True: |
212 | | - | header_key = input( |
213 | | - | 'Specify custom header if you need or just press Enter to skip. Header name: ' |
| 181 | + | def extract_username_dialog(self, url): |
| 182 | + | url_parts = url.rstrip("/").split("/") |
| 183 | + | supposed_username = url_parts[-1].strip('@') |
| 184 | + | entered_username = input( |
| 185 | + | f'Is "{supposed_username}" a valid username? If not, write it manually: ' |
214 | 186 | | ) |
215 | | - | if not header_key: |
216 | | - | break |
217 | | - | header_value = input('Header value: ') |
218 | | - | custom_headers[header_key.strip()] = header_value.strip() |
| 187 | + | return entered_username if entered_username else supposed_username |
219 | 188 | | |
220 | | - | supposed_username = extract_username_dialog(url_exists) |
221 | | - | non_exist_username = "noonewouldeverusethis7" |
| 189 | + | async def check_features_manually( |
| 190 | + | self, url_exists, url_mainpage, cookie_file, redirects=False |
| 191 | + | ): |
| 192 | + | custom_headers = {} |
| 193 | + | while True: |
| 194 | + | header_key = input( |
| 195 | + | 'Specify custom header if you need or just press Enter to skip. Header name: ' |
| 196 | + | ) |
| 197 | + | if not header_key: |
| 198 | + | break |
| 199 | + | header_value = input('Header value: ') |
| 200 | + | custom_headers[header_key.strip()] = header_value.strip() |
222 | 201 | | |
223 | | - | url_user = url_exists.replace(supposed_username, "{username}") |
224 | | - | url_not_exists = url_exists.replace(supposed_username, non_exist_username) |
| 202 | + | supposed_username = self.extract_username_dialog(url_exists) |
| 203 | + | non_exist_username = "noonewouldeverusethis7" |
225 | 204 | | |
226 | | - | headers = dict(HEADERS) |
227 | | - | headers.update(custom_headers) |
| 205 | + | url_user = url_exists.replace(supposed_username, "{username}") |
| 206 | + | url_not_exists = url_exists.replace(supposed_username, non_exist_username) |
228 | 207 | | |
229 | | - | # cookies |
230 | | - | cookie_dict = None |
231 | | - | if cookie_file: |
232 | | - | logger.info(f'Use {cookie_file} for cookies') |
233 | | - | cookie_jar = import_aiohttp_cookies(cookie_file) |
234 | | - | cookie_dict = {c.key: c.value for c in cookie_jar} |
| 208 | + | headers = dict(self.HEADERS) |
| 209 | + | headers.update(custom_headers) |
235 | 210 | | |
236 | | - | exists_resp = requests.get( |
237 | | - | url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects |
238 | | - | ) |
239 | | - | logger.debug(url_exists) |
240 | | - | logger.debug(exists_resp.status_code) |
241 | | - | logger.debug(exists_resp.text) |
| 211 | + | # cookies |
| 212 | + | cookie_dict = None |
| 213 | + | if cookie_file: |
| 214 | + | self.logger.info(f'Use {cookie_file} for cookies') |
| 215 | + | cookie_jar = import_aiohttp_cookies(cookie_file) |
| 216 | + | cookie_dict = {c.key: c.value for c in cookie_jar} |
242 | 217 | | |
243 | | - | non_exists_resp = requests.get( |
244 | | - | url_not_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects |
245 | | - | ) |
246 | | - | logger.debug(url_not_exists) |
247 | | - | logger.debug(non_exists_resp.status_code) |
248 | | - | logger.debug(non_exists_resp.text) |
| 218 | + | exists_resp = requests.get( |
| 219 | + | url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects |
| 220 | + | ) |
| 221 | + | self.logger.debug(url_exists) |
| 222 | + | self.logger.debug(exists_resp.status_code) |
| 223 | + | self.logger.debug(exists_resp.text) |
249 | 224 | | |
250 | | - | a = exists_resp.text |
251 | | - | b = non_exists_resp.text |
| 225 | + | non_exists_resp = requests.get( |
| 226 | + | url_not_exists, |
| 227 | + | cookies=cookie_dict, |
| 228 | + | headers=headers, |
| 229 | + | allow_redirects=redirects, |
| 230 | + | ) |
| 231 | + | self.logger.debug(url_not_exists) |
| 232 | + | self.logger.debug(non_exists_resp.status_code) |
| 233 | + | self.logger.debug(non_exists_resp.text) |
252 | 234 | | |
253 | | - | tokens_a = set(re.split(f'[{SEPARATORS}]', a)) |
254 | | - | tokens_b = set(re.split(f'[{SEPARATORS}]', b)) |
| 235 | + | a = exists_resp.text |
| 236 | + | b = non_exists_resp.text |
255 | 237 | | |
256 | | - | a_minus_b = tokens_a.difference(tokens_b) |
257 | | - | b_minus_a = tokens_b.difference(tokens_a) |
| 238 | + | tokens_a = set(re.split(f'[{self.SEPARATORS}]', a)) |
| 239 | + | tokens_b = set(re.split(f'[{self.SEPARATORS}]', b)) |
258 | 240 | | |
259 | | - | if len(a_minus_b) == len(b_minus_a) == 0: |
260 | | - | print("The pages for existing and non-existing account are the same!") |
| 241 | + | a_minus_b = tokens_a.difference(tokens_b) |
| 242 | + | b_minus_a = tokens_b.difference(tokens_a) |
261 | 243 | | |
262 | | - | top_features_count = int( |
263 | | - | input(f"Specify count of features to extract [default {TOP_FEATURES}]: ") |
264 | | - | or TOP_FEATURES |
265 | | - | ) |
| 244 | + | if len(a_minus_b) == len(b_minus_a) == 0: |
| 245 | + | print("The pages for existing and non-existing account are the same!") |
266 | 246 | | |
267 | | - | presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[ |
268 | | - | :top_features_count |
269 | | - | ] |
| 247 | + | top_features_count = int( |
| 248 | + | input( |
| 249 | + | f"Specify count of features to extract [default {self.TOP_FEATURES}]: " |
| 250 | + | ) |
| 251 | + | or self.TOP_FEATURES |
| 252 | + | ) |
270 | 253 | | |
271 | | - | print("Detected text features of existing account: " + ", ".join(presence_list)) |
272 | | - | features = input("If features was not detected correctly, write it manually: ") |
| 254 | + | match_fun = get_match_ratio(self.settings.presence_strings) |
273 | 255 | | |
274 | | - | if features: |
275 | | - | presence_list = list(map(str.strip, features.split(","))) |
| 256 | + | presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[ |
| 257 | + | :top_features_count |
| 258 | + | ] |
276 | 259 | | |
277 | | - | absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[ |
278 | | - | :top_features_count |
279 | | - | ] |
280 | | - | print("Detected text features of non-existing account: " + ", ".join(absence_list)) |
281 | | - | features = input("If features was not detected correctly, write it manually: ") |
| 260 | + | print("Detected text features of existing account: " + ", ".join(presence_list)) |
| 261 | + | features = input("If features was not detected correctly, write it manually: ") |
282 | 262 | | |
283 | | - | if features: |
284 | | - | absence_list = list(map(str.strip, features.split(","))) |
| 263 | + | if features: |
| 264 | + | presence_list = list(map(str.strip, features.split(","))) |
285 | 265 | | |
286 | | - | site_data = { |
287 | | - | "absenceStrs": absence_list, |
288 | | - | "presenseStrs": presence_list, |
289 | | - | "url": url_user, |
290 | | - | "urlMain": url_mainpage, |
291 | | - | "usernameClaimed": supposed_username, |
292 | | - | "usernameUnclaimed": non_exist_username, |
293 | | - | "checkType": "message", |
294 | | - | } |
| 266 | + | absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[ |
| 267 | + | :top_features_count |
| 268 | + | ] |
| 269 | + | print( |
| 270 | + | "Detected text features of non-existing account: " + ", ".join(absence_list) |
| 271 | + | ) |
| 272 | + | features = input("If features was not detected correctly, write it manually: ") |
295 | 273 | | |
296 | | - | if headers != HEADERS: |
297 | | - | site_data['headers'] = headers |
| 274 | + | if features: |
| 275 | + | absence_list = list(map(str.strip, features.split(","))) |
298 | 276 | | |
299 | | - | site = MaigretSite(url_mainpage.split("/")[-1], site_data) |
300 | | - | return site |
| 277 | + | site_data = { |
| 278 | + | "absenceStrs": absence_list, |
| 279 | + | "presenseStrs": presence_list, |
| 280 | + | "url": url_user, |
| 281 | + | "urlMain": url_mainpage, |
| 282 | + | "usernameClaimed": supposed_username, |
| 283 | + | "usernameUnclaimed": non_exist_username, |
| 284 | + | "checkType": "message", |
| 285 | + | } |
301 | 286 | | |
| 287 | + | if headers != self.HEADERS: |
| 288 | + | site_data['headers'] = headers |
302 | 289 | | |
303 | | - | async def submit_dialog(db, url_exists, cookie_file, logger): |
304 | | - | domain_raw = URL_RE.sub("", url_exists).strip().strip("/") |
305 | | - | domain_raw = domain_raw.split("/")[0] |
306 | | - | logger.info('Domain is %s', domain_raw) |
| 290 | + | site = MaigretSite(url_mainpage.split("/")[-1], site_data) |
| 291 | + | return site |
307 | 292 | | |
308 | | - | # check for existence |
309 | | - | matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites)) |
| 293 | + | async def dialog(self, url_exists, cookie_file): |
| 294 | + | domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/") |
| 295 | + | domain_raw = domain_raw.split("/")[0] |
| 296 | + | self.logger.info('Domain is %s', domain_raw) |
310 | 297 | | |
311 | | - | if matched_sites: |
312 | | - | print( |
313 | | - | f'Sites with domain "{domain_raw}" already exists in the Maigret database!' |
| 298 | + | # check for existence |
| 299 | + | matched_sites = list( |
| 300 | + | filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites) |
314 | 301 | | ) |
315 | | - | status = lambda s: "(disabled)" if s.disabled else "" |
316 | | - | url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}" |
317 | | - | print( |
318 | | - | "\n".join( |
319 | | - | [ |
320 | | - | f"{site.name} {status(site)}{url_block(site)}" |
321 | | - | for site in matched_sites |
322 | | - | ] |
| 302 | + | |
| 303 | + | if matched_sites: |
| 304 | + | print( |
| 305 | + | f'Sites with domain "{domain_raw}" already exists in the Maigret database!' |
| 306 | + | ) |
| 307 | + | status = lambda s: "(disabled)" if s.disabled else "" |
| 308 | + | url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}" |
| 309 | + | print( |
| 310 | + | "\n".join( |
| 311 | + | [ |
| 312 | + | f"{site.name} {status(site)}{url_block(site)}" |
| 313 | + | for site in matched_sites |
| 314 | + | ] |
| 315 | + | ) |
323 | 316 | | ) |
324 | | - | ) |
325 | 317 | | |
326 | | - | if input("Do you want to continue? [yN] ").lower() in "n": |
327 | | - | return False |
| 318 | + | if input("Do you want to continue? [yN] ").lower() in "n": |
| 319 | + | return False |
328 | 320 | | |
329 | | - | url_mainpage = extract_mainpage_url(url_exists) |
| 321 | + | url_mainpage = self.extract_mainpage_url(url_exists) |
330 | 322 | | |
331 | | - | print('Detecting site engine, please wait...') |
332 | | - | sites = [] |
333 | | - | try: |
334 | | - | sites = await detect_known_engine(db, url_exists, url_mainpage, logger) |
335 | | - | except KeyboardInterrupt: |
336 | | - | print('Engine detect process is interrupted.') |
| 323 | + | print('Detecting site engine, please wait...') |
| 324 | + | sites = [] |
| 325 | + | try: |
| 326 | + | sites = await self.detect_known_engine(url_exists, url_mainpage) |
| 327 | + | except KeyboardInterrupt: |
| 328 | + | print('Engine detect process is interrupted.') |
337 | 329 | | |
338 | | - | if not sites: |
339 | | - | print("Unable to detect site engine, lets generate checking features") |
340 | | - | sites = [ |
341 | | - | await check_features_manually( |
342 | | - | db, url_exists, url_mainpage, cookie_file, logger |
343 | | - | ) |
344 | | - | ] |
| 330 | + | if not sites: |
| 331 | + | print("Unable to detect site engine, lets generate checking features") |
| 332 | + | sites = [ |
| 333 | + | await self.check_features_manually( |
| 334 | + | url_exists, url_mainpage, cookie_file |
| 335 | + | ) |
| 336 | + | ] |
345 | 337 | | |
346 | | - | logger.debug(sites[0].__dict__) |
| 338 | + | self.logger.debug(sites[0].__dict__) |
347 | 339 | | |
348 | | - | sem = asyncio.Semaphore(1) |
| 340 | + | sem = asyncio.Semaphore(1) |
349 | 341 | | |
350 | | - | print("Checking, please wait...") |
351 | | - | found = False |
352 | | - | chosen_site = None |
353 | | - | for s in sites: |
354 | | - | chosen_site = s |
355 | | - | result = await site_self_check(s, logger, sem, db) |
356 | | - | if not result["disabled"]: |
357 | | - | found = True |
358 | | - | break |
| 342 | + | print("Checking, please wait...") |
| 343 | + | found = False |
| 344 | + | chosen_site = None |
| 345 | + | for s in sites: |
| 346 | + | chosen_site = s |
| 347 | + | result = await self.site_self_check(s, sem) |
| 348 | + | if not result["disabled"]: |
| 349 | + | found = True |
| 350 | + | break |
359 | 351 | | |
360 | | - | if not found: |
361 | | - | print( |
362 | | - | f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}." |
363 | | - | ) |
364 | | - | print( |
365 | | - | "Try to run this mode again and increase features count or choose others." |
366 | | - | ) |
367 | | - | return False |
368 | | - | else: |
369 | | - | if ( |
370 | | - | input( |
371 | | - | f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] " |
| 352 | + | if not found: |
| 353 | + | print( |
| 354 | + | f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}." |
372 | 355 | | ) |
373 | | - | .lower() |
374 | | - | .strip("y") |
375 | | - | ): |
| 356 | + | print( |
| 357 | + | "Try to run this mode again and increase features count or choose others." |
| 358 | + | ) |
376 | 359 | | return False |
| 360 | + | else: |
| 361 | + | if ( |
| 362 | + | input( |
| 363 | + | f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] " |
| 364 | + | ) |
| 365 | + | .lower() |
| 366 | + | .strip("y") |
| 367 | + | ): |
| 368 | + | return False |
377 | 369 | | |
378 | | - | chosen_site.name = input("Change site name if you want: ") or chosen_site.name |
379 | | - | chosen_site.tags = list(map(str.strip, input("Site tags: ").split(','))) |
380 | | - | rank = get_alexa_rank(chosen_site.url_main) |
381 | | - | if rank: |
382 | | - | print(f'New alexa rank: {rank}') |
383 | | - | chosen_site.alexa_rank = rank |
| 370 | + | chosen_site.name = input("Change site name if you want: ") or chosen_site.name |
| 371 | + | chosen_site.tags = list(map(str.strip, input("Site tags: ").split(','))) |
| 372 | + | rank = Submitter.get_alexa_rank(chosen_site.url_main) |
| 373 | + | if rank: |
| 374 | + | print(f'New alexa rank: {rank}') |
| 375 | + | chosen_site.alexa_rank = rank |
384 | 376 | | |
385 | | - | logger.debug(chosen_site.json) |
386 | | - | site_data = chosen_site.strip_engine_data() |
387 | | - | logger.debug(site_data.json) |
388 | | - | db.update_site(site_data) |
389 | | - | return True |
| 377 | + | self.logger.debug(chosen_site.json) |
| 378 | + | site_data = chosen_site.strip_engine_data() |
| 379 | + | self.logger.debug(site_data.json) |
| 380 | + | self.db.update_site(site_data) |
| 381 | + | return True |
390 | 382 | | |