| skipped 2 lines |
3 | 3 | | import re |
4 | 4 | | from typing import List |
5 | 5 | | import xml.etree.ElementTree as ET |
| 6 | + | from aiohttp import TCPConnector, ClientSession |
6 | 7 | | import requests |
7 | 8 | | |
8 | 9 | | from .activation import import_aiohttp_cookies |
| skipped 15 lines |
24 | 25 | | TOP_FEATURES = 5 |
25 | 26 | | URL_RE = re.compile(r"https?://(www\.)?") |
26 | 27 | | |
27 | | - | def __init__(self, db: MaigretDatabase, settings: Settings, logger): |
| 28 | + | def __init__(self, db: MaigretDatabase, settings: Settings, logger, args): |
28 | 29 | | self.settings = settings |
| 30 | + | self.args = args |
29 | 31 | | self.db = db |
30 | 32 | | self.logger = logger |
31 | 33 | | |
| 34 | + | from aiohttp_socks import ProxyConnector |
| 35 | + | proxy = self.args.proxy |
| 36 | + | cookie_jar = None |
| 37 | + | if args.cookie_file: |
| 38 | + | cookie_jar = import_aiohttp_cookies(args.cookie_file) |
| 39 | + | |
| 40 | + | connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False) |
| 41 | + | connector.verify_ssl = False |
| 42 | + | self.session = ClientSession( |
| 43 | + | connector=connector, trust_env=True, cookie_jar=cookie_jar |
| 44 | + | ) |
| 45 | + | |
32 | 46 | | @staticmethod |
33 | 47 | | def get_alexa_rank(site_url_main): |
34 | 48 | | url = f"http://data.alexa.com/data?cli=10&url={site_url_main}" |
| skipped 28 lines |
63 | 77 | | results_dict = await maigret( |
64 | 78 | | username=username, |
65 | 79 | | site_dict={site.name: site}, |
| 80 | + | proxy=self.args.proxy, |
66 | 81 | | logger=self.logger, |
67 | 82 | | timeout=30, |
68 | 83 | | id_type=site.type, |
| skipped 57 lines |
126 | 141 | | return fields |
127 | 142 | | |
128 | 143 | | async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]: |
| 144 | + | resp_text = '' |
129 | 145 | | try: |
130 | | - | r = requests.get(url_mainpage) |
131 | | - | self.logger.debug(r.text) |
| 146 | + | r = await self.session.get(url_mainpage) |
| 147 | + | resp_text = await r.text() |
| 148 | + | self.logger.debug(resp_text) |
132 | 149 | | except Exception as e: |
133 | 150 | | self.logger.warning(e) |
134 | 151 | | print("Some error while checking main page") |
| skipped 1 lines |
136 | 153 | | |
137 | 154 | | for engine in self.db.engines: |
138 | 155 | | strs_to_check = engine.__dict__.get("presenseStrs") |
139 | | - | if strs_to_check and r and r.text: |
| 156 | + | if strs_to_check and resp_text: |
140 | 157 | | all_strs_in_response = True |
141 | 158 | | for s in strs_to_check: |
142 | | - | if s not in r.text: |
| 159 | + | if s not in resp_text: |
143 | 160 | | all_strs_in_response = False |
144 | 161 | | sites = [] |
145 | 162 | | if all_strs_in_response: |
| skipped 63 lines |
209 | 226 | | headers = dict(self.HEADERS) |
210 | 227 | | headers.update(custom_headers) |
211 | 228 | | |
212 | | - | # cookies |
213 | | - | cookie_dict = None |
214 | | - | if cookie_file: |
215 | | - | self.logger.info(f'Use {cookie_file} for cookies') |
216 | | - | cookie_jar = import_aiohttp_cookies(cookie_file) |
217 | | - | cookie_dict = {c.key: c.value for c in cookie_jar} |
218 | | - | |
219 | | - | exists_resp = requests.get( |
220 | | - | url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects |
| 229 | + | exists_resp = await self.session.get( |
| 230 | + | url_exists, |
| 231 | + | headers=headers, |
| 232 | + | allow_redirects=redirects, |
221 | 233 | | ) |
| 234 | + | exists_resp_text = await exists_resp.text() |
222 | 235 | | self.logger.debug(url_exists) |
223 | | - | self.logger.debug(exists_resp.status_code) |
224 | | - | self.logger.debug(exists_resp.text) |
| 236 | + | self.logger.debug(exists_resp.status) |
| 237 | + | self.logger.debug(exists_resp_text) |
225 | 238 | | |
226 | | - | non_exists_resp = requests.get( |
| 239 | + | non_exists_resp = await self.session.get( |
227 | 240 | | url_not_exists, |
228 | | - | cookies=cookie_dict, |
229 | 241 | | headers=headers, |
230 | 242 | | allow_redirects=redirects, |
231 | 243 | | ) |
| 244 | + | non_exists_resp_text = await non_exists_resp.text() |
232 | 245 | | self.logger.debug(url_not_exists) |
233 | | - | self.logger.debug(non_exists_resp.status_code) |
234 | | - | self.logger.debug(non_exists_resp.text) |
| 246 | + | self.logger.debug(non_exists_resp.status) |
| 247 | + | self.logger.debug(non_exists_resp_text) |
235 | 248 | | |
236 | | - | a = exists_resp.text |
237 | | - | b = non_exists_resp.text |
| 249 | + | a = exists_resp_text |
| 250 | + | b = non_exists_resp_text |
238 | 251 | | |
239 | 252 | | tokens_a = set(re.split(f'[{self.SEPARATORS}]', a)) |
240 | 253 | | tokens_b = set(re.split(f'[{self.SEPARATORS}]', b)) |
| skipped 144 lines |