STRLCPY/maigret

Added a couple of sites, fixed false positives (#286)
Soxoj committed with GitHub 3 years ago

ecabf88c

1 parent 8801f7e6

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

■ ■ ■ ■ ■ ■

maigret/maigret.py

		skipped 535 lines
536	536		site_data = get_top_sites_for_id(args.id_type)
537	537
538	538		if args.new_site_to_submit:
539		-	submitter = Submitter(db=db, logger=logger, settings=settings)
	539	+	submitter = Submitter(db=db, logger=logger, settings=settings, args=args)
540	540		is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
541	541		if is_submitted:
542	542		db.save_to_file(db_file)
		skipped 189 lines

■ ■ ■ ■ ■ ■

maigret/resources/data.json

		skipped 1832 lines
1833	1833		"usernameUnclaimed": "noonewouldeverusethis7"
1834	1834		},
1835	1835		"Bestfantasybooks": {
	1836	+	"disabled": true,
1836	1837		"tags": [
1837	1838		"us"
1838	1839		],
		skipped 2593 lines
4432	4433		]
4433	4434		},
4434	4435		"Facenama": {
	4436	+	"disabled": true,
4435	4437		"tags": [
4436	4438		"ir"
4437	4439		],
		skipped 24002 lines
28440	28442		"usernameUnclaimed": "noonewouldeverusethis7",
28441	28443		"checkType": "message",
28442	28444		"alexaRank": 6859
	28445	+	},
	28446	+	"Worldis.me": {
	28447	+	"absenceStrs": [
	28448	+	"user_password",
	28449	+	"send_email"
	28450	+	],
	28451	+	"presenseStrs": [
	28452	+	"my_profile",
	28453	+	"profile_upi",
	28454	+	"UserInfo"
	28455	+	],
	28456	+	"url": "http://en.worldis.me/{username}",
	28457	+	"urlMain": "http://en.worldis.me",
	28458	+	"usernameClaimed": "admin",
	28459	+	"usernameUnclaimed": "noonewouldeverusethis7",
	28460	+	"checkType": "message",
	28461	+	"alexaRank": 3233509,
	28462	+	"tags": [
	28463	+	"ru"
	28464	+	]
	28465	+	},
	28466	+	"photoshop-kopona.com": {
	28467	+	"absenceStrs": [
	28468	+	"<title>noonewouldeverusethis7 » \u0420\u0435\u0441\u0443\u0440\u0441\u044b \u0434\u043b\u044f \u0424\u043e\u0442\u043e\u0448\u043e\u043f\u0430</title>"
	28469	+	],
	28470	+	"presenseStrs": [
	28471	+	"offline",
	28472	+	"uspusertitle"
	28473	+	],
	28474	+	"url": "https://photoshop-kopona.com/ru/user/{username}/",
	28475	+	"urlMain": "https://photoshop-kopona.com",
	28476	+	"usernameClaimed": "test",
	28477	+	"usernameUnclaimed": "noonewouldeverusethis7",
	28478	+	"checkType": "message",
	28479	+	"alexaRank": 44106,
	28480	+	"tags": [
	28481	+	"ru"
	28482	+	]
	28483	+	},
	28484	+	"dumskaya.net": {
	28485	+	"absenceStrs": [
	28486	+	"><img class=nobo src=/banner/ps2_/ alt="
	28487	+	],
	28488	+	"presenseStrs": [
	28489	+	"><img class=nobo src=/banner/prague_/ alt="
	28490	+	],
	28491	+	"url": "https://dumskaya.net/user/{username}/",
	28492	+	"urlMain": "https://dumskaya.net",
	28493	+	"usernameClaimed": "test",
	28494	+	"usernameUnclaimed": "noonewouldeverusethis7",
	28495	+	"checkType": "message",
	28496	+	"alexaRank": 73617,
	28497	+	"tags": [
	28498	+	"ru"
	28499	+	]
	28500	+	},
	28501	+	"rblx.trade": {
	28502	+	"absenceStrs": [
	28503	+	"isRblxTradeException"
	28504	+	],
	28505	+	"presenseStrs": [
	28506	+	"userId"
	28507	+	],
	28508	+	"url": "https://rblx.trade/p/{username}",
	28509	+	"urlMain": "https://rblx.trade",
	28510	+	"usernameClaimed": "test",
	28511	+	"usernameUnclaimed": "noonewouldeverusethis7",
	28512	+	"checkType": "message",
	28513	+	"alexaRank": 362185,
	28514	+	"tags": [
	28515	+	"gaming"
	28516	+	]
	28517	+	},
	28518	+	"monitoringminecraft.ru": {
	28519	+	"absenceStrs": [
	28520	+	"shadowi"
	28521	+	],
	28522	+	"presenseStrs": [
	28523	+	"small"
	28524	+	],
	28525	+	"url": "https://monitoringminecraft.ru/player/{username}",
	28526	+	"urlMain": "https://monitoringminecraft.ru",
	28527	+	"usernameClaimed": "test",
	28528	+	"usernameUnclaimed": "noonewouldeverusethis7",
	28529	+	"checkType": "message",
	28530	+	"alexaRank": 115209,
	28531	+	"tags": [
	28532	+	"gaming"
	28533	+	]
	28534	+	},
	28535	+	"profi.ru": {
	28536	+	"absenceStrs": [
	28537	+	"page-404__paragraph"
	28538	+	],
	28539	+	"presenseStrs": [
	28540	+	"PROFILE",
	28541	+	"profiles",
	28542	+	"profileOIO",
	28543	+	"fullProfile",
	28544	+	"profileUGC2"
	28545	+	],
	28546	+	"url": "https://profi.ru/profile/{username}/",
	28547	+	"urlMain": "https://profi.ru",
	28548	+	"usernameClaimed": "EgorovRV",
	28549	+	"usernameUnclaimed": "noonewouldeverusethis7",
	28550	+	"checkType": "message",
	28551	+	"alexaRank": 12037,
	28552	+	"tags": [
	28553	+	"freelance"
	28554	+	]
	28555	+	},
	28556	+	"app.airnfts.com": {
	28557	+	"absenceStrs": [
	28558	+	"user-not-found-div"
	28559	+	],
	28560	+	"presenseStrs": [
	28561	+	"username",
	28562	+	"ownerUsername",
	28563	+	"creatorUsername",
	28564	+	"name",
	28565	+	"user"
	28566	+	],
	28567	+	"url": "https://app.airnfts.com/creators/{username}",
	28568	+	"urlMain": "https://app.airnfts.com",
	28569	+	"usernameClaimed": "test",
	28570	+	"usernameUnclaimed": "noonewouldeverusethis7",
	28571	+	"checkType": "message",
	28572	+	"alexaRank": 30223
	28573	+	},
	28574	+	"xgm.guru": {
	28575	+	"absenceStrs": [
	28576	+	">Username:</label>"
	28577	+	],
	28578	+	"presenseStrs": [
	28579	+	"email",
	28580	+	"usernamereg",
	28581	+	"username-top",
	28582	+	"\u041e\u043f\u044b\u0442 \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044f",
	28583	+	"check-username"
	28584	+	],
	28585	+	"url": "https://xgm.guru/user/{username}",
	28586	+	"urlMain": "https://xgm.guru",
	28587	+	"usernameClaimed": "test",
	28588	+	"usernameUnclaimed": "noonewouldeverusethis7",
	28589	+	"checkType": "message",
	28590	+	"alexaRank": 692341,
	28591	+	"tags": [
	28592	+	"forum",
	28593	+	"gaming"
	28594	+	]
28443	28595		}
28444	28596		},
28445	28597		"engines": {
		skipped 263 lines

■ ■ ■ ■ ■ ■

maigret/submit.py

		skipped 2 lines
3	3		import re
4	4		from typing import List
5	5		import xml.etree.ElementTree as ET
	6	+	from aiohttp import TCPConnector, ClientSession
6	7		import requests
7	8
8	9		from .activation import import_aiohttp_cookies
		skipped 15 lines
24	25		TOP_FEATURES = 5
25	26		URL_RE = re.compile(r"https?://(www\.)?")
26	27
27		-	def __init__(self, db: MaigretDatabase, settings: Settings, logger):
	28	+	def __init__(self, db: MaigretDatabase, settings: Settings, logger, args):
28	29		self.settings = settings
	30	+	self.args = args
29	31		self.db = db
30	32		self.logger = logger
31	33
	34	+	from aiohttp_socks import ProxyConnector
	35	+	proxy = self.args.proxy
	36	+	cookie_jar = None
	37	+	if args.cookie_file:
	38	+	cookie_jar = import_aiohttp_cookies(args.cookie_file)
	39	+
	40	+	connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
	41	+	connector.verify_ssl = False
	42	+	self.session = ClientSession(
	43	+	connector=connector, trust_env=True, cookie_jar=cookie_jar
	44	+	)
	45	+
32	46		@staticmethod
33	47		def get_alexa_rank(site_url_main):
34	48		url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
		skipped 28 lines
63	77		results_dict = await maigret(
64	78		username=username,
65	79		site_dict={site.name: site},
	80	+	proxy=self.args.proxy,
66	81		logger=self.logger,
67	82		timeout=30,
68	83		id_type=site.type,
		skipped 57 lines
126	141		return fields
127	142
128	143		async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
	144	+	resp_text = ''
129	145		try:
130		-	r = requests.get(url_mainpage)
131		-	self.logger.debug(r.text)
	146	+	r = await self.session.get(url_mainpage)
	147	+	resp_text = await r.text()
	148	+	self.logger.debug(resp_text)
132	149		except Exception as e:
133	150		self.logger.warning(e)
134	151		print("Some error while checking main page")
		skipped 1 lines
136	153
137	154		for engine in self.db.engines:
138	155		strs_to_check = engine.__dict__.get("presenseStrs")
139		-	if strs_to_check and r and r.text:
	156	+	if strs_to_check and resp_text:
140	157		all_strs_in_response = True
141	158		for s in strs_to_check:
142		-	if s not in r.text:
	159	+	if s not in resp_text:
143	160		all_strs_in_response = False
144	161		sites = []
145	162		if all_strs_in_response:
		skipped 63 lines
209	226		headers = dict(self.HEADERS)
210	227		headers.update(custom_headers)
211	228
212		-	# cookies
213		-	cookie_dict = None
214		-	if cookie_file:
215		-	self.logger.info(f'Use {cookie_file} for cookies')
216		-	cookie_jar = import_aiohttp_cookies(cookie_file)
217		-	cookie_dict = {c.key: c.value for c in cookie_jar}
218		-
219		-	exists_resp = requests.get(
220		-	url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
	229	+	exists_resp = await self.session.get(
	230	+	url_exists,
	231	+	headers=headers,
	232	+	allow_redirects=redirects,
221	233		)
	234	+	exists_resp_text = await exists_resp.text()
222	235		self.logger.debug(url_exists)
223		-	self.logger.debug(exists_resp.status_code)
224		-	self.logger.debug(exists_resp.text)
	236	+	self.logger.debug(exists_resp.status)
	237	+	self.logger.debug(exists_resp_text)
225	238
226		-	non_exists_resp = requests.get(
	239	+	non_exists_resp = await self.session.get(
227	240		url_not_exists,
228		-	cookies=cookie_dict,
229	241		headers=headers,
230	242		allow_redirects=redirects,
231	243		)
	244	+	non_exists_resp_text = await non_exists_resp.text()
232	245		self.logger.debug(url_not_exists)
233		-	self.logger.debug(non_exists_resp.status_code)
234		-	self.logger.debug(non_exists_resp.text)
	246	+	self.logger.debug(non_exists_resp.status)
	247	+	self.logger.debug(non_exists_resp_text)
235	248
236		-	a = exists_resp.text
237		-	b = non_exists_resp.text
	249	+	a = exists_resp_text
	250	+	b = non_exists_resp_text
238	251
239	252		tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
240	253		tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
		skipped 144 lines

■ ■ ■ ■ ■ ■

utils/update_site_data.py

		skipped 24 lines
25	25		'100000000': '100M',
26	26		})
27	27
28		-	SEMAPHORE = threading.Semaphore(10)
	28	+	SEMAPHORE = threading.Semaphore(20)
29	29
30	30		def get_rank(domain_to_query, site, print_errors=True):
31	31		with SEMAPHORE:
		skipped 113 lines

Added a couple of sites, fixed false positives (#286)