STRLCPY/Maryam

fix web_scrap errors
saeeddhqan committed 3 years ago

d4728c30

1 parent 79beacd0

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

■ ■ ■ ■ ■ ■

maryam/core/util/urlib.py

		skipped 36 lines
37	37		def unparse(self, urparse):
38	38		return urlparse.urlunparse(urparse)
39	39
40		-	def sub_service(self, serv=None):
	40	+	def sub_service(self, serv=None, ifany=False):
41	41		'''Add protocol to url or replace it or clean it'''
42	42		urparse = re.split(r'://', self.url)
43	43		if not serv:
		skipped 3 lines
47	47		# Add protocol
48	48		serv = re.sub(r'://', '', serv)
49	49		if len(urparse) == 2:
	50	+	if ifany:
	51	+	return self.url
50	52		del urparse[0]
51	53		url = f"{serv}://{''.join(urparse)}"
52	54		else:
		skipped 99 lines

■ ■ ■ ■ ■ ■

maryam/core/util/web_scrap.py

		skipped 17 lines
18	18		import re
19	19		import concurrent.futures
20	20
21		-	# Web Scraper v5.1
	21	+	# Web Scrap v5.1
22	22
23	23		class main:
24	24
		skipped 3 lines
28	28		url : First page address
29	29		debug : Show the result at moment
30	30		limit : Web scrap level(if it's 1 that's mean just search in first page)
31		-	thread_count : Count of links for open at per lap
	31	+	thread_count : Number of links for each lap
32	32		"""
33	33		self.framework = main.framework
34		-	self.url = url
	34	+	self.parser = self.framework.urlib(url)
	35	+	self.url = self.parser.sub_service(self.framework._global_options['protocol'], ifany=True)
35	36		self.urlib = self.framework.urlib
36	37		self.debug = debug
37	38		self.limit = limit
		skipped 68 lines
106	107		return True
107	108
108	109		def joiner(self, url):
109		-	url = str(url)
110		-	# ADD slash to end url
	110	+	url = url
111	111		urparse = self.urlib(url)
112	112		urparse.url = urparse.quote if '%' not in url else url
113		-	urparse2 = self.urlib(str(self.url))
114		-	cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:')
	113	+	cond1 = url.lower() in ('%20', '', '/', '%23', '#', 'https:', 'http:') or '.' not in url
115	114		cond12 = url.endswith(':')
116	115		cond2 = len(
117	116		urparse.url) > 1 and '%3a//' not in urparse.url and urparse.url[:2] != '//'
		skipped 1 lines
119	118		if cond1 or cond12:
120	119		return False
121	120		elif cond2:
122		-	urparse.url = urparse2.join(url)
	121	+	urparse.url = self.parser.join(url)
123	122		elif cond3:
124	123		urparse.url = url
125	124		else:
126		-	urparse.url = urparse2.join(url)
127		-	return str(urparse.url)
	125	+	urparse.url = self.parser.join(url)
	126	+	return urparse.url
128	127
129	128		def link_category(self, urls):
130	129		links = []
131	130		for url in urls:
132	131		join = self.joiner(url)
133		-
134	132		##########################
135	133		# ADD CDN, PHONE and EMAIL
136	134		##########################
		skipped 1 lines
138	136		if cond1:
139	137		continue
140	138
141		-	ends = join.endswith
142	139		join = str(join).replace('\/', '/')
143	140		##########################
144	141		# ADD OUT SCOPE
		skipped 9 lines
154	151		if urparse.query != '':
155	152		self._QUERY_LINKS = self.rept(join, self._QUERY_LINKS)
156	153
	154	+	# If the link is a media link(mp4,..) not a web page
157	155		broke = 0
158	156		for ext in self.media_exts:
159		-	if (f'.{ext}/' in join) or ends(f'.{ext}'):
	157	+	if (f'.{ext}/' in join) or join.endswith(f'.{ext}'):
160	158		self._MEDIA = self.rept(join, self._MEDIA)
161	159		broke = 1
162	160		break
		skipped 20 lines
183	181		return []
184	182		self.passed.append(url)
185	183		# Send Request
186		-	# try:
187		-	req = self.framework.request(url)
188		-	# except:
189		-	# return False
190		-	# else:
191		-	resp = req.text
	184	+	try:
	185	+	req = self.framework.request(url)
	186	+	except:
	187	+	return False
	188	+	else:
	189	+	resp = req.text
192	190
193	191		pp = self.framework.page_parse(resp)
194	192
		skipped 140 lines

fix web_scrap errors