STRLCPY/snscrape

made Telegram scraper not return full channel info for forwarded_from attribute; fixed video edge cases.
Tristan Lee committed 2 years ago

babcddda

1 parent 4e59638e

Total 1 files

■ ■ ■ ■ ■ ■

snscrape/modules/telegram.py

		skipped 26 lines
27	27		@dataclasses.dataclass
28	28		class Channel(snscrape.base.Entity):
29	29		username: str
30		-	title: str
31		-	verified: bool
32		-	photo: str
	30	+	title: typing.Optional[str] = None
	31	+	verified: typing.Optional[bool] = None
	32	+	photo: typing.Optional[str] = None
33	33		description: typing.Optional[str] = None
34	34		members: typing.Optional[int] = None
35	35		photos: typing.Optional[snscrape.base.IntWithGranularity] = None
		skipped 87 lines
123	123		content = message.get_text(separator="\n")
124	124
125	125		for video_player in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
126		-
127		-	style = video_player.find('i')['style']
128		-	videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)
129		-	videoTag = video_player.find('video')
130		-	if videoTag is None:
	126	+	iTag = video_player.find('i')
	127	+	if iTag is None:
131	128		videoUrl = None
	129	+	videoThumbnailUrl = None
132	130		else:
133		-	videoUrl = videoTag['src']
	131	+	style = iTag['style']
	132	+	videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)[0]
	133	+	videoTag = video_player.find('video')
	134	+	if videoTag is None:
	135	+	videoUrl = None
	136	+	else:
	137	+	videoUrl = videoTag['src']
134	138		mKwargs = {
135	139		'thumbnailUrl': videoThumbnailUrl,
136	140		'url': videoUrl,
		skipped 9 lines
146	150		if (forward_tag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
147	151		forwardedUrl = forward_tag['href']
148	152		forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
149		-	forwardedChannelScraper = TelegramChannelScraper(name = forwardedName)
150		-	forwarded = forwardedChannelScraper._get_entity()
	153	+	forwarded = Channel(username = forwardedName)
151	154
152	155		outlinks = []
153	156		for link in post.find_all('a'):
		skipped 59 lines
213	216		if not pageLink:
214	217		break
215	218		nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
216		-	r = self._get(nextPageUrl, headers = self._headers)
	219	+	r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
217	220		if r.status_code != 200:
218	221		raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
219	222		soup = bs4.BeautifulSoup(r.text, 'lxml')
		skipped 60 lines
280	283		return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
281	284		else:
282	285		return int(s), 1
	286	+
	287	+	def telegramResponseOkCallback(r):
	288	+	if r.status_code == 200:
	289	+	return (True, None)
	290	+	elif r.status_code // 100 == 5:
	291	+	return (False, f'status code: {r.status_code}')
	292	+	else:
	293	+	return (False, None)

made Telegram scraper not return full channel info for forwarded_from attribute; fixed video edge cases.