STRLCPY/snscrape

added additional attributes for hashtags and user mentions, removed redundant outlinks
Tristan Lee committed 2 years ago

9b3faec9

1 parent 97d38e5c

Total 1 files

■ ■ ■ ■ ■ ■

snscrape/modules/telegram.py

		skipped 49 lines
50	50		url: str
51	51		date: datetime.datetime
52	52		content: str
53		-	outlinks: list
	53	+	outlinks: typing.List[str] = None
	54	+	mentions: typing.List[str] = None
	55	+	hashtags: typing.List[str] = None
54	56		forwarded: typing.Optional['Channel'] = None
55	57		forwardedUrl: typing.Optional[str] = None
56	58		media: typing.Optional[typing.List['Medium']] = None
		skipped 76 lines
133	135		content = None
134	136
135	137		outlinks = []
	138	+	mentions = []
	139	+	hashtags = []
136	140		for link in post.find_all('a'):
137	141		if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
138	142		# Author links at the top (avatar and name)
		skipped 15 lines
154	158		# encoded_string = base64.b64encode(resp.content)
155	159		# Individual photo or video link
156	160		continue
	161	+	if link.text.startswith('@'):
	162	+	mentions.append(link.text.strip('@'))
	163	+	continue
	164	+	if link.text.startswith('#'):
	165	+	hashtags.append(link.text.strip('#'))
	166	+	continue
157	167		href = urllib.parse.urljoin(pageUrl, link['href'])
158		-	if (href not in outlinks) and (href != rawUrl):
	168	+	if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
159	169		outlinks.append(href)
160	170
161	171		for voice_player in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
		skipped 55 lines
217	227		else:
218	228		views = parse_num(viewsSpan.text)
219	229
220		-	yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
	230	+	yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
221	231
222	232		def get_items(self):
223	233		r, soup = self._initial_page()
		skipped 101 lines

added additional attributes for hashtags and user mentions, removed redundant outlinks