■ ■ ■ ■ ■ ■
snscrape/modules/telegram.py
| skipped 49 lines |
50 | 50 | | url: str |
51 | 51 | | date: datetime.datetime |
52 | 52 | | content: str |
53 | | - | outlinks: list |
| 53 | + | outlinks: typing.List[str] = None |
| 54 | + | mentions: typing.List[str] = None |
| 55 | + | hashtags: typing.List[str] = None |
54 | 56 | | forwarded: typing.Optional['Channel'] = None |
55 | 57 | | forwardedUrl: typing.Optional[str] = None |
56 | 58 | | media: typing.Optional[typing.List['Medium']] = None |
| skipped 76 lines |
133 | 135 | | content = None |
134 | 136 | | |
135 | 137 | | outlinks = [] |
| 138 | + | mentions = [] |
| 139 | + | hashtags = [] |
136 | 140 | | for link in post.find_all('a'): |
137 | 141 | | if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')): |
138 | 142 | | # Author links at the top (avatar and name) |
| skipped 15 lines |
154 | 158 | | # encoded_string = base64.b64encode(resp.content) |
155 | 159 | | # Individual photo or video link |
156 | 160 | | continue |
| 161 | + | if link.text.startswith('@'): |
| 162 | + | mentions.append(link.text.strip('@')) |
| 163 | + | continue |
| 164 | + | if link.text.startswith('#'): |
| 165 | + | hashtags.append(link.text.strip('#')) |
| 166 | + | continue |
157 | 167 | | href = urllib.parse.urljoin(pageUrl, link['href']) |
158 | | - | if (href not in outlinks) and (href != rawUrl): |
| 168 | + | if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl): |
159 | 169 | | outlinks.append(href) |
160 | 170 | | |
161 | 171 | | for voice_player in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}): |
| skipped 55 lines |
217 | 227 | | else: |
218 | 228 | | views = parse_num(viewsSpan.text) |
219 | 229 | | |
220 | | - | yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views) |
| 230 | + | yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views) |
221 | 231 | | |
222 | 232 | | def get_items(self): |
223 | 233 | | r, soup = self._initial_page() |
| skipped 101 lines |