Projects STRLCPY snscrape Commits 9b3faec9
🤬
  • added additional attributes for hashtags and user mentions, removed redundant outlinks

  • Loading...
  • Tristan Lee committed 2 years ago
    9b3faec9
    1 parent 97d38e5c
  • ■ ■ ■ ■ ■
    snscrape/modules/telegram.py
    skipped 49 lines
    50 50   url: str
    51 51   date: datetime.datetime
    52 52   content: str
    53  - outlinks: list
     53 + outlinks: typing.List[str] = None
     54 + mentions: typing.List[str] = None
     55 + hashtags: typing.List[str] = None
    54 56   forwarded: typing.Optional['Channel'] = None
    55 57   forwardedUrl: typing.Optional[str] = None
    56 58   media: typing.Optional[typing.List['Medium']] = None
    skipped 76 lines
    133 135   content = None
    134 136   
    135 137   outlinks = []
     138 + mentions = []
     139 + hashtags = []
    136 140   for link in post.find_all('a'):
    137 141   if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
    138 142   # Author links at the top (avatar and name)
    skipped 15 lines
    154 158   # encoded_string = base64.b64encode(resp.content)
    155 159   # Individual photo or video link
    156 160   continue
     161 + if link.text.startswith('@'):
     162 + mentions.append(link.text.strip('@'))
     163 + continue
     164 + if link.text.startswith('#'):
     165 + hashtags.append(link.text.strip('#'))
     166 + continue
    157 167   href = urllib.parse.urljoin(pageUrl, link['href'])
    158  - if (href not in outlinks) and (href != rawUrl):
     168 + if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
    159 169   outlinks.append(href)
    160 170   
    161 171   for voice_player in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
    skipped 55 lines
    217 227   else:
    218 228   views = parse_num(viewsSpan.text)
    219 229  
    220  - yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
     230 + yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
    221 231   
    222 232   def get_items(self):
    223 233   r, soup = self._initial_page()
    skipped 101 lines
Please wait...
Page is in error, reload to recover