Projects STRLCPY snscrape Commits babcddda
🤬
  • made Telegram scraper not return full channel info for forwarded_from attribute; fixed video edge cases.

  • Loading...
  • Tristan Lee committed 2 years ago
    babcddda
    1 parent 4e59638e
  • ■ ■ ■ ■ ■ ■
    snscrape/modules/telegram.py
    skipped 26 lines
    27 27  @dataclasses.dataclass
    28 28  class Channel(snscrape.base.Entity):
    29 29   username: str
    30  - title: str
    31  - verified: bool
    32  - photo: str
     30 + title: typing.Optional[str] = None
     31 + verified: typing.Optional[bool] = None
     32 + photo: typing.Optional[str] = None
    33 33   description: typing.Optional[str] = None
    34 34   members: typing.Optional[int] = None
    35 35   photos: typing.Optional[snscrape.base.IntWithGranularity] = None
    skipped 87 lines
    123 123   content = message.get_text(separator="\n")
    124 124   
    125 125   for video_player in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
    126  - 
    127  - style = video_player.find('i')['style']
    128  - videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)
    129  - videoTag = video_player.find('video')
    130  - if videoTag is None:
     126 + iTag = video_player.find('i')
     127 + if iTag is None:
    131 128   videoUrl = None
     129 + videoThumbnailUrl = None
    132 130   else:
    133  - videoUrl = videoTag['src']
     131 + style = iTag['style']
     132 + videoThumbnailUrl = re.findall('url\(\'(.*?)\'\)', style)[0]
     133 + videoTag = video_player.find('video')
     134 + if videoTag is None:
     135 + videoUrl = None
     136 + else:
     137 + videoUrl = videoTag['src']
    134 138   mKwargs = {
    135 139   'thumbnailUrl': videoThumbnailUrl,
    136 140   'url': videoUrl,
    skipped 9 lines
    146 150   if (forward_tag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
    147 151   forwardedUrl = forward_tag['href']
    148 152   forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
    149  - forwardedChannelScraper = TelegramChannelScraper(name = forwardedName)
    150  - forwarded = forwardedChannelScraper._get_entity()
     153 + forwarded = Channel(username = forwardedName)
    151 154   
    152 155   outlinks = []
    153 156   for link in post.find_all('a'):
    skipped 59 lines
    213 216   if not pageLink:
    214 217   break
    215 218   nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
    216  - r = self._get(nextPageUrl, headers = self._headers)
     219 + r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
    217 220   if r.status_code != 200:
    218 221   raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
    219 222   soup = bs4.BeautifulSoup(r.text, 'lxml')
    skipped 60 lines
    280 283   return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
    281 284   else:
    282 285   return int(s), 1
     286 + 
     287 +def telegramResponseOkCallback(r):
     288 + if r.status_code == 200:
     289 + return (True, None)
     290 + elif r.status_code // 100 == 5:
     291 + return (False, f'status code: {r.status_code}')
     292 + else:
     293 + return (False, None)
Please wait...
Page is in error, reload to recover