Projects STRLCPY snscrape Commits 07a5f6fd
🤬
  • ■ ■ ■ ■ ■
    snscrape/_cli.py
    skipped 132 lines
    133 133   fp.write('Stack:\n')
    134 134   for frameRecord in trace:
    135 135   fp.write(f' File "{frameRecord.filename}", line {frameRecord.lineno}, in {frameRecord.function}\n')
    136  - for line in frameRecord.code_context:
    137  - fp.write(f' {line.strip()}\n')
     136 + if frameRecord.code_context is not None:
     137 + for line in frameRecord.code_context:
     138 + fp.write(f' {line.strip()}\n')
    138 139   fp.write('\n')
    139 140   
    140  - for frameRecord in trace:
    141  - module = inspect.getmodule(frameRecord[0])
     141 + modules = [inspect.getmodule(frameRecord[0]) for frameRecord in trace]
     142 + for i, (module, frameRecord) in enumerate(zip(modules, trace)):
     143 + if module is None:
     144 + # Module-less frame, e.g. dataclass.__init__
     145 + for j in reversed(range(i)):
     146 + if modules[j] is not None:
     147 + break
     148 + else:
     149 + # No previous module scope
     150 + continue
     151 + module = modules[j]
    142 152   if not module.__name__.startswith('snscrape.') and module.__name__ != 'snscrape':
    143 153   continue
    144 154   locals_ = frameRecord[0].f_locals
    skipped 185 lines
  • ■ ■ ■ ■ ■ ■
    snscrape/base.py
    skipped 162 lines
    163 163   return self._get_entity()
    164 164   
    165 165   def _request(self, method, url, params = None, data = None, headers = None, timeout = 10, responseOkCallback = None, allowRedirects = True, proxies = None):
    166  - proxies = proxies or self._proxies
     166 + proxies = proxies or self._proxies or {}
    167 167   for attempt in range(self._retries + 1):
    168 168   # The request is newly prepared on each retry because of potential cookie updates.
    169 169   req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers))
     170 + environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None)
    170 171   logger.info(f'Retrieving {req.url}')
    171 172   logger.debug(f'... with headers: {headers!r}')
    172 173   if data:
    173 174   logger.debug(f'... with data: {data!r}')
     175 + if environmentSettings:
     176 + logger.debug(f'... with environmentSettings: {environmentSettings!r}')
    174 177   try:
    175  - r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, proxies = proxies)
     178 + r = self._session.send(req, allow_redirects = allowRedirects, timeout = timeout, **environmentSettings)
    176 179   except requests.exceptions.RequestException as exc:
    177 180   if attempt < self._retries:
    178 181   retrying = ', retrying'
    skipped 66 lines
  • ■ ■ ■ ■ ■ ■
    snscrape/modules/instagram.py
    skipped 95 lines
    96 96   def _check_json_callback(self, r):
    97 97   if r.status_code != 200:
    98 98   return False, f'status code {r.status_code}'
     99 + if r.url.startswith('https://www.instagram.com/accounts/login/'):
     100 + raise snscrape.base.ScraperException('Redirected to login page')
    99 101   try:
    100 102   obj = json.loads(r.text)
    101 103   except json.JSONDecodeError as e:
    skipped 141 lines
  • ■ ■ ■ ■ ■ ■
    snscrape/modules/reddit.py
    skipped 19 lines
    20 20  @dataclasses.dataclass
    21 21  class Submission(snscrape.base.Item):
    22 22   author: typing.Optional[str] # E.g. submission hf7k6
    23  - created: datetime.datetime
     23 + date: datetime.datetime
    24 24   id: str
    25 25   link: typing.Optional[str]
    26 26   selftext: typing.Optional[str]
    skipped 1 lines
    28 28   title: str
    29 29   url: str
    30 30   
     31 + created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
     32 + 
    31 33   def __str__(self):
    32 34   return self.url
    33 35   
    skipped 2 lines
    36 38  class Comment(snscrape.base.Item):
    37 39   author: typing.Optional[str]
    38 40   body: str
    39  - created: datetime.datetime
     41 + date: datetime.datetime
    40 42   id: str
    41 43   parentId: typing.Optional[str]
    42 44   subreddit: typing.Optional[str]
    43 45   url: str
    44 46   
     47 + created = snscrape.base._DeprecatedProperty('created', lambda self: self.date, 'date')
     48 + 
    45 49   def __str__(self):
    46 50   return self.url
    47 51   
    skipped 63 lines
    111 115   
    112 116   kwargs = {
    113 117   'author': d.get('author'),
    114  - 'created': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
     118 + 'date': datetime.datetime.fromtimestamp(d['created_utc'], datetime.timezone.utc),
    115 119   'url': f'https://old.reddit.com{permalink}',
    116 120   'subreddit': d.get('subreddit'),
    117 121   }
    skipped 74 lines
    192 196   
    193 197   while True:
    194 198   # Return newer first; if both have the same creation datetime, return the comment first
    195  - if tipSubmission.created > tipComment.created:
     199 + if tipSubmission.date > tipComment.date:
    196 200   yield tipSubmission
    197 201   try:
    198 202   tipSubmission = next(submissionsIter)
    skipped 84 lines
  • ■ ■ ■ ■ ■
    snscrape/modules/telegram.py
    skipped 11 lines
    12 12   
    13 13  _logger = logging.getLogger(__name__)
    14 14  _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$')
    15  - 
     15 +_STYLE_MEDIA_URL_PATTERN = re.compile(r'url\(\'(.*?)\'\)')
    16 16   
    17 17  @dataclasses.dataclass
    18 18  class LinkPreview:
    skipped 5 lines
    24 24   
    25 25   
    26 26  @dataclasses.dataclass
    27  -class TelegramPost(snscrape.base.Item):
    28  - url: str
    29  - date: datetime.datetime
    30  - content: str
    31  - outlinks: list
    32  - images: list
    33  - video: str
    34  - forwarded: str
    35  - linkPreview: typing.Optional[LinkPreview] = None
    36  - 
    37  - outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
    38  - 
    39  - def __str__(self):
    40  - return self.url
    41  - 
    42  - 
    43  -@dataclasses.dataclass
    44 27  class Channel(snscrape.base.Entity):
    45 28   username: str
    46  - title: str
    47  - verified: bool
    48  - photo: str
     29 + title: typing.Optional[str] = None
     30 + verified: typing.Optional[bool] = None
     31 + photo: typing.Optional[str] = None
    49 32   description: typing.Optional[str] = None
    50 33   members: typing.Optional[int] = None
    51 34   photos: typing.Optional[snscrape.base.IntWithGranularity] = None
    skipped 10 lines
    62 45   return f'https://t.me/s/{self.username}'
    63 46   
    64 47   
     48 +@dataclasses.dataclass
     49 +class TelegramPost(snscrape.base.Item):
     50 + url: str
     51 + date: datetime.datetime
     52 + content: str
     53 + outlinks: typing.List[str] = None
     54 + mentions: typing.List[str] = None
     55 + hashtags: typing.List[str] = None
     56 + forwarded: typing.Optional['Channel'] = None
     57 + forwardedUrl: typing.Optional[str] = None
     58 + media: typing.Optional[typing.List['Medium']] = None
     59 + views: typing.Optional[int] = None
     60 + linkPreview: typing.Optional[LinkPreview] = None
     61 + 
     62 + outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks), 'outlinks')
     63 + 
     64 + def __str__(self):
     65 + return self.url
     66 + 
     67 + 
     68 +class Medium:
     69 + pass
     70 + 
     71 + 
     72 +@dataclasses.dataclass
     73 +class Photo(Medium):
     74 + url: str
     75 + 
     76 + 
     77 +@dataclasses.dataclass
     78 +class Video(Medium):
     79 + thumbnailUrl: str
     80 + duration: float
     81 + url: typing.Optional[str] = None
     82 + 
     83 + 
     84 +@dataclasses.dataclass
     85 +class VoiceMessage(Medium):
     86 + url: str
     87 + duration: str
     88 + bars:typing.List[float]
     89 + 
     90 + 
     91 +@dataclasses.dataclass
     92 +class Gif(Medium):
     93 + thumbnailUrl: str
     94 + url: typing.Optional[str] = None
     95 + 
     96 + 
    65 97  class TelegramChannelScraper(snscrape.base.Scraper):
    66 98   name = 'telegram-channel'
    67 99   
    skipped 24 lines
    92 124   _logger.warning(f'Possibly incorrect URL: {rawUrl!r}')
    93 125   url = rawUrl.replace('//t.me/', '//t.me/s/')
    94 126   date = datetime.datetime.strptime(dateDiv.find('time', datetime = True)['datetime'].replace('-', '', 2).replace(':', ''), '%Y%m%dT%H%M%S%z')
    95  - images = []
    96  - video = None
     127 + media = []
     128 + outlinks = []
     129 + mentions = []
     130 + hashtags = []
    97 131   forwarded = None
     132 + forwardedUrl = None
     133 + 
     134 + if (forwardTag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
     135 + forwardedUrl = forwardTag['href']
     136 + forwardedName = forwardedUrl.split('t.me/')[1].split('/')[0]
     137 + forwarded = Channel(username = forwardedName)
     138 + 
    98 139   if (message := post.find('div', class_ = 'tgme_widget_message_text')):
    99 140   content = message.get_text(separator="\n")
     141 + else:
     142 + content = None
    100 143   
    101  - for video_tag in post.find_all('video'):
    102  - video = video_tag['src']
     144 + for link in post.find_all('a'):
     145 + if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
     146 + # Author links at the top (avatar and name)
     147 + continue
     148 + if link['href'] == rawUrl or link['href'] == url:
     149 + style = link.attrs.get('style', '')
     150 + # Generic filter of links to the post itself, catches videos, photos, and the date link
     151 + if style != '':
     152 + imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
     153 + if len(imageUrls) == 1:
     154 + media.append(Photo(url = imageUrls[0]))
     155 + continue
     156 + if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']):
     157 + style = link.attrs.get('style', '')
     158 + imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style)
     159 + if len(imageUrls) == 1:
     160 + media.append(Photo(url = imageUrls[0]))
     161 + # resp = self._get(image[0])
     162 + # encoded_string = base64.b64encode(resp.content)
     163 + # Individual photo or video link
     164 + continue
     165 + if link.text.startswith('@'):
     166 + mentions.append(link.text.strip('@'))
     167 + continue
     168 + if link.text.startswith('#'):
     169 + hashtags.append(link.text.strip('#'))
     170 + continue
     171 + href = urllib.parse.urljoin(pageUrl, link['href'])
     172 + if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl):
     173 + outlinks.append(href)
    103 174   
    104  - if (forward_tag := post.find('a', class_ = 'tgme_widget_message_forwarded_from_name')):
    105  - forwarded = forward_tag['href'].split('t.me/')[1].split('/')[0]
     175 + for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}):
     176 + audioUrl = voicePlayer.find('audio')['src']
     177 + durationStr = voicePlayer.find('time').text
     178 + duration = durationStrToSeconds(durationStr)
     179 + barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')]
     180 + 
     181 + media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights))
     182 + 
     183 + for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}):
     184 + iTag = videoPlayer.find('i')
     185 + if iTag is None:
     186 + videoUrl = None
     187 + videoThumbnailUrl = None
     188 + else:
     189 + style = iTag['style']
     190 + videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0]
     191 + videoTag = videoPlayer.find('video')
     192 + videoUrl = None if videoTag is None else videoTag['src']
     193 + mKwargs = {
     194 + 'thumbnailUrl': videoThumbnailUrl,
     195 + 'url': videoUrl,
     196 + }
     197 + timeTag = videoPlayer.find('time')
     198 + if timeTag is None:
     199 + cls = Gif
     200 + else:
     201 + cls = Video
     202 + durationStr = videoPlayer.find('time').text
     203 + mKwargs['duration'] = durationStrToSeconds(durationStr)
     204 + media.append(cls(**mKwargs))
    106 205   
     206 +<<<<<<< HEAD
    107 207   outlinks = []
    108 208   for link in post.find_all('a'):
    109 209   if any(x in link.parent.attrs.get('class', []) for x in ('tgme_widget_message_user', 'tgme_widget_message_author')):
    skipped 21 lines
    131 231   outlinks = []
    132 232   images = []
    133 233   video = None
     234 +=======
     235 +>>>>>>> master
    134 236   linkPreview = None
    135 237   if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')):
    136 238   kwargs = {}
    skipped 10 lines
    147 249   else:
    148 250   _logger.warning(f'Could not process link preview image on {url}')
    149 251   linkPreview = LinkPreview(**kwargs)
    150  - yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, linkPreview = linkPreview, images = images, video = video, forwarded = forwarded)
     252 + if kwargs['href'] in outlinks:
     253 + outlinks.remove(kwargs['href'])
     254 + 
     255 + viewsSpan = post.find('span', class_ = 'tgme_widget_message_views')
     256 + views = None if viewsSpan is None else parse_num(viewsSpan.text)
     257 +
     258 + yield TelegramPost(url = url, date = date, content = content, outlinks = outlinks, mentions = mentions, hashtags = hashtags, linkPreview = linkPreview, media = media, forwarded = forwarded, forwardedUrl = forwardedUrl, views = views)
    151 259   
    152 260   def get_items(self):
    153 261   r, soup = self._initial_page()
    154 262   if '/s/' not in r.url:
    155 263   _logger.warning('No public post list for this user')
    156 264   return
     265 + nextPageUrl = ''
    157 266   while True:
    158 267   yield from self._soup_to_items(soup, r.url)
     268 + try:
     269 + if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1':
     270 + # if message 1 is the first message in the page, terminate scraping
     271 + break
     272 + except:
     273 + pass
    159 274   pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True})
    160 275   if not pageLink:
    161  - break
     276 + # some pages are missing a "tme_messages_more" tag, causing early termination
     277 + if '=' not in nextPageUrl:
     278 + nextPageUrl = soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href']
     279 + nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20
     280 + if nextPostIndex > 20:
     281 + pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'}
     282 + else:
     283 + break
    162 284   nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href'])
    163  - r = self._get(nextPageUrl, headers = self._headers)
     285 + r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = telegramResponseOkCallback)
    164 286   if r.status_code != 200:
    165 287   raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
    166 288   soup = bs4.BeautifulSoup(r.text, 'lxml')
    skipped 6 lines
    173 295   raise snscrape.base.ScraperException(f'Got status code {r.status_code}')
    174 296   soup = bs4.BeautifulSoup(r.text, 'lxml')
    175 297   membersDiv = soup.find('div', class_ = 'tgme_page_extra')
    176  - if membersDiv.text.endswith(' members'):
    177  - kwargs['members'] = int(membersDiv.text[:-8].replace(' ', ''))
    178  - kwargs['photo'] = soup.find('img', class_ = 'tgme_page_photo_image').attrs['src']
     298 + if membersDiv.text.endswith((' members', ' subscribers')):
     299 + kwargs['members'] = int(''.join(membersDiv.text.split(' ')[:-1]))
     300 + photoImg = soup.find('img', class_ = 'tgme_page_photo_image')
     301 + if photoImg is not None:
     302 + kwargs['photo'] = photoImg.attrs['src']
     303 + else:
     304 + kwargs['photo'] = None
    179 305   
    180 306   r, soup = self._initial_page()
    181 307   if '/s/' not in r.url: # Redirect on channels without public posts
    skipped 14 lines
    196 322   if (descriptionDiv := channelInfoDiv.find('div', class_ = 'tgme_channel_info_description')):
    197 323   kwargs['description'] = descriptionDiv.text
    198 324   
    199  - def parse_num(s):
    200  - s = s.replace(' ', '')
    201  - if s.endswith('M'):
    202  - return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
    203  - elif s.endswith('K'):
    204  - return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
    205  - else:
    206  - return int(s), 1
    207  - 
    208 325   for div in channelInfoDiv.find_all('div', class_ = 'tgme_channel_info_counter'):
    209 326   value, granularity = parse_num(div.find('span', class_ = 'counter_value').text)
    210 327   type_ = div.find('span', class_ = 'counter_type').text
    skipped 13 lines
    224 341   def _cli_from_args(cls, args):
    225 342   return cls._cli_construct(args, args.channel)
    226 343   
     344 +def parse_num(s):
     345 + s = s.replace(' ', '')
     346 + if s.endswith('M'):
     347 + return int(float(s[:-1]) * 1e6), 10 ** (6 if '.' not in s else 6 - len(s[:-1].split('.')[1]))
     348 + elif s.endswith('K'):
     349 + return int(float(s[:-1]) * 1000), 10 ** (3 if '.' not in s else 3 - len(s[:-1].split('.')[1]))
     350 + return int(s), 1
     351 + 
     352 +def durationStrToSeconds(durationStr):
     353 + durationList = durationStr.split(':')
     354 + return sum([int(s) * int(g) for s, g in zip([1, 60, 360], reversed(durationList))])
     355 + 
     356 +def telegramResponseOkCallback(r):
     357 + if r.status_code == 200:
     358 + return (True, None)
     359 + return (False, f'{r.status_code=}')
     360 +
  • ■ ■ ■ ■ ■ ■
    snscrape/modules/twitter.py
    skipped 98 lines
    99 99  class Video(Medium):
    100 100   thumbnailUrl: str
    101 101   variants: typing.List[VideoVariant]
    102  - duration: float
     102 + duration: typing.Optional[float] = None
    103 103   views: typing.Optional[int] = None
    104 104   
    105 105   
    skipped 26 lines
    132 132   countryCode: str
    133 133   
    134 134   
    135  -@dataclasses.dataclass
    136 135  class Card:
     136 + pass
     137 + 
     138 + 
     139 +@dataclasses.dataclass
     140 +class SummaryCard(Card):
    137 141   title: str
    138 142   url: str
    139 143   description: typing.Optional[str] = None
    140 144   thumbnailUrl: typing.Optional[str] = None
     145 + siteUser: typing.Optional['User'] = None
     146 + creatorUser: typing.Optional['User'] = None
     147 + 
     148 + 
     149 +@dataclasses.dataclass
     150 +class AppCard(SummaryCard):
     151 + pass
     152 + 
     153 + 
     154 +@dataclasses.dataclass
     155 +class PollCard(Card):
     156 + options: typing.List['PollOption']
     157 + endDate: datetime.datetime
     158 + duration: int
     159 + finalResults: bool
     160 + lastUpdateDate: typing.Optional[datetime.datetime] = None
     161 + medium: typing.Optional[Medium] = None
     162 + 
     163 + 
     164 +@dataclasses.dataclass
     165 +class PollOption:
     166 + label: str
     167 + count: typing.Optional[int] = None
     168 + 
     169 + 
     170 +@dataclasses.dataclass
     171 +class PlayerCard(Card):
     172 + title: str
     173 + url: str
     174 + description: typing.Optional[str] = None
     175 + imageUrl: typing.Optional[str] = None
     176 + siteUser: typing.Optional['User'] = None
     177 + 
     178 + 
     179 +@dataclasses.dataclass
     180 +class PromoConvoCard(Card):
     181 + actions: typing.List['PromoConvoAction']
     182 + thankYouText: str
     183 + medium: Medium
     184 + thankYouUrl: typing.Optional[str] = None
     185 + thankYouTcoUrl: typing.Optional[str] = None
     186 + cover: typing.Optional['Photo'] = None
     187 + 
     188 + 
     189 +@dataclasses.dataclass
     190 +class PromoConvoAction:
     191 + label: str
     192 + tweet: str
     193 + 
     194 + 
     195 +@dataclasses.dataclass
     196 +class BroadcastCard(Card):
     197 + id: str
     198 + url: str
     199 + title: str
     200 + state: typing.Optional[str] = None
     201 + broadcaster: typing.Optional['User'] = None
     202 + thumbnailUrl: typing.Optional[str] = None
     203 + source: typing.Optional[str] = None
     204 + siteUser: typing.Optional['User'] = None
     205 + 
     206 + 
     207 +@dataclasses.dataclass
     208 +class PeriscopeBroadcastCard(Card):
     209 + id: str
     210 + url: str
     211 + title: str
     212 + description: str
     213 + state: str
     214 + totalParticipants: int
     215 + thumbnailUrl: str
     216 + source: typing.Optional[str] = None
     217 + broadcaster: typing.Optional['User'] = None
     218 + siteUser: typing.Optional['User'] = None
     219 + 
     220 + 
     221 +@dataclasses.dataclass
     222 +class EventCard(Card):
     223 + event: 'Event'
     224 + 
     225 + 
     226 +@dataclasses.dataclass
     227 +class Event:
     228 + id: int
     229 + category: str
     230 + photo: Photo
     231 + title: typing.Optional[str] = None
     232 + description: typing.Optional[str] = None
     233 + 
     234 + @property
     235 + def url(self):
     236 + return f'https://twitter.com/i/events/{self.id}'
     237 + 
     238 + 
     239 +@dataclasses.dataclass
     240 +class NewsletterCard(Card):
     241 + title: str
     242 + description: str
     243 + imageUrl: str
     244 + url: str
     245 + revueAccountId: int
     246 + issueCount: int
     247 + 
     248 + 
     249 +@dataclasses.dataclass
     250 +class NewsletterIssueCard(Card):
     251 + newsletterTitle: str
     252 + newsletterDescription: str
     253 + issueTitle: str
     254 + issueNumber: int
     255 + url: str
     256 + revueAccountId: int
     257 + issueDescription: typing.Optional[str] = None
     258 + imageUrl: typing.Optional[str] = None
     259 + 
     260 + 
     261 +@dataclasses.dataclass
     262 +class AmplifyCard(Card):
     263 + id: str
     264 + video: Video
     265 + 
     266 + 
     267 +@dataclasses.dataclass
     268 +class AppPlayerCard(Card):
     269 + title: str
     270 + video: Video
     271 + appCategory: str
     272 + playerOwnerId: int
     273 + siteUser: typing.Optional['User'] = None
     274 + 
     275 + 
     276 +@dataclasses.dataclass
     277 +class SpacesCard(Card):
     278 + url: str
     279 + id: str
     280 + 
     281 + 
     282 +@dataclasses.dataclass
     283 +class MessageMeCard(Card):
     284 + recipient: 'User'
     285 + url: str
     286 + buttonText: str
     287 + 
     288 + 
     289 +UnifiedCardComponentKey = str
     290 +UnifiedCardDestinationKey = str
     291 +UnifiedCardMediumKey = str
     292 +UnifiedCardAppKey = str
     293 + 
     294 + 
     295 +@dataclasses.dataclass
     296 +class UnifiedCard(Card):
     297 + componentObjects: typing.Dict[UnifiedCardComponentKey, 'UnifiedCardComponentObject']
     298 + destinations: typing.Dict[UnifiedCardDestinationKey, 'UnifiedCardDestination']
     299 + media: typing.Dict[UnifiedCardMediumKey, Medium]
     300 + apps: typing.Optional[typing.Dict[UnifiedCardAppKey, typing.List['UnifiedCardApp']]] = None
     301 + components: typing.Optional[typing.List[UnifiedCardComponentKey]] = None
     302 + swipeableLayoutSlides: typing.Optional[typing.List['UnifiedCardSwipeableLayoutSlide']] = None
     303 + type: typing.Optional[str] = None
     304 + 
     305 + def __post_init__(self):
     306 + if (self.components is None) == (self.swipeableLayoutSlides is None):
     307 + raise ValueError('did not get exactly one of components or swipeableLayoutSlides')
     308 + if self.components and not all(k in self.componentObjects for k in self.components):
     309 + raise ValueError('missing components')
     310 + if self.swipeableLayoutSlides and not all(s.mediumComponentKey in self.componentObjects and s.componentKey in self.componentObjects for s in self.swipeableLayoutSlides):
     311 + raise ValueError('missing components')
     312 + if any(c.destinationKey not in self.destinations for c in self.componentObjects.values() if hasattr(c, 'destinationKey')):
     313 + raise ValueError('missing destinations')
     314 + if any(b.destinationKey not in self.destinations for c in self.componentObjects.values() if isinstance(c, UnifiedCardButtonGroupComponentObject) for b in c.buttons):
     315 + raise ValueError('missing destinations')
     316 + mediaKeys = []
     317 + for c in self.componentObjects.values():
     318 + if isinstance(c, UnifiedCardMediumComponentObject):
     319 + mediaKeys.append(c.mediumKey)
     320 + elif isinstance(c, UnifiedCardSwipeableMediaComponentObject):
     321 + mediaKeys.extend(x.mediumKey for x in c.media)
     322 + mediaKeys.extend(d.mediumKey for d in self.destinations.values() if d.mediumKey is not None)
     323 + mediaKeys.extend(a.iconMediumKey for l in (self.apps.values() if self.apps is not None else []) for a in l if a.iconMediumKey is not None)
     324 + if any(k not in self.media for k in mediaKeys):
     325 + raise ValueError('missing media')
     326 + if any(c.appKey not in self.apps for c in self.componentObjects.values() if hasattr(c, 'appKey')):
     327 + raise ValueError('missing apps')
     328 + if any(d.appKey not in self.apps for d in self.destinations.values() if d.appKey is not None):
     329 + raise ValueError('missing apps')
     330 + 
     331 + 
     332 +class UnifiedCardComponentObject:
     333 + pass
     334 + 
     335 + 
     336 +@dataclasses.dataclass
     337 +class UnifiedCardDetailComponentObject(UnifiedCardComponentObject):
     338 + content: str
     339 + destinationKey: UnifiedCardDestinationKey
     340 + 
     341 + 
     342 +@dataclasses.dataclass
     343 +class UnifiedCardMediumComponentObject(UnifiedCardComponentObject):
     344 + mediumKey: UnifiedCardMediumKey
     345 + destinationKey: UnifiedCardDestinationKey
     346 + 
     347 + 
     348 +@dataclasses.dataclass
     349 +class UnifiedCardButtonGroupComponentObject(UnifiedCardComponentObject):
     350 + buttons: typing.List['UnifiedCardButton']
     351 + 
     352 + 
     353 +@dataclasses.dataclass
     354 +class UnifiedCardButton:
     355 + text: str
     356 + destinationKey: UnifiedCardDestinationKey
     357 + 
     358 + 
     359 +@dataclasses.dataclass
     360 +class UnifiedCardSwipeableMediaComponentObject(UnifiedCardComponentObject):
     361 + media: typing.List['UnifiedCardSwipeableMediaMedium']
     362 + 
     363 + 
     364 +@dataclasses.dataclass
     365 +class UnifiedCardSwipeableMediaMedium:
     366 + mediumKey: UnifiedCardMediumKey
     367 + destinationKey: UnifiedCardDestinationKey
     368 + 
     369 + 
     370 +@dataclasses.dataclass
     371 +class UnifiedCardAppStoreComponentObject(UnifiedCardComponentObject):
     372 + appKey: UnifiedCardAppKey
     373 + destinationKey: UnifiedCardDestinationKey
     374 + 
     375 + 
     376 +@dataclasses.dataclass
     377 +class UnifiedCardTwitterListDetailsComponentObject(UnifiedCardComponentObject):
     378 + name: str
     379 + memberCount: int
     380 + subscriberCount: int
     381 + user: 'User'
     382 + destinationKey: UnifiedCardDestinationKey
     383 + 
     384 + 
     385 +@dataclasses.dataclass
     386 +class UnifiedCardTwitterCommunityDetailsComponentObject(UnifiedCardComponentObject):
     387 + name: str
     388 + theme: str
     389 + membersCount: int
     390 + destinationKey: UnifiedCardDestinationKey
     391 + membersFacepile: typing.Optional[typing.List['User']] = None
     392 + 
     393 + 
     394 +@dataclasses.dataclass
     395 +class UnifiedCardDestination:
     396 + url: typing.Optional[str] = None
     397 + appKey: typing.Optional[UnifiedCardAppKey] = None
     398 + mediumKey: typing.Optional[UnifiedCardMediumKey] = None
     399 + 
     400 + def __post_init__(self):
     401 + if (self.url is None) == (self.appKey is None):
     402 + raise ValueError('did not get exactly one of url and appKey')
     403 + 
     404 + 
     405 +@dataclasses.dataclass
     406 +class UnifiedCardApp:
     407 + type: str
     408 + id: str
     409 + title: str
     410 + category: str
     411 + countryCode: str
     412 + url: str
     413 + description: typing.Optional[str] = None
     414 + iconMediumKey: typing.Optional[UnifiedCardMediumKey] = None
     415 + size: typing.Optional[int] = None
     416 + installs: typing.Optional[int] = None
     417 + ratingAverage: typing.Optional[float] = None
     418 + ratingCount: typing.Optional[int] = None
     419 + isFree: typing.Optional[bool] = None
     420 + isEditorsChoice: typing.Optional[bool] = None
     421 + hasInAppPurchases: typing.Optional[bool] = None
     422 + hasInAppAds: typing.Optional[bool] = None
     423 + 
     424 + 
     425 +@dataclasses.dataclass
     426 +class UnifiedCardSwipeableLayoutSlide:
     427 + mediumComponentKey: UnifiedCardComponentKey
     428 + componentKey: UnifiedCardComponentKey
    141 429   
    142 430   
    143 431  @dataclasses.dataclass
    skipped 49 lines
    193 481   
    194 482   
    195 483  @dataclasses.dataclass
     484 +class UserRef:
     485 + id: int
     486 + 
     487 + 
     488 +@dataclasses.dataclass
    196 489  class Trend(snscrape.base.Item):
    197 490   name: str
    198 491   domainContext: str
    skipped 88 lines
    287 580   def reset(self):
    288 581   super().reset()
    289 582   with self._lock:
    290  - os.remove(self._file)
     583 + _logger.info(f'Deleting guest token file {self._file}')
     584 + try:
     585 + os.remove(self._file)
     586 + except FileNotFoundError:
     587 + # Another process likely already removed the file
     588 + pass
    291 589   
    292 590   
    293 591  class _TwitterAPIType(enum.Enum):
    skipped 45 lines
    339 637   r = self._post('https://api.twitter.com/1.1/guest/activate.json', data = b'', headers = self._apiHeaders, responseOkCallback = self._check_guest_token_response)
    340 638   o = r.json()
    341 639   if not o.get('guest_token'):
    342  - raise snscrape.base.ScraperError('Unable to retrieve guest token')
     640 + raise snscrape.base.ScraperException('Unable to retrieve guest token')
    343 641   self._guestTokenManager.token = o['guest_token']
    344 642   assert self._guestTokenManager.token
    345 643   _logger.debug(f'Using guest token {self._guestTokenManager.token}')
    skipped 159 lines
    505 803   raise snscrape.base.ScraperException(f'Unable to handle entry {entryId!r}')
    506 804   yield self._tweet_to_tweet(tweet, obj)
    507 805   
     806 + def _get_tweet_id(self, tweet):
     807 + return tweet['id'] if 'id' in tweet else int(tweet['id_str'])
     808 + 
    508 809   def _make_tweet(self, tweet, user, retweetedTweet = None, quotedTweet = None, card = None):
     810 + tweetId = self._get_tweet_id(tweet)
    509 811   kwargs = {}
    510  - kwargs['id'] = tweet['id'] if 'id' in tweet else int(tweet['id_str'])
     812 + kwargs['id'] = tweetId
    511 813   kwargs['content'] = tweet['full_text']
    512 814   kwargs['renderedContent'] = self._render_text_with_urls(tweet['full_text'], tweet['entities'].get('urls'))
    513 815   kwargs['user'] = user
    skipped 1 lines
    515 817   if tweet['entities'].get('urls'):
    516 818   kwargs['outlinks'] = [u['expanded_url'] for u in tweet['entities']['urls']]
    517 819   kwargs['tcooutlinks'] = [u['url'] for u in tweet['entities']['urls']]
    518  - kwargs['url'] = f'https://twitter.com/{user.username}/status/{kwargs["id"]}'
     820 + kwargs['url'] = f'https://twitter.com/{user.username}/status/{tweetId}'
    519 821   kwargs['replyCount'] = tweet['reply_count']
    520 822   kwargs['retweetCount'] = tweet['retweet_count']
    521 823   kwargs['likeCount'] = tweet['favorite_count']
    skipped 8 lines
    530 832   if 'extended_entities' in tweet and 'media' in tweet['extended_entities']:
    531 833   media = []
    532 834   for medium in tweet['extended_entities']['media']:
    533  - if medium['type'] == 'photo':
    534  - if '.' not in medium['media_url_https']:
    535  - _logger.warning(f'Skipping malformed medium URL on tweet {kwargs["id"]}: {medium["media_url_https"]!r} contains no dot')
    536  - continue
    537  - baseUrl, format = medium['media_url_https'].rsplit('.', 1)
    538  - if format not in ('jpg', 'png'):
    539  - _logger.warning(f'Skipping photo with unknown format on tweet {kwargs["id"]}: {format!r}')
    540  - continue
    541  - media.append(Photo(
    542  - previewUrl = f'{baseUrl}?format={format}&name=small',
    543  - fullUrl = f'{baseUrl}?format={format}&name=large',
    544  - ))
    545  - elif medium['type'] == 'video' or medium['type'] == 'animated_gif':
    546  - variants = []
    547  - for variant in medium['video_info']['variants']:
    548  - variants.append(VideoVariant(contentType = variant['content_type'], url = variant['url'], bitrate = variant.get('bitrate')))
    549  - mKwargs = {
    550  - 'thumbnailUrl': medium['media_url_https'],
    551  - 'variants': variants,
    552  - }
    553  - if medium['type'] == 'video':
    554  - mKwargs['duration'] = medium['video_info']['duration_millis'] / 1000
    555  - if (ext := medium.get('ext')) and (mediaStats := ext['mediaStats']) and isinstance(r := mediaStats['r'], dict) and 'ok' in r and isinstance(r['ok'], dict):
    556  - mKwargs['views'] = int(r['ok']['viewCount'])
    557  - elif (mediaStats := medium.get('mediaStats')):
    558  - mKwargs['views'] = mediaStats['viewCount']
    559  - cls = Video
    560  - elif medium['type'] == 'animated_gif':
    561  - cls = Gif
    562  - media.append(cls(**mKwargs))
     835 + if (mediumO := self._make_medium(medium, tweetId)):
     836 + media.append(mediumO)
    563 837   if media:
    564 838   kwargs['media'] = media
    565 839   if retweetedTweet:
    skipped 34 lines
    600 874   kwargs['cashtags'] = [o['text'] for o in tweet['entities']['symbols']]
    601 875   if card:
    602 876   kwargs['card'] = card
    603  - # Try to convert the URL to the non-shortened/t.co one
    604  - try:
    605  - i = kwargs['tcooutlinks'].index(card.url)
    606  - except ValueError:
    607  - _logger.warning('Could not find card URL in tcooutlinks')
    608  - else:
    609  - card.url = kwargs['outlinks'][i]
     877 + if hasattr(card, 'url') and '//t.co/' in card.url:
     878 + # Try to convert the URL to the non-shortened/t.co one
     879 + # Retweets inherit the card but not the outlinks; try to get them from the retweeted tweet instead in that case.
     880 + if 'tcooutlinks' in kwargs and card.url in kwargs['tcooutlinks']:
     881 + card.url = kwargs['outlinks'][kwargs['tcooutlinks'].index(card.url)]
     882 + elif retweetedTweet and retweetedTweet.tcooutlinks and card.url in retweetedTweet.tcooutlinks:
     883 + card.url = retweetedTweet.outlinks[retweetedTweet.tcooutlinks.index(card.url)]
     884 + else:
     885 + _logger.warning(f'Could not translate t.co card URL on tweet {tweetId}')
    610 886   return Tweet(**kwargs)
    611 887   
    612  - def _make_card(self, card, apiType):
    613  - cardKwargs = {}
    614  - for key, kwarg in [('title', 'title'), ('description', 'description'), ('card_url', 'url'), ('thumbnail_image_original', 'thumbnailUrl')]:
    615  - if apiType is _TwitterAPIType.V2:
    616  - value = card['binding_values'].get(key)
    617  - elif apiType is _TwitterAPIType.GRAPHQL:
    618  - value = next((o['value'] for o in card['legacy']['binding_values'] if o['key'] == key), None)
    619  - if not value:
     888 + def _make_medium(self, medium, tweetId):
     889 + if medium['type'] == 'photo':
     890 + if '?format=' in medium['media_url_https'] or '&format=' in medium['media_url_https']:
     891 + return Photo(previewUrl = medium['media_url_https'], fullUrl = medium['media_url_https'])
     892 + if '.' not in medium['media_url_https']:
     893 + _logger.warning(f'Skipping malformed medium URL on tweet {tweetId}: {medium["media_url_https"]!r} contains no dot')
     894 + return
     895 + baseUrl, format = medium['media_url_https'].rsplit('.', 1)
     896 + if format not in ('jpg', 'png'):
     897 + _logger.warning(f'Skipping photo with unknown format on tweet {tweetId}: {format!r}')
     898 + return
     899 + return Photo(
     900 + previewUrl = f'{baseUrl}?format={format}&name=small',
     901 + fullUrl = f'{baseUrl}?format={format}&name=large',
     902 + )
     903 + elif medium['type'] == 'video' or medium['type'] == 'animated_gif':
     904 + variants = []
     905 + for variant in medium['video_info']['variants']:
     906 + variants.append(VideoVariant(contentType = variant['content_type'], url = variant['url'], bitrate = variant.get('bitrate')))
     907 + mKwargs = {
     908 + 'thumbnailUrl': medium['media_url_https'],
     909 + 'variants': variants,
     910 + }
     911 + if medium['type'] == 'video':
     912 + mKwargs['duration'] = medium['video_info']['duration_millis'] / 1000
     913 + if (ext := medium.get('ext')) and (mediaStats := ext.get('mediaStats')) and isinstance(r := mediaStats['r'], dict) and 'ok' in r and isinstance(r['ok'], dict):
     914 + mKwargs['views'] = int(r['ok']['viewCount'])
     915 + elif (mediaStats := medium.get('mediaStats')):
     916 + mKwargs['views'] = mediaStats['viewCount']
     917 + cls = Video
     918 + elif medium['type'] == 'animated_gif':
     919 + cls = Gif
     920 + return cls(**mKwargs)
     921 + else:
     922 + _logger.warning(f'Unsupported medium type on tweet {tweetId}: {medium["type"]!r}')
     923 + 
     924 + def _make_card(self, card, apiType, tweetId):
     925 + bindingValues = {}
     926 + 
     927 + def _kwargs_from_map(keyKwargMap):
     928 + nonlocal bindingValues
     929 + return {kwarg: bindingValues[key] for key, kwarg in keyKwargMap.items() if key in bindingValues}
     930 + 
     931 + userRefs = {}
     932 + if apiType is _TwitterAPIType.V2:
     933 + for o in card.get('users', {}).values():
     934 + userId = o['id']
     935 + assert userId not in userRefs
     936 + userRefs[userId] = self._user_to_user(o)
     937 + elif apiType is _TwitterAPIType.GRAPHQL:
     938 + for o in card['legacy'].get('user_refs', {}):
     939 + userId = int(o['rest_id'])
     940 + if userId in userRefs:
     941 + _logger.warning(f'Duplicate user {userId} in card on tweet {tweetId}')
     942 + continue
     943 + if 'legacy' in o:
     944 + userRefs[userId] = self._user_to_user(o['legacy'], id_ = userId)
     945 + else:
     946 + userRefs[userId] = UserRef(id = userId)
     947 + 
     948 + if apiType is _TwitterAPIType.V2:
     949 + messyBindingValues = card['binding_values'].items()
     950 + elif apiType is _TwitterAPIType.GRAPHQL:
     951 + messyBindingValues = ((x['key'], x['value']) for x in card['legacy']['binding_values'])
     952 + for key, value in messyBindingValues:
     953 + if 'type' not in value:
     954 + # Silently ignore creator/site entries since they frequently appear like this.
     955 + if key not in ('creator', 'site'):
     956 + _logger.warning(f'Skipping type-less card value {key!r} on tweet {tweetId}')
    620 957   continue
    621 958   if value['type'] == 'STRING':
    622  - cardKwargs[kwarg] = value['string_value']
     959 + bindingValues[key] = value['string_value']
     960 + if key.endswith('_datetime_utc'):
     961 + bindingValues[key] = datetime.datetime.strptime(bindingValues[key], '%Y-%m-%dT%H:%M:%SZ').replace(tzinfo = datetime.timezone.utc)
    623 962   elif value['type'] == 'IMAGE':
    624  - cardKwargs[kwarg] = value['image_value']['url']
     963 + bindingValues[key] = value['image_value']['url']
     964 + elif value['type'] == 'IMAGE_COLOR':
     965 + # Silently discard this.
     966 + pass
     967 + elif value['type'] == 'BOOLEAN':
     968 + bindingValues[key] = value['boolean_value']
     969 + elif value['type'] == 'USER':
     970 + bindingValues[key] = userRefs[int(value['user_value']['id_str'])]
    625 971   else:
    626  - raise snscrape.base.ScraperError(f'Unknown card value type: {value["type"]!r}')
    627  - return Card(**cardKwargs)
     972 + _logger.warning(f'Unsupported card value type on {key!r} on tweet {tweetId}: {value["type"]!r}')
     973 + 
     974 + if apiType is _TwitterAPIType.V2:
     975 + cardName = card['name']
     976 + elif apiType is _TwitterAPIType.GRAPHQL:
     977 + cardName = card['legacy']['name']
     978 + 
     979 + if cardName in ('summary', 'summary_large_image', 'app', 'direct_store_link_app'):
     980 + keyKwargMap = {
     981 + 'title': 'title',
     982 + 'description': 'description',
     983 + 'card_url': 'url',
     984 + 'site': 'siteUser',
     985 + 'creator': 'creatorUser',
     986 + }
     987 + if cardName in ('app', 'direct_store_link_app'):
     988 + keyKwargMap['thumbnail_original'] = 'thumbnailUrl'
     989 + return AppCard(**_kwargs_from_map(keyKwargMap))
     990 + else:
     991 + keyKwargMap['thumbnail_image_original'] = 'thumbnailUrl'
     992 + return SummaryCard(**_kwargs_from_map(keyKwargMap))
     993 + elif any(cardName.startswith(x) for x in ('poll2choice_', 'poll3choice_', 'poll4choice_')) and cardName.split('_', 1)[1] in ('text_only', 'image', 'video'):
     994 + kwargs = _kwargs_from_map({'end_datetime_utc': 'endDate', 'last_updated_datetime_utc': 'lastUpdateDate', 'duration_minutes': 'duration', 'counts_are_final': 'finalResults'})
     995 + 
     996 + options = []
     997 + for key in sorted(bindingValues):
     998 + if key.startswith('choice') and key.endswith('_label'):
     999 + optKwargs = {'label': bindingValues[key]}
     1000 + if (count := bindingValues.get(f'{key[:-5]}count')):
     1001 + optKwargs['count'] = int(count)
     1002 + options.append(PollOption(**optKwargs))
     1003 + kwargs['options'] = options
     1004 + kwargs['duration'] = int(kwargs['duration'])
     1005 + 
     1006 + if cardName.endswith('_image'):
     1007 + kwargs['medium'] = Photo(previewUrl = bindingValues['image_small'], fullUrl = bindingValues['image_original'])
     1008 + elif cardName.endswith('_video'):
     1009 + variants = []
     1010 + variants.append(VideoVariant(contentType = 'application/x-mpegurl', url = bindingValues['player_hls_url'], bitrate = None))
     1011 + if 'vmap' not in bindingValues['player_stream_url']:
     1012 + _logger.warning(f'Non-VMAP URL in {cardName} player_stream_url on tweet {tweetId}')
     1013 + variants.append(VideoVariant(contentType = 'text/xml', url = bindingValues['player_stream_url'], bitrate = None))
     1014 + kwargs['medium'] = Video(thumbnailUrl = bindingValues['player_image_original'], variants = variants, duration = int(bindingValues['content_duration_seconds']))
     1015 + 
     1016 + return PollCard(**kwargs)
     1017 + elif cardName == 'player':
     1018 + return PlayerCard(**_kwargs_from_map({'title': 'title', 'description': 'description', 'card_url': 'url', 'player_image_original': 'imageUrl', 'site': 'siteUser'}))
     1019 + elif cardName in ('promo_image_convo', 'promo_video_convo'):
     1020 + kwargs = _kwargs_from_map({'thank_you_text': 'thankYouText', 'thank_you_url': 'thankYouUrl', 'thank_you_shortened_url': 'thankYouTcoUrl'})
     1021 + kwargs['actions'] = []
     1022 + for l in ('one', 'two', 'three', 'four'):
     1023 + if f'cta_{l}' in bindingValues:
     1024 + kwargs['actions'].append(PromoConvoAction(label = bindingValues[f'cta_{l}'], tweet = bindingValues[f'cta_{l}_tweet']))
     1025 + if 'image' in cardName:
     1026 + kwargs['medium'] = Photo(previewUrl = bindingValues['promo_image_small'], fullUrl = bindingValues['promo_image_original'])
     1027 + if 'cover_promo_image' in bindingValues:
     1028 + kwargs['cover'] = Photo(previewUrl = bindingValues['cover_promo_image_small'], fullUrl = bindingValues['cover_promo_image_original'])
     1029 + elif 'video' in cardName:
     1030 + variants = []
     1031 + variants.append(VideoVariant(contentType = bindingValues['player_stream_content_type'], url = bindingValues['player_stream_url'], bitrate = None))
     1032 + if bindingValues['player_stream_url'] != bindingValues['player_url']:
     1033 + if 'vmap' not in bindingValues['player_url']:
     1034 + _logger.warning(f'Non-VMAP URL in {cardName} player_url on tweet {tweetId}')
     1035 + variants.append(VideoVariant(contentType = 'text/xml', url = bindingValues['player_url'], bitrate = None))
     1036 + kwargs['medium'] = Video(thumbnailUrl = bindingValues['player_image_original'], variants = variants, duration = int(bindingValues['content_duration_seconds']))
     1037 + return PromoConvoCard(**kwargs)
     1038 + elif cardName in ('745291183405076480:broadcast', '3691233323:periscope_broadcast'):
     1039 + keyKwargMap = {'broadcast_state': 'state', 'broadcast_source': 'source', 'site': 'siteUser'}
     1040 + if cardName == '745291183405076480:broadcast':
     1041 + keyKwargMap = {**keyKwargMap, 'broadcast_id': 'id', 'broadcast_url': 'url', 'broadcast_title': 'title', 'broadcast_thumbnail_original': 'thumbnailUrl'}
     1042 + else:
     1043 + keyKwargMap = {**keyKwargMap, 'id': 'id', 'url': 'url', 'title': 'title', 'description': 'description', 'total_participants': 'totalParticipants', 'full_size_thumbnail_url': 'thumbnailUrl'}
     1044 + kwargs = _kwargs_from_map(keyKwargMap)
     1045 + if 'broadcaster_twitter_id' in bindingValues:
     1046 + kwargs['broadcaster'] = User(id = int(bindingValues['broadcaster_twitter_id']), username = bindingValues['broadcaster_username'], displayname = bindingValues['broadcaster_display_name'])
     1047 + if 'siteUser' not in kwargs:
     1048 + kwargs['siteUser'] = None
     1049 + if cardName == '745291183405076480:broadcast':
     1050 + return BroadcastCard(**kwargs)
     1051 + else:
     1052 + kwargs['totalParticipants'] = int(kwargs['totalParticipants'])
     1053 + return PeriscopeBroadcastCard(**kwargs)
     1054 + elif cardName == '745291183405076480:live_event':
     1055 + kwargs = _kwargs_from_map({'event_id': 'id', 'event_title': 'title', 'event_category': 'category', 'event_subtitle': 'description'})
     1056 + kwargs['id'] = int(kwargs['id'])
     1057 + kwargs['photo'] = Photo(previewUrl = bindingValues['event_thumbnail_small'], fullUrl = bindingValues.get('event_thumbnail_original') or bindingValues['event_thumbnail'])
     1058 + return EventCard(event = Event(**kwargs))
     1059 + elif cardName == '3337203208:newsletter_publication':
     1060 + kwargs = _kwargs_from_map({'newsletter_title': 'title', 'newsletter_description': 'description', 'newsletter_image_original': 'imageUrl', 'card_url': 'url', 'revue_account_id': 'revueAccountId', 'issue_count': 'issueCount'})
     1061 + kwargs['revueAccountId'] = int(kwargs['revueAccountId'])
     1062 + kwargs['issueCount'] = int(kwargs['issueCount'])
     1063 + return NewsletterCard(**kwargs)
     1064 + elif cardName == '3337203208:newsletter_issue':
     1065 + kwargs = _kwargs_from_map({
     1066 + 'newsletter_title': 'newsletterTitle',
     1067 + 'newsletter_description': 'newsletterDescription',
     1068 + 'issue_title': 'issueTitle',
     1069 + 'issue_description': 'issueDescription',
     1070 + 'issue_number': 'issueNumber',
     1071 + 'issue_image_original': 'imageUrl',
     1072 + 'card_url': 'url',
     1073 + 'revue_account_id': 'revueAccountId'
     1074 + })
     1075 + kwargs['issueNumber'] = int(kwargs['issueNumber'])
     1076 + kwargs['revueAccountId'] = int(kwargs['revueAccountId'])
     1077 + return NewsletterIssueCard(**kwargs)
     1078 + elif cardName == 'amplify':
     1079 + return AmplifyCard(
     1080 + id = bindingValues['amplify_content_id'],
     1081 + video = Video(
     1082 + thumbnailUrl = bindingValues['player_image'],
     1083 + variants = [VideoVariant(contentType = bindingValues['player_stream_content_type'], url = bindingValues['amplify_url_vmap'], bitrate = None)],
     1084 + ),
     1085 + )
     1086 + elif cardName == 'appplayer':
     1087 + kwargs = _kwargs_from_map({'title': 'title', 'app_category': 'appCategory', 'player_owner_id': 'playerOwnerId', 'site': 'siteUser'})
     1088 + kwargs['playerOwnerId'] = int(kwargs['playerOwnerId'])
     1089 + variants = []
     1090 + variants.append(VideoVariant(contentType = 'application/x-mpegurl', url = bindingValues['player_hls_url'], bitrate = None))
     1091 + if 'vmap' not in bindingValues['player_url']:
     1092 + _logger.warning(f'Non-VMAP URL in {cardName} player_url on tweet {tweetId}')
     1093 + variants.append(VideoVariant(contentType = 'text/xml', url = bindingValues['player_url'], bitrate = None))
     1094 + kwargs['video'] = Video(thumbnailUrl = bindingValues['player_image_original'], variants = variants, duration = int(bindingValues['content_duration_seconds']))
     1095 + return AppPlayerCard(**kwargs)
     1096 + elif cardName == '3691233323:audiospace':
     1097 + return SpacesCard(**_kwargs_from_map({'card_url': 'url', 'id': 'id'}))
     1098 + elif cardName == '2586390716:message_me':
     1099 + # Note that the strings in Twitter's JS appear to have an incorrect mapping that then gets changed somewhere in the 1.8 MiB of JS!
     1100 + # cta_1, 3, and 4 should mean 'Message us', 'Send a private message', and 'Send me a private message', but the correct mapping is currently unknown.
     1101 + ctas = {'message_me_card_cta_2': 'Send us a private message'}
     1102 + if bindingValues['cta'] not in ctas:
     1103 + _logger.warning(f'Unsupported message_me card cta on tweet {tweetId}: {bindingValues["cta"]!r}')
     1104 + return
     1105 + return MessageMeCard(**_kwargs_from_map({'recipient': 'recipient', 'card_url': 'url'}), buttonText = ctas[bindingValues['cta']])
     1106 + elif cardName == 'unified_card':
     1107 + o = json.loads(bindingValues['unified_card'])
     1108 + kwargs = {}
     1109 + if 'type' in o:
     1110 + unifiedCardType = o.get('type')
     1111 + if unifiedCardType not in (
     1112 + 'image_app',
     1113 + 'image_carousel_app',
     1114 + 'image_carousel_website',
     1115 + 'image_multi_dest_carousel_website',
     1116 + 'image_website',
     1117 + 'mixed_media_multi_dest_carousel_website',
     1118 + 'mixed_media_single_dest_carousel_app',
     1119 + 'mixed_media_single_dest_carousel_website',
     1120 + 'video_app',
     1121 + 'video_carousel_app',
     1122 + 'video_carousel_website',
     1123 + 'video_multi_dest_carousel_website',
     1124 + 'video_website',
     1125 + ):
     1126 + _logger.warning(f'Unsupported unified_card type on tweet {tweetId}: {unifiedCardType!r}')
     1127 + return
     1128 + kwargs['type'] = unifiedCardType
     1129 + elif set(c['type'] for c in o['component_objects'].values()) not in ({'media', 'twitter_list_details'}, {'media', 'community_details'}):
     1130 + _logger.warning(f'Unsupported unified_card type on tweet {tweetId}')
     1131 + return
     1132 + 
     1133 + kwargs['componentObjects'] = {}
     1134 + for k, v in o['component_objects'].items():
     1135 + if v['type'] == 'details':
     1136 + co = UnifiedCardDetailComponentObject(content = v['data']['title']['content'], destinationKey = v['data']['destination'])
     1137 + elif v['type'] == 'media':
     1138 + co = UnifiedCardMediumComponentObject(mediumKey = v['data']['id'], destinationKey = v['data']['destination'])
     1139 + elif v['type'] == 'button_group':
     1140 + if not all(b['type'] == 'cta' for b in v['data']['buttons']):
     1141 + _logger.warning(f'Unsupported unified_card button_group button type on tweet {tweetId}')
     1142 + return
     1143 + buttons = [UnifiedCardButton(text = b['action'][0].upper() + re.sub('[A-Z]', lambda x: f' {x[0]}', b['action'][1:]), destinationKey = b['destination']) for b in v['data']['buttons']]
     1144 + co = UnifiedCardButtonGroupComponentObject(buttons = buttons)
     1145 + elif v['type'] == 'swipeable_media':
     1146 + media = [UnifiedCardSwipeableMediaMedium(mediumKey = m['id'], destinationKey = m['destination']) for m in v['data']['media_list']]
     1147 + co = UnifiedCardSwipeableMediaComponentObject(media = media)
     1148 + elif v['type'] == 'app_store_details':
     1149 + co = UnifiedCardAppStoreComponentObject(appKey = v['data']['app_id'], destinationKey = v['data']['destination'])
     1150 + elif v['type'] == 'twitter_list_details':
     1151 + co = UnifiedCardTwitterListDetailsComponentObject(
     1152 + name = v['data']['name']['content'],
     1153 + memberCount = v['data']['member_count'],
     1154 + subscriberCount = v['data']['subscriber_count'],
     1155 + user = self._user_to_user(o['users'][v['data']['user_id']]),
     1156 + destinationKey = v['data']['destination'],
     1157 + )
     1158 + elif v['type'] == 'community_details':
     1159 + co = UnifiedCardTwitterCommunityDetailsComponentObject(
     1160 + name = v['data']['name']['content'],
     1161 + theme = v['data']['theme'],
     1162 + membersCount = v['data']['member_count'],
     1163 + destinationKey = v['data']['destination'],
     1164 + membersFacepile = [self._user_to_user(u) for u in map(o['users'].get, v['data']['members_facepile']) if u],
     1165 + )
     1166 + else:
     1167 + _logger.warning(f'Unsupported unified_card component type on tweet {tweetId}: {v["type"]!r}')
     1168 + return
     1169 + kwargs['componentObjects'][k] = co
     1170 + 
     1171 + kwargs['destinations'] = {}
     1172 + for k, v in o['destination_objects'].items():
     1173 + dKwargs = {}
     1174 + if 'url_data' in v['data']:
     1175 + dKwargs['url'] = v['data']['url_data']['url']
     1176 + if 'app_id' in v['data']:
     1177 + dKwargs['appKey'] = v['data']['app_id']
     1178 + if 'media_id' in v['data']:
     1179 + dKwargs['mediumKey'] = v['data']['media_id']
     1180 + kwargs['destinations'][k] = UnifiedCardDestination(**dKwargs)
     1181 + 
     1182 + kwargs['media'] = {}
     1183 + for k, v in o['media_entities'].items():
     1184 + if (medium := self._make_medium(v, tweetId)):
     1185 + kwargs['media'][k] = medium
     1186 + 
     1187 + if 'app_store_data' in o:
     1188 + kwargs['apps'] = {}
     1189 + for k, v in o['app_store_data'].items():
     1190 + variants = []
     1191 + for var in v:
     1192 + vKwargsMap = {
     1193 + 'type': 'type',
     1194 + 'id': 'id',
     1195 + 'icon_media_key': 'iconMediumKey',
     1196 + 'country_code': 'countryCode',
     1197 + 'num_installs': 'installs',
     1198 + 'size_bytes': 'size',
     1199 + 'is_free': 'isFree',
     1200 + 'is_editors_choice': 'isEditorsChoice',
     1201 + 'has_in_app_purchases': 'hasInAppPurchases',
     1202 + 'has_in_app_ads': 'hasInAppAds',
     1203 + }
     1204 + vKwargs = {kwarg: var[key] for key, kwarg in vKwargsMap.items() if key in var}
     1205 + vKwargs['title'] = var['title']['content']
     1206 + if 'description' in var:
     1207 + vKwargs['description'] = var['description']['content']
     1208 + vKwargs['category'] = var['category']['content']
     1209 + if (ratings := var['ratings']):
     1210 + vKwargs['ratingAverage'] = var['ratings']['star']
     1211 + vKwargs['ratingCount'] = var['ratings']['count']
     1212 + vKwargs['url'] = f'https://play.google.com/store/apps/details?id={var["id"]}' if var['type'] == 'android_app' else f'https://itunes.apple.com/app/id{var["id"]}'
     1213 + variants.append(UnifiedCardApp(**vKwargs))
     1214 + kwargs['apps'][k] = variants
     1215 + 
     1216 + if o['components']:
     1217 + kwargs['components'] = o['components']
     1218 + 
     1219 + if 'layout' in o:
     1220 + if o['layout']['type'] != 'swipeable':
     1221 + _logger.warning(f'Unsupported unified_card layout type on tweet {tweetId}: {o["layout"]["type"]!r}')
     1222 + return
     1223 + kwargs['swipeableLayoutSlides'] = [UnifiedCardSwipeableLayoutSlide(mediumComponentKey = v[0], componentKey = v[1]) for v in o['layout']['data']['slides']]
     1224 + 
     1225 + return UnifiedCard(**kwargs)
     1226 + 
     1227 + _logger.warning(f'Unsupported card type on tweet {tweetId}: {cardName!r}')
    628 1228   
    629 1229   def _tweet_to_tweet(self, tweet, obj):
    630 1230   user = self._user_to_user(obj['globalObjects']['users'][tweet['user_id_str']])
    skipped 3 lines
    634 1234   if 'quoted_status_id_str' in tweet and tweet['quoted_status_id_str'] in obj['globalObjects']['tweets']:
    635 1235   kwargs['quotedTweet'] = self._tweet_to_tweet(obj['globalObjects']['tweets'][tweet['quoted_status_id_str']], obj)
    636 1236   if 'card' in tweet:
    637  - kwargs['card'] = self._make_card(tweet['card'], _TwitterAPIType.V2)
     1237 + kwargs['card'] = self._make_card(tweet['card'], _TwitterAPIType.V2, self._get_tweet_id(tweet))
    638 1238   return self._make_tweet(tweet, user, **kwargs)
    639 1239   
    640 1240   def _graphql_timeline_tweet_item_result_to_tweet(self, result):
    skipped 3 lines
    644 1244   #TODO Include result['softInterventionPivot'] in the Tweet object
    645 1245   result = result['tweet']
    646 1246   else:
    647  - raise snscrape.base.ScraperError(f'Unknown result type {result["__typename"]!r}')
     1247 + raise snscrape.base.ScraperException(f'Unknown result type {result["__typename"]!r}')
    648 1248   tweet = result['legacy']
    649 1249   userId = int(result['core']['user_results']['result']['rest_id'])
    650 1250   user = self._user_to_user(result['core']['user_results']['result']['legacy'], id_ = userId)
    skipped 10 lines
    661 1261   kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str']))
    662 1262   else:
    663 1263   kwargs['quotedTweet'] = TweetRef(id = int(result['quotedRefResult']['result']['rest_id']))
     1264 + elif 'quoted_status_id_str' in tweet:
     1265 + kwargs['quotedTweet'] = TweetRef(id = int(tweet['quoted_status_id_str']))
    664 1266   if 'card' in result:
    665  - kwargs['card'] = self._make_card(result['card'], _TwitterAPIType.GRAPHQL)
     1267 + kwargs['card'] = self._make_card(result['card'], _TwitterAPIType.GRAPHQL, self._get_tweet_id(tweet))
    666 1268   return self._make_tweet(tweet, user, **kwargs)
    667 1269   
    668 1270   def _graphql_timeline_instructions_to_tweets(self, instructions, includeConversationThreads = False):
    skipped 435 lines
  • ■ ■ ■ ■ ■
    snscrape/modules/vkontakte.py
    skipped 176 lines
    177 177   continue
    178 178   if 'data-video' in a.attrs:
    179 179   # Video
     180 + if 'data-link-attr' in a.attrs:
     181 + hrefUrl = urllib.parse.unquote(a.attrs['data-link-attr'].split('to=')[1].split('&')[0])
     182 + else:
     183 + hrefUrl = f'https://vk.com{a["href"]}'
    180 184   video = Video(
    181 185   id = a['data-video'],
    182 186   list = a['data-list'],
    183 187   duration = int(a['data-duration']),
    184  - url = f'https://vk.com{a["href"]}',
     188 + url = hrefUrl,
    185 189   thumbUrl = a['style'][(begin := a['style'].find('background-image: url(') + 22) : a['style'].find(')', begin)],
    186 190   )
    187 191   continue
    skipped 202 lines
  • ■ ■ ■ ■
    snscrape/modules/weibo.py
    skipped 69 lines
    70 70   _logger.warning('User does not exist')
    71 71   self._user = _userDoesNotExist
    72 72   else:
    73  - raise snscrape.base.ScraperError(f'Got unexpected response on resolving username ({r.status_code})')
     73 + raise snscrape.base.ScraperException(f'Got unexpected response on resolving username ({r.status_code})')
    74 74   
    75 75   def _check_timeline_response(self, r):
    76 76   if r.status_code == 200 and r.content == b'{"ok":0,"msg":"\\u8fd9\\u91cc\\u8fd8\\u6ca1\\u6709\\u5185\\u5bb9","data":{"cards":[]}}':
    skipped 76 lines
Please wait...
Page is in error, reload to recover