Projects STRLCPY snscrape Commits dc6bc9bf
🤬
  • Refactor how links on Twitter are handled

    All links in text (tweets, profile descriptions, and profile links) are now represented by TextLink objects, which contain all relevant information: the displayed text (if available), the URL, the short t.co URL, and the indices in the text at which it appears.
    
    Closes #478
  • Loading...
  • JustAnotherArchivist committed 2 years ago
    dc6bc9bf
    1 parent 01cf6a09
  • ■ ■ ■ ■ ■ ■
    snscrape/modules/twitter.py
    1 1  __all__ = [
    2  - 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionURL', 'Coordinates', 'Place',
     2 + 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'TextLink', 'Coordinates', 'Place',
    3 3   'User', 'UserLabel',
    4 4   'Trend',
    5 5   'GuestTokenManager',
    skipped 25 lines
    31 31  import time
    32 32  import typing
    33 33  import urllib.parse
     34 +import warnings
     35 + 
     36 + 
     37 +# DescriptionURL deprecation
     38 +_DEPRECATED_NAMES = {'DescriptionURL': 'TextLink'}
     39 +def __getattr__(name):
     40 + if name in _DEPRECATED_NAMES:
     41 + warnings.warn(f'{name} is deprecated, use {_DEPRECATED_NAMES[name]} instead', FutureWarning, stacklevel = 2)
     42 + return globals()[_DEPRECATED_NAMES[name]]
     43 + raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
     44 +def __dir__():
     45 + return sorted(__all__ + list(_DEPRECATED_NAMES.keys()))
    34 46   
    35 47   
    36 48  _logger = logging.getLogger(__name__)
    skipped 19 lines
    56 68   source: str
    57 69   sourceUrl: typing.Optional[str] = None
    58 70   sourceLabel: typing.Optional[str] = None
    59  - outlinks: typing.Optional[typing.List[str]] = None
    60  - tcooutlinks: typing.Optional[typing.List[str]] = None
     71 + links: typing.Optional[typing.List['TextLink']] = None
    61 72   media: typing.Optional[typing.List['Medium']] = None
    62 73   retweetedTweet: typing.Optional['Tweet'] = None
    63 74   quotedTweet: typing.Optional['Tweet'] = None
    skipped 7 lines
    71 82   card: typing.Optional['Card'] = None
    72 83   
    73 84   username = snscrape.base._DeprecatedProperty('username', lambda self: self.user.username, 'user.username')
    74  - outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks) if self.outlinks else '', 'outlinks')
    75  - tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(self.tcooutlinks) if self.tcooutlinks else '', 'tcooutlinks')
     85 + outlinks = snscrape.base._DeprecatedProperty('outlinks', lambda self: [x.url for x in self.links] if self.links else [], 'links (url attribute)')
     86 + outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(x.url for x in self.links) if self.links else '', 'links (url attribute)')
     87 + tcooutlinks = snscrape.base._DeprecatedProperty('tcooutlinks', lambda self: [x.tcourl for x in self.links] if self.links else [], 'links (tcourl attribute)')
     88 + tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(x.tcourl for x in self.links) if self.links else '', 'links (tcourl attribute)')
    76 89   
    77 90   def __str__(self):
    78 91   return self.url
    79 92   
    80 93   
     94 +@dataclasses.dataclass
     95 +class TextLink:
     96 + text: typing.Optional[str]
     97 + url: str
     98 + tcourl: str
     99 + indices: typing.Tuple[int, int]
     100 + 
     101 + 
    81 102  class Medium:
    82 103   pass
    83 104   
    skipped 23 lines
    107 128  class Gif(Medium):
    108 129   thumbnailUrl: str
    109 130   variants: typing.List[VideoVariant]
    110  - 
    111  - 
    112  -@dataclasses.dataclass
    113  -class DescriptionURL:
    114  - text: typing.Optional[str]
    115  - url: str
    116  - tcourl: str
    117  - indices: typing.Tuple[int, int]
    118 131   
    119 132   
    120 133  @dataclasses.dataclass
    skipped 326 lines
    447 460   displayname: typing.Optional[str] = None
    448 461   description: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced
    449 462   rawDescription: typing.Optional[str] = None # Raw description with the URL(s) intact
    450  - descriptionUrls: typing.Optional[typing.List[DescriptionURL]] = None
     463 + descriptionLinks: typing.Optional[typing.List[TextLink]] = None
    451 464   verified: typing.Optional[bool] = None
    452 465   created: typing.Optional[datetime.datetime] = None
    453 466   followersCount: typing.Optional[int] = None
    skipped 4 lines
    458 471   mediaCount: typing.Optional[int] = None
    459 472   location: typing.Optional[str] = None
    460 473   protected: typing.Optional[bool] = None
    461  - linkUrl: typing.Optional[str] = None
    462  - linkTcourl: typing.Optional[str] = None
     474 + link: typing.Optional[TextLink] = None
    463 475   profileImageUrl: typing.Optional[str] = None
    464 476   profileBannerUrl: typing.Optional[str] = None
    465 477   label: typing.Optional['UserLabel'] = None
     478 + 
     479 + descriptionUrls = snscrape.base._DeprecatedProperty('descriptionUrls', lambda self: self.descriptionLinks, 'descriptionLinks')
     480 + linkUrl = snscrape.base._DeprecatedProperty('linkUrl', lambda self: self.link.url if self.link else None, 'link.url')
     481 + linkTcourl = snscrape.base._DeprecatedProperty('linkTcourl', lambda self: self.link.tcourl if self.link else None, 'link.tcourl')
    466 482   
    467 483   @property
    468 484   def url(self):
    skipped 346 lines
    815 831   kwargs['user'] = user
    816 832   kwargs['date'] = email.utils.parsedate_to_datetime(tweet['created_at'])
    817 833   if tweet['entities'].get('urls'):
    818  - kwargs['outlinks'] = [u['expanded_url'] for u in tweet['entities']['urls']]
    819  - kwargs['tcooutlinks'] = [u['url'] for u in tweet['entities']['urls']]
     834 + kwargs['links'] = [TextLink(
     835 + text = u.get('display_url'),
     836 + url = u['expanded_url'],
     837 + tcourl = u['url'],
     838 + indices = tuple(u['indices']),
     839 + ) for u in tweet['entities']['urls']]
    820 840   kwargs['url'] = f'https://twitter.com/{user.username}/status/{tweetId}'
    821 841   kwargs['replyCount'] = tweet['reply_count']
    822 842   kwargs['retweetCount'] = tweet['retweet_count']
    skipped 54 lines
    877 897   if hasattr(card, 'url') and '//t.co/' in card.url:
    878 898   # Try to convert the URL to the non-shortened/t.co one
    879 899   # Retweets inherit the card but not the outlinks; try to get them from the retweeted tweet instead in that case.
    880  - if 'tcooutlinks' in kwargs and card.url in kwargs['tcooutlinks']:
    881  - card.url = kwargs['outlinks'][kwargs['tcooutlinks'].index(card.url)]
    882  - elif retweetedTweet and retweetedTweet.tcooutlinks and card.url in retweetedTweet.tcooutlinks:
    883  - card.url = retweetedTweet.outlinks[retweetedTweet.tcooutlinks.index(card.url)]
     900 + candidates = []
     901 + if 'links' in kwargs:
     902 + candidates.extend(kwargs['links'])
     903 + if retweetedTweet:
     904 + candidates.extend(retweetedTweet.links)
     905 + for u in candidates:
     906 + if u.tcourl == card.url:
     907 + card.url = u.url
     908 + break
    884 909   else:
    885 910   _logger.warning(f'Could not translate t.co card URL on tweet {tweetId}')
    886 911   return Tweet(**kwargs)
    skipped 416 lines
    1303 1328   kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls'))
    1304 1329   kwargs['rawDescription'] = user['description']
    1305 1330   if user['entities']['description'].get('urls'):
    1306  - kwargs['descriptionUrls'] = [DescriptionURL(
    1307  - text = x.get('display_url'),
    1308  - url = x['expanded_url'],
    1309  - tcourl = x['url'],
    1310  - indices = tuple(x['indices']),
    1311  - ) for x in user['entities']['description']['urls']]
     1331 + kwargs['descriptionLinks'] = [TextLink(
     1332 + text = x.get('display_url'),
     1333 + url = x['expanded_url'],
     1334 + tcourl = x['url'],
     1335 + indices = tuple(x['indices']),
     1336 + ) for x in user['entities']['description']['urls']]
    1312 1337   kwargs['verified'] = user.get('verified')
    1313 1338   kwargs['created'] = email.utils.parsedate_to_datetime(user['created_at'])
    1314 1339   kwargs['followersCount'] = user['followers_count']
    skipped 4 lines
    1319 1344   kwargs['mediaCount'] = user['media_count']
    1320 1345   kwargs['location'] = user['location']
    1321 1346   kwargs['protected'] = user.get('protected')
    1322  - if 'url' in user['entities']:
    1323  - kwargs['linkUrl'] = (user['entities']['url']['urls'][0].get('expanded_url') or user.get('url'))
    1324  - kwargs['linkTcourl'] = user.get('url')
     1347 + if user.get('url'):
     1348 + entity = user['entities'].get('url', {}).get('urls', [None])[0]
     1349 + if not entity or entity['url'] != user['url']:
     1350 + self.logger.warning(f'Link inconsistency on user {kwargs["id"]}')
     1351 + if not entity:
     1352 + entity = {'display_url': None, 'expanded_url': user['url'], 'indices': (0, len(user['url']))}
     1353 + kwargs['link'] = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['url'], indices = tuple(entity['indices']))
    1325 1354   kwargs['profileImageUrl'] = user['profile_image_url_https']
    1326 1355   kwargs['profileBannerUrl'] = user.get('profile_banner_url')
    1327 1356   if 'ext' in user and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')):
    skipped 119 lines
    1447 1476   user = obj['data']['user']['result']
    1448 1477   rawDescription = user['legacy']['description']
    1449 1478   description = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls'])
     1479 + link = None
     1480 + if user['legacy'].get('url'):
     1481 + entity = user['legacy']['entities'].get('url', {}).get('urls', [None])[0]
     1482 + if not entity or entity['url'] != user['legacy']['url']:
     1483 + self.logger.warning(f'Link inconsistency on user')
     1484 + if not entity:
     1485 + entity = {'display_url': None, 'expanded_url': user['legacy']['url'], 'indices': (0, len(user['legacy']['url']))}
     1486 + link = TextLink(text = entity['display_url'], url = entity['expanded_url'], tcourl = user['legacy']['url'], indices = tuple(entity['indices']))
    1450 1487   label = None
    1451 1488   if (labelO := user['affiliates_highlighted_label'].get('label')):
    1452 1489   label = self._user_label_to_user_label(labelO)
    skipped 3 lines
    1456 1493   displayname = user['legacy']['name'],
    1457 1494   description = description,
    1458 1495   rawDescription = rawDescription,
    1459  - descriptionUrls = [DescriptionURL(
    1460  - text = x.get('display_url'),
    1461  - url = x['expanded_url'],
    1462  - tcourl = x['url'],
    1463  - indices = tuple(x['indices']),
    1464  - ) for x in user['legacy']['entities']['description']['urls']],
     1496 + descriptionLinks = [TextLink(
     1497 + text = x.get('display_url'),
     1498 + url = x['expanded_url'],
     1499 + tcourl = x['url'],
     1500 + indices = tuple(x['indices']),
     1501 + ) for x in user['legacy']['entities']['description']['urls']],
    1465 1502   verified = user['legacy']['verified'],
    1466 1503   created = email.utils.parsedate_to_datetime(user['legacy']['created_at']),
    1467 1504   followersCount = user['legacy']['followers_count'],
    skipped 4 lines
    1472 1509   mediaCount = user['legacy']['media_count'],
    1473 1510   location = user['legacy']['location'],
    1474 1511   protected = user['legacy']['protected'],
    1475  - linkUrl = user['legacy']['entities']['url']['urls'][0]['expanded_url'] if 'url' in user['legacy']['entities'] else None,
    1476  - linkTcourl = user['legacy'].get('url'),
     1512 + link = link,
    1477 1513   profileImageUrl = user['legacy']['profile_image_url_https'],
    1478 1514   profileBannerUrl = user['legacy'].get('profile_banner_url'),
    1479 1515   label = label,
    skipped 236 lines
Please wait...
Page is in error, reload to recover