Projects STRLCPY snscrape Commits 82351800
🤬
  • ■ ■ ■ ■
    snscrape/base.py
    skipped 228 lines
    229 229   
    230 230   @classmethod
    231 231   def _cli_from_args(cls, args):
    232  - return cls._construct(args)
     232 + return cls._cli_construct(args)
    233 233   
    234 234   @classmethod
    235 235   def _cli_construct(cls, argparseArgs, *args, **kwargs):
    skipped 12 lines
  • ■ ■ ■ ■ ■ ■
    snscrape/modules/twitter.py
    1 1  __all__ = [
    2  - 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'DescriptionUrl', 'Coordinates', 'Place',
     2 + 'Tweet', 'Medium', 'Photo', 'VideoVariant', 'Video', 'Gif', 'TextLink', 'Coordinates', 'Place',
    3 3   'User', 'UserLabel',
    4 4   'Trend',
    5 5   'GuestTokenManager',
    skipped 25 lines
    31 31  import time
    32 32  import typing
    33 33  import urllib.parse
     34 +import warnings
     35 + 
     36 + 
     37 +# DescriptionURL deprecation
     38 +_DEPRECATED_NAMES = {'DescriptionURL': 'TextLink'}
     39 +def __getattr__(name):
     40 + if name in _DEPRECATED_NAMES:
     41 + warnings.warn(f'{name} is deprecated, use {_DEPRECATED_NAMES[name]} instead', FutureWarning, stacklevel = 2)
     42 + return globals()[_DEPRECATED_NAMES[name]]
     43 + raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
     44 +def __dir__():
     45 + return sorted(__all__ + list(_DEPRECATED_NAMES.keys()))
    34 46   
    35 47   
    36 48  _logger = logging.getLogger(__name__)
    skipped 6 lines
    43 55  class Tweet(snscrape.base.Item):
    44 56   url: str
    45 57   date: datetime.datetime
    46  - content: str
     58 + rawContent: str
    47 59   renderedContent: str
    48 60   id: int
    49 61   user: 'User'
    skipped 6 lines
    56 68   source: str
    57 69   sourceUrl: typing.Optional[str] = None
    58 70   sourceLabel: typing.Optional[str] = None
    59  - outlinks: typing.Optional[typing.List[str]] = None
    60  - tcooutlinks: typing.Optional[typing.List[str]] = None
     71 + links: typing.Optional[typing.List['TextLink']] = None
    61 72   media: typing.Optional[typing.List['Medium']] = None
    62 73   retweetedTweet: typing.Optional['Tweet'] = None
    63 74   quotedTweet: typing.Optional['Tweet'] = None
    skipped 7 lines
    71 82   card: typing.Optional['Card'] = None
    72 83   
    73 84   username = snscrape.base._DeprecatedProperty('username', lambda self: self.user.username, 'user.username')
    74  - outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(self.outlinks) if self.outlinks else '', 'outlinks')
    75  - tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(self.tcooutlinks) if self.tcooutlinks else '', 'tcooutlinks')
     85 + outlinks = snscrape.base._DeprecatedProperty('outlinks', lambda self: [x.url for x in self.links] if self.links else [], 'links (url attribute)')
     86 + outlinksss = snscrape.base._DeprecatedProperty('outlinksss', lambda self: ' '.join(x.url for x in self.links) if self.links else '', 'links (url attribute)')
     87 + tcooutlinks = snscrape.base._DeprecatedProperty('tcooutlinks', lambda self: [x.tcourl for x in self.links] if self.links else [], 'links (tcourl attribute)')
     88 + tcooutlinksss = snscrape.base._DeprecatedProperty('tcooutlinksss', lambda self: ' '.join(x.tcourl for x in self.links) if self.links else '', 'links (tcourl attribute)')
     89 + content = snscrape.base._DeprecatedProperty('content', lambda self: self.rawContent, 'rawContent')
    76 90   
    77 91   def __str__(self):
    78 92   return self.url
     93 + 
     94 + 
     95 +@dataclasses.dataclass
     96 +class TextLink:
     97 + text: typing.Optional[str]
     98 + url: str
     99 + tcourl: str
     100 + indices: typing.Tuple[int, int]
    79 101   
    80 102   
    81 103  class Medium:
    skipped 28 lines
    110 132   
    111 133   
    112 134  @dataclasses.dataclass
    113  -class DescriptionURL:
    114  - text: typing.Optional[str]
    115  - url: str
    116  - tcourl: str
    117  - indices: typing.Tuple[int, int]
    118  - 
    119  - 
    120  -@dataclasses.dataclass
    121 135  class Coordinates:
    122 136   longitude: float
    123 137   latitude: float
    skipped 321 lines
    445 459   username: str
    446 460   id: int
    447 461   displayname: typing.Optional[str] = None
    448  - description: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced
    449 462   rawDescription: typing.Optional[str] = None # Raw description with the URL(s) intact
    450  - descriptionUrls: typing.Optional[typing.List[DescriptionURL]] = None
     463 + renderedDescription: typing.Optional[str] = None # Description as it's displayed on the web interface with URLs replaced
     464 + descriptionLinks: typing.Optional[typing.List[TextLink]] = None
    451 465   verified: typing.Optional[bool] = None
    452 466   created: typing.Optional[datetime.datetime] = None
    453 467   followersCount: typing.Optional[int] = None
    skipped 4 lines
    458 472   mediaCount: typing.Optional[int] = None
    459 473   location: typing.Optional[str] = None
    460 474   protected: typing.Optional[bool] = None
    461  - linkUrl: typing.Optional[str] = None
    462  - linkTcourl: typing.Optional[str] = None
     475 + link: typing.Optional[TextLink] = None
    463 476   profileImageUrl: typing.Optional[str] = None
    464 477   profileBannerUrl: typing.Optional[str] = None
    465 478   label: typing.Optional['UserLabel'] = None
    466 479   
     480 + descriptionUrls = snscrape.base._DeprecatedProperty('descriptionUrls', lambda self: self.descriptionLinks, 'descriptionLinks')
     481 + linkUrl = snscrape.base._DeprecatedProperty('linkUrl', lambda self: self.link.url if self.link else None, 'link.url')
     482 + linkTcourl = snscrape.base._DeprecatedProperty('linkTcourl', lambda self: self.link.tcourl if self.link else None, 'link.tcourl')
     483 + description = snscrape.base._DeprecatedProperty('description', lambda self: self.renderedDescription, 'renderedDescription')
     484 + 
    467 485   @property
    468 486   def url(self):
    469 487   return f'https://twitter.com/{self.username}'
    skipped 78 lines
    548 566   return None
    549 567   _logger.info(f'Reading guest token from {self._file}')
    550 568   with open(self._file, 'r') as fp:
    551  - o = json.load(fp)
     569 + try:
     570 + o = json.load(fp)
     571 + except json.JSONDecodeError as e:
     572 + _logger.warning(f'Malformed guest token file {self._file}: {e!s}')
     573 + self.reset()
     574 + return None
    552 575   self._token = o['token']
    553 576   self._setTime = o['setTime']
    554 577   if self._setTime < time.time() - _GUEST_TOKEN_VALIDITY:
    skipped 255 lines
    810 833   tweetId = self._get_tweet_id(tweet)
    811 834   kwargs = {}
    812 835   kwargs['id'] = tweetId
    813  - kwargs['content'] = tweet['full_text']
     836 + kwargs['rawContent'] = tweet['full_text']
    814 837   kwargs['renderedContent'] = self._render_text_with_urls(tweet['full_text'], tweet['entities'].get('urls'))
    815 838   kwargs['user'] = user
    816 839   kwargs['date'] = email.utils.parsedate_to_datetime(tweet['created_at'])
    817 840   if tweet['entities'].get('urls'):
    818  - kwargs['outlinks'] = [u['expanded_url'] for u in tweet['entities']['urls']]
    819  - kwargs['tcooutlinks'] = [u['url'] for u in tweet['entities']['urls']]
     841 + kwargs['links'] = [TextLink(
     842 + text = u.get('display_url'),
     843 + url = u['expanded_url'],
     844 + tcourl = u['url'],
     845 + indices = tuple(u['indices']),
     846 + ) for u in tweet['entities']['urls']]
    820 847   kwargs['url'] = f'https://twitter.com/{user.username}/status/{tweetId}'
    821 848   kwargs['replyCount'] = tweet['reply_count']
    822 849   kwargs['retweetCount'] = tweet['retweet_count']
    skipped 54 lines
    877 904   if hasattr(card, 'url') and '//t.co/' in card.url:
    878 905   # Try to convert the URL to the non-shortened/t.co one
    879 906   # Retweets inherit the card but not the outlinks; try to get them from the retweeted tweet instead in that case.
    880  - if 'tcooutlinks' in kwargs and card.url in kwargs['tcooutlinks']:
    881  - card.url = kwargs['outlinks'][kwargs['tcooutlinks'].index(card.url)]
    882  - elif retweetedTweet and retweetedTweet.tcooutlinks and card.url in retweetedTweet.tcooutlinks:
    883  - card.url = retweetedTweet.outlinks[retweetedTweet.tcooutlinks.index(card.url)]
     907 + candidates = []
     908 + if 'links' in kwargs:
     909 + candidates.extend(kwargs['links'])
     910 + if retweetedTweet:
     911 + candidates.extend(retweetedTweet.links)
     912 + for u in candidates:
     913 + if u.tcourl == card.url:
     914 + card.url = u.url
     915 + break
    884 916   else:
    885 917   _logger.warning(f'Could not translate t.co card URL on tweet {tweetId}')
    886 918   return Tweet(**kwargs)
    skipped 413 lines
    1300 1332   kwargs['username'] = user['screen_name']
    1301 1333   kwargs['id'] = id_ if id_ else user['id'] if 'id' in user else int(user['id_str'])
    1302 1334   kwargs['displayname'] = user['name']
    1303  - kwargs['description'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls'))
    1304 1335   kwargs['rawDescription'] = user['description']
     1336 + kwargs['renderedDescription'] = self._render_text_with_urls(user['description'], user['entities']['description'].get('urls'))
    1305 1337   if user['entities']['description'].get('urls'):
    1306  - kwargs['descriptionUrls'] = [{'text': x.get('display_url'), 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['entities']['description']['urls']]
     1338 + kwargs['descriptionLinks'] = [TextLink(
     1339 + text = x.get('display_url'),
     1340 + url = x['expanded_url'],
     1341 + tcourl = x['url'],
     1342 + indices = tuple(x['indices']),
     1343 + ) for x in user['entities']['description']['urls']]
    1307 1344   kwargs['verified'] = user.get('verified')
    1308 1345   kwargs['created'] = email.utils.parsedate_to_datetime(user['created_at'])
    1309 1346   kwargs['followersCount'] = user['followers_count']
    skipped 4 lines
    1314 1351   kwargs['mediaCount'] = user['media_count']
    1315 1352   kwargs['location'] = user['location']
    1316 1353   kwargs['protected'] = user.get('protected')
    1317  - if 'url' in user['entities']:
    1318  - kwargs['linkUrl'] = (user['entities']['url']['urls'][0].get('expanded_url') or user.get('url'))
    1319  - kwargs['linkTcourl'] = user.get('url')
     1354 + if user.get('url'):
     1355 + entity = user['entities'].get('url', {}).get('urls', [None])[0]
     1356 + if not entity or entity['url'] != user['url']:
     1357 + self.logger.warning(f'Link inconsistency on user {kwargs["id"]}')
     1358 + if not entity:
     1359 + entity = {'indices': (0, len(user['url']))}
     1360 + kwargs['link'] = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['url']), tcourl = user['url'], indices = tuple(entity['indices']))
    1320 1361   kwargs['profileImageUrl'] = user['profile_image_url_https']
    1321 1362   kwargs['profileBannerUrl'] = user.get('profile_banner_url')
    1322 1363   if 'ext' in user and (label := user['ext']['highlightedLabel']['r']['ok'].get('label')):
    skipped 50 lines
    1373 1414   'include_mute_edge': '1',
    1374 1415   'include_can_dm': '1',
    1375 1416   'include_can_media_tag': '1',
     1417 + 'include_ext_has_nft_avatar': '1',
    1376 1418   'skip_status': '1',
    1377 1419   'cards_platform': 'Web-12',
    1378 1420   'include_cards': '1',
    skipped 5 lines
    1384 1426   'include_user_entities': 'true',
    1385 1427   'include_ext_media_color': 'true',
    1386 1428   'include_ext_media_availability': 'true',
     1429 + 'include_ext_sensitive_media_warning': 'true',
     1430 + 'include_ext_trusted_friends_metadata': 'true',
    1387 1431   'send_error_codes': 'true',
    1388  - 'simple_quoted_tweets': 'true',
     1432 + 'simple_quoted_tweet': 'true',
    1389 1433   'q': self._query,
    1390 1434   'tweet_search_mode': 'live',
    1391  - 'count': '100',
     1435 + 'count': '20',
    1392 1436   'query_source': 'spelling_expansion_revert_click',
    1393 1437   'cursor': None,
    1394 1438   'pc': '1',
    1395 1439   'spelling_corrections': '1',
    1396  - 'ext': 'mediaStats,highlightedLabel',
     1440 + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo',
    1397 1441   }
    1398 1442   params = paginationParams.copy()
    1399 1443   del params['cursor']
    skipped 41 lines
    1441 1485   return None
    1442 1486   user = obj['data']['user']['result']
    1443 1487   rawDescription = user['legacy']['description']
    1444  - description = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls'])
     1488 + renderedDescription = self._render_text_with_urls(rawDescription, user['legacy']['entities']['description']['urls'])
     1489 + link = None
     1490 + if user['legacy'].get('url'):
     1491 + entity = user['legacy']['entities'].get('url', {}).get('urls', [None])[0]
     1492 + if not entity or entity['url'] != user['legacy']['url']:
     1493 + self.logger.warning(f'Link inconsistency on user')
     1494 + if not entity:
     1495 + entity = {'indices': (0, len(user['legacy']['url']))}
     1496 + link = TextLink(text = entity.get('display_url'), url = entity.get('expanded_url', user['legacy']['url']), tcourl = user['legacy']['url'], indices = tuple(entity['indices']))
    1445 1497   label = None
    1446 1498   if (labelO := user['affiliates_highlighted_label'].get('label')):
    1447 1499   label = self._user_label_to_user_label(labelO)
    skipped 1 lines
    1449 1501   username = user['legacy']['screen_name'],
    1450 1502   id = int(user['rest_id']),
    1451 1503   displayname = user['legacy']['name'],
    1452  - description = description,
    1453 1504   rawDescription = rawDescription,
    1454  - descriptionUrls = [{'text': x.get('display_url'), 'url': x['expanded_url'], 'tcourl': x['url'], 'indices': tuple(x['indices'])} for x in user['legacy']['entities']['description']['urls']],
     1505 + renderedDescription = renderedDescription,
     1506 + descriptionLinks = [TextLink(
     1507 + text = x.get('display_url'),
     1508 + url = x['expanded_url'],
     1509 + tcourl = x['url'],
     1510 + indices = tuple(x['indices']),
     1511 + ) for x in user['legacy']['entities']['description']['urls']],
    1455 1512   verified = user['legacy']['verified'],
    1456 1513   created = email.utils.parsedate_to_datetime(user['legacy']['created_at']),
    1457 1514   followersCount = user['legacy']['followers_count'],
    skipped 4 lines
    1462 1519   mediaCount = user['legacy']['media_count'],
    1463 1520   location = user['legacy']['location'],
    1464 1521   protected = user['legacy']['protected'],
    1465  - linkUrl = user['legacy']['entities']['url']['urls'][0]['expanded_url'] if 'url' in user['legacy']['entities'] else None,
    1466  - linkTcourl = user['legacy'].get('url'),
     1522 + link = link,
    1467 1523   profileImageUrl = user['legacy']['profile_image_url_https'],
    1468 1524   profileBannerUrl = user['legacy'].get('profile_banner_url'),
    1469 1525   label = label,
    skipped 203 lines
    1673 1729   'include_mute_edge': '1',
    1674 1730   'include_can_dm': '1',
    1675 1731   'include_can_media_tag': '1',
     1732 + 'include_ext_has_nft_avatar': '1',
    1676 1733   'skip_status': '1',
    1677 1734   'cards_platform': 'Web-12',
    1678 1735   'include_cards': '1',
    skipped 5 lines
    1684 1741   'include_user_entities': 'true',
    1685 1742   'include_ext_media_color': 'true',
    1686 1743   'include_ext_media_availability': 'true',
     1744 + 'include_ext_sensitive_media_warning': 'true',
     1745 + 'include_ext_trusted_friends_metadata': 'true',
    1687 1746   'send_error_codes': 'true',
    1688 1747   'simple_quoted_tweet': 'true',
    1689 1748   'count': '20',
    1690 1749   'candidate_source': 'trends',
    1691 1750   'include_page_configuration': 'false',
    1692 1751   'entity_tokens': 'false',
    1693  - 'ext': 'mediaStats,highlightedLabel,voiceInfo',
     1752 + 'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo',
    1694 1753   }
    1695 1754   obj = self._get_api_data('https://twitter.com/i/api/2/guide.json', _TwitterAPIType.V2, params)
    1696 1755   for instruction in obj['timeline']['instructions']:
    skipped 9 lines
  • ■ ■ ■ ■
    snscrape/modules/vkontakte.py
    skipped 31 lines
    32 32  _months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    33 33  _datePattern = re.compile(r'^(?P<date>today'
    34 34   r'|yesterday'
    35  - r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + ')(\s+(?P<year1>\d{4}))?'
     35 + r'|(?P<day1>\d+)\s+(?P<month1>' + '|'.join(_months) + r')(\s+(?P<year1>\d{4}))?'
    36 36   r'|(?P<month2>' + '|'.join(_months) + r')\s+(?P<day2>\d+),\s+(?P<year2>\d{4})'
    37 37   ')'
    38 38   r'\s+at\s+(?P<hour>\d+):(?P<minute>\d+)\s+(?P<ampm>[ap]m)$')
    skipped 355 lines
Please wait...
Page is in error, reload to recover