Projects STRLCPY snscrape Commits 5a084af8
🤬
  • Fix Instagram

    Instagram dropped the max_id parameter, so it is no longer possible to iterate over the posts so easily. Switch to GraphQL instead, which is what's used in the browser as well.
  • Loading...
  • JustAnotherArchivist committed 6 years ago
    5a084af8
    1 parent 14831d41
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■ ■
    socialmediascraper/modules/instagram.py
    skipped 12 lines
    13 13   super().__init__(**kwargs)
    14 14   self._username = username
    15 15   
    16  - def _response_to_items(self, response):
    17  - username = response['user']['username'] # Might have different capitalisation than self._username
    18  - 
    19  - for node in response['user']['media']['nodes']:
    20  - code = node['code']
     16 + def _response_to_items(self, response, username):
     17 + for node in response['user']['edge_owner_to_timeline_media']['edges']:
     18 + code = node['node']['shortcode']
    21 19   yield socialmediascraper.base.URLItem(f'https://www.instagram.com/p/{code}/?taken-by={username}') #TODO: Do we want the taken-by parameter in here?
    22 20   
    23 21   def get_items(self):
    24 22   headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    25 23   
    26  - maxID = None
     24 + logger.info('Retrieving initial data')
     25 + r = self._get(f'https://www.instagram.com/{self._username}/?__a=1', headers = headers)
     26 + if r.status_code == 404:
     27 + logger.warning('User does not exist')
     28 + return
     29 + elif r.status_code != 200:
     30 + logger.error(f'Got status code {r.status_code}')
     31 + return
     32 + response = json.loads(r.text)
     33 + if response['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
     34 + logger.info('User has no posts')
     35 + return
     36 + if not response['graphql']['user']['edge_owner_to_timeline_media']['edges']:
     37 + logger.warning('Private account')
     38 + return
     39 + userID = response['graphql']['user']['id']
     40 + username = response['graphql']['user']['username'] # Might have different capitalisation than self._username
     41 + yield from self._response_to_items(response['graphql'], username)
     42 + if not response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
     43 + return
     44 + endCursor = response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
    27 45   
     46 + # Cf. https://stackoverflow.com/questions/49265339/instagram-a-1-url-doesnt-allow-max-id and https://github.com/rarcega/instagram-scraper
    28 47   while True:
    29  - logger.info(f'Retrieving max_id = {maxID!r}')
    30  - if maxID is None:
    31  - url = f'https://www.instagram.com/{self._username}/?__a=1'
    32  - else:
    33  - url = f'https://www.instagram.com/{self._username}/?__a=1&max_id={maxID}'
    34  - r = self._get(url, headers = headers)
     48 + logger.info(f'Retrieving endCursor = {endCursor!r}')
     49 + r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={{"id":"{userID}","first":12,"after":"{endCursor}"}}', headers = headers)
    35 50   
    36  - #TODO: Handle 404 (HTML)
     51 + if r.status_code != 200:
     52 + logger.error(f'Got status code {r.status_code}')
     53 + return
    37 54   
    38 55   response = json.loads(r.text)
    39  - if not response['user']['media']['nodes']:
     56 + if not response['data']['user']['edge_owner_to_timeline_media']['edges']:
    40 57   return
    41  - yield from self._response_to_items(response)
    42  - maxID = response['user']['media']['nodes'][-1]['id']
     58 + yield from self._response_to_items(response['data'], username)
     59 + if not response['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
     60 + return
     61 + endCursor = response['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
    43 62   
    44 63   @classmethod
    45 64   def setup_parser(cls, subparser):
    skipped 6 lines
Please wait...
Page is in error, reload to recover