Projects STRLCPY snscrape Commits e28a2cdb
🤬
  • Fix Instagram again

    - __a=1 is no longer supported, so we need to extract the JSON from the HTML page instead.
    - There is now a X-Instagram-GIS header that needs to be set correctly.
  • Loading...
  • JustAnotherArchivist committed 6 years ago
    e28a2cdb
    1 parent 5a084af8
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■ ■
    socialmediascraper/modules/instagram.py
     1 +import hashlib
    1 2  import json
    2 3  import logging
    3 4  import socialmediascraper.base
    skipped 18 lines
    22 23   headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    23 24   
    24 25   logger.info('Retrieving initial data')
    25  - r = self._get(f'https://www.instagram.com/{self._username}/?__a=1', headers = headers)
     26 + r = self._get(f'https://www.instagram.com/{self._username}/', headers = headers)
    26 27   if r.status_code == 404:
    27 28   logger.warning('User does not exist')
    28 29   return
    29 30   elif r.status_code != 200:
    30 31   logger.error(f'Got status code {r.status_code}')
    31 32   return
    32  - response = json.loads(r.text)
    33  - if response['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
     33 + jsonData = r.text.split('<script type="text/javascript">window._sharedData = ')[1].split(';</script>')[0] # May throw an IndexError if Instagram changes something again; we just let that bubble.
     34 + response = json.loads(jsonData)
     35 + rhxGis = response['rhx_gis']
     36 + if response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
    34 37   logger.info('User has no posts')
    35 38   return
    36  - if not response['graphql']['user']['edge_owner_to_timeline_media']['edges']:
     39 + if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
    37 40   logger.warning('Private account')
    38 41   return
    39  - userID = response['graphql']['user']['id']
    40  - username = response['graphql']['user']['username'] # Might have different capitalisation than self._username
    41  - yield from self._response_to_items(response['graphql'], username)
    42  - if not response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
     42 + userID = response['entry_data']['ProfilePage'][0]['graphql']['user']['id']
     43 + username = response['entry_data']['ProfilePage'][0]['graphql']['user']['username'] # Might have different capitalisation than self._username
     44 + yield from self._response_to_items(response['entry_data']['ProfilePage'][0]['graphql'], username)
     45 + if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
    43 46   return
    44  - endCursor = response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
     47 + endCursor = response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
    45 48   
    46  - # Cf. https://stackoverflow.com/questions/49265339/instagram-a-1-url-doesnt-allow-max-id and https://github.com/rarcega/instagram-scraper
    47 49   while True:
    48 50   logger.info(f'Retrieving endCursor = {endCursor!r}')
    49  - r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={{"id":"{userID}","first":12,"after":"{endCursor}"}}', headers = headers)
     51 + variables = f'{{"id":"{userID}","first":50,"after":"{endCursor}"}}'
     52 + headers['X-Requested-With'] = 'XMLHttpRequest'
     53 + headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
     54 + r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={variables}', headers = headers)
    50 55   
    51 56   if r.status_code != 200:
    52 57   logger.error(f'Got status code {r.status_code}')
    skipped 18 lines
Please wait...
Page is in error, reload to recover