- __a=1 is no longer supported, so we need to extract the JSON from the HTML page instead.
- There is now a X-Instagram-GIS header that needs to be set correctly.
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
Total 1 files
■ ■ ■ ■ ■ ■
socialmediascraper/modules/instagram.py
1
+
import hashlib
1
2
import json
2
3
import logging
3
4
import socialmediascraper.base
skipped 18 lines
22
23
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
23
24
24
25
logger.info('Retrieving initial data')
25
-
r = self._get(f'https://www.instagram.com/{self._username}/?__a=1', headers = headers)
26
+
r = self._get(f'https://www.instagram.com/{self._username}/', headers = headers)
26
27
if r.status_code == 404:
27
28
logger.warning('User does not exist')
28
29
return
29
30
elif r.status_code != 200:
30
31
logger.error(f'Got status code {r.status_code}')
31
32
return
32
-
response = json.loads(r.text)
33
-
if response['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
33
+
jsonData = r.text.split('<script type="text/javascript">window._sharedData = ')[1].split(';</script>')[0] # May throw an IndexError if Instagram changes something again; we just let that bubble.
34
+
response = json.loads(jsonData)
35
+
rhxGis = response['rhx_gis']
36
+
if response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
34
37
logger.info('User has no posts')
35
38
return
36
-
if not response['graphql']['user']['edge_owner_to_timeline_media']['edges']:
39
+
if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
37
40
logger.warning('Private account')
38
41
return
39
-
userID = response['graphql']['user']['id']
40
-
username = response['graphql']['user']['username'] # Might have different capitalisation than self._username
41
-
yield from self._response_to_items(response['graphql'], username)
42
-
if not response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']: