STRLCPY/snscrape

Fix Instagram again

- __a=1 is no longer supported, so we need to extract the JSON from the HTML page instead.
- There is now a X-Instagram-GIS header that needs to be set correctly.

JustAnotherArchivist committed 6 years ago

e28a2cdb

1 parent 5a084af8

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

Total 1 files

■ ■ ■ ■ ■ ■

socialmediascraper/modules/instagram.py

	1	+	import hashlib
1	2		import json
2	3		import logging
3	4		import socialmediascraper.base
		skipped 18 lines
22	23		headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
23	24
24	25		logger.info('Retrieving initial data')
25		-	r = self._get(f'https://www.instagram.com/{self._username}/?__a=1', headers = headers)
	26	+	r = self._get(f'https://www.instagram.com/{self._username}/', headers = headers)
26	27		if r.status_code == 404:
27	28		logger.warning('User does not exist')
28	29		return
29	30		elif r.status_code != 200:
30	31		logger.error(f'Got status code {r.status_code}')
31	32		return
32		-	response = json.loads(r.text)
33		-	if response['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
	33	+	jsonData = r.text.split('<script type="text/javascript">window._sharedData = ')[1].split(';</script>')[0] # May throw an IndexError if Instagram changes something again; we just let that bubble.
	34	+	response = json.loads(jsonData)
	35	+	rhxGis = response['rhx_gis']
	36	+	if response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0:
34	37		logger.info('User has no posts')
35	38		return
36		-	if not response['graphql']['user']['edge_owner_to_timeline_media']['edges']:
	39	+	if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
37	40		logger.warning('Private account')
38	41		return
39		-	userID = response['graphql']['user']['id']
40		-	username = response['graphql']['user']['username'] # Might have different capitalisation than self._username
41		-	yield from self._response_to_items(response['graphql'], username)
42		-	if not response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
	42	+	userID = response['entry_data']['ProfilePage'][0]['graphql']['user']['id']
	43	+	username = response['entry_data']['ProfilePage'][0]['graphql']['user']['username'] # Might have different capitalisation than self._username
	44	+	yield from self._response_to_items(response['entry_data']['ProfilePage'][0]['graphql'], username)
	45	+	if not response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']:
43	46		return
44		-	endCursor = response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
	47	+	endCursor = response['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor']
45	48
46		-	# Cf. https://stackoverflow.com/questions/49265339/instagram-a-1-url-doesnt-allow-max-id and https://github.com/rarcega/instagram-scraper
47	49		while True:
48	50		logger.info(f'Retrieving endCursor = {endCursor!r}')
49		-	r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={{"id":"{userID}","first":12,"after":"{endCursor}"}}', headers = headers)
	51	+	variables = f'{{"id":"{userID}","first":50,"after":"{endCursor}"}}'
	52	+	headers['X-Requested-With'] = 'XMLHttpRequest'
	53	+	headers['X-Instagram-GIS'] = hashlib.md5(f'{rhxGis}:{variables}'.encode('utf-8')).hexdigest()
	54	+	r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={variables}', headers = headers)
50	55
51	56		if r.status_code != 200:
52	57		logger.error(f'Got status code {r.status_code}')
		skipped 18 lines