■ ■ ■ ■ ■ ■
socialmediascraper/modules/instagram.py
skipped 12 lines 13 13 super().__init__(**kwargs) 14 14 self._username = username 15 15 16 - def _response_to_items(self, response): 17 - username = response['user']['username'] # Might have different capitalisation than self._username 18 - 19 - for node in response['user']['media']['nodes']: 20 - code = node['code'] 16 + def _response_to_items(self, response, username ): 17 + for node in response['user']['edge_owner_to_timeline_media']['edges']: 18 + code = node['node']['shortcode'] 21 19 yield socialmediascraper.base.URLItem(f'https://www.instagram.com/p/{code}/?taken-by={username}') #TODO: Do we want the taken-by parameter in here? 22 20 23 21 def get_items(self): 24 22 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} 25 23 26 - maxID = None 24 + logger.info('Retrieving initial data') 25 + r = self._get(f'https://www.instagram.com/{self._username}/?__a=1', headers = headers) 26 + if r.status_code == 404: 27 + logger.warning('User does not exist') 28 + return 29 + elif r.status_code != 200: 30 + logger.error(f'Got status code {r.status_code}') 31 + return 32 + response = json.loads(r.text) 33 + if response['graphql']['user']['edge_owner_to_timeline_media']['count'] == 0: 34 + logger.info('User has no posts') 35 + return 36 + if not response['graphql']['user']['edge_owner_to_timeline_media']['edges']: 37 + logger.warning('Private account') 38 + return 39 + userID = response['graphql']['user']['id'] 40 + username = response['graphql']['user']['username'] # Might have different capitalisation than self._username 41 + yield from self._response_to_items(response['graphql'], username) 42 + if not response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']: 43 + return 44 + endCursor = response['graphql']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] 27 45 46 + # Cf. https://stackoverflow.com/questions/49265339/instagram-a-1-url-doesnt-allow-max-id and https://github.com/rarcega/instagram-scraper 28 47 while True: 29 - logger.info(f'Retrieving max_id = {maxID !r}') 30 - if maxID is None: 31 - url = f'https://www.instagram.com/{self._username}/?__a=1' 32 - else: 33 - url = f'https://www.instagram.com/{self._username}/?__a=1&max_id={maxID}' 34 - r = self._get(url, headers = headers) 48 + logger.info(f'Retrieving endCursor = {endCursor !r}') 49 + r = self._get(f'https://www.instagram.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables={{"id":"{userID}","first":12,"after":"{endCursor}"}}', headers = headers) 35 50 36 - #TODO: Handle 404 (HTML) 51 + if r.status_code != 200: 52 + logger.error(f'Got status code {r.status_code}') 53 + return 37 54 38 55 response = json.loads(r.text) 39 - if not response['user']['media ']['nodes ']: 56 + if not response['data ' ] [ ' user']['edge_owner_to_timeline_media ']['edges ']: 40 57 return 41 - yield from self._response_to_items(response) 42 - maxID = response['user']['media']['nodes'][-1]['id'] 58 + yield from self._response_to_items(response[ ' data ' ] , username ) 59 + if not response['data']['user']['edge_owner_to_timeline_media']['page_info']['has_next_page']: 60 + return 61 + endCursor = response['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] 43 62 44 63 @classmethod 45 64 def setup_parser(cls, subparser): skipped 6 lines