■ ■ ■ ■ ■ ■
socialmediascraper/modules/instagram.py
| 1 | + | import json |
| 2 | + | import logging |
| 3 | + | import socialmediascraper.base |
| 4 | + | |
| 5 | + | |
| 6 | + | logger = logging.getLogger(__name__) |
| 7 | + | |
| 8 | + | |
| 9 | + | class InstagramUserScraper(socialmediascraper.base.Scraper): |
| 10 | + | name = 'instagram-user' |
| 11 | + | |
| 12 | + | def __init__(self, username, **kwargs): |
| 13 | + | super().__init__(**kwargs) |
| 14 | + | self._username = username |
| 15 | + | |
| 16 | + | def _response_to_items(self, response): |
| 17 | + | username = response['user']['username'] # Might have different capitalisation than self._username |
| 18 | + | |
| 19 | + | for node in response['user']['media']['nodes']: |
| 20 | + | code = node['code'] |
| 21 | + | yield socialmediascraper.base.URLItem(f'https://www.instagram.com/p/{code}/?taken-by={username}') #TODO: Do we want the taken-by parameter in here? |
| 22 | + | |
| 23 | + | def get_items(self): |
| 24 | + | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} |
| 25 | + | |
| 26 | + | maxID = None |
| 27 | + | |
| 28 | + | while True: |
| 29 | + | logger.info(f'Retrieving max_id = {maxID!r}') |
| 30 | + | if maxID is None: |
| 31 | + | url = f'https://www.instagram.com/{self._username}/?__a=1' |
| 32 | + | else: |
| 33 | + | url = f'https://www.instagram.com/{self._username}/?__a=1&max_id={maxID}' |
| 34 | + | r = self._get(url, headers = headers) |
| 35 | + | |
| 36 | + | #TODO: Handle 404 (HTML) |
| 37 | + | |
| 38 | + | response = json.loads(r.text) |
| 39 | + | if not response['user']['media']['nodes']: |
| 40 | + | return |
| 41 | + | yield from self._response_to_items(response) |
| 42 | + | maxID = response['user']['media']['nodes'][-1]['id'] |
| 43 | + | |
| 44 | + | @classmethod |
| 45 | + | def setup_parser(cls, subparser): |
| 46 | + | subparser.add_argument('username', help = 'An Instagram username') |
| 47 | + | |
| 48 | + | @classmethod |
| 49 | + | def from_args(cls, args): |
| 50 | + | return cls(args.username, retries = args.retries) |
| 51 | + | |