■ ■ ■ ■ ■ ■
socialmediascraper/modules/googleplus.py
| 1 | + | import datetime |
| 2 | + | import itertools |
| 3 | + | import json |
| 4 | + | import logging |
| 5 | + | import re |
| 6 | + | import socialmediascraper.base |
| 7 | + | |
| 8 | + | |
| 9 | + | logger = logging.getLogger(__name__) |
| 10 | + | |
| 11 | + | |
| 12 | + | class GooglePlusUserScraper(socialmediascraper.base.Scraper): |
| 13 | + | name = 'googleplus-user' |
| 14 | + | |
| 15 | + | def __init__(self, user, **kwargs): |
| 16 | + | super().__init__(**kwargs) |
| 17 | + | self._user = user |
| 18 | + | |
| 19 | + | def get_items(self): |
| 20 | + | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} |
| 21 | + | |
| 22 | + | logger.info('Retrieving initial data') |
| 23 | + | r = self._get(f'https://plus.google.com/{self._user}', headers = headers) |
| 24 | + | if r.status_code == 404: |
| 25 | + | logger.warning('User does not exist') |
| 26 | + | return |
| 27 | + | elif r.status_code != 200: |
| 28 | + | logger.error(f'Got status code {r.status_code}') |
| 29 | + | return |
| 30 | + | |
| 31 | + | # Global data; only needed for the session ID |
| 32 | + | #TODO: Make this more robust somehow |
| 33 | + | match = re.search(r'''(['"])FdrFJe\1\s*:\s*(['"])(?P<sid>.*?)\2''', r.text) |
| 34 | + | if not match: |
| 35 | + | logger.error('Unable to find session ID') |
| 36 | + | return |
| 37 | + | sid = match.group('sid') |
| 38 | + | |
| 39 | + | # Page data |
| 40 | + | # As of 2018-05-18, the much simpler regex r'''<script[^>]*>AF_initDataCallback\(\{key: 'ds:6',.*?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break: |
| 41 | + | match = re.search(r'''<script[^>]*>\s*(?:.*?)\s*\(\s*\{(?:|.*?,)\s*key\s*:\s*(['"])ds:6\1\s*,.*?,\s*data\s*:\s*function\s*\(\s*\)\s*\{\s*return\s*(?P<data>.*?)\}\s*\}\s*\)\s*;\s*</script>''', r.text, re.DOTALL) |
| 42 | + | if not match: |
| 43 | + | logger.error('Unable to extract data') |
| 44 | + | return |
| 45 | + | jsonData = match.group('data') |
| 46 | + | response = json.loads(jsonData) |
| 47 | + | if response[0][7] is None: |
| 48 | + | logger.info('User has no posts') |
| 49 | + | return |
| 50 | + | for postObj in response[0][7]: |
| 51 | + | yield socialmediascraper.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}') |
| 52 | + | cursor = response[0][1] # 'ADSJ_x' |
| 53 | + | if cursor is None: |
| 54 | + | # No further pages |
| 55 | + | return |
| 56 | + | baseDate = datetime.datetime.utcnow() |
| 57 | + | baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second |
| 58 | + | userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16] |
| 59 | + | |
| 60 | + | for counter in itertools.count(start = 2): |
| 61 | + | logger.info('Retrieving next page') |
| 62 | + | reqid = 1 + baseSeconds + int(1e5) * counter |
| 63 | + | r = self._post( |
| 64 | + | f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c', |
| 65 | + | data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')], |
| 66 | + | headers = headers |
| 67 | + | ) |
| 68 | + | if r.status_code != 200: |
| 69 | + | logger.error(f'Got status code {r.status_code}') |
| 70 | + | return |
| 71 | + | |
| 72 | + | # As if everything up to here wasn't terrible already, this is where it gets *really* bad. |
| 73 | + | # The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure. |
| 74 | + | # The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size. |
| 75 | + | # It sucks. |
| 76 | + | # Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult. |
| 77 | + | # I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length. |
| 78 | + | # Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON. |
| 79 | + | |
| 80 | + | garbage = r.text |
| 81 | + | assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines |
| 82 | + | data = [] |
| 83 | + | pos = 6 |
| 84 | + | while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace |
| 85 | + | pos += 1 |
| 86 | + | response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end |
| 87 | + | |
| 88 | + | for postObj in response[0][2]['74333095'][0][7]: |
| 89 | + | yield socialmediascraper.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}') |
| 90 | + | |
| 91 | + | cursor = response[0][2]['74333095'][0][1] |
| 92 | + | |
| 93 | + | if cursor is None: |
| 94 | + | break |
| 95 | + | |
| 96 | + | @classmethod |
| 97 | + | def setup_parser(cls, subparser): |
| 98 | + | subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID') |
| 99 | + | |
| 100 | + | @classmethod |
| 101 | + | def from_args(cls, args): |
| 102 | + | return cls(args.user, retries = args.retries) |
| 103 | + | |