Projects STRLCPY snscrape Commits 9fb3ac60
🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■ ■
    socialmediascraper/modules/googleplus.py
     1 +import datetime
     2 +import itertools
     3 +import json
     4 +import logging
     5 +import re
     6 +import socialmediascraper.base
     7 + 
     8 + 
     9 +logger = logging.getLogger(__name__)
     10 + 
     11 + 
     12 +class GooglePlusUserScraper(socialmediascraper.base.Scraper):
     13 + name = 'googleplus-user'
     14 + 
     15 + def __init__(self, user, **kwargs):
     16 + super().__init__(**kwargs)
     17 + self._user = user
     18 + 
     19 + def get_items(self):
     20 + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
     21 + 
     22 + logger.info('Retrieving initial data')
     23 + r = self._get(f'https://plus.google.com/{self._user}', headers = headers)
     24 + if r.status_code == 404:
     25 + logger.warning('User does not exist')
     26 + return
     27 + elif r.status_code != 200:
     28 + logger.error(f'Got status code {r.status_code}')
     29 + return
     30 + 
     31 + # Global data; only needed for the session ID
     32 + #TODO: Make this more robust somehow
     33 + match = re.search(r'''(['"])FdrFJe\1\s*:\s*(['"])(?P<sid>.*?)\2''', r.text)
     34 + if not match:
     35 + logger.error('Unable to find session ID')
     36 + return
     37 + sid = match.group('sid')
     38 + 
     39 + # Page data
     40 + # As of 2018-05-18, the much simpler regex r'''<script[^>]*>AF_initDataCallback\(\{key: 'ds:6',.*?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break:
     41 + match = re.search(r'''<script[^>]*>\s*(?:.*?)\s*\(\s*\{(?:|.*?,)\s*key\s*:\s*(['"])ds:6\1\s*,.*?,\s*data\s*:\s*function\s*\(\s*\)\s*\{\s*return\s*(?P<data>.*?)\}\s*\}\s*\)\s*;\s*</script>''', r.text, re.DOTALL)
     42 + if not match:
     43 + logger.error('Unable to extract data')
     44 + return
     45 + jsonData = match.group('data')
     46 + response = json.loads(jsonData)
     47 + if response[0][7] is None:
     48 + logger.info('User has no posts')
     49 + return
     50 + for postObj in response[0][7]:
     51 + yield socialmediascraper.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
     52 + cursor = response[0][1] # 'ADSJ_x'
     53 + if cursor is None:
     54 + # No further pages
     55 + return
     56 + baseDate = datetime.datetime.utcnow()
     57 + baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second
     58 + userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16]
     59 + 
     60 + for counter in itertools.count(start = 2):
     61 + logger.info('Retrieving next page')
     62 + reqid = 1 + baseSeconds + int(1e5) * counter
     63 + r = self._post(
     64 + f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c',
     65 + data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')],
     66 + headers = headers
     67 + )
     68 + if r.status_code != 200:
     69 + logger.error(f'Got status code {r.status_code}')
     70 + return
     71 + 
     72 + # As if everything up to here wasn't terrible already, this is where it gets *really* bad.
     73 + # The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure.
     74 + # The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size.
     75 + # It sucks.
     76 + # Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult.
     77 + # I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length.
     78 + # Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON.
     79 + 
     80 + garbage = r.text
     81 + assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines
     82 + data = []
     83 + pos = 6
     84 + while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace
     85 + pos += 1
     86 + response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end
     87 + 
     88 + for postObj in response[0][2]['74333095'][0][7]:
     89 + yield socialmediascraper.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
     90 + 
     91 + cursor = response[0][2]['74333095'][0][1]
     92 + 
     93 + if cursor is None:
     94 + break
     95 + 
     96 + @classmethod
     97 + def setup_parser(cls, subparser):
     98 + subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID')
     99 + 
     100 + @classmethod
     101 + def from_args(cls, args):
     102 + return cls(args.user, retries = args.retries)
     103 + 
Please wait...
Page is in error, reload to recover