STRLCPY/snscrape

Add support for Google+ user profiles
JustAnotherArchivist committed 6 years ago

9fb3ac60

1 parent 897f5beb

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

Total 1 files

■ ■ ■ ■ ■ ■

socialmediascraper/modules/googleplus.py

1	+	import datetime
2	+	import itertools
3	+	import json
4	+	import logging
5	+	import re
6	+	import socialmediascraper.base
7	+
8	+
9	+	logger = logging.getLogger(__name__)
10	+
11	+
12	+	class GooglePlusUserScraper(socialmediascraper.base.Scraper):
13	+	name = 'googleplus-user'
14	+
15	+	def __init__(self, user, **kwargs):
16	+	super().__init__(**kwargs)
17	+	self._user = user
18	+
19	+	def get_items(self):
20	+	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
21	+
22	+	logger.info('Retrieving initial data')
23	+	r = self._get(f'https://plus.google.com/{self._user}', headers = headers)
24	+	if r.status_code == 404:
25	+	logger.warning('User does not exist')
26	+	return
27	+	elif r.status_code != 200:
28	+	logger.error(f'Got status code {r.status_code}')
29	+	return
30	+
31	+	# Global data; only needed for the session ID
32	+	#TODO: Make this more robust somehow
33	+	match = re.search(r'''(['"])FdrFJe\1\s:\s(['"])(?P<sid>.*?)\2''', r.text)
34	+	if not match:
35	+	logger.error('Unable to find session ID')
36	+	return
37	+	sid = match.group('sid')
38	+
39	+	# Page data
40	+	# As of 2018-05-18, the much simpler regex r'''<script[^>]>AF_initDataCallback\(\{key: 'ds:6',.?return (.*?)\}\}\);</script>''' would work also, but this is more generic and less likely to break:
41	+	match = re.search(r'''<script[^>]>\s(?:.?)\s\(\s\{(?:\|.?,)\skey\s:\s(['"])ds:6\1\s,.?,\sdata\s:\sfunction\s\(\s\)\s\{\sreturn\s(?P<data>.?)\}\s\}\s\)\s;\s</script>''', r.text, re.DOTALL)
42	+	if not match:
43	+	logger.error('Unable to extract data')
44	+	return
45	+	jsonData = match.group('data')
46	+	response = json.loads(jsonData)
47	+	if response[0][7] is None:
48	+	logger.info('User has no posts')
49	+	return
50	+	for postObj in response[0][7]:
51	+	yield socialmediascraper.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
52	+	cursor = response[0][1] # 'ADSJ_x'
53	+	if cursor is None:
54	+	# No further pages
55	+	return
56	+	baseDate = datetime.datetime.utcnow()
57	+	baseSeconds = baseDate.hour * 3600 + baseDate.minute * 60 + baseDate.second
58	+	userid = response[1] # Alternatively and more ugly: response[0][7][0][6]['33558957'][16]
59	+
60	+	for counter in itertools.count(start = 2):
61	+	logger.info('Retrieving next page')
62	+	reqid = 1 + baseSeconds + int(1e5) * counter
63	+	r = self._post(
64	+	f'https://plus.google.com/_/PlusAppUi/data?ds.extension=74333095&f.sid={sid}&hl=en-US&soc-app=199&soc-platform=1&soc-device=1&_reqid={reqid}&rt=c',
65	+	data = [('f.req', '[[[74333095,[{"74333095":["' + cursor + '","' + userid + '"]}],null,null,0]]]'), ('', '')],
66	+	headers = headers
67	+	)
68	+	if r.status_code != 200:
69	+	logger.error(f'Got status code {r.status_code}')
70	+	return
71	+
72	+	# As if everything up to here wasn't terrible already, this is where it gets really bad.
73	+	# The API contains a few junk characters at the beginning, apparently as an anti-CSRF measure.
74	+	# The remainder is effectively a self-made chunked transfer encoding but with decimal digits and including everything except the digits themselves in the chunk size.
75	+	# It sucks.
76	+	# Each chunk is actually one JSON object; you'd think that we can just read the first one and parse that, but there are some quirks that make this difficult.
77	+	# I was unable to figure out what the "chunk size" actually covers exactly; the response is UTF-8 encoded, but the chunk size matches neither the binary nor the decoded length.
78	+	# Enter the awful workaround: strip away the initial chunk size, then parse the beginning of the remaining data using a parser that doesn't care if there's junk after the JSON.
79	+
80	+	garbage = r.text
81	+	assert garbage[:6] == ")]}'\n\n" # anti-CSRF and two newlines
82	+	data = []
83	+	pos = 6
84	+	while garbage[pos].isdigit() or garbage[pos].isspace(): # Also strip leading whitespace
85	+	pos += 1
86	+	response = json.JSONDecoder().raw_decode(''.join(garbage[pos:]))[0] # Parses only the first structure in the data stream without throwing an error about the extra data at the end
87	+
88	+	for postObj in response[0][2]['74333095'][0][7]:
89	+	yield socialmediascraper.base.URLItem(f'https://plus.google.com/{postObj[6]["33558957"][21]}')
90	+
91	+	cursor = response[0][2]['74333095'][0][1]
92	+
93	+	if cursor is None:
94	+	break
95	+
96	+	@classmethod
97	+	def setup_parser(cls, subparser):
98	+	subparser.add_argument('user', help = 'A Google Plus username (with leading "+") or numeric ID')
99	+
100	+	@classmethod
101	+	def from_args(cls, args):
102	+	return cls(args.user, retries = args.retries)
103	+

Add support for Google+ user profiles