STRLCPY/isp-data-pollution

Timeout robots GET
Steven Thomas Smith committed 7 years ago

ffab6f90

1 parent ebda4615

Total 1 files

■ ■ ■ ■ ■ ■

isp_data_pollution.py

		skipped 85 lines
86	86
87	87		# monkeypatch the read class method in RobotFileParser
88	88		# many sites will block access to robots.txt without a standard User-Agent header
	89	+	robot_timeout = 3
89	90		class RobotFileParserUserAgent(robotparser.RobotFileParser):
	91	+
	92	+	timeout = robot_timeout # short-term timeout
	93	+
90	94		def read(self):
91	95		"""Reads the robots.txt URL and feeds it to the parser."""
92	96		try:
93	97		headers = {'User-Agent': user_agent, }
94	98		request = urllib.request.Request(self.url, None, headers)
95		-	f = urllib.request.urlopen(request)
	99	+	f = urllib.request.urlopen(request,timeout=self.timeout)
96	100		# f = urllib.request.urlopen(self.url) #! original code
97	101		except urllib.error.HTTPError as err:
98	102		if err.code in (401, 403):
		skipped 416 lines
515	519		def get_websearch(self,query):
516	520		'''HTTP GET of a websearch, then add any embedded links.'''
517	521		url = uprs.urlunparse(uprs.urlparse(self.search_url)._replace(query='q={}&safe=active'.format(query)))
	522	+	signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler
518	523		signal.alarm(self.timeout+2) # set an alarm
519	524		try:
520	525		self.session.get(url) # selenium driver
		skipped 23 lines
544	549		def get_url(self,url):
545	550		'''HTTP GET of the url, and add any embedded links.'''
546	551		if not self.check_robots(url): return # bail out if robots.txt says to
	552	+	signal.signal(signal.SIGALRM, self.phantomjs_hang_handler) # register hang handler
547	553		signal.alarm(self.timeout+2) # set an alarm
548	554		try:
549	555		self.session.get(url) # selenium driver
		skipped 20 lines
570	576		return []
571	577
572	578		def check_robots(self,url):
573		-	result = False
	579	+	result = True
	580	+	url_robots = uprs.urlunparse(
	581	+	uprs.urlparse(url)._replace(scheme='https', path='/robots.txt', query='', params=''))
	582	+	signal.signal(signal.SIGALRM, self.robot_hang_handler) # register hang handler
	583	+	# signal.alarm(robot_timeout+1) # set a short-term alarm a little longer than robot_timeout
574	584		try:
575		-	url_robots = uprs.urlunparse(uprs.urlparse(url)._replace(scheme='https',path='/robots.txt',query='',params=''))
576	585		rp = RobotFileParserUserAgent()
577	586		rp.set_url(url_robots)
578	587		rp.read()
579	588		result = rp.can_fetch(self.user_agent,url)
580		-	except Exception as e:
	589	+	except (self.TimeoutError,Exception) as e:
581	590		if self.debug: print('rp.read() exception:\n{}'.format(e))
	591	+	finally:
	592	+	signal.alarm(0) # cancel the alarm
582	593		del rp # ensure self.close() in urllib
583	594		return result
584	595
		skipped 53 lines
638	649		if self.debug: print('.quit_session() exception:\n{}'.format(e))
639	650		raise self.TimeoutError('Unable to quit the session as well.')
640	651		raise self.TimeoutError('phantomjs is taking too long')
	652	+
	653	+	def robot_hang_handler(self, signum, frame):
	654	+	if self.debug: print('Looks like robotparser has hung.')
	655	+	raise self.TimeoutError('robotparser is taking too long')
641	656
642	657		def check_phantomjs_process(self):
643	658		'''Check if phantomjs is running.'''
		skipped 39 lines

Timeout robots GET