STRLCPY/changedetection.io

■ ■ ■ ■ ■ ■

README.md

		skipped 90 lines
91	91		```bash
92	92		docker-compose pull && docker-compose up -d
93	93		```
	94	+	### Filters
	95	+	XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
94	96
95	97		### Notifications
96	98
		skipped 77 lines

■ ■ ■ ■ ■ ■

changedetectionio/__init__.py

		skipped 803 lines
804	804		compress_type=zipfile.ZIP_DEFLATED,
805	805		compresslevel=8)
806	806
807		-	return send_from_directory(datastore_o.datastore_path, backupname, as_attachment=True)
	807	+	# Send_from_directory needs to be the full absolute path
	808	+	return send_from_directory(os.path.abspath(datastore_o.datastore_path), backupname, as_attachment=True)
808	809
809	810		@app.route("/static/<string:group>/<string:filename>", methods=['GET'])
810	811		def static_content(group, filename):
		skipped 192 lines

■ ■ ■ ■ ■ ■

changedetectionio/fetch_site_status.py

		skipped 113 lines
114	114		if 'json:' in css_filter_rule:
115	115		stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
116	116		is_html = False
117		-	else:
118		-	# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
119		-	stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
120	117
121	118		if is_html:
122	119		# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
123	120		html_content = fetcher.content
124	121		if has_filter_rule:
125		-	html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
	122	+	# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
	123	+	if css_filter_rule[0] == '/':
	124	+	html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
	125	+	else:
	126	+	# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
	127	+	html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
126	128
127	129		# get_text() via inscriptis
128	130		stripped_text_from_html = get_text(html_content)
		skipped 65 lines

■ ■ ■ ■ ■ ■ ■

changedetectionio/forms.py

		skipped 180 lines
181	181		message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
182	182		raise ValidationError(message % (line))
183	183
184		-	class ValidateCSSJSONInput(object):
	184	+	class ValidateCSSJSONXPATHInput(object):
185	185		"""
186	186		Filter validation
187	187		@todo CSS validator ;)
		skipped 3 lines
191	191		self.message = message
192	192
193	193		def __call__(self, form, field):
	194	+
	195	+	# Nothing to see here
	196	+	if not len(field.data.strip()):
	197	+	return
	198	+
	199	+	# Does it look like XPath?
	200	+	if field.data.strip()[0] == '/':
	201	+	from lxml import html, etree
	202	+	tree = html.fromstring("<html></html>")
	203	+
	204	+	try:
	205	+	tree.xpath(field.data.strip())
	206	+	except etree.XPathEvalError as e:
	207	+	message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
	208	+	raise ValidationError(message % (field.data, str(e)))
	209	+	except:
	210	+	raise ValidationError("A system-error occurred when validating your XPath expression")
	211	+
194	212		if 'json:' in field.data:
195	213		from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
196	214		from jsonpath_ng.ext import parse
		skipped 5 lines
202	220		except (JsonPathParserError, JsonPathLexerError) as e:
203	221		message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
204	222		raise ValidationError(message % (input, str(e)))
	223	+	except:
	224	+	raise ValidationError("A system-error occurred when validating your JSONPath expression")
205	225
206	226		# Re #265 - maybe in the future fetch the page and offer a
207	227		# warning/notice that its possible the rule doesnt yet match anything?
		skipped 21 lines
229	249
230	250		minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck',
231	251		[validators.Optional(), validators.NumberRange(min=1)])
232		-	css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()])
	252	+	css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
233	253		title = StringField('Title')
234	254
235	255		ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
		skipped 27 lines

■ ■ ■ ■ ■ ■

changedetectionio/html_tools.py

		skipped 15 lines
16	16
17	17		return html_block + "\n"
18	18
	19	+
	20	+	# Return str Utf-8 of matched rules
	21	+	def xpath_filter(xpath_filter, html_content):
	22	+	from lxml import html
	23	+	from lxml import etree
	24	+
	25	+	tree = html.fromstring(html_content)
	26	+	html_block = ""
	27	+
	28	+	for item in tree.xpath(xpath_filter.strip()):
	29	+	html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"
	30	+
	31	+	return html_block
	32	+
	33	+
19	34		# Extract/find element
20	35		def extract_element(find='title', html_content=''):
21	36
		skipped 72 lines

■ ■ ■ ■ ■ ■

changedetectionio/templates/edit.html

		skipped 94 lines
95	95		<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
96	96		<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <b>"json:"</b>, <a
97	97		href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
	98	+	<li>XPATH - Limit text to this XPath rule, simply start with a forward-slash, example <b>//*[contains(@class, 'sametext')]</b>, <a
	99	+	href="http://xpather.com/" target="new">test your XPath here</a></li>
98	100		</ul>
99		-	Please be sure that you thoroughly understand how to write CSS or JSONPath selector rules before filing an issue on GitHub! <a
	101	+	Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
100	102		href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
101	103		</span>
102	104		</div>
		skipped 41 lines

■ ■ ■ ■ ■ ■

changedetectionio/tests/test_xpath_selector.py

1	+	#!/usr/bin/python3
2	+
3	+	import time
4	+	from flask import url_for
5	+	from . util import live_server_setup
6	+
7	+	from ..html_tools import *
8	+
9	+	def test_setup(live_server):
10	+	live_server_setup(live_server)
11	+
12	+	def set_original_response():
13	+	test_return_data = """<html>
14	+	<body>
15	+	Some initial text</br>
16	+	<p>Which is across multiple lines</p>
17	+	</br>
18	+	So let's see what happens. </br>
19	+	<div class="sametext">Some text thats the same</div>
20	+	<div class="changetext">Some text that will change</div>
21	+	</body>
22	+	</html>
23	+	"""
24	+
25	+	with open("test-datastore/endpoint-content.txt", "w") as f:
26	+	f.write(test_return_data)
27	+	return None
28	+
29	+	def set_modified_response():
30	+	test_return_data = """<html>
31	+	<body>
32	+	Some initial text</br>
33	+	<p>Which is across multiple lines</p>
34	+	</br>
35	+	So let's see what happens. THIS CHANGES AND SHOULDNT TRIGGER A CHANGE</br>
36	+	<div class="sametext">Some text thats the same</div>
37	+	<div class="changetext">Some new text</div>
38	+	</body>
39	+	</html>
40	+	"""
41	+
42	+	with open("test-datastore/endpoint-content.txt", "w") as f:
43	+	f.write(test_return_data)
44	+
45	+	return None
46	+
47	+
48	+	def test_check_markup_xpath_filter_restriction(client, live_server):
49	+	sleep_time_for_fetch_thread = 3
50	+
51	+	xpath_filter = "//*[contains(@class, 'sametext')]"
52	+
53	+	set_original_response()
54	+
55	+	# Give the endpoint time to spin up
56	+	time.sleep(1)
57	+
58	+	# Add our URL to the import page
59	+	test_url = url_for('test_endpoint', _external=True)
60	+	res = client.post(
61	+	url_for("import_page"),
62	+	data={"urls": test_url},
63	+	follow_redirects=True
64	+	)
65	+	assert b"1 Imported" in res.data
66	+
67	+	# Trigger a check
68	+	client.get(url_for("api_watch_checknow"), follow_redirects=True)
69	+
70	+	# Give the thread time to pick it up
71	+	time.sleep(sleep_time_for_fetch_thread)
72	+
73	+	# Goto the edit page, add our ignore text
74	+	# Add our URL to the import page
75	+	res = client.post(
76	+	url_for("edit_page", uuid="first"),
77	+	data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
78	+	follow_redirects=True
79	+	)
80	+	assert b"Updated watch." in res.data
81	+
82	+	# Give the thread time to pick it up
83	+	time.sleep(sleep_time_for_fetch_thread)
84	+
85	+	# view it/reset state back to viewed
86	+	client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True)
87	+
88	+	# Make a change
89	+	set_modified_response()
90	+
91	+	# Trigger a check
92	+	client.get(url_for("api_watch_checknow"), follow_redirects=True)
93	+	# Give the thread time to pick it up
94	+	time.sleep(sleep_time_for_fetch_thread)
95	+
96	+	res = client.get(url_for("index"))
97	+	assert b'unviewed' not in res.data
98	+
99	+	def test_xpath_validation(client, live_server):
100	+
101	+	# Give the endpoint time to spin up
102	+	time.sleep(1)
103	+
104	+	# Add our URL to the import page
105	+	test_url = url_for('test_endpoint', _external=True)
106	+	res = client.post(
107	+	url_for("import_page"),
108	+	data={"urls": test_url},
109	+	follow_redirects=True
110	+	)
111	+	assert b"1 Imported" in res.data
112	+
113	+	res = client.post(
114	+	url_for("edit_page", uuid="first"),
115	+	data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
116	+	follow_redirects=True
117	+	)
118	+	assert b"is not a valid XPath expression" in res.data

■ ■ ■ ■ ■ ■

requirements.txt

		skipped 25 lines
26	26		# ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly
27	27		cryptography ~= 3.4
28	28
29		-	# Used for CSS filtering, replace with soupsieve and lxml for xpath
	29	+	# Used for CSS filtering
30	30		bs4
	31	+
	32	+	# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
	33	+	lxml
31	34
32	35		# 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0
33	36		selenium ~= 4.1.0
		skipped 1 lines

Merge branch 'master' of github.com:dgtlmoon/changedetection.io