STRLCPY/changedetection.io

Distill.io JSON export file importer (#592)
dgtlmoon committed with GitHub 2 years ago

f28c2605

1 parent 18f0b63b

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

■ ■ ■ ■ ■ ■

changedetectionio/__init__.py

		skipped 682 lines
683	683		@app.route("/import", methods=['GET', "POST"])
684	684		@login_required
685	685		def import_page():
686		-	import validators
687	686		remaining_urls = []
688		-
689		-	good = 0
690		-
691	687		if request.method == 'POST':
692		-	now=time.time()
693		-	urls = request.values.get('urls').split("\n")
	688	+	from .importer import import_url_list, import_distill_io_json
694	689
695		-	if (len(urls) > 5000):
696		-	flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
	690	+	# URL List import
	691	+	if request.values.get('urls') and len(request.values.get('urls').strip()):
	692	+	# Import and push into the queue for immediate update check
	693	+	importer = import_url_list()
	694	+	importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
	695	+	for uuid in importer.new_uuids:
	696	+	update_q.put(uuid)
697	697
698		-	for url in urls:
699		-	url = url.strip()
700		-	url, *tags = url.split(" ")
701		-	# Flask wtform validators wont work with basic auth, use validators package
702		-	# Up to 5000 per batch so we dont flood the server
703		-	if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
704		-	new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False)
705		-	if new_uuid:
706		-	# Straight into the queue.
707		-	update_q.put(new_uuid)
708		-	good += 1
709		-	continue
	698	+	if len(importer.remaining_data) == 0:
	699	+	return redirect(url_for('index'))
	700	+	else:
	701	+	remaining_urls = importer.remaining_data
710	702
711		-	if len(url.strip()):
712		-	remaining_urls.append(url)
	703	+	# Distill.io import
	704	+	if request.values.get('distill-io') and len(request.values.get('distill-io').strip()):
	705	+	# Import and push into the queue for immediate update check
	706	+	d_importer = import_distill_io_json()
	707	+	d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
	708	+	for uuid in d_importer.new_uuids:
	709	+	update_q.put(uuid)
713	710
714		-	flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls)))
715		-	datastore.needs_write = True
716	711
717		-	if len(remaining_urls) == 0:
718		-	# Looking good, redirect to index.
719		-	return redirect(url_for('index'))
720	712
721	713		# Could be some remaining, or we could be on GET
722	714		output = render_template("import.html",
723		-	remaining="\n".join(remaining_urls)
	715	+	import_url_list_remaining="\n".join(remaining_urls),
	716	+	original_distill_json=''
724	717		)
725	718		return output
726	719
		skipped 566 lines

■ ■ ■ ■ ■ ■

changedetectionio/fetch_site_status.py

		skipped 16 lines
17	17		self.datastore = datastore
18	18
19	19		# If there was a proxy list enabled, figure out what proxy_args/which proxy to use
20		-	# if watch.proxy use that
21		-	# fetcher.proxy_override = watch.proxy or main config proxy
22		-	# Allows override the proxy on a per-request basis
23		-	# ALWAYS use the first one is nothing selected
	20	+	# if watch.proxy use that
	21	+	# fetcher.proxy_override = watch.proxy or main config proxy
	22	+	# Allows override the proxy on a per-request basis
	23	+	# ALWAYS use the first one is nothing selected
24	24
25	25		def set_proxy_from_list(self, watch):
26	26		proxy_args = None
		skipped 122 lines
149	149		# Then we assume HTML
150	150		if has_filter_rule:
151	151		# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
152		-	if css_filter_rule[0] == '/':
153		-	html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
	152	+	if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'):
	153	+	html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''),
	154	+	html_content=fetcher.content)
154	155		else:
155	156		# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
156	157		html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
	158	+
157	159		if has_subtractive_selectors:
158	160		html_content = html_tools.element_removal(subtractive_selectors, html_content)
159	161
		skipped 12 lines
172	174
173	175		# Re #340 - return the content before the 'ignore text' was applied
174	176		text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
175		-
176	177
177	178		# Re #340 - return the content before the 'ignore text' was applied
178	179		text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
		skipped 46 lines
225	226		update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
226	227
227	228		return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot
	229	+

■ ■ ■ ■ ■ ■

changedetectionio/importer.py

1	+	from abc import ABC, abstractmethod
2	+	import time
3	+	import validators
4	+
5	+
6	+	class Importer():
7	+	remaining_data = []
8	+	new_uuids = []
9	+	good = 0
10	+
11	+	def __init__(self):
12	+	self.new_uuids = []
13	+	self.good = 0
14	+	self.remaining_data = []
15	+
16	+	@abstractmethod
17	+	def run(self,
18	+	data,
19	+	flash,
20	+	datastore):
21	+	pass
22	+
23	+
24	+	class import_url_list(Importer):
25	+	"""
26	+	Imports a list, can be in <code>https://example.com tag1, tag2, last tag</code> format
27	+	"""
28	+	def run(self,
29	+	data,
30	+	flash,
31	+	datastore,
32	+	):
33	+
34	+	urls = data.split("\n")
35	+	good = 0
36	+	now = time.time()
37	+
38	+	if (len(urls) > 5000):
39	+	flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
40	+
41	+	for url in urls:
42	+	url = url.strip()
43	+	if not len(url):
44	+	continue
45	+
46	+	tags = ""
47	+
48	+	# 'tags' should be a csv list after the URL
49	+	if ' ' in url:
50	+	url, tags = url.split(" ", 1)
51	+
52	+	# Flask wtform validators wont work with basic auth, use validators package
53	+	# Up to 5000 per batch so we dont flood the server
54	+	if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
55	+	new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False)
56	+	if new_uuid:
57	+	# Straight into the queue.
58	+	self.new_uuids.append(new_uuid)
59	+	good += 1
60	+	continue
61	+
62	+	# Worked past the 'continue' above, append it to the bad list
63	+	if self.remaining_data is None:
64	+	self.remaining_data = []
65	+	self.remaining_data.append(url)
66	+
67	+	flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data)))
68	+
69	+
70	+	class import_distill_io_json(Importer):
71	+	def run(self,
72	+	data,
73	+	flash,
74	+	datastore,
75	+	):
76	+
77	+	import json
78	+	good = 0
79	+	now = time.time()
80	+	self.new_uuids=[]
81	+
82	+
83	+	try:
84	+	data = json.loads(data.strip())
85	+	except json.decoder.JSONDecodeError:
86	+	flash("Unable to read JSON file, was it broken?", 'error')
87	+	return
88	+
89	+	if not data.get('data'):
90	+	flash("JSON structure looks invalid, was it broken?", 'error')
91	+	return
92	+
93	+	for d in data.get('data'):
94	+	d_config = json.loads(d['config'])
95	+	extras = {'title': d['name']}
96	+
97	+	if len(d['uri']) and good < 5000:
98	+	try:
99	+	# @todo we only support CSS ones at the moment
100	+	if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css':
101	+	extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr']
102	+	except KeyError:
103	+	pass
104	+	except IndexError:
105	+	pass
106	+
107	+	try:
108	+	extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
109	+	if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
110	+	extras['css_filter'] = 'xpath:' + extras['css_filter']
111	+
112	+	except KeyError:
113	+	pass
114	+	except IndexError:
115	+	pass
116	+
117	+	try:
118	+	extras['tag'] = ", ".join(d['tags'])
119	+	except KeyError:
120	+	pass
121	+	except IndexError:
122	+	pass
123	+
124	+	new_uuid = datastore.add_watch(url=d['uri'].strip(),
125	+	extras=extras,
126	+	write_to_disk_now=False)
127	+
128	+	if new_uuid:
129	+	# Straight into the queue.
130	+	self.new_uuids.append(new_uuid)
131	+	good += 1
132	+
133	+	flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
134	+

■ ■ ■ ■ ■ ■

changedetectionio/templates/edit.html

		skipped 130 lines
131	131		<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
132	132		<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
133	133		href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
134		-	<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code>, <a
	134	+	<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//[contains(@class, 'sametext')]</code> or <code>xpath://[contains(@class, 'sametext')]</code>, <a
135	135		href="http://xpather.com/" target="new">test your XPath here</a></li>
136	136		</ul>
137	137		Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
		skipped 64 lines

■ ■ ■ ■ ■ ■

changedetectionio/templates/import.html

1	1		{% extends 'base.html' %}
	2	+	{% block content %}
	3	+	<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
	4	+	<div class="edit-form monospaced-textarea">
2	5
3		-	{% block content %}
4		-	<div class="edit-form">
5		-	<div class="inner">
	6	+	<div class="tabs collapsable">
	7	+	<ul>
	8	+	<li class="tab" id="default-tab"><a href="#url-list">URL List</a></li>
	9	+	<li class="tab"><a href="#distill-io">Distill.io</a></li>
	10	+	</ul>
	11	+	</div>
	12	+
	13	+	<div class="box-wrap inner">
6	14		<form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
7	15		<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
8		-	<fieldset class="pure-group">
9		-	<legend>
10		-	Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):
11		-	<br>
12		-	<code>https://example.com tag1, tag2, last tag</code>
13		-	<br>
14		-	URLs which do not pass validation will stay in the textarea.
15		-	</legend>
	16	+	<div class="tab-pane-inner" id="url-list">
	17	+	<fieldset class="pure-group">
	18	+	<legend>
	19	+	Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma
	20	+	(,):
	21	+	<br>
	22	+	<code>https://example.com tag1, tag2, last tag</code>
	23	+	<br>
	24	+	URLs which do not pass validation will stay in the textarea.
	25	+	</legend>
16	26
17	27
18		-	<textarea name="urls" class="pure-input-1-2" placeholder="https://"
19		-	style="width: 100%;
	28	+	<textarea name="urls" class="pure-input-1-2" placeholder="https://"
	29	+	style="width: 100%;
20	30		font-family:monospace;
21	31		white-space: pre;
22	32		overflow-wrap: normal;
23		-	overflow-x: scroll;" rows="25">{{ remaining }}</textarea>
24		-	</fieldset>
	33	+	overflow-x: scroll;" rows="25">{{ import_url_list_remaining }}</textarea>
	34	+	</fieldset>
	35	+
	36	+
	37	+	</div>
	38	+
	39	+	<div class="tab-pane-inner" id="distill-io">
	40	+
	41	+
	42	+	<fieldset class="pure-group">
	43	+	<legend>
	44	+	Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.</br>
	45	+	This is <i>experimental</i>, supported fields are <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, the rest (including <code>schedule</code>) are ignored.
	46	+	<br/>
	47	+	<p>
	48	+	How to export? <a href="https://distill.io/docs/web-monitor/how-export-and-import-monitors/">https://distill.io/docs/web-monitor/how-export-and-import-monitors/</a><br/>
	49	+	Be sure to set your default fetcher to Chrome if required.</br>
	50	+	</p>
	51	+	</legend>
	52	+
	53	+
	54	+	<textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
	55	+	font-family:monospace;
	56	+	white-space: pre;
	57	+	overflow-wrap: normal;
	58	+	overflow-x: scroll;" placeholder="Example Distill.io JSON export file
	59	+
	60	+	{
	61	+	"client": {
	62	+	"local": 1
	63	+	},
	64	+	"data": [
	65	+	{
	66	+	"name": "Unraid \| News",
	67	+	"uri": "https://unraid.net/blog",
	68	+	"config": "{\"selections\":[{\"frames\":[{\"index\":0,\"excludes\":[],\"includes\":[{\"type\":\"xpath\",\"expr\":\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\"}]}],\"dynamic\":true,\"delay\":2}],\"ignoreEmptyText\":true,\"includeStyle\":false,\"dataAttr\":\"text\"}",
	69	+	"tags": [],
	70	+	"content_type": 2,
	71	+	"state": 40,
	72	+	"schedule": "{\"type\":\"INTERVAL\",\"params\":{\"interval\":4447}}",
	73	+	"ts": "2022-03-27T15:51:15.667Z"
	74	+	}
	75	+	]
	76	+	}
	77	+	" rows="25">{{ original_distill_json }}</textarea>
	78	+	</fieldset>
	79	+	</div>
25	80		<button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
26	81		</form>
27		-	</div>
	82	+
	83	+	</div>
28	84		</div>
29	85
30	86		{% endblock %}
		skipped 1 lines

■ ■ ■ ■ ■ ■ ■

changedetectionio/tests/test_import.py

		skipped 4 lines
5	5		from flask import url_for
6	6
7	7		from .util import live_server_setup
8		-
9		-
10		-	def test_import(client, live_server):
11		-
	8	+	def test_setup(client, live_server):
12	9		live_server_setup(live_server)
13	10
	11	+	def test_import(client, live_server):
14	12		# Give the endpoint time to spin up
15	13		time.sleep(1)
16	14
17	15		res = client.post(
18	16		url_for("import_page"),
19	17		data={
	18	+	"distill-io": "",
20	19		"urls": """https://example.com
21	20		https://example.com tag1
22	21		https://example.com tag1, other tag"""
		skipped 3 lines
26	25		assert b"3 Imported" in res.data
27	26		assert b"tag1" in res.data
28	27		assert b"other tag" in res.data
	28	+	res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
	29	+
	30	+	# Clear flask alerts
	31	+	res = client.get( url_for("index"))
	32	+	res = client.get( url_for("index"))
	33	+
	34	+	def xtest_import_skip_url(client, live_server):
	35	+
	36	+
	37	+	# Give the endpoint time to spin up
	38	+	time.sleep(1)
	39	+
	40	+	res = client.post(
	41	+	url_for("import_page"),
	42	+	data={
	43	+	"distill-io": "",
	44	+	"urls": """https://example.com
	45	+	:ht000000broken
	46	+	"""
	47	+	},
	48	+	follow_redirects=True,
	49	+	)
	50	+	assert b"1 Imported" in res.data
	51	+	assert b"ht000000broken" in res.data
	52	+	assert b"1 Skipped" in res.data
	53	+	res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
	54	+	# Clear flask alerts
	55	+	res = client.get( url_for("index"))
	56	+
	57	+	def test_import_distillio(client, live_server):
	58	+
	59	+	distill_data='''
	60	+	{
	61	+	"client": {
	62	+	"local": 1
	63	+	},
	64	+	"data": [
	65	+	{
	66	+	"name": "Unraid \| News",
	67	+	"uri": "https://unraid.net/blog",
	68	+	"config": "{\\"selections\\":[{\\"frames\\":[{\\"index\\":0,\\"excludes\\":[],\\"includes\\":[{\\"type\\":\\"xpath\\",\\"expr\\":\\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\\"}]}],\\"dynamic\\":true,\\"delay\\":2}],\\"ignoreEmptyText\\":true,\\"includeStyle\\":false,\\"dataAttr\\":\\"text\\"}",
	69	+	"tags": ["nice stuff", "nerd-news"],
	70	+	"content_type": 2,
	71	+	"state": 40,
	72	+	"schedule": "{\\"type\\":\\"INTERVAL\\",\\"params\\":{\\"interval\\":4447}}",
	73	+	"ts": "2022-03-27T15:51:15.667Z"
	74	+	}
	75	+	]
	76	+	}
	77	+
	78	+	'''
	79	+
	80	+	# Give the endpoint time to spin up
	81	+	time.sleep(1)
	82	+	client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
	83	+	res = client.post(
	84	+	url_for("import_page"),
	85	+	data={
	86	+	"distill-io": distill_data,
	87	+	"urls" : ''
	88	+	},
	89	+	follow_redirects=True,
	90	+	)
	91	+
	92	+
	93	+	assert b"Unable to read JSON file, was it broken?" not in res.data
	94	+	assert b"1 Imported from Distill.io" in res.data
	95	+
	96	+	res = client.get( url_for("edit_page", uuid="first"))
	97	+
	98	+	assert b"https://unraid.net/blog" in res.data
	99	+	assert b"Unraid \| News" in res.data
	100	+
	101	+
	102	+	# flask/wtforms should recode this, check we see it
	103	+	# wtforms encodes it like id=&#39 ,but html.escape makes it like id=&#x27
	104	+	# - so just check it manually :(
	105	+	#import json
	106	+	#import html
	107	+	#d = json.loads(distill_data)
	108	+	# embedded_d=json.loads(d['data'][0]['config'])
	109	+	# x=html.escape(embedded_d['selections'][0]['frames'][0]['includes'][0]['expr']).encode('utf-8')
	110	+	assert b"xpath:(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]" in res.data
	111	+
	112	+	# did the tags work?
	113	+	res = client.get( url_for("index"))
	114	+
	115	+	assert b"nice stuff" in res.data
	116	+	assert b"nerd-news" in res.data
	117	+
	118	+	res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
	119	+	# Clear flask alerts
	120	+	res = client.get(url_for("index"))
29	121

■ ■ ■ ■ ■ ■

changedetectionio/tests/test_xpath_selector.py

		skipped 116 lines
117	117		follow_redirects=True
118	118		)
119	119		assert b"is not a valid XPath expression" in res.data
	120	+
	121	+
	122	+	# actually only really used by the distll.io importer, but could be handy too
	123	+	def test_check_with_prefix_css_filter(client, live_server):
	124	+	res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
	125	+	assert b'Deleted' in res.data
	126	+
	127	+	# Give the endpoint time to spin up
	128	+	time.sleep(1)
	129	+
	130	+	set_original_response()
	131	+
	132	+	# Add our URL to the import page
	133	+	test_url = url_for('test_endpoint', _external=True)
	134	+	res = client.post(
	135	+	url_for("import_page"),
	136	+	data={"urls": test_url},
	137	+	follow_redirects=True
	138	+	)
	139	+	assert b"1 Imported" in res.data
	140	+	time.sleep(3)
	141	+
	142	+	res = client.post(
	143	+	url_for("edit_page", uuid="first"),
	144	+	data={"css_filter": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
	145	+	follow_redirects=True
	146	+	)
	147	+
	148	+	assert b"Updated watch." in res.data
	149	+	time.sleep(3)
	150	+
	151	+	res = client.get(
	152	+	url_for("preview_page", uuid="first"),
	153	+	follow_redirects=True
	154	+	)
	155	+
	156	+	with open('/tmp/fuck.html', 'wb') as f:
	157	+	f.write(res.data)
	158	+	assert b"Some text thats the same" in res.data #in selector
	159	+	assert b"Some text that will change" not in res.data #not in selector
	160	+
	161	+	client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
	162	+

Distill.io JSON export file importer (#592)