🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■ ■ ■
    changedetectionio/__init__.py
    skipped 682 lines
    683 683   @app.route("/import", methods=['GET', "POST"])
    684 684   @login_required
    685 685   def import_page():
    686  - import validators
    687 686   remaining_urls = []
    688  - 
    689  - good = 0
    690  - 
    691 687   if request.method == 'POST':
    692  - now=time.time()
    693  - urls = request.values.get('urls').split("\n")
     688 + from .importer import import_url_list, import_distill_io_json
    694 689   
    695  - if (len(urls) > 5000):
    696  - flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
     690 + # URL List import
     691 + if request.values.get('urls') and len(request.values.get('urls').strip()):
     692 + # Import and push into the queue for immediate update check
     693 + importer = import_url_list()
     694 + importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
     695 + for uuid in importer.new_uuids:
     696 + update_q.put(uuid)
    697 697   
    698  - for url in urls:
    699  - url = url.strip()
    700  - url, *tags = url.split(" ")
    701  - # Flask wtform validators wont work with basic auth, use validators package
    702  - # Up to 5000 per batch so we dont flood the server
    703  - if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
    704  - new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False)
    705  - if new_uuid:
    706  - # Straight into the queue.
    707  - update_q.put(new_uuid)
    708  - good += 1
    709  - continue
     698 + if len(importer.remaining_data) == 0:
     699 + return redirect(url_for('index'))
     700 + else:
     701 + remaining_urls = importer.remaining_data
    710 702   
    711  - if len(url.strip()):
    712  - remaining_urls.append(url)
     703 + # Distill.io import
     704 + if request.values.get('distill-io') and len(request.values.get('distill-io').strip()):
     705 + # Import and push into the queue for immediate update check
     706 + d_importer = import_distill_io_json()
     707 + d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
     708 + for uuid in d_importer.new_uuids:
     709 + update_q.put(uuid)
    713 710   
    714  - flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls)))
    715  - datastore.needs_write = True
    716 711   
    717  - if len(remaining_urls) == 0:
    718  - # Looking good, redirect to index.
    719  - return redirect(url_for('index'))
    720 712   
    721 713   # Could be some remaining, or we could be on GET
    722 714   output = render_template("import.html",
    723  - remaining="\n".join(remaining_urls)
     715 + import_url_list_remaining="\n".join(remaining_urls),
     716 + original_distill_json=''
    724 717   )
    725 718   return output
    726 719   
    skipped 566 lines
  • ■ ■ ■ ■ ■ ■
    changedetectionio/fetch_site_status.py
    skipped 16 lines
    17 17   self.datastore = datastore
    18 18   
    19 19   # If there was a proxy list enabled, figure out what proxy_args/which proxy to use
    20  - # if watch.proxy use that
    21  - # fetcher.proxy_override = watch.proxy or main config proxy
    22  - # Allows override the proxy on a per-request basis
    23  - # ALWAYS use the first one is nothing selected
     20 + # if watch.proxy use that
     21 + # fetcher.proxy_override = watch.proxy or main config proxy
     22 + # Allows override the proxy on a per-request basis
     23 + # ALWAYS use the first one is nothing selected
    24 24   
    25 25   def set_proxy_from_list(self, watch):
    26 26   proxy_args = None
    skipped 122 lines
    149 149   # Then we assume HTML
    150 150   if has_filter_rule:
    151 151   # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
    152  - if css_filter_rule[0] == '/':
    153  - html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
     152 + if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'):
     153 + html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''),
     154 + html_content=fetcher.content)
    154 155   else:
    155 156   # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
    156 157   html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
     158 + 
    157 159   if has_subtractive_selectors:
    158 160   html_content = html_tools.element_removal(subtractive_selectors, html_content)
    159 161   
    skipped 12 lines
    172 174   
    173 175   # Re #340 - return the content before the 'ignore text' was applied
    174 176   text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
    175  - 
    176 177   
    177 178   # Re #340 - return the content before the 'ignore text' was applied
    178 179   text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
    skipped 46 lines
    225 226   update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
    226 227   
    227 228   return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot
     229 + 
  • ■ ■ ■ ■ ■ ■
    changedetectionio/importer.py
     1 +from abc import ABC, abstractmethod
     2 +import time
     3 +import validators
     4 + 
     5 + 
     6 +class Importer():
     7 + remaining_data = []
     8 + new_uuids = []
     9 + good = 0
     10 + 
     11 + def __init__(self):
     12 + self.new_uuids = []
     13 + self.good = 0
     14 + self.remaining_data = []
     15 + 
     16 + @abstractmethod
     17 + def run(self,
     18 + data,
     19 + flash,
     20 + datastore):
     21 + pass
     22 + 
     23 + 
     24 +class import_url_list(Importer):
     25 + """
     26 + Imports a list, can be in <code>https://example.com tag1, tag2, last tag</code> format
     27 + """
     28 + def run(self,
     29 + data,
     30 + flash,
     31 + datastore,
     32 + ):
     33 + 
     34 + urls = data.split("\n")
     35 + good = 0
     36 + now = time.time()
     37 + 
     38 + if (len(urls) > 5000):
     39 + flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
     40 + 
     41 + for url in urls:
     42 + url = url.strip()
     43 + if not len(url):
     44 + continue
     45 + 
     46 + tags = ""
     47 + 
     48 + # 'tags' should be a csv list after the URL
     49 + if ' ' in url:
     50 + url, tags = url.split(" ", 1)
     51 + 
     52 + # Flask wtform validators wont work with basic auth, use validators package
     53 + # Up to 5000 per batch so we dont flood the server
     54 + if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
     55 + new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False)
     56 + if new_uuid:
     57 + # Straight into the queue.
     58 + self.new_uuids.append(new_uuid)
     59 + good += 1
     60 + continue
     61 + 
     62 + # Worked past the 'continue' above, append it to the bad list
     63 + if self.remaining_data is None:
     64 + self.remaining_data = []
     65 + self.remaining_data.append(url)
     66 + 
     67 + flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data)))
     68 + 
     69 + 
     70 +class import_distill_io_json(Importer):
     71 + def run(self,
     72 + data,
     73 + flash,
     74 + datastore,
     75 + ):
     76 + 
     77 + import json
     78 + good = 0
     79 + now = time.time()
     80 + self.new_uuids=[]
     81 + 
     82 + 
     83 + try:
     84 + data = json.loads(data.strip())
     85 + except json.decoder.JSONDecodeError:
     86 + flash("Unable to read JSON file, was it broken?", 'error')
     87 + return
     88 + 
     89 + if not data.get('data'):
     90 + flash("JSON structure looks invalid, was it broken?", 'error')
     91 + return
     92 + 
     93 + for d in data.get('data'):
     94 + d_config = json.loads(d['config'])
     95 + extras = {'title': d['name']}
     96 + 
     97 + if len(d['uri']) and good < 5000:
     98 + try:
     99 + # @todo we only support CSS ones at the moment
     100 + if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css':
     101 + extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr']
     102 + except KeyError:
     103 + pass
     104 + except IndexError:
     105 + pass
     106 + 
     107 + try:
     108 + extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
     109 + if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
     110 + extras['css_filter'] = 'xpath:' + extras['css_filter']
     111 + 
     112 + except KeyError:
     113 + pass
     114 + except IndexError:
     115 + pass
     116 + 
     117 + try:
     118 + extras['tag'] = ", ".join(d['tags'])
     119 + except KeyError:
     120 + pass
     121 + except IndexError:
     122 + pass
     123 + 
     124 + new_uuid = datastore.add_watch(url=d['uri'].strip(),
     125 + extras=extras,
     126 + write_to_disk_now=False)
     127 + 
     128 + if new_uuid:
     129 + # Straight into the queue.
     130 + self.new_uuids.append(new_uuid)
     131 + good += 1
     132 + 
     133 + flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
     134 + 
  • ■ ■ ■ ■
    changedetectionio/templates/edit.html
    skipped 130 lines
    131 131   <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
    132 132   <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
    133 133   href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
    134  - <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code>, <a
     134 + <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
    135 135   href="http://xpather.com/" target="new">test your XPath here</a></li>
    136 136   </ul>
    137 137   Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
    skipped 64 lines
  • ■ ■ ■ ■ ■
    changedetectionio/templates/import.html
    1 1  {% extends 'base.html' %}
     2 +{% block content %}
     3 +<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
     4 +<div class="edit-form monospaced-textarea">
    2 5   
    3  -{% block content %}
    4  -<div class="edit-form">
    5  - <div class="inner">
     6 + <div class="tabs collapsable">
     7 + <ul>
     8 + <li class="tab" id="default-tab"><a href="#url-list">URL List</a></li>
     9 + <li class="tab"><a href="#distill-io">Distill.io</a></li>
     10 + </ul>
     11 + </div>
     12 + 
     13 + <div class="box-wrap inner">
    6 14   <form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
    7 15   <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
    8  - <fieldset class="pure-group">
    9  - <legend>
    10  - Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):
    11  - <br>
    12  - <code>https://example.com tag1, tag2, last tag</code>
    13  - <br>
    14  - URLs which do not pass validation will stay in the textarea.
    15  - </legend>
     16 + <div class="tab-pane-inner" id="url-list">
     17 + <fieldset class="pure-group">
     18 + <legend>
     19 + Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma
     20 + (,):
     21 + <br>
     22 + <code>https://example.com tag1, tag2, last tag</code>
     23 + <br>
     24 + URLs which do not pass validation will stay in the textarea.
     25 + </legend>
    16 26   
    17 27   
    18  - <textarea name="urls" class="pure-input-1-2" placeholder="https://"
    19  - style="width: 100%;
     28 + <textarea name="urls" class="pure-input-1-2" placeholder="https://"
     29 + style="width: 100%;
    20 30   font-family:monospace;
    21 31   white-space: pre;
    22 32   overflow-wrap: normal;
    23  - overflow-x: scroll;" rows="25">{{ remaining }}</textarea>
    24  - </fieldset>
     33 + overflow-x: scroll;" rows="25">{{ import_url_list_remaining }}</textarea>
     34 + </fieldset>
     35 + 
     36 + 
     37 + </div>
     38 + 
     39 + <div class="tab-pane-inner" id="distill-io">
     40 + 
     41 + 
     42 + <fieldset class="pure-group">
     43 + <legend>
     44 + Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.</br>
     45 + This is <i>experimental</i>, supported fields are <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, the rest (including <code>schedule</code>) are ignored.
     46 + <br/>
     47 + <p>
     48 + How to export? <a href="https://distill.io/docs/web-monitor/how-export-and-import-monitors/">https://distill.io/docs/web-monitor/how-export-and-import-monitors/</a><br/>
     49 + Be sure to set your default fetcher to Chrome if required.</br>
     50 + </p>
     51 + </legend>
     52 + 
     53 + 
     54 + <textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
     55 + font-family:monospace;
     56 + white-space: pre;
     57 + overflow-wrap: normal;
     58 + overflow-x: scroll;" placeholder="Example Distill.io JSON export file
     59 + 
     60 +{
     61 + &quot;client&quot;: {
     62 + &quot;local&quot;: 1
     63 + },
     64 + &quot;data&quot;: [
     65 + {
     66 + &quot;name&quot;: &quot;Unraid | News&quot;,
     67 + &quot;uri&quot;: &quot;https://unraid.net/blog&quot;,
     68 + &quot;config&quot;: &quot;{\&quot;selections\&quot;:[{\&quot;frames\&quot;:[{\&quot;index\&quot;:0,\&quot;excludes\&quot;:[],\&quot;includes\&quot;:[{\&quot;type\&quot;:\&quot;xpath\&quot;,\&quot;expr\&quot;:\&quot;(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\&quot;}]}],\&quot;dynamic\&quot;:true,\&quot;delay\&quot;:2}],\&quot;ignoreEmptyText\&quot;:true,\&quot;includeStyle\&quot;:false,\&quot;dataAttr\&quot;:\&quot;text\&quot;}&quot;,
     69 + &quot;tags&quot;: [],
     70 + &quot;content_type&quot;: 2,
     71 + &quot;state&quot;: 40,
     72 + &quot;schedule&quot;: &quot;{\&quot;type\&quot;:\&quot;INTERVAL\&quot;,\&quot;params\&quot;:{\&quot;interval\&quot;:4447}}&quot;,
     73 + &quot;ts&quot;: &quot;2022-03-27T15:51:15.667Z&quot;
     74 + }
     75 + ]
     76 +}
     77 +" rows="25">{{ original_distill_json }}</textarea>
     78 + </fieldset>
     79 + </div>
    25 80   <button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
    26 81   </form>
    27  - </div>
     82 + 
     83 + </div>
    28 84  </div>
    29 85   
    30 86  {% endblock %}
    skipped 1 lines
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/test_import.py
    skipped 4 lines
    5 5  from flask import url_for
    6 6   
    7 7  from .util import live_server_setup
    8  - 
    9  - 
    10  -def test_import(client, live_server):
    11  - 
     8 +def test_setup(client, live_server):
    12 9   live_server_setup(live_server)
    13 10   
     11 +def test_import(client, live_server):
    14 12   # Give the endpoint time to spin up
    15 13   time.sleep(1)
    16 14   
    17 15   res = client.post(
    18 16   url_for("import_page"),
    19 17   data={
     18 + "distill-io": "",
    20 19   "urls": """https://example.com
    21 20  https://example.com tag1
    22 21  https://example.com tag1, other tag"""
    skipped 3 lines
    26 25   assert b"3 Imported" in res.data
    27 26   assert b"tag1" in res.data
    28 27   assert b"other tag" in res.data
     28 + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
     29 + 
     30 + # Clear flask alerts
     31 + res = client.get( url_for("index"))
     32 + res = client.get( url_for("index"))
     33 + 
     34 +def xtest_import_skip_url(client, live_server):
     35 + 
     36 + 
     37 + # Give the endpoint time to spin up
     38 + time.sleep(1)
     39 + 
     40 + res = client.post(
     41 + url_for("import_page"),
     42 + data={
     43 + "distill-io": "",
     44 + "urls": """https://example.com
     45 +:ht000000broken
     46 +"""
     47 + },
     48 + follow_redirects=True,
     49 + )
     50 + assert b"1 Imported" in res.data
     51 + assert b"ht000000broken" in res.data
     52 + assert b"1 Skipped" in res.data
     53 + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
     54 + # Clear flask alerts
     55 + res = client.get( url_for("index"))
     56 + 
     57 +def test_import_distillio(client, live_server):
     58 + 
     59 + distill_data='''
     60 +{
     61 + "client": {
     62 + "local": 1
     63 + },
     64 + "data": [
     65 + {
     66 + "name": "Unraid | News",
     67 + "uri": "https://unraid.net/blog",
     68 + "config": "{\\"selections\\":[{\\"frames\\":[{\\"index\\":0,\\"excludes\\":[],\\"includes\\":[{\\"type\\":\\"xpath\\",\\"expr\\":\\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\\"}]}],\\"dynamic\\":true,\\"delay\\":2}],\\"ignoreEmptyText\\":true,\\"includeStyle\\":false,\\"dataAttr\\":\\"text\\"}",
     69 + "tags": ["nice stuff", "nerd-news"],
     70 + "content_type": 2,
     71 + "state": 40,
     72 + "schedule": "{\\"type\\":\\"INTERVAL\\",\\"params\\":{\\"interval\\":4447}}",
     73 + "ts": "2022-03-27T15:51:15.667Z"
     74 + }
     75 + ]
     76 +}
     77 + 
     78 +'''
     79 + 
     80 + # Give the endpoint time to spin up
     81 + time.sleep(1)
     82 + client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
     83 + res = client.post(
     84 + url_for("import_page"),
     85 + data={
     86 + "distill-io": distill_data,
     87 + "urls" : ''
     88 + },
     89 + follow_redirects=True,
     90 + )
     91 + 
     92 + 
     93 + assert b"Unable to read JSON file, was it broken?" not in res.data
     94 + assert b"1 Imported from Distill.io" in res.data
     95 + 
     96 + res = client.get( url_for("edit_page", uuid="first"))
     97 + 
     98 + assert b"https://unraid.net/blog" in res.data
     99 + assert b"Unraid | News" in res.data
     100 + 
     101 + 
     102 + # flask/wtforms should recode this, check we see it
     103 + # wtforms encodes it like id=&#39 ,but html.escape makes it like id=&#x27
     104 + # - so just check it manually :(
     105 + #import json
     106 + #import html
     107 + #d = json.loads(distill_data)
     108 + # embedded_d=json.loads(d['data'][0]['config'])
     109 + # x=html.escape(embedded_d['selections'][0]['frames'][0]['includes'][0]['expr']).encode('utf-8')
     110 + assert b"xpath:(//div[@id=&#39;App&#39;]/div[contains(@class,&#39;flex&#39;)]/main[contains(@class,&#39;relative&#39;)]/section[contains(@class,&#39;relative&#39;)]/div[@class=&#39;container&#39;]/div[contains(@class,&#39;flex&#39;)]/div[contains(@class,&#39;w-full&#39;)])[1]" in res.data
     111 + 
     112 + # did the tags work?
     113 + res = client.get( url_for("index"))
     114 + 
     115 + assert b"nice stuff" in res.data
     116 + assert b"nerd-news" in res.data
     117 + 
     118 + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
     119 + # Clear flask alerts
     120 + res = client.get(url_for("index"))
    29 121   
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/test_xpath_selector.py
    skipped 116 lines
    117 117   follow_redirects=True
    118 118   )
    119 119   assert b"is not a valid XPath expression" in res.data
     120 + 
     121 + 
     122 +# actually only really used by the distll.io importer, but could be handy too
     123 +def test_check_with_prefix_css_filter(client, live_server):
     124 + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
     125 + assert b'Deleted' in res.data
     126 + 
     127 + # Give the endpoint time to spin up
     128 + time.sleep(1)
     129 + 
     130 + set_original_response()
     131 + 
     132 + # Add our URL to the import page
     133 + test_url = url_for('test_endpoint', _external=True)
     134 + res = client.post(
     135 + url_for("import_page"),
     136 + data={"urls": test_url},
     137 + follow_redirects=True
     138 + )
     139 + assert b"1 Imported" in res.data
     140 + time.sleep(3)
     141 + 
     142 + res = client.post(
     143 + url_for("edit_page", uuid="first"),
     144 + data={"css_filter": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
     145 + follow_redirects=True
     146 + )
     147 + 
     148 + assert b"Updated watch." in res.data
     149 + time.sleep(3)
     150 + 
     151 + res = client.get(
     152 + url_for("preview_page", uuid="first"),
     153 + follow_redirects=True
     154 + )
     155 + 
     156 + with open('/tmp/fuck.html', 'wb') as f:
     157 + f.write(res.data)
     158 + assert b"Some text thats the same" in res.data #in selector
     159 + assert b"Some text that will change" not in res.data #not in selector
     160 + 
     161 + client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
     162 + 
Please wait...
Page is in error, reload to recover