🤬
  • ■ ■ ■ ■ ■ ■
    README.md
    skipped 90 lines
    91 91  ```bash
    92 92  docker-compose pull && docker-compose up -d
    93 93  ```
     94 +### Filters
     95 +XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
    94 96   
    95 97  ### Notifications
    96 98   
    skipped 77 lines
  • ■ ■ ■ ■ ■
    changedetectionio/__init__.py
    skipped 803 lines
    804 804   compress_type=zipfile.ZIP_DEFLATED,
    805 805   compresslevel=8)
    806 806   
    807  - return send_from_directory(datastore_o.datastore_path, backupname, as_attachment=True)
     807 + # Send_from_directory needs to be the full absolute path
     808 + return send_from_directory(os.path.abspath(datastore_o.datastore_path), backupname, as_attachment=True)
    808 809   
    809 810   @app.route("/static/<string:group>/<string:filename>", methods=['GET'])
    810 811   def static_content(group, filename):
    skipped 192 lines
  • ■ ■ ■ ■ ■ ■
    changedetectionio/fetch_site_status.py
    skipped 113 lines
    114 114   if 'json:' in css_filter_rule:
    115 115   stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
    116 116   is_html = False
    117  - else:
    118  - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
    119  - stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
    120 117   
    121 118   if is_html:
    122 119   # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
    123 120   html_content = fetcher.content
    124 121   if has_filter_rule:
    125  - html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
     122 + # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
     123 + if css_filter_rule[0] == '/':
     124 + html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
     125 + else:
     126 + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
     127 + html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
    126 128   
    127 129   # get_text() via inscriptis
    128 130   stripped_text_from_html = get_text(html_content)
    skipped 65 lines
  • ■ ■ ■ ■ ■ ■
    changedetectionio/forms.py
    skipped 180 lines
    181 181   message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
    182 182   raise ValidationError(message % (line))
    183 183   
    184  -class ValidateCSSJSONInput(object):
     184 +class ValidateCSSJSONXPATHInput(object):
    185 185   """
    186 186   Filter validation
    187 187   @todo CSS validator ;)
    skipped 3 lines
    191 191   self.message = message
    192 192   
    193 193   def __call__(self, form, field):
     194 + 
     195 + # Nothing to see here
     196 + if not len(field.data.strip()):
     197 + return
     198 + 
     199 + # Does it look like XPath?
     200 + if field.data.strip()[0] == '/':
     201 + from lxml import html, etree
     202 + tree = html.fromstring("<html></html>")
     203 + 
     204 + try:
     205 + tree.xpath(field.data.strip())
     206 + except etree.XPathEvalError as e:
     207 + message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
     208 + raise ValidationError(message % (field.data, str(e)))
     209 + except:
     210 + raise ValidationError("A system-error occurred when validating your XPath expression")
     211 + 
    194 212   if 'json:' in field.data:
    195 213   from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
    196 214   from jsonpath_ng.ext import parse
    skipped 5 lines
    202 220   except (JsonPathParserError, JsonPathLexerError) as e:
    203 221   message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
    204 222   raise ValidationError(message % (input, str(e)))
     223 + except:
     224 + raise ValidationError("A system-error occurred when validating your JSONPath expression")
    205 225   
    206 226   # Re #265 - maybe in the future fetch the page and offer a
    207 227   # warning/notice that its possible the rule doesnt yet match anything?
    skipped 21 lines
    229 249   
    230 250   minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck',
    231 251   [validators.Optional(), validators.NumberRange(min=1)])
    232  - css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()])
     252 + css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
    233 253   title = StringField('Title')
    234 254   
    235 255   ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
    skipped 27 lines
  • ■ ■ ■ ■ ■ ■
    changedetectionio/html_tools.py
    skipped 15 lines
    16 16   
    17 17   return html_block + "\n"
    18 18   
     19 + 
     20 +# Return str Utf-8 of matched rules
     21 +def xpath_filter(xpath_filter, html_content):
     22 + from lxml import html
     23 + from lxml import etree
     24 + 
     25 + tree = html.fromstring(html_content)
     26 + html_block = ""
     27 + 
     28 + for item in tree.xpath(xpath_filter.strip()):
     29 + html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"
     30 + 
     31 + return html_block
     32 + 
     33 + 
    19 34  # Extract/find element
    20 35  def extract_element(find='title', html_content=''):
    21 36   
    skipped 72 lines
  • ■ ■ ■ ■ ■
    changedetectionio/templates/edit.html
    skipped 94 lines
    95 95   <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
    96 96   <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <b>"json:"</b>, <a
    97 97   href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
     98 + <li>XPATH - Limit text to this XPath rule, simply start with a forward-slash, example <b>//*[contains(@class, 'sametext')]</b>, <a
     99 + href="http://xpather.com/" target="new">test your XPath here</a></li>
    98 100   </ul>
    99  - Please be sure that you thoroughly understand how to write CSS or JSONPath selector rules before filing an issue on GitHub! <a
     101 + Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
    100 102   href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
    101 103   </span>
    102 104   </div>
    skipped 41 lines
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/test_xpath_selector.py
     1 +#!/usr/bin/python3
     2 + 
     3 +import time
     4 +from flask import url_for
     5 +from . util import live_server_setup
     6 + 
     7 +from ..html_tools import *
     8 + 
     9 +def test_setup(live_server):
     10 + live_server_setup(live_server)
     11 + 
     12 +def set_original_response():
     13 + test_return_data = """<html>
     14 + <body>
     15 + Some initial text</br>
     16 + <p>Which is across multiple lines</p>
     17 + </br>
     18 + So let's see what happens. </br>
     19 + <div class="sametext">Some text thats the same</div>
     20 + <div class="changetext">Some text that will change</div>
     21 + </body>
     22 + </html>
     23 + """
     24 + 
     25 + with open("test-datastore/endpoint-content.txt", "w") as f:
     26 + f.write(test_return_data)
     27 + return None
     28 + 
     29 +def set_modified_response():
     30 + test_return_data = """<html>
     31 + <body>
     32 + Some initial text</br>
     33 + <p>Which is across multiple lines</p>
     34 + </br>
     35 + So let's see what happens. THIS CHANGES AND SHOULDNT TRIGGER A CHANGE</br>
     36 + <div class="sametext">Some text thats the same</div>
     37 + <div class="changetext">Some new text</div>
     38 + </body>
     39 + </html>
     40 + """
     41 + 
     42 + with open("test-datastore/endpoint-content.txt", "w") as f:
     43 + f.write(test_return_data)
     44 + 
     45 + return None
     46 + 
     47 + 
     48 +def test_check_markup_xpath_filter_restriction(client, live_server):
     49 + sleep_time_for_fetch_thread = 3
     50 + 
     51 + xpath_filter = "//*[contains(@class, 'sametext')]"
     52 + 
     53 + set_original_response()
     54 + 
     55 + # Give the endpoint time to spin up
     56 + time.sleep(1)
     57 + 
     58 + # Add our URL to the import page
     59 + test_url = url_for('test_endpoint', _external=True)
     60 + res = client.post(
     61 + url_for("import_page"),
     62 + data={"urls": test_url},
     63 + follow_redirects=True
     64 + )
     65 + assert b"1 Imported" in res.data
     66 + 
     67 + # Trigger a check
     68 + client.get(url_for("api_watch_checknow"), follow_redirects=True)
     69 + 
     70 + # Give the thread time to pick it up
     71 + time.sleep(sleep_time_for_fetch_thread)
     72 + 
     73 + # Goto the edit page, add our ignore text
     74 + # Add our URL to the import page
     75 + res = client.post(
     76 + url_for("edit_page", uuid="first"),
     77 + data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
     78 + follow_redirects=True
     79 + )
     80 + assert b"Updated watch." in res.data
     81 + 
     82 + # Give the thread time to pick it up
     83 + time.sleep(sleep_time_for_fetch_thread)
     84 + 
     85 + # view it/reset state back to viewed
     86 + client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True)
     87 + 
     88 + # Make a change
     89 + set_modified_response()
     90 + 
     91 + # Trigger a check
     92 + client.get(url_for("api_watch_checknow"), follow_redirects=True)
     93 + # Give the thread time to pick it up
     94 + time.sleep(sleep_time_for_fetch_thread)
     95 + 
     96 + res = client.get(url_for("index"))
     97 + assert b'unviewed' not in res.data
     98 + 
     99 +def test_xpath_validation(client, live_server):
     100 + 
     101 + # Give the endpoint time to spin up
     102 + time.sleep(1)
     103 + 
     104 + # Add our URL to the import page
     105 + test_url = url_for('test_endpoint', _external=True)
     106 + res = client.post(
     107 + url_for("import_page"),
     108 + data={"urls": test_url},
     109 + follow_redirects=True
     110 + )
     111 + assert b"1 Imported" in res.data
     112 + 
     113 + res = client.post(
     114 + url_for("edit_page", uuid="first"),
     115 + data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
     116 + follow_redirects=True
     117 + )
     118 + assert b"is not a valid XPath expression" in res.data
  • ■ ■ ■ ■
    requirements.txt
    skipped 25 lines
    26 26  # ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly
    27 27  cryptography ~= 3.4
    28 28   
    29  -# Used for CSS filtering, replace with soupsieve and lxml for xpath
     29 +# Used for CSS filtering
    30 30  bs4
     31 + 
     32 +# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
     33 +lxml
    31 34   
    32 35  # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0
    33 36  selenium ~= 4.1.0
    skipped 1 lines
Please wait...
Page is in error, reload to recover