🤬
  • ■ ■ ■ ■
    Dockerfile
    skipped 21 lines
    22 22   
    23 23  # Playwright is an alternative to Selenium
    24 24  # Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing
    25  -RUN pip install --target=/dependencies playwright~=1.24 \
     25 +RUN pip install --target=/dependencies playwright~=1.25 \
    26 26   || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled."
    27 27   
    28 28  # Final image stage
    skipped 39 lines
  • ■ ■ ■ ■ ■ ■
    README.md
    skipped 11 lines
    12 12   
    13 13  [**Don't have time? Let us host it for you! try our $6.99/month subscription - use our proxies and support!**](https://lemonade.changedetection.io/start) , _half the price of other website change monitoring services and comes with unlimited watches & checks!_
    14 14   
     15 +- Chrome browser included.
     16 +- Super fast, no registration needed setup.
     17 +- Start watching and receiving change notifications instantly.
    15 18   
    16 19   
    17  -- Automatic Updates, Automatic Backups, No Heroku "paused application", don't miss a change!
    18  -- Javascript browser included
    19  -- Unlimited checks and watches!
     20 +Easily see what changed, examine by word, line, or individual character.
     21 + 
     22 +<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot-diff.png" style="max-width:100%;" alt="Self-hosted web page change monitoring context difference " title="Self-hosted web page change monitoring context difference " />
    20 23   
    21 24   
    22 25  #### Example use cases
    skipped 27 lines
    50 53  - Execute JS before extracting text (Good for logging in, see examples in the UI!)
    51 54  - Override Request Headers, Specify `POST` or `GET` and other methods
    52 55  - Use the "Visual Selector" to help target specific elements
     56 +- Configurable [proxy per watch](https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration)
    53 57   
     58 +We [recommend and use Bright Data](https://brightdata.grsm.io/n0r16zf7eivq) global proxy services, Bright Data will match any first deposit up to $100 using our signup link.
    54 59   
    55 60  ## Screenshots
    56  - 
    57  -### Examine differences in content.
    58  - 
    59  -Easily see what changed, examine by word, line, or individual character.
    60  - 
    61  -<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot-diff.png" style="max-width:100%;" alt="Self-hosted web page change monitoring context difference " title="Self-hosted web page change monitoring context difference " />
    62 61   
    63 62  Please :star: star :star: this project and help it grow! https://github.com/dgtlmoon/changedetection.io/
    64 63   
    skipped 147 lines
  • ■ ■ ■ ■ ■
    changedetectionio/__init__.py
    skipped 548 lines
    549 549   
    550 550   # Defaults for proxy choice
    551 551   if datastore.proxy_list is not None: # When enabled
     552 + # @todo
    552 553   # Radio needs '' not None, or incase that the chosen one no longer exists
    553 554   if default['proxy'] is None or not any(default['proxy'] in tup for tup in datastore.proxy_list):
    554 555   default['proxy'] = ''
    skipped 7 lines
    562 563   # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead
    563 564   del form.proxy
    564 565   else:
    565  - form.proxy.choices = [('', 'Default')] + datastore.proxy_list
     566 + form.proxy.choices = [('', 'Default')]
     567 + for p in datastore.proxy_list:
     568 + form.proxy.choices.append(tuple((p, datastore.proxy_list[p]['label'])))
     569 + 
    566 570   
    567 571   if request.method == 'POST' and form.validate():
    568 572   extra_update_obj = {}
    skipped 101 lines
    670 674   
    671 675   default = deepcopy(datastore.data['settings'])
    672 676   if datastore.proxy_list is not None:
     677 + available_proxies = list(datastore.proxy_list.keys())
    673 678   # When enabled
    674 679   system_proxy = datastore.data['settings']['requests']['proxy']
    675 680   # In the case it doesnt exist anymore
    676  - if not any([system_proxy in tup for tup in datastore.proxy_list]):
     681 + if not system_proxy in available_proxies:
    677 682   system_proxy = None
    678 683   
    679  - default['requests']['proxy'] = system_proxy if system_proxy is not None else datastore.proxy_list[0][0]
     684 + default['requests']['proxy'] = system_proxy if system_proxy is not None else available_proxies[0]
    680 685   # Used by the form handler to keep or remove the proxy settings
    681  - default['proxy_list'] = datastore.proxy_list
     686 + default['proxy_list'] = available_proxies[0]
    682 687   
    683 688   
    684 689   # Don't use form.data on POST so that it doesnt overrid the checkbox status from the POST status
    skipped 8 lines
    693 698   # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead
    694 699   del form.requests.form.proxy
    695 700   else:
    696  - form.requests.form.proxy.choices = datastore.proxy_list
     701 + form.requests.form.proxy.choices = []
     702 + for p in datastore.proxy_list:
     703 + form.requests.form.proxy.choices.append(tuple((p, datastore.proxy_list[p]['label'])))
     704 + 
    697 705   
    698 706   if request.method == 'POST':
    699 707   # Password unset is a GET, but we can lock the session to a salted env password to always need the password
    skipped 834 lines
    1534 1542   import random
    1535 1543   from changedetectionio import update_worker
    1536 1544   
     1545 + proxy_last_called_time = {}
     1546 + 
    1537 1547   recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20))
    1538 1548   print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds)
    1539 1549   
    skipped 54 lines
    1594 1604   if watch.jitter_seconds == 0:
    1595 1605   watch.jitter_seconds = random.uniform(-abs(jitter), jitter)
    1596 1606   
    1597  - 
    1598 1607   seconds_since_last_recheck = now - watch['last_checked']
     1608 + 
    1599 1609   if seconds_since_last_recheck >= (threshold + watch.jitter_seconds) and seconds_since_last_recheck >= recheck_time_minimum_seconds:
    1600 1610   if not uuid in running_uuids and uuid not in [q_uuid for p,q_uuid in update_q.queue]:
     1611 + 
     1612 + # Proxies can be set to have a limit on seconds between which they can be called
     1613 + watch_proxy = datastore.get_preferred_proxy_for_watch(uuid=uuid)
     1614 + if watch_proxy and watch_proxy in list(datastore.proxy_list.keys()):
     1615 + # Proxy may also have some threshold minimum
     1616 + proxy_list_reuse_time_minimum = int(datastore.proxy_list.get(watch_proxy, {}).get('reuse_time_minimum', 0))
     1617 + if proxy_list_reuse_time_minimum:
     1618 + proxy_last_used_time = proxy_last_called_time.get(watch_proxy, 0)
     1619 + time_since_proxy_used = int(time.time() - proxy_last_used_time)
     1620 + if time_since_proxy_used < proxy_list_reuse_time_minimum:
     1621 + # Not enough time difference reached, skip this watch
     1622 + print("> Skipped UUID {} using proxy '{}', not enough time between proxy requests {}s/{}s".format(uuid,
     1623 + watch_proxy,
     1624 + time_since_proxy_used,
     1625 + proxy_list_reuse_time_minimum))
     1626 + continue
     1627 + else:
     1628 + # Record the last used time
     1629 + proxy_last_called_time[watch_proxy] = int(time.time())
     1630 + 
    1601 1631   # Use Epoch time as priority, so we get a "sorted" PriorityQueue, but we can still push a priority 1 into it.
    1602 1632   priority = int(time.time())
    1603 1633   print(
    skipped 19 lines
  • ■ ■ ■ ■ ■
    changedetectionio/content_fetcher.py
    skipped 317 lines
    318 318   import playwright._impl._api_types
    319 319   from playwright._impl._api_types import Error, TimeoutError
    320 320   response = None
     321 + 
    321 322   with sync_playwright() as p:
    322 323   browser_type = getattr(p, self.browser_type)
    323 324   
    skipped 51 lines
    375 376   print("response object was none")
    376 377   raise EmptyReply(url=url, status_code=None)
    377 378   
    378  - # Bug 2(?) Set the viewport size AFTER loading the page
    379  - page.set_viewport_size({"width": 1280, "height": 1024})
     379 + 
     380 + # Removed browser-set-size, seemed to be needed to make screenshots work reliably in older playwright versions
     381 + # Was causing exceptions like 'waiting for page but content is changing' etc
     382 + # https://www.browserstack.com/docs/automate/playwright/change-browser-window-size 1280x720 should be the default
     383 +
    380 384   extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
    381 385   time.sleep(extra_wait)
    382 386   
    skipped 16 lines
    399 403   pass
    400 404   
    401 405   raise JSActionExceptions(status_code=response.status, screenshot=error_screenshot, message=str(e), url=url)
     406 + 
     407 + page.wait_for_timeout(500)
    402 408   
    403 409   self.content = page.content()
    404 410   self.raw_content = page.content()
    skipped 223 lines
  • ■ ■ ■ ■ ■
    changedetectionio/fetch_processor/json_html_plaintext.py
    skipped 13 lines
    14 14  # Some common stuff here that can be moved to a base class
    15 15  # (set_proxy_from_list)
    16 16  class perform_site_check(fetch_processor):
     17 + screenshot = None
     18 + xpath_data = None
    17 19   
    18  - xpath_data = None
     20 + def __init__(self, *args, datastore, **kwargs):
     21 + super().__init__(*args, **kwargs)
     22 + self.datastore = datastore
    19 23   
    20 24   # Doesn't look like python supports forward slash auto enclosure in re.findall
    21 25   # So convert it to inline flag "foobar(?i)" type configuration
    skipped 15 lines
    37 41   stripped_text_from_html = ""
    38 42   
    39 43   watch = self.datastore.data['watching'].get(uuid)
     44 + if not watch:
     45 + return
    40 46   
    41 47   # Protect against file:// access
    42 48   if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False):
    skipped 16 lines
    59 65   if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']:
    60 66   request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '')
    61 67   
    62  - timeout = self.datastore.data['settings']['requests']['timeout']
     68 + timeout = self.datastore.data['settings']['requests'].get('timeout')
    63 69   url = watch.get('url')
    64 70   request_body = self.datastore.data['watching'][uuid].get('body')
    65 71   request_method = self.datastore.data['watching'][uuid].get('method')
    skipped 13 lines
    79 85   # If the klass doesnt exist, just use a default
    80 86   klass = getattr(content_fetcher, "html_requests")
    81 87   
     88 + proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid)
     89 + proxy_url = None
     90 + if proxy_id:
     91 + proxy_url = self.datastore.proxy_list.get(proxy_id).get('url')
     92 + print ("UUID {} Using proxy {}".format(uuid, proxy_url))
    82 93   
    83  - proxy_args = self.set_proxy_from_list(watch)
    84  - fetcher = klass(proxy_override=proxy_args)
     94 + fetcher = klass(proxy_override=proxy_url)
    85 95   
    86 96   # Configurable per-watch or global extra delay before extracting text (for webDriver types)
    87 97   system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None)
    skipped 203 lines
  • ■ ■ ■ ■ ■ ■
    changedetectionio/run_all_tests.sh
    skipped 48 lines
    49 49   
    50 50  unset PLAYWRIGHT_DRIVER_URL
    51 51  docker kill $$-test_browserless
     52 + 
     53 +# Test proxy list handling, starting two squids on different ports
     54 +# Each squid adds a different header to the response, which is the main thing we test for.
     55 +docker run -d --name $$-squid-one --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf -p 3128:3128 ubuntu/squid:4.13-21.10_edge
     56 +docker run -d --name $$-squid-two --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf -p 3129:3128 ubuntu/squid:4.13-21.10_edge
     57 + 
     58 + 
     59 +# So, basic HTTP as env var test
     60 +export HTTP_PROXY=http://localhost:3128
     61 +export HTTPS_PROXY=http://localhost:3128
     62 +pytest tests/proxy_list/test_proxy.py
     63 +docker logs $$-squid-one 2>/dev/null|grep one.changedetection.io
     64 +if [ $? -ne 0 ]
     65 +then
     66 + echo "Did not see a request to one.changedetection.io in the squid logs (while checking env vars HTTP_PROXY/HTTPS_PROXY)"
     67 +fi
     68 +unset HTTP_PROXY
     69 +unset HTTPS_PROXY
     70 + 
     71 + 
     72 +# 2nd test actually choose the preferred proxy from proxies.json
     73 +cp tests/proxy_list/proxies.json-example ./test-datastore/proxies.json
     74 +# Makes a watch use a preferred proxy
     75 +pytest tests/proxy_list/test_multiple_proxy.py
     76 + 
     77 +# Should be a request in the default "first" squid
     78 +docker logs $$-squid-one 2>/dev/null|grep chosen.changedetection.io
     79 +if [ $? -ne 0 ]
     80 +then
     81 + echo "Did not see a request to chosen.changedetection.io in the squid logs (while checking preferred proxy)"
     82 +fi
     83 + 
     84 +# And one in the 'second' squid (user selects this as preferred)
     85 +docker logs $$-squid-two 2>/dev/null|grep chosen.changedetection.io
     86 +if [ $? -ne 0 ]
     87 +then
     88 + echo "Did not see a request to chosen.changedetection.io in the squid logs (while checking preferred proxy)"
     89 +fi
     90 + 
     91 +# @todo - test system override proxy selection and watch defaults, setup a 3rd squid?
     92 +docker kill $$-squid-one
     93 +docker kill $$-squid-two
     94 + 
     95 + 
     96 + 
  • ■ ■ ■ ■ ■ ■
    changedetectionio/store.py
    skipped 112 lines
    113 113   self.__data['settings']['application']['api_access_token'] = secret
    114 114   
    115 115   # Proxy list support - available as a selection in settings when text file is imported
    116  - # CSV list
    117  - # "name, address", or just "name"
    118  - proxy_list_file = "{}/proxies.txt".format(self.datastore_path)
     116 + proxy_list_file = "{}/proxies.json".format(self.datastore_path)
    119 117   if path.isfile(proxy_list_file):
    120 118   self.import_proxy_list(proxy_list_file)
    121 119   
    skipped 315 lines
    437 435   unlink(item)
    438 436   
    439 437   def import_proxy_list(self, filename):
    440  - import csv
    441  - with open(filename, newline='') as f:
    442  - reader = csv.reader(f, skipinitialspace=True)
    443  - # @todo This loop can could be improved
    444  - l = []
    445  - for row in reader:
    446  - if len(row):
    447  - if len(row)>=2:
    448  - l.append(tuple(row[:2]))
    449  - else:
    450  - l.append(tuple([row[0], row[0]]))
    451  - self.proxy_list = l if len(l) else None
     438 + with open(filename) as f:
     439 + self.proxy_list = json.load(f)
     440 + print ("Registered proxy list", list(self.proxy_list.keys()))
     441 + 
     442 + 
     443 + def get_preferred_proxy_for_watch(self, uuid):
     444 + """
     445 + Returns the preferred proxy by ID key
     446 + :param uuid: UUID
     447 + :return: proxy "key" id
     448 + """
    452 449   
     450 + proxy_id = None
     451 + if self.proxy_list is None:
     452 + return None
     453 + 
     454 + # If its a valid one
     455 + watch = self.data['watching'].get(uuid)
     456 + 
     457 + if watch.get('proxy') and watch.get('proxy') in list(self.proxy_list.keys()):
     458 + return watch.get('proxy')
     459 + 
     460 + # not valid (including None), try the system one
     461 + else:
     462 + system_proxy_id = self.data['settings']['requests'].get('proxy')
     463 + # Is not None and exists
     464 + if self.proxy_list.get(system_proxy_id):
     465 + return system_proxy_id
     466 + 
     467 + # Fallback - Did not resolve anything, use the first available
     468 + if system_proxy_id is None:
     469 + first_default = list(self.proxy_list)[0]
     470 + return first_default
     471 + 
     472 + return None
    453 473   
    454 474   # Run all updates
    455 475   # IMPORTANT - Each update could be run even when they have a new install and the schema is correct
    skipped 105 lines
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/proxy_list/__init__.py
     1 +"""Tests for the app."""
     2 + 
     3 + 
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/proxy_list/conftest.py
     1 +#!/usr/bin/python3
     2 + 
     3 +from .. import conftest
     4 + 
     5 +#def pytest_addoption(parser):
     6 +# parser.addoption("--url_suffix", action="store", default="identifier for request")
     7 + 
     8 + 
     9 +#def pytest_generate_tests(metafunc):
     10 +# # This is called for every test. Only get/set command line arguments
     11 +# # if the argument is specified in the list of test "fixturenames".
     12 +# option_value = metafunc.config.option.url_suffix
     13 +# if 'url_suffix' in metafunc.fixturenames and option_value is not None:
     14 +# metafunc.parametrize("url_suffix", [option_value])
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/proxy_list/proxies.json-example
     1 +{
     2 + "proxy-one": {
     3 + "label": "One",
     4 + "url": "http://127.0.0.1:3128"
     5 + },
     6 + "proxy-two": {
     7 + "label": "two",
     8 + "url": "http://127.0.0.1:3129"
     9 + }
     10 +}
     11 + 
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/proxy_list/squid.conf
     1 +acl localnet src 0.0.0.1-0.255.255.255 # RFC 1122 "this" network (LAN)
     2 +acl localnet src 10.0.0.0/8 # RFC 1918 local private network (LAN)
     3 +acl localnet src 100.64.0.0/10 # RFC 6598 shared address space (CGN)
     4 +acl localnet src 169.254.0.0/16 # RFC 3927 link-local (directly plugged) machines
     5 +acl localnet src 172.16.0.0/12 # RFC 1918 local private network (LAN)
     6 +acl localnet src 192.168.0.0/16 # RFC 1918 local private network (LAN)
     7 +acl localnet src fc00::/7 # RFC 4193 local private network range
     8 +acl localnet src fe80::/10 # RFC 4291 link-local (directly plugged) machines
     9 +acl localnet src 159.65.224.174
     10 +acl SSL_ports port 443
     11 +acl Safe_ports port 80 # http
     12 +acl Safe_ports port 21 # ftp
     13 +acl Safe_ports port 443 # https
     14 +acl Safe_ports port 70 # gopher
     15 +acl Safe_ports port 210 # wais
     16 +acl Safe_ports port 1025-65535 # unregistered ports
     17 +acl Safe_ports port 280 # http-mgmt
     18 +acl Safe_ports port 488 # gss-http
     19 +acl Safe_ports port 591 # filemaker
     20 +acl Safe_ports port 777 # multiling http
     21 +acl CONNECT method CONNECT
     22 + 
     23 +http_access deny !Safe_ports
     24 +http_access deny CONNECT !SSL_ports
     25 +http_access allow localhost manager
     26 +http_access deny manager
     27 +http_access allow localhost
     28 +http_access allow localnet
     29 +http_access deny all
     30 +http_port 3128
     31 +coredump_dir /var/spool/squid
     32 +refresh_pattern ^ftp: 1440 20% 10080
     33 +refresh_pattern ^gopher: 1440 0% 1440
     34 +refresh_pattern -i (/cgi-bin/|\?) 0 0% 0
     35 +refresh_pattern \/(Packages|Sources)(|\.bz2|\.gz|\.xz)$ 0 0% 0 refresh-ims
     36 +refresh_pattern \/Release(|\.gpg)$ 0 0% 0 refresh-ims
     37 +refresh_pattern \/InRelease$ 0 0% 0 refresh-ims
     38 +refresh_pattern \/(Translation-.*)(|\.bz2|\.gz|\.xz)$ 0 0% 0 refresh-ims
     39 +refresh_pattern . 0 20% 4320
     40 +logfile_rotate 0
     41 + 
     42 + 
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/proxy_list/test_multiple_proxy.py
     1 +#!/usr/bin/python3
     2 + 
     3 +import time
     4 +from flask import url_for
     5 +from ..util import live_server_setup
     6 + 
     7 +def test_preferred_proxy(client, live_server):
     8 + time.sleep(1)
     9 + live_server_setup(live_server)
     10 + time.sleep(1)
     11 + url = "http://chosen.changedetection.io"
     12 + 
     13 + res = client.post(
     14 + url_for("import_page"),
     15 + # Because a URL wont show in squid/proxy logs due it being SSLed
     16 + # Use plain HTTP or a specific domain-name here
     17 + data={"urls": url},
     18 + follow_redirects=True
     19 + )
     20 + 
     21 + assert b"1 Imported" in res.data
     22 + 
     23 + time.sleep(2)
     24 + res = client.post(
     25 + url_for("edit_page", uuid="first"),
     26 + data={
     27 + "css_filter": "",
     28 + "fetch_backend": "html_requests",
     29 + "headers": "",
     30 + "proxy": "proxy-two",
     31 + "tag": "",
     32 + "url": url,
     33 + },
     34 + follow_redirects=True
     35 + )
     36 + assert b"Updated watch." in res.data
     37 + time.sleep(2)
     38 + # Now the request should appear in the second-squid logs
     39 + 
  • ■ ■ ■ ■ ■ ■
    changedetectionio/tests/proxy_list/test_proxy.py
     1 +#!/usr/bin/python3
     2 + 
     3 +import time
     4 +from flask import url_for
     5 +from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
     6 + 
     7 +# just make a request, we will grep in the docker logs to see it actually got called
     8 +def test_check_basic_change_detection_functionality(client, live_server):
     9 + live_server_setup(live_server)
     10 + res = client.post(
     11 + url_for("import_page"),
     12 + # Because a URL wont show in squid/proxy logs due it being SSLed
     13 + # Use plain HTTP or a specific domain-name here
     14 + data={"urls": "http://one.changedetection.io"},
     15 + follow_redirects=True
     16 + )
     17 + 
     18 + assert b"1 Imported" in res.data
     19 + time.sleep(3)
     20 + 
  • ■ ■ ■ ■ ■ ■
    docker-compose.yml
    skipped 5 lines
    6 6   hostname: changedetection
    7 7   volumes:
    8 8   - changedetection-data:/datastore
     9 +# Configurable proxy list support, see https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#proxy-list-support
     10 +# - ./proxies.json:/datastore/proxies.json
    9 11   
    10 12   # environment:
    11 13   # Default listening port, can also be changed with the -p option
    skipped 80 lines
Please wait...
Page is in error, reload to recover