■ ■ ■ ■ ■ ■
changedetectionio/fetch_processor/rendered_webpage.py
1 | | - | import hashlib |
2 | | - | import imagehash |
3 | | - | from PIL import Image |
4 | | - | import io |
5 | | - | import logging |
6 | | - | import os |
7 | | - | import re |
8 | | - | import time |
9 | | - | import urllib3 |
10 | | - | |
11 | | - | # fetch processor for requesting and comparing a single image |
12 | | - | # can use both requests and playwright/selenium |
13 | | - | |
14 | | - | # - imagehash for change detection (or https://github.com/dgtlmoon/changedetection.io/pull/419/files#diff-7d3854710a6c0faead783f75850100a4c4b69409309200d3a83692dc9783bf6eR17 ?) |
15 | | - | # - skimage.metrics import structural_similarity for viewing the diff |
16 | | - | |
17 | | - | |
18 | | - | from changedetectionio import content_fetcher, html_tools |
19 | | - | |
20 | | - | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
21 | | - | |
22 | | - | from . import fetch_processor |
23 | | - | |
24 | | - | |
25 | | - | # Some common stuff here that can be moved to a base class |
26 | | - | # (set_proxy_from_list) |
27 | | - | class perform_site_check(fetch_processor): |
28 | | - | xpath_data = None |
29 | | - | |
30 | | - | def run(self, uuid): |
31 | | - | changed_detected = False |
32 | | - | |
33 | | - | watch = self.datastore.data['watching'].get(uuid) |
34 | | - | |
35 | | - | # Protect against file:// access |
36 | | - | if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): |
37 | | - | raise Exception( |
38 | | - | "file:// type access is denied for security reasons." |
39 | | - | ) |
40 | | - | |
41 | | - | if watch.get('fetch_backend') != 'html_webdriver': |
42 | | - | raise Exception( |
43 | | - | "Requires a Chrome compatible fetcher enabled." |
44 | | - | ) |
45 | | - | |
46 | | - | # Unset any existing notification error |
47 | | - | update_obj = {'last_notification_error': False, 'last_error': False} |
48 | | - | |
49 | | - | extra_headers = self.datastore.data['watching'][uuid].get('headers') |
50 | | - | |
51 | | - | # Tweak the base config with the per-watch ones |
52 | | - | request_headers = self.datastore.data['settings']['headers'].copy() |
53 | | - | request_headers.update(extra_headers) |
54 | | - | |
55 | | - | # https://github.com/psf/requests/issues/4525 |
56 | | - | # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot |
57 | | - | # do this by accident. |
58 | | - | if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: |
59 | | - | request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') |
60 | | - | |
61 | | - | timeout = self.datastore.data['settings']['requests']['timeout'] |
62 | | - | url = watch.get('url') |
63 | | - | request_body = self.datastore.data['watching'][uuid].get('body') |
64 | | - | request_method = self.datastore.data['watching'][uuid].get('method') |
65 | | - | ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) |
66 | | - | |
67 | | - | prefer_backend = watch['fetch_backend'] |
68 | | - | if hasattr(content_fetcher, prefer_backend): |
69 | | - | klass = getattr(content_fetcher, prefer_backend) |
70 | | - | else: |
71 | | - | # If the klass doesnt exist, just use a default |
72 | | - | klass = getattr(content_fetcher, "html_requests") |
73 | | - | |
74 | | - | proxy_args = self.set_proxy_from_list(watch) |
75 | | - | fetcher = klass(proxy_override=proxy_args) |
76 | | - | |
77 | | - | fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes) |
78 | | - | fetcher.quit() |
79 | | - | |
80 | | - | # if not image/foobar in mimetype |
81 | | - | # raise content_fecther.NotAnImage(mimetype) ? |
82 | | - | # or better to try load with PIL and catch exception? |
83 | | - | |
84 | | - | update_obj["last_check_status"] = fetcher.get_last_status_code() |
85 | | - | |
86 | | - | self.contents = fetcher.screenshot |
87 | | - | |
88 | | - | image = Image.open(io.BytesIO(fetcher.screenshot)) |
89 | | - | |
90 | | - | # @todo different choice? |
91 | | - | # https://github.com/JohannesBuchner/imagehash#references |
92 | | - | fetched_hash = str(imagehash.average_hash(image)) |
93 | | - | |
94 | | - | # The main thing that all this at the moment comes down to :) |
95 | | - | if watch['previous_md5'] != fetched_hash: |
96 | | - | changed_detected = True |
97 | | - | |
98 | | - | # Always record the new checksum |
99 | | - | update_obj["previous_md5"] = fetched_hash |
100 | | - | |
101 | | - | # On the first run of a site, watch['previous_md5'] will be None, set it the current one. |
102 | | - | if not watch.get('previous_md5'): |
103 | | - | watch['previous_md5'] = fetched_hash |
104 | | - | |
105 | | - | return changed_detected, update_obj |
106 | | - | |