| skipped 14 lines |
15 | 15 | | along with this program. If not, see <http://www.gnu.org/licenses/>. |
16 | 16 | | """ |
17 | 17 | | |
18 | | - | import re |
19 | | - | import os |
20 | 18 | | |
21 | 19 | | class main: |
22 | 20 | | # framework = None |
23 | | - | def __init__(self, q, limit=1, count=10, google_api=None, google_cx=None): |
| 21 | + | def __init__(self, q, limit=1, count=10): |
24 | 22 | | """ google.com search engine |
25 | 23 | | |
26 | | - | q : Query for search |
27 | | - | limit : Number of pages |
28 | | - | google_api : Google api(if you need to use api_run_crawl) |
29 | | - | google_cx : Google cx(if you need to use api_run_crawl) |
| 24 | + | q : Query for search |
| 25 | + | limit : Number of pages |
| 26 | + | count : Number of results |
30 | 27 | | """ |
31 | 28 | | self.framework = main.framework |
32 | 29 | | self.q = q |
33 | 30 | | self.agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0' |
| 31 | + | self.url = 'https://www.google.com/search' |
34 | 32 | | self._pages = '' |
35 | 33 | | self.limit = limit + 1 |
36 | | - | self.num = count |
37 | | - | self.google_api = google_api |
38 | | - | self.google_cx = google_cx |
39 | | - | self._links = [] |
| 34 | + | self.count = count |
| 35 | + | self.xpath_name = { |
| 36 | + | 'results': '//div[@class="g"]', |
| 37 | + | 'results_content': './/div[@class="IsZvec"]', |
| 38 | + | 'results_title': './/h3[1]', |
| 39 | + | 'results_a': './/div[@class="yuRUbf"]/a', |
| 40 | + | 'results_cite': './/div[@class="yuRUbf"]/a//cite' |
| 41 | + | } |
| 42 | + | self.xpath = { |
| 43 | + | self.xpath_name['results']: [ |
| 44 | + | self.xpath_name['results_content'], |
| 45 | + | self.xpath_name['results_title'], |
| 46 | + | self.xpath_name['results_a'], |
| 47 | + | self.xpath_name['results_cite'] |
| 48 | + | ] |
| 49 | + | } |
40 | 50 | | |
41 | 51 | | def run_crawl(self): |
42 | 52 | | page = 1 |
43 | | - | url = 'https://www.google.com/search' |
44 | | - | set_page = lambda x: (x - 1) * self.num |
45 | | - | payload = {'num': self.num, 'start': set_page(page), 'ie': 'utf-8', 'oe': 'utf-8', 'q': self.q, 'filter': '0'} |
| 53 | + | set_page = lambda x: (x - 1) * self.count |
| 54 | + | payload = {'num': self.count, 'start': set_page(page), 'ie': 'utf-8', 'oe': 'utf-8', 'q': self.q, 'filter': '0'} |
46 | 55 | | while True: |
47 | 56 | | self.framework.verbose(f"[GOOGLE] Searching in {page} page...", end='\r') |
48 | 57 | | try: |
49 | 58 | | req = self.framework.request( |
50 | | - | url=url, |
| 59 | + | url=self.url, |
51 | 60 | | params=payload, |
52 | 61 | | headers={'user-agent': self.agent}, |
53 | 62 | | allow_redirects=True) |
54 | 63 | | except Exception as e: |
55 | | - | self.framework.error(f"[GOOGLE] ConnectionError: {e}") |
56 | | - | return |
57 | | - | if req.status_code in (503, 429): |
58 | | - | self.framework.error('[GOOGLE] Google CAPTCHA triggered.') |
59 | | - | break |
| 64 | + | self.framework.error(f"ConnectionError: {e}", 'util/google', 'run_crawl') |
| 65 | + | else: |
| 66 | + | if req.status_code in (503, 429): |
| 67 | + | self.framework.error('Google CAPTCHA triggered.', 'util/google', 'run_crawl') |
| 68 | + | break |
60 | 69 | | |
61 | | - | if req.status_code in (301, 302): |
62 | | - | redirect = req.headers['location'] |
63 | | - | req = self.framework.request(url=redirect, allow_redirects=False) |
64 | | - | |
65 | | - | self._pages += req.text |
66 | | - | page += 1 |
67 | | - | payload['start'] = set_page(page) |
68 | | - | if page >= self.limit: |
69 | | - | break |
70 | | - | parser = self.framework.page_parse(self._pages) |
71 | | - | links = parser.findall(r'a href="([^"]+)"') |
72 | | - | links += parser.findall(r'href="/url\?q=([^"]+)&sa=U&ved=') |
73 | | - | |
74 | | - | for link in links: |
75 | | - | inside_links = ('https://accounts.google.com/ServiceLogin?continue', 'https://support.google.com/', \ |
76 | | - | 'https://www.google.com/webhp', 'https://www.google.com/search?', 'https://support.google.com/websearch?') |
77 | | - | if '://' in link: |
78 | | - | url = self.framework.urlib(link).unquote_plus |
79 | | - | url = re.sub(r"^\/url\?q=", '', url) |
80 | | - | url = re.sub(r"\&.+", '', url) |
81 | | - | cond2 = [x in url for x in inside_links] |
82 | | - | if not any(cond2) and '://' in url: |
83 | | - | self._links.append(url) |
| 70 | + | if req.status_code in (301, 302): |
| 71 | + | redirect = req.headers['location'] |
| 72 | + | req = self.framework.request(url=redirect, allow_redirects=False) |
84 | 73 | | |
85 | | - | def api_run_crawl(self): |
86 | | - | if not (self.google_api and self.google_cx): |
87 | | - | self.framework.error('[GOOGLEAPI] google api needs google_api and google_cx variable') |
88 | | - | return |
| 74 | + | self._pages += req.text |
| 75 | + | page += 1 |
| 76 | + | payload['start'] = set_page(page) |
| 77 | + | if page >= self.limit: |
| 78 | + | break |
89 | 79 | | |
90 | | - | url = 'https://www.googleapis.com/customsearch/v1' |
91 | | - | payload = {'alt': 'json', 'prettyPrint': 'false', 'key': self.google_api, 'cx': self.google_cx, 'q': self.q} |
92 | | - | page = 0 |
93 | | - | self.framework.verbose(f"[GOOGLEAPI] Searching Google API for: {self.q}") |
94 | | - | while True: |
95 | | - | self.framework.verbose(f'[GOOGLEAPI] Searching in {page} page...', end='\r') |
96 | | - | resp = self.framework.request(url, params=payload) |
97 | | - | if resp.json() is None: |
98 | | - | raise self.framework.FrameworkException(f"Invalid JSON response.{os.linesep}{resp.text}") |
99 | | - | # add new results |
100 | | - | if 'items' in resp.json(): |
101 | | - | self._links.extend(resp.json()['items']) |
102 | | - | # increment and check the limit |
103 | | - | page += 1 |
104 | | - | if self.limit == page: |
105 | | - | break |
106 | | - | # check for more pages |
107 | | - | if not 'nextPage' in resp.json()['queries']: |
108 | | - | break |
109 | | - | payload['start'] = resp.json()['queries']['nextPage'][0]['startIndex'] |
| 80 | + | @property |
| 81 | + | def results(self): |
| 82 | + | parser = self.framework.page_parse(self._pages) |
| 83 | + | xpath_results = parser.html_fromstring(self.xpath) |
| 84 | + | results = [] |
| 85 | + | if not xpath_results: |
| 86 | + | return results |
| 87 | + | root = xpath_results[self.xpath_name['results']] |
| 88 | + | for i in range(len(root[self.xpath_name['results_a']])): |
| 89 | + | result = { |
| 90 | + | 'title': root[self.xpath_name['results_title']][i].text_content(), |
| 91 | + | 'a': root[self.xpath_name['results_a']][i].get('href'), |
| 92 | + | 'cite': root[self.xpath_name['results_cite']][i].text_content(), |
| 93 | + | 'content': root[self.xpath_name['results_content']][i].text_content(), |
| 94 | + | } |
| 95 | + | results.append(result) |
| 96 | + | return results |
110 | 97 | | |
111 | 98 | | @property |
112 | 99 | | def pages(self): |
| skipped 1 lines |
114 | 101 | | |
115 | 102 | | @property |
116 | 103 | | def links(self): |
117 | | - | return self._links |
| 104 | + | links = [x['a'] for x in self.results] |
| 105 | + | return links |
118 | 106 | | |
119 | 107 | | @property |
120 | 108 | | def dns(self): |
121 | | - | return self.framework.page_parse(self._pages).get_dns(self.q) |
| 109 | + | return self.framework.page_parse(self._pages).get_dns(self.q, self.links) |
122 | 110 | | |
123 | 111 | | @property |
124 | 112 | | def emails(self): |
| skipped 6 lines |