| 1 | + | import re |
| 2 | + | import requests |
| 3 | + | |
| 4 | + | try: |
| 5 | + | from urllib.parse import quote_plus as url_encode |
| 6 | + | except ImportError: |
| 7 | + | from urllib import quote_plus as url_encode |
| 8 | + | |
| 9 | + | def decode_html(string): |
| 10 | + | "decode common html/xml entities" |
| 11 | + | new_string = string |
| 12 | + | decoded = ['>', '<', '"', '&', '\''] |
| 13 | + | encoded = ['>', '<', '"', '&', '''] |
| 14 | + | for e, d in zip(encoded, decoded): |
| 15 | + | new_string = new_string.replace(e, d) |
| 16 | + | for e, d in zip(encoded[::-1], decoded[::-1]): |
| 17 | + | new_string = new_string.replace(e, d) |
| 18 | + | return new_string |
| 19 | + | |
| 20 | + | def parse(string): |
| 21 | + | "extract and parse resutls" |
| 22 | + | parsed = {} |
| 23 | + | # pattern = r'''<div><div class="[^"]+"> |
| 24 | + | # <div class="[^"]+"><a href="/url\?q=(.+?)&sa=[^"]+"><div class="[^"]+">(.*?)</div> |
| 25 | + | # <div class="[^"]+">.*?</div></a></div> |
| 26 | + | # <div class="[^"]+"></div> |
| 27 | + | # <div class="[^"]+"><div><div class="[^"]+"><div><div><div class="[^"]+">(?:(.*?)(?: ...)?</div>|\n<span class="[^"]+">.*?</span><span class="[^"]+">.*?</span>(.*?)</div>)''' |
| 28 | + | pattern = r'''<div class="[^"]+"><a href="/url\?q=(.+?)&sa=[^"]+">''' |
| 29 | + | matches = re.finditer(pattern, string) |
| 30 | + | num = 0 |
| 31 | + | for match in matches: |
| 32 | + | # parsed[num] = {'url' : match.group(1), 'text' : match.group(2), 'summary' : match.group(3) or match.group(4)} |
| 33 | + | parsed[num] = {'url' : match.group(1), 'text' : '', 'summary' : ''} |
| 34 | + | num += 1 |
| 35 | + | return parsed |
| 36 | + | |
| 37 | + | def search(query, cookie, page=0, full=False): |
| 38 | + | """ |
| 39 | + | main function, returns parsed results |
| 40 | + | Args: |
| 41 | + | query - search string |
| 42 | + | cookie - facebook cookie |
| 43 | + | page - search result page number (optional) |
| 44 | + | """ |
| 45 | + | offset = page * 10 |
| 46 | + | filter = 1 if not full else 0 |
| 47 | + | escaped = url_encode('https://google.com/search?q=%s&start=%i&filter=%i' % (url_encode(query), offset, filter)) |
| 48 | + | headers = { |
| 49 | + | 'Host': 'developers.facebook.com', |
| 50 | + | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', |
| 51 | + | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
| 52 | + | 'Accept-Language': 'en-US,en;q=0.5', |
| 53 | + | 'Accept-Encoding': 'deflate', |
| 54 | + | 'Connection': 'keep-alive', |
| 55 | + | 'Cookie': cookie, |
| 56 | + | 'Upgrade-Insecure-Requests': '1', |
| 57 | + | 'Cache-Control': 'max-age=0', |
| 58 | + | 'TE': 'Trailers' |
| 59 | + | } |
| 60 | + | response = requests.get('https://developers.facebook.com/tools/debug/echo/?q=%s' % escaped, headers=headers) |
| 61 | + | cleaned_response = decode_html(response.text) |
| 62 | + | parsed = parse(cleaned_response) |
| 63 | + | return parsed |
| 64 | + | |