| skipped 20 lines |
21 | 21 | | |
22 | 22 | | __version__ = '1.1' |
23 | 23 | | |
24 | | - | import argparse as ap, datetime as dt, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time |
| 24 | + | import argparse as ap, datetime as dt, importlib, numpy as np, numpy.random as npr, os, psutil, random, requests, signal, sys, tarfile, time, warnings as warn |
25 | 25 | | import urllib.request, urllib.robotparser as robotparser, urllib.parse as uprs |
26 | 26 | | from selenium import webdriver |
27 | 27 | | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities |
28 | 28 | | from selenium.webdriver.support.ui import WebDriverWait |
29 | 29 | | from io import BytesIO |
30 | 30 | | from faker import Factory |
| 31 | + | |
| 32 | + | # ensure pyopenssl exists to address SNI support |
| 33 | + | # https://stackoverflow.com/questions/18578439/using-requests-with-tls-doesnt-give-sni-support/18579484#18579484 |
| 34 | + | if importlib.util.find_spec('OpenSSL') is None: |
| 35 | + | msg = 'Use the pyopenssl package to enable SNI support for TLS-protected hosted domains.' |
| 36 | + | print(msg) |
| 37 | + | warn.warn(msg) |
31 | 38 | | |
32 | 39 | | # headless Raspberry Pi |
33 | 40 | | try: |
| skipped 179 lines |
213 | 220 | | self.session.capabilities["driverVersion"])) |
214 | 221 | | phantomjs_version = tuple(int(i) for i in self.session.capabilities["version"].split('.')) |
215 | 222 | | if phantomjs_version < recommended_version: |
216 | | - | print("""{} version is {}; |
| 223 | + | warn.warn("""{} version is {}; |
217 | 224 | | please upgrade to at least version {} from http://phantomjs.org. |
218 | 225 | | """.format(self.session.capabilities["browserName"],self.session.capabilities["version"], |
219 | 226 | | '.'.join(str(i) for i in recommended_version))) |
| skipped 376 lines |
596 | 603 | | a fraction of the time. """ |
597 | 604 | | url = None |
598 | 605 | | if hasattr(self,'current_preferred_domain') and npr.uniform() < current_preferred_domain_fraction: |
599 | | - | while url is not None: # loop until `self.current_preferred_domain` has a url |
| 606 | + | while url is not None and len(self.domain_links) > 0: # loop until `self.current_preferred_domain` has a url |
600 | 607 | | url = self.draw_link_from_domain(self.current_preferred_domain) |
601 | 608 | | if url is None: self.current_preferred_domain = self.draw_domain() |
602 | 609 | | if url is None: url = self.draw_link() |
| skipped 286 lines |