diff --git a/tests/test_cli.py b/tests/test_cli.py index b7c1700..dc4768f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,7 +23,6 @@ def test_save(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -42,7 +41,6 @@ def test_save(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -63,7 +61,6 @@ def test_json(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -84,7 +81,6 @@ def test_archive_url(): archive_url=True, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -105,7 +101,6 @@ def test_oldest(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -128,7 +123,6 @@ def test_oldest(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -150,7 +144,6 @@ def test_newest(): archive_url=False, newest=True, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -173,7 +166,6 @@ def test_newest(): archive_url=False, newest=True, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -195,7 +187,6 @@ def test_total_archives(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get=None, @@ -217,7 +208,6 @@ def test_known_urls(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=True, get=None, @@ -237,7 +227,6 @@ def test_known_urls(): archive_url=False, newest=False, near=False, - alive=True, subdomain=True, known_urls=True, get=None, @@ -259,7 +248,6 @@ def test_near(): archive_url=False, newest=False, near=True, - alive=False, subdomain=False, known_urls=False, get=None, @@ -287,7 +275,6 @@ def test_near(): archive_url=False, newest=False, near=True, - alive=False, subdomain=False, known_urls=False, get=None, @@ -314,7 +301,6 @@ def test_get(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get="url", @@ -334,7 +320,6 @@ def test_get(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get="oldest", @@ -354,7 +339,6 @@ def test_get(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get="newest", @@ -374,7 +358,6 @@ def test_get(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get="save", @@ -394,7 +377,6 @@ def test_get(): archive_url=False, newest=False, near=False, - alive=False, subdomain=False, known_urls=False, get="foobar", diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 54a6b4a..6950f82 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -128,8 +128,5 @@ def test_total_archives(): def test_known_urls(): - target = Url("akamhy.github.io", user_agent) - assert len(target.known_urls(alive=True, subdomain=False)) > 2 - target = Url("akamhy.github.io", user_agent) assert len(target.known_urls()) > 3 diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 370f586..c043b23 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -120,11 +120,7 @@ def _known_urls(obj, args): if args.subdomain: subdomain = True - alive = False - if args.alive: - alive = True - - url_list = obj.known_urls(alive=alive, subdomain=subdomain) + url_list = obj.known_urls(subdomain=subdomain) total_urls = len(url_list) if total_urls > 0: @@ -267,8 +263,6 @@ def add_knownUrlArg(knownUrlArg): ) help_text = "Use with '--known_urls' to include known URLs for subdomains." knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text) - help_text = "Only include live URLs. Will not inlclude dead links." - knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text) def add_nearArg(nearArg): diff --git a/waybackpy/utils.py b/waybackpy/utils.py index ac7102d..fa217d9 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -11,6 +11,12 @@ quote = requests.utils.quote default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy" +def _latest_version(package_name, headers): + endpoint = "https://pypi.org/pypi/" + package_name + "/json" + json = _get_response(endpoint, headers=headers).json() + return json["info"]["version"] + + def _unix_ts_to_wayback_ts(unix_ts): return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S") @@ -183,7 +189,7 @@ def _get_total_pages(url, user_agent): return int((_get_response(total_pages_url, headers=headers).text).strip()) -def _archive_url_parser(header, url): +def _archive_url_parser(header, url, latest_version=__version__): """ The wayback machine's save API doesn't return JSON response, we are required @@ -226,15 +232,25 @@ def _archive_url_parser(header, url): if m: return m.group(1) - raise WaybackError( - "No archive URL found in the API response. " - "If '{url}' can be accessed via your web browser then either " - "this version of waybackpy ({version}) is out of date or WayBack Machine is malfunctioning. Visit " - "'https://github.com/akamhy/waybackpy' for the latest version " - "of waybackpy.\nHeader:\n{header}".format( - url=url, version=__version__, header=header + if __version__ == latest_version: + exc_message = ( + "No archive URL found in the API response. " + "If '{url}' can be accessed via your web browser then either " + "Wayback Machine is malfunctioning or it refused to archive your URL." + "\nHeader:\n{header}".format(url=url, header=header) ) - ) + else: + exc_message = ( + "No archive URL found in the API response. " + "If '{url}' can be accessed via your web browser then either " + "this version of waybackpy ({version}) is out of date or WayBack " + "Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' " + "for the latest version of waybackpy.\nHeader:\n{header}".format( + url=url, version=__version__, header=header + ) + ) + + raise WaybackError(exc_message) def _wayback_timestamp(**kwargs): @@ -292,20 +308,27 @@ def _get_response( # From https://stackoverflow.com/a/35504626 # By https://stackoverflow.com/users/401467/datashaman + s = requests.Session() + retries = Retry( total=retries, backoff_factor=backoff_factor, status_forcelist=[500, 502, 503, 504], ) + s.mount("https://", HTTPAdapter(max_retries=retries)) + url = _full_url(endpoint, params) + try: if not return_full_url: return s.get(url, headers=headers) return (url, s.get(url, headers=headers)) except Exception as e: - exc_message = "Error while retrieving {url}".format(url=url) + exc_message = "Error while retrieving {url}.\n{reason}".format( + url=url, reason=str(e) + ) exc = WaybackError(exc_message) exc.__cause__ = e raise exc diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index d11ebb8..22f42ea 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -12,6 +12,7 @@ from .utils import ( _cleaned_url, _ts, _unix_ts_to_wayback_ts, + _latest_version, ) @@ -23,7 +24,7 @@ class Url: self._archive_url = None self.timestamp = None self._JSON = None - self._alive_url_list = [] + self.latest_version = None def __repr__(self): return "waybackpy.Url(url={url}, user_agent={user_agent})".format( @@ -141,7 +142,11 @@ class Url: response = _get_response( request_url, params=None, headers=headers, backoff_factor=2 ) - self._archive_url = "https://" + _archive_url_parser(response.headers, self.url) + if not self.latest_version: + self.latest_version = _latest_version("waybackpy", headers=headers) + self._archive_url = "https://" + _archive_url_parser( + response.headers, self.url, self.latest_version + ) self.timestamp = datetime.utcnow() return self @@ -291,26 +296,7 @@ class Url: i = i + 1 return i - def live_urls_finder(self, url): - """ - This method is used to check if supplied url - is >= 400. - """ - - try: - response_code = requests.get(url).status_code - except Exception: - return # we don't care if Exception - - # 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300 - if response_code >= 400: - return - - self._alive_url_list.append(url) - - def known_urls( - self, alive=False, subdomain=False, start_timestamp=None, end_timestamp=None - ): + def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None): """ Returns list of URLs known to exist for given domain name because these URLs were crawled by WayBack Machine spider. @@ -347,10 +333,4 @@ class Url: for snapshot in snapshots: url_list.append(snapshot.original) - # Remove all deadURLs from url_list if alive=True - if alive: - with concurrent.futures.ThreadPoolExecutor() as executor: - executor.map(self.live_urls_finder, url_list) - url_list = self._alive_url_list - return url_list