better error messages(str), check latest version before asking for an upgrade and rm alive checking

2021-01-15 16:47:26 +05:30
parent dcd7b03302
commit 712471176b
5 changed files with 42 additions and 66 deletions
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -23,7 +23,6 @@ def test_save():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -42,7 +41,6 @@ def test_save():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -63,7 +61,6 @@ def test_json():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -84,7 +81,6 @@ def test_archive_url():
        archive_url=True,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -105,7 +101,6 @@ def test_oldest():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -128,7 +123,6 @@ def test_oldest():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -150,7 +144,6 @@ def test_newest():
        archive_url=False,
        newest=True,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -173,7 +166,6 @@ def test_newest():
        archive_url=False,
        newest=True,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -195,7 +187,6 @@ def test_total_archives():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -217,7 +208,6 @@ def test_known_urls():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=True,
        get=None,
@@ -237,7 +227,6 @@ def test_known_urls():
        archive_url=False,
        newest=False,
        near=False,
-        alive=True,
        subdomain=True,
        known_urls=True,
        get=None,
@@ -259,7 +248,6 @@ def test_near():
        archive_url=False,
        newest=False,
        near=True,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -287,7 +275,6 @@ def test_near():
        archive_url=False,
        newest=False,
        near=True,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get=None,
@@ -314,7 +301,6 @@ def test_get():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get="url",
@@ -334,7 +320,6 @@ def test_get():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get="oldest",
@@ -354,7 +339,6 @@ def test_get():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get="newest",
@@ -374,7 +358,6 @@ def test_get():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get="save",
@@ -394,7 +377,6 @@ def test_get():
        archive_url=False,
        newest=False,
        near=False,
-        alive=False,
        subdomain=False,
        known_urls=False,
        get="foobar",
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@@ -128,8 +128,5 @@ def test_total_archives():

 def test_known_urls():

-    target = Url("akamhy.github.io", user_agent)
-    assert len(target.known_urls(alive=True, subdomain=False)) > 2
-
    target = Url("akamhy.github.io", user_agent)
    assert len(target.known_urls()) > 3
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -120,11 +120,7 @@ def _known_urls(obj, args):
    if args.subdomain:
        subdomain = True

-    alive = False
-    if args.alive:
-        alive = True
-
-    url_list = obj.known_urls(alive=alive, subdomain=subdomain)
+    url_list = obj.known_urls(subdomain=subdomain)
    total_urls = len(url_list)

    if total_urls > 0:
@@ -267,8 +263,6 @@ def add_knownUrlArg(knownUrlArg):
    )
    help_text = "Use with '--known_urls' to include known URLs for subdomains."
    knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
-    help_text = "Only include live URLs. Will not inlclude dead links."
-    knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text)


 def add_nearArg(nearArg):
--- a/waybackpy/utils.py
+++ b/waybackpy/utils.py
@@ -11,6 +11,12 @@ quote = requests.utils.quote
 default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"


+def _latest_version(package_name, headers):
+    endpoint = "https://pypi.org/pypi/" + package_name + "/json"
+    json = _get_response(endpoint, headers=headers).json()
+    return json["info"]["version"]
+
+
 def _unix_ts_to_wayback_ts(unix_ts):
    return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")

@@ -183,7 +189,7 @@ def _get_total_pages(url, user_agent):
    return int((_get_response(total_pages_url, headers=headers).text).strip())


-def _archive_url_parser(header, url):
+def _archive_url_parser(header, url, latest_version=__version__):
    """
    The wayback machine's save API doesn't
    return JSON response, we are required
@@ -226,15 +232,25 @@ def _archive_url_parser(header, url):
    if m:
        return m.group(1)

-    raise WaybackError(
-        "No archive URL found in the API response. "
-        "If '{url}' can be accessed via your web browser then either "
-        "this version of waybackpy ({version}) is out of date or WayBack Machine is malfunctioning. Visit "
-        "'https://github.com/akamhy/waybackpy' for the latest version "
-        "of waybackpy.\nHeader:\n{header}".format(
-            url=url, version=__version__, header=header
+    if __version__ == latest_version:
+        exc_message = (
+            "No archive URL found in the API response. "
+            "If '{url}' can be accessed via your web browser then either "
+            "Wayback Machine is malfunctioning or it refused to archive your URL."
+            "\nHeader:\n{header}".format(url=url, header=header)
        )
-    )
+    else:
+        exc_message = (
+            "No archive URL found in the API response. "
+            "If '{url}' can be accessed via your web browser then either "
+            "this version of waybackpy ({version}) is out of date or WayBack "
+            "Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' "
+            "for the latest version of waybackpy.\nHeader:\n{header}".format(
+                url=url, version=__version__, header=header
+            )
+        )
+
+    raise WaybackError(exc_message)


 def _wayback_timestamp(**kwargs):
@@ -292,20 +308,27 @@ def _get_response(

    # From https://stackoverflow.com/a/35504626
    # By https://stackoverflow.com/users/401467/datashaman
+
    s = requests.Session()
+
    retries = Retry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504],
    )
+
    s.mount("https://", HTTPAdapter(max_retries=retries))
+
    url = _full_url(endpoint, params)
+
    try:
        if not return_full_url:
            return s.get(url, headers=headers)
        return (url, s.get(url, headers=headers))
    except Exception as e:
-        exc_message = "Error while retrieving {url}".format(url=url)
+        exc_message = "Error while retrieving {url}.\n{reason}".format(
+            url=url, reason=str(e)
+        )
        exc = WaybackError(exc_message)
        exc.__cause__ = e
        raise exc
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -12,6 +12,7 @@ from .utils import (
    _cleaned_url,
    _ts,
    _unix_ts_to_wayback_ts,
+    _latest_version,
 )


@@ -23,7 +24,7 @@ class Url:
        self._archive_url = None
        self.timestamp = None
        self._JSON = None
-        self._alive_url_list = []
+        self.latest_version = None

    def __repr__(self):
        return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
@@ -141,7 +142,11 @@ class Url:
        response = _get_response(
            request_url, params=None, headers=headers, backoff_factor=2
        )
-        self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
+        if not self.latest_version:
+            self.latest_version = _latest_version("waybackpy", headers=headers)
+        self._archive_url = "https://" + _archive_url_parser(
+            response.headers, self.url, self.latest_version
+        )
        self.timestamp = datetime.utcnow()
        return self

@@ -291,26 +296,7 @@ class Url:
            i = i + 1
        return i

-    def live_urls_finder(self, url):
-        """
-        This method is used to check if supplied url
-        is >= 400.
-        """
-
-        try:
-            response_code = requests.get(url).status_code
-        except Exception:
-            return  # we don't care if Exception
-
-        # 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
-        if response_code >= 400:
-            return
-
-        self._alive_url_list.append(url)
-
-    def known_urls(
-        self, alive=False, subdomain=False, start_timestamp=None, end_timestamp=None
-    ):
+    def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
        """
        Returns list of URLs known to exist for given domain name
        because these URLs were crawled by WayBack Machine spider.
@@ -347,10 +333,4 @@ class Url:
        for snapshot in snapshots:
            url_list.append(snapshot.original)

-        # Remove all deadURLs from url_list if alive=True
-        if alive:
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                executor.map(self.live_urls_finder, url_list)
-            url_list = self._alive_url_list
-
        return url_list