add: type annotation to waybackpy modules

2022-02-04 04:25:01 +09:00
parent c274c474b2
commit 38088fa0d8
9 changed files with 275 additions and 205 deletions
--- a/waybackpy/save_api.py
+++ b/waybackpy/save_api.py
@@ -1,38 +1,42 @@
 import re
 import time
 from datetime import datetime
+from typing import Dict, Optional

 import requests
 from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+
+# from urllib3.util.retry import Retry
+from requests.packages.urllib3.util.retry import Retry

 from .exceptions import MaximumSaveRetriesExceeded
 from .utils import DEFAULT_USER_AGENT


-class WaybackMachineSaveAPI:
-
+class WaybackMachineSaveAPI(object):
    """
    WaybackMachineSaveAPI class provides an interface for saving URLs on the
    Wayback Machine.
    """

-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
+    def __init__(
+        self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
+    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.request_url = "https://web.archive.org/save/" + self.url
        self.user_agent = user_agent
-        self.request_headers = {"User-Agent": self.user_agent}
+        self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
        if max_tries < 1:
            raise ValueError("max_tries should be positive")
        self.max_tries = max_tries
        self.total_save_retries = 5
        self.backoff_factor = 0.5
        self.status_forcelist = [500, 502, 503, 504]
-        self._archive_url = None
+        self._archive_url: Optional[str] = None
        self.instance_birth_time = datetime.utcnow()

    @property
-    def archive_url(self):
+    def archive_url(self) -> str:
        """
        Returns the archive URL is already cached by _archive_url
        else invoke the save method to save the archive which returns the
@@ -44,7 +48,7 @@ class WaybackMachineSaveAPI:
        else:
            return self.save()

-    def get_save_request_headers(self):
+    def get_save_request_headers(self) -> None:
        """
        Creates a session and tries 'retries' number of times to
        retrieve the archive.
@@ -61,21 +65,21 @@ class WaybackMachineSaveAPI:
        the response URL yourself in the browser.
        """
        session = requests.Session()
-        retries = Retry(
+        retries_ = Retry(
            total=self.total_save_retries,
            backoff_factor=self.backoff_factor,
            status_forcelist=self.status_forcelist,
        )
-        session.mount("https://", HTTPAdapter(max_retries=retries))
+        session.mount("https://", HTTPAdapter(max_retries=retries_))
        self.response = session.get(self.request_url, headers=self.request_headers)
-        self.headers = (
-            self.response.headers
-        )  # <class 'requests.structures.CaseInsensitiveDict'>
+        # requests.response.headers is requests.structures.CaseInsensitiveDict
+        self.headers = self.response.headers
+        self.headers_str = str(self.headers)
        self.status_code = self.response.status_code
        self.response_url = self.response.url
        session.close()

-    def archive_url_parser(self):
+    def archive_url_parser(self) -> Optional[str]:
        """
        Three regexen (like oxen?) are used to search for the
        archive URL in the headers and finally look in the response URL
@@ -83,18 +87,18 @@ class WaybackMachineSaveAPI:
        """

        regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
-        match = re.search(regex1, str(self.headers))
+        match = re.search(regex1, self.headers_str)
        if match:
            return "https://web.archive.org" + match.group(1)

        regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
-        match = re.search(regex2, str(self.headers))
-        if match:
+        match = re.search(regex2, self.headers_str)
+        if match is not None and len(match.groups()) == 1:
            return "https://" + match.group(1)

        regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
-        match = re.search(regex3, str(self.headers))
-        if match:
+        match = re.search(regex3, self.headers_str)
+        if match is not None and len(match.groups()) == 1:
            return "https" + match.group(1)

        if self.response_url:
@@ -105,7 +109,9 @@ class WaybackMachineSaveAPI:
                if match:
                    return "https://" + match.group(0)

-    def sleep(self, tries):
+        return None
+
+    def sleep(self, tries: int) -> None:
        """
        Ensure that the we wait some time before succesive retries so that we
        don't waste the retries before the page is even captured by the Wayback
@@ -120,7 +126,7 @@ class WaybackMachineSaveAPI:
            sleep_seconds = 10
        time.sleep(sleep_seconds)

-    def timestamp(self):
+    def timestamp(self) -> datetime:
        """
        Read the timestamp off the archive URL and convert the Wayback Machine
        timestamp to datetime object.
@@ -133,9 +139,10 @@ class WaybackMachineSaveAPI:
        didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
        cached archive if last archive was captured before last 45 minutes.
        """
-        m = re.search(
-            r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
-        )
+        regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
+        m = re.search(regex, str(self._archive_url))
+        if m is None or len(m.groups()) != 1:
+            raise ValueError("Could not find get timestamp")
        string_timestamp = m.group(1)
        timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")

@@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:

        return timestamp

-    def save(self):
+    def save(self) -> str:
        """
        Calls the SavePageNow API of the Wayback Machine with required parameters
        and headers to save the URL.
@@ -169,7 +176,7 @@ class WaybackMachineSaveAPI:
                self.get_save_request_headers()
                self.saved_archive = self.archive_url_parser()

-                if self.saved_archive is not None:
+                if isinstance(self.saved_archive, str):
                    self._archive_url = self.saved_archive
                    self.timestamp()
                    return self.saved_archive
@@ -179,5 +186,5 @@ class WaybackMachineSaveAPI:
                raise MaximumSaveRetriesExceeded(
                    "Tried %s times but failed to save and retrieve the" % str(tries)
                    + " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
-                    % (self.url, self.response_url, str(self.headers)),
+                    % (self.url, self.response_url, self.headers_str),
                )