add: type annotation to waybackpy modules

2022-02-04 04:25:01 +09:00
parent c274c474b2
commit 38088fa0d8
9 changed files with 275 additions and 205 deletions
--- a/waybackpy/availability_api.py
+++ b/waybackpy/availability_api.py
@@ -1,6 +1,7 @@
 import json
 import time
 from datetime import datetime
 from typing import Any, Dict, Optional
 import requests
@@ -10,37 +11,41 @@ from .exceptions import (
 )
 from .utils import DEFAULT_USER_AGENT
 ResponseJSON = Dict[str, Any]
-class WaybackMachineAvailabilityAPI:
+
 class WaybackMachineAvailabilityAPI(object):
    """
    Class that interfaces the availability API of the Wayback Machine.
    """
-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
+    def __init__(
        self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
-        self.headers = {"User-Agent": self.user_agent}
+        self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
        self.payload = {"url": "{url}".format(url=self.url)}
        self.endpoint = "https://archive.org/wayback/available"
        self.max_tries = max_tries
        self.tries = 0
        self.last_api_call_unix_time = int(time.time())
        self.api_call_time_gap = 5
-        self.JSON = None
+        self.JSON: Optional[ResponseJSON] = None
-    def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
+    def unix_timestamp_to_wayback_timestamp(self, unix_timestamp: int) -> str:
        """
        Converts Unix time to wayback Machine timestamp.
        """
        return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
-    def __repr__(self):
+    def __repr__(self) -> str:
        """
        Same as string representation, just return the archive URL as a string.
        """
        return str(self)
-    def __str__(self):
+    def __str__(self) -> str:
        """
        String representation of the class. If atleast one API call was successfully
        made then return the archive URL as a string. Else returns None.
@@ -54,7 +59,7 @@ class WaybackMachineAvailabilityAPI:
        return self.archive_url
-    def json(self):
+    def json(self) -> Optional[ResponseJSON]:
        """
        Makes the API call to the availability API can set the JSON response
        to the JSON attribute of the instance and also returns the JSON attribute.
@@ -79,7 +84,7 @@ class WaybackMachineAvailabilityAPI:
        return self.JSON
-    def timestamp(self):
+    def timestamp(self) -> datetime:
        """
        Converts the timestamp form the JSON response to datetime object.
        If JSON attribute of the instance is None it implies that the either
@@ -91,19 +96,29 @@ class WaybackMachineAvailabilityAPI:
        If you get an URL as a response form the availability API it is guaranteed
        that you can get the datetime object from the timestamp.
        """
-        if not self.JSON or not self.JSON["archived_snapshots"]:
+        if self.JSON is None or "archived_snapshots" not in self.JSON:
            return datetime.max
-
+        elif (
-        return datetime.strptime(
+            self.JSON is not None
-            self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
+            and "archived_snapshots" in self.JSON
-        )
+            and self.JSON["archived_snapshots"] is not None
            and "closest" in self.JSON["archived_snapshots"]
            and self.JSON["archived_snapshots"]["closest"] is not None
            and "timestamp" in self.JSON["archived_snapshots"]["closest"]
        ):
            return datetime.strptime(
                self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
            )
        else:
            raise ValueError("Could not get timestamp from result")
    @property
-    def archive_url(self):
+    def archive_url(self) -> str:
        """
        Reads the the JSON response data and tries to get the timestamp and returns
        the timestamp if found else returns None.
        """
        archive_url = ""
        data = self.JSON
        # If the user didn't used oldest, newest or near but tries to access the
@@ -138,7 +153,7 @@ class WaybackMachineAvailabilityAPI:
            )
        return archive_url
-    def wayback_timestamp(self, **kwargs):
+    def wayback_timestamp(self, **kwargs: int) -> str:
        """
        Prepends zero before the year, month, day, hour and minute so that they
        are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
@@ -148,7 +163,7 @@ class WaybackMachineAvailabilityAPI:
            for key in ["year", "month", "day", "hour", "minute"]
        )
-    def oldest(self):
+    def oldest(self) -> "WaybackMachineAvailabilityAPI":
        """
        Passing the year 1994 should return the oldest archive because
        wayback machine was started in May, 1996 and there should be no archive
@@ -156,7 +171,7 @@ class WaybackMachineAvailabilityAPI:
        """
        return self.near(year=1994)
-    def newest(self):
+    def newest(self) -> "WaybackMachineAvailabilityAPI":
        """
        Passing the current UNIX time should be sufficient to get the newest
        archive considering the API request-response time delay and also the
@@ -166,13 +181,13 @@ class WaybackMachineAvailabilityAPI:
    def near(
        self,
-        year=None,
+        year: Optional[int] = None,
-        month=None,
+        month: Optional[int] = None,
-        day=None,
+        day: Optional[int] = None,
-        hour=None,
+        hour: Optional[int] = None,
-        minute=None,
+        minute: Optional[int] = None,
-        unix_timestamp=None,
+        unix_timestamp: Optional[int] = None,
-    ):
+    ) -> "WaybackMachineAvailabilityAPI":
        """
        The main method for this Class, oldest and newest methods are dependent on this
        method.
@@ -188,11 +203,11 @@ class WaybackMachineAvailabilityAPI:
        else:
            now = datetime.utcnow().timetuple()
            timestamp = self.wayback_timestamp(
-                year=year if year else now.tm_year,
+                year=now.tm_year if year is None else year,
-                month=month if month else now.tm_mon,
+                month=now.tm_mon if month is None else month,
-                day=day if day else now.tm_mday,
+                day=now.tm_mday if day is None else day,
-                hour=hour if hour else now.tm_hour,
+                hour=now.tm_hour if hour is None else hour,
-                minute=minute if minute else now.tm_min,
+                minute=now.tm_min if minute is None else minute,
            )
        self.payload["timestamp"] = timestamp
--- a/waybackpy/cdx_api.py
+++ b/waybackpy/cdx_api.py
@@ -1,3 +1,5 @@
 from typing import Dict, Generator, List, Optional, cast
 from .cdx_snapshot import CDXSnapshot
 from .cdx_utils import (
    check_collapses,
@@ -11,43 +13,48 @@ from .exceptions import WaybackError
 from .utils import DEFAULT_USER_AGENT
-class WaybackMachineCDXServerAPI:
+class WaybackMachineCDXServerAPI(object):
    """
    Class that interfaces the CDX server API of the Wayback Machine.
    """
    # start_timestamp: from, can not use from as it's a keyword
    # end_timestamp: to, not using to as can not use from
    def __init__(
        self,
-        url,
+        url: str,
-        user_agent=DEFAULT_USER_AGENT,
+        user_agent: str = DEFAULT_USER_AGENT,
-        start_timestamp=None,  # from, can not use from as it's a keyword
+        start_timestamp: Optional[str] = None,
-        end_timestamp=None,  # to, not using to as can not use from
+        end_timestamp: Optional[str] = None,
-        filters=[],
+        filters: List[str] = [],
-        match_type=None,
+        match_type: Optional[str] = None,
-        gzip=None,
+        gzip: Optional[str] = None,
-        collapses=[],
+        collapses: List[str] = [],
-        limit=None,
+        limit: Optional[str] = None,
-        max_tries=3,
+        max_tries: int = 3,
-    ):
+    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
-        self.start_timestamp = str(start_timestamp) if start_timestamp else None
+        self.start_timestamp = (
-        self.end_timestamp = str(end_timestamp) if end_timestamp else None
+            str(start_timestamp) if start_timestamp is not None else None
        )
        self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
        self.filters = filters
        check_filters(self.filters)
-        self.match_type = str(match_type).strip() if match_type else None
+        self.match_type = str(match_type).strip() if match_type is not None else None
        check_match_type(self.match_type, self.url)
-        self.gzip = gzip if gzip else True
+        self.gzip = gzip
        self.collapses = collapses
        check_collapses(self.collapses)
-        self.limit = limit if limit else 5000
+        self.limit = limit if limit is not None else 5000
        self.max_tries = max_tries
-        self.last_api_request_url = None
+        self.last_api_request_url: Optional[str] = None
        self.use_page = False
        self.endpoint = "https://web.archive.org/cdx/search/cdx"
-    def cdx_api_manager(self, payload, headers, use_page=False):
+    def cdx_api_manager(
-
+        self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
    ) -> Generator[str, None, None]:
        total_pages = get_total_pages(self.url, self.user_agent)
        # If we only have two or less pages of archives then we care for more accuracy
        # pagination API is lagged sometimes
@@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI:
                url = full_url(self.endpoint, params=payload)
                res = get_response(url, headers=headers)
                if isinstance(res, Exception):
                    raise res
                self.last_api_request_url = url
                text = res.text
@@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI:
                yield text
        else:
            payload["showResumeKey"] = "true"
            payload["limit"] = str(self.limit)
            resumeKey = None
            more = True
            while more:
                if resumeKey:
                    payload["resumeKey"] = resumeKey
                url = full_url(self.endpoint, params=payload)
                res = get_response(url, headers=headers)
                if isinstance(res, Exception):
                    raise res
                self.last_api_request_url = url
@@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI:
                yield text
-    def add_payload(self, payload):
+    def add_payload(self, payload: Dict[str, str]) -> None:
        if self.start_timestamp:
            payload["from"] = self.start_timestamp
        if self.end_timestamp:
            payload["to"] = self.end_timestamp
-        if self.gzip is not True:
+        if self.gzip is None:
            payload["gzip"] = "false"
        if self.match_type:
@@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI:
        # Don't need to return anything as it's dictionary.
        payload["url"] = self.url
-    def snapshots(self):
+    def snapshots(self) -> Generator[CDXSnapshot, None, None]:
-        payload = {}
+        payload: Dict[str, str] = {}
        headers = {"User-Agent": self.user_agent}
        self.add_payload(payload)
@@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI:
                if len(snapshot) < 46:  # 14 + 32 (timestamp+digest)
                    continue
-                properties = {
+                properties: Dict[str, Optional[str]] = {
                    "urlkey": None,
                    "timestamp": None,
                    "original": None,
@@ -190,4 +198,4 @@ class WaybackMachineCDXServerAPI:
                    properties["length"],
                ) = prop_values
-                yield CDXSnapshot(properties)
+                yield CDXSnapshot(cast(Dict[str, str], properties))
--- a/waybackpy/cdx_snapshot.py
+++ b/waybackpy/cdx_snapshot.py
@@ -1,7 +1,8 @@
 from datetime import datetime
 from typing import Dict
-class CDXSnapshot:
+class CDXSnapshot(object):
    """
    Class for the CDX snapshot lines returned by the CDX API,
    Each valid line of the CDX API is casted to an CDXSnapshot object
@@ -10,7 +11,7 @@ class CDXSnapshot:
    of the CDXSnapshot.
    """
-    def __init__(self, properties):
+    def __init__(self, properties: Dict[str, str]) -> None:
        self.urlkey = properties["urlkey"]
        self.timestamp = properties["timestamp"]
        self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
@@ -23,7 +24,7 @@ class CDXSnapshot:
            "https://web.archive.org/web/" + self.timestamp + "/" + self.original
        )
-    def __str__(self):
+    def __str__(self) -> str:
        return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
            urlkey=self.urlkey,
            timestamp=self.timestamp,
--- a/waybackpy/cdx_utils.py
+++ b/waybackpy/cdx_utils.py
@@ -1,23 +1,30 @@
 import re
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import quote
 import requests
 from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+
 # from urllib3.util.retry import Retry
 from requests.packages.urllib3.util.retry import Retry
 from .exceptions import WaybackError
 from .utils import DEFAULT_USER_AGENT
-def get_total_pages(url, user_agent=DEFAULT_USER_AGENT):
+def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
    endpoint = "https://web.archive.org/cdx/search/cdx?"
    payload = {"showNumPages": "true", "url": str(url)}
    headers = {"User-Agent": user_agent}
    request_url = full_url(endpoint, params=payload)
    response = get_response(request_url, headers=headers)
-    return int(response.text.strip())
+    if isinstance(response, requests.Response):
        return int(response.text.strip())
    else:
        raise response
-def full_url(endpoint, params):
+def full_url(endpoint: str, params: Dict[str, Any]) -> str:
    if not params:
        return endpoint
    full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
@@ -26,27 +33,25 @@ def full_url(endpoint, params):
        key = "collapse" if key.startswith("collapse") else key
        amp = "" if full_url.endswith("?") else "&"
        full_url = (
-            full_url
+            full_url + amp + "{key}={val}".format(key=key, val=quote(str(val), safe=""))
            + amp
            + "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
        )
    return full_url
 def get_response(
-    url,
+    url: str,
-    headers=None,
+    headers: Optional[Dict[str, str]] = None,
-    retries=5,
+    retries: int = 5,
-    backoff_factor=0.5,
+    backoff_factor: float = 0.5,
-    no_raise_on_redirects=False,
+    # no_raise_on_redirects=False,
-):
+) -> Union[requests.Response, Exception]:
    session = requests.Session()
-    retries = Retry(
+    retries_ = Retry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504],
    )
-    session.mount("https://", HTTPAdapter(max_retries=retries))
+    session.mount("https://", HTTPAdapter(max_retries=retries_))
    try:
        response = session.get(url, headers=headers)
@@ -62,23 +67,18 @@ def get_response(
        raise exc
-def check_filters(filters):
+def check_filters(filters: List[str]) -> None:
    if not isinstance(filters, list):
        raise WaybackError("filters must be a list.")
    # [!]field:regex
    for _filter in filters:
-        try:
+        match = re.search(
            r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
            _filter,
        )
-            match = re.search(
+        if match is None or len(match.groups()) != 2:
                r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
                _filter,
            )
            match.group(1)
            match.group(2)
        except Exception:
            exc_message = (
                "Filter '{_filter}' is not following the cdx filter syntax.".format(
@@ -88,43 +88,38 @@ def check_filters(filters):
            raise WaybackError(exc_message)
-def check_collapses(collapses):
+def check_collapses(collapses: List[str]) -> bool:
    if not isinstance(collapses, list):
        raise WaybackError("collapses must be a list.")
-
+    elif len(collapses) == 0:
-    if len(collapses) == 0:
+        return True
        return
    for collapse in collapses:
-        try:
+        match = re.search(
-            match = re.search(
+            r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
-                r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
+            collapse,
-                collapse,
+        )
-            )
+        if match is None or len(match.groups()) != 2:
            match.group(1)
            if 2 == len(match.groups()):
                match.group(2)
        except Exception:
            exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
                collapse=collapse
            )
            raise WaybackError(exc_message)
    else:
        return True
-def check_match_type(match_type, url):
+def check_match_type(match_type: Optional[str], url: str) -> bool:
    legal_match_type = ["exact", "prefix", "host", "domain"]
    if not match_type:
-        return
+        return True
-
+    elif "*" in url:
    if "*" in url:
        raise WaybackError(
            "Can not use wildcard in the URL along with the match_type arguments."
        )
-
+    elif match_type not in legal_match_type:
    legal_match_type = ["exact", "prefix", "host", "domain"]
    if match_type not in legal_match_type:
        exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
            match_type=match_type
        )
        raise WaybackError(exc_message)
    else:
        return True
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -3,6 +3,7 @@ import os
 import random
 import re
 import string
 from typing import Generator, List, Optional
 import click
 import requests
@@ -163,34 +164,34 @@ from .wrapper import Url
    + "will be printed.",
 )
 def main(
-    url,
+    url: Optional[str],
-    user_agent,
+    user_agent: str,
-    version,
+    version: bool,
-    license,
+    license: bool,
-    newest,
+    newest: bool,
-    oldest,
+    oldest: bool,
-    json,
+    json: bool,
-    near,
+    near: bool,
-    year,
+    year: Optional[int],
-    month,
+    month: Optional[int],
-    day,
+    day: Optional[int],
-    hour,
+    hour: Optional[int],
-    minute,
+    minute: Optional[int],
-    save,
+    save: bool,
-    headers,
+    headers: bool,
-    known_urls,
+    known_urls: bool,
-    subdomain,
+    subdomain: bool,
-    file,
+    file: bool,
-    cdx,
+    cdx: bool,
-    start_timestamp,
+    start_timestamp: Optional[str],
-    end_timestamp,
+    end_timestamp: Optional[str],
-    filter,
+    filter: List[str],
-    match_type,
+    match_type: Optional[str],
-    gzip,
+    gzip: Optional[str],
-    collapse,
+    collapse: List[str],
-    limit,
+    limit: Optional[str],
-    cdx_print,
+    cdx_print: List[str],
-):
+) -> None:
    """\b
                         _                _
                        | |              | |
@@ -244,7 +245,9 @@ def main(
        )
        return
-    def echo_availability_api(availability_api_instance):
+    def echo_availability_api(
        availability_api_instance: WaybackMachineAvailabilityAPI,
    ) -> None:
        click.echo("Archive URL:")
        if not availability_api_instance.archive_url:
            archive_url = (
@@ -295,13 +298,14 @@ def main(
            click.echo(save_api.headers)
        return
-    def save_urls_on_file(url_gen):
+    def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
        domain = None
        sys_random = random.SystemRandom()
        uid = "".join(
            sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
        )
        url_count = 0
        file_name = None
        for url in url_gen:
            url_count += 1
@@ -310,7 +314,7 @@ def main(
                domain = "domain-unknown"
-                if match:
+                if match is not None:
                    domain = match.group(1)
                file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
@@ -318,12 +322,12 @@ def main(
                if not os.path.isfile(file_path):
                    open(file_path, "w+").close()
-            with open(file_path, "a") as f:
+                with open(file_path, "a") as f:
-                f.write("{url}\n".format(url=url))
+                    f.write("{url}\n".format(url=url))
            click.echo(url)
-        if url_count > 0:
+        if url_count > 0 or file_name is not None:
            click.echo(
                "\n\n'{file_name}' saved in current working directory".format(
                    file_name=file_name
--- a/waybackpy/exceptions.py
+++ b/waybackpy/exceptions.py
@@ -14,6 +14,8 @@ class WaybackError(Exception):
     All other exceptions are inherited from this class.
    """
    pass
 class RedirectSaveError(WaybackError):
    """
@@ -21,32 +23,44 @@ class RedirectSaveError(WaybackError):
    redirect URL is archived but not the original URL.
    """
    pass
 class URLError(Exception):
    """
    Raised when malformed URLs are passed as arguments.
    """
    pass
 class MaximumRetriesExceeded(WaybackError):
    """
    MaximumRetriesExceeded
    """
    pass
 class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
    """
    MaximumSaveRetriesExceeded
    """
    pass
 class ArchiveNotInAvailabilityAPIResponse(WaybackError):
    """
    Could not parse the archive in the JSON response of the availability API.
    """
    pass
 class InvalidJSONInAvailabilityAPIResponse(WaybackError):
    """
    availability api returned invalid JSON
    """
    pass
--- a/waybackpy/save_api.py
+++ b/waybackpy/save_api.py
@@ -1,38 +1,42 @@
 import re
 import time
 from datetime import datetime
 from typing import Dict, Optional
 import requests
 from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+
 # from urllib3.util.retry import Retry
 from requests.packages.urllib3.util.retry import Retry
 from .exceptions import MaximumSaveRetriesExceeded
 from .utils import DEFAULT_USER_AGENT
-class WaybackMachineSaveAPI:
+class WaybackMachineSaveAPI(object):
    """
    WaybackMachineSaveAPI class provides an interface for saving URLs on the
    Wayback Machine.
    """
-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
+    def __init__(
        self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.request_url = "https://web.archive.org/save/" + self.url
        self.user_agent = user_agent
-        self.request_headers = {"User-Agent": self.user_agent}
+        self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
        if max_tries < 1:
            raise ValueError("max_tries should be positive")
        self.max_tries = max_tries
        self.total_save_retries = 5
        self.backoff_factor = 0.5
        self.status_forcelist = [500, 502, 503, 504]
-        self._archive_url = None
+        self._archive_url: Optional[str] = None
        self.instance_birth_time = datetime.utcnow()
    @property
-    def archive_url(self):
+    def archive_url(self) -> str:
        """
        Returns the archive URL is already cached by _archive_url
        else invoke the save method to save the archive which returns the
@@ -44,7 +48,7 @@ class WaybackMachineSaveAPI:
        else:
            return self.save()
-    def get_save_request_headers(self):
+    def get_save_request_headers(self) -> None:
        """
        Creates a session and tries 'retries' number of times to
        retrieve the archive.
@@ -61,21 +65,21 @@ class WaybackMachineSaveAPI:
        the response URL yourself in the browser.
        """
        session = requests.Session()
-        retries = Retry(
+        retries_ = Retry(
            total=self.total_save_retries,
            backoff_factor=self.backoff_factor,
            status_forcelist=self.status_forcelist,
        )
-        session.mount("https://", HTTPAdapter(max_retries=retries))
+        session.mount("https://", HTTPAdapter(max_retries=retries_))
        self.response = session.get(self.request_url, headers=self.request_headers)
-        self.headers = (
+        # requests.response.headers is requests.structures.CaseInsensitiveDict
-            self.response.headers
+        self.headers = self.response.headers
-        )  # <class 'requests.structures.CaseInsensitiveDict'>
+        self.headers_str = str(self.headers)
        self.status_code = self.response.status_code
        self.response_url = self.response.url
        session.close()
-    def archive_url_parser(self):
+    def archive_url_parser(self) -> Optional[str]:
        """
        Three regexen (like oxen?) are used to search for the
        archive URL in the headers and finally look in the response URL
@@ -83,18 +87,18 @@ class WaybackMachineSaveAPI:
        """
        regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
-        match = re.search(regex1, str(self.headers))
+        match = re.search(regex1, self.headers_str)
        if match:
            return "https://web.archive.org" + match.group(1)
        regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
-        match = re.search(regex2, str(self.headers))
+        match = re.search(regex2, self.headers_str)
-        if match:
+        if match is not None and len(match.groups()) == 1:
            return "https://" + match.group(1)
        regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
-        match = re.search(regex3, str(self.headers))
+        match = re.search(regex3, self.headers_str)
-        if match:
+        if match is not None and len(match.groups()) == 1:
            return "https" + match.group(1)
        if self.response_url:
@@ -105,7 +109,9 @@ class WaybackMachineSaveAPI:
                if match:
                    return "https://" + match.group(0)
-    def sleep(self, tries):
+        return None
    def sleep(self, tries: int) -> None:
        """
        Ensure that the we wait some time before succesive retries so that we
        don't waste the retries before the page is even captured by the Wayback
@@ -120,7 +126,7 @@ class WaybackMachineSaveAPI:
            sleep_seconds = 10
        time.sleep(sleep_seconds)
-    def timestamp(self):
+    def timestamp(self) -> datetime:
        """
        Read the timestamp off the archive URL and convert the Wayback Machine
        timestamp to datetime object.
@@ -133,9 +139,10 @@ class WaybackMachineSaveAPI:
        didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
        cached archive if last archive was captured before last 45 minutes.
        """
-        m = re.search(
+        regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
-            r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
+        m = re.search(regex, str(self._archive_url))
-        )
+        if m is None or len(m.groups()) != 1:
            raise ValueError("Could not find get timestamp")
        string_timestamp = m.group(1)
        timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
@@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:
        return timestamp
-    def save(self):
+    def save(self) -> str:
        """
        Calls the SavePageNow API of the Wayback Machine with required parameters
        and headers to save the URL.
@@ -169,7 +176,7 @@ class WaybackMachineSaveAPI:
                self.get_save_request_headers()
                self.saved_archive = self.archive_url_parser()
-                if self.saved_archive is not None:
+                if isinstance(self.saved_archive, str):
                    self._archive_url = self.saved_archive
                    self.timestamp()
                    return self.saved_archive
@@ -179,5 +186,5 @@ class WaybackMachineSaveAPI:
                raise MaximumSaveRetriesExceeded(
                    "Tried %s times but failed to save and retrieve the" % str(tries)
                    + " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
-                    % (self.url, self.response_url, str(self.headers)),
+                    % (self.url, self.response_url, self.headers_str),
                )
--- a/waybackpy/utils.py
+++ b/waybackpy/utils.py
@@ -2,22 +2,43 @@ import requests
 from . import __version__
-DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
+DEFAULT_USER_AGENT: str = (
    "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
 )
-def latest_version_pypi(package_name, user_agent=DEFAULT_USER_AGENT):
+def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
    request_url = "https://pypi.org/pypi/" + package_name + "/json"
    headers = {"User-Agent": user_agent}
    response = requests.get(request_url, headers=headers)
    data = response.json()
-    return data["info"]["version"]
+    if (
        data is not None
        and "info" in data
        and data["info"] is not None
        and "version" in data["info"]
        and data["info"]["version"] is not None
    ):
        return str(data["info"]["version"])
    else:
        raise ValueError("Could not get latest pypi version")
-def latest_version_github(package_name, user_agent=DEFAULT_USER_AGENT):
+def latest_version_github(
    package_name: str, user_agent: str = DEFAULT_USER_AGENT
 ) -> str:
    request_url = (
        "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
    )
    headers = {"User-Agent": user_agent}
    response = requests.get(request_url, headers=headers)
    data = response.json()
-    return data[0]["tag_name"]
+    if (
        data is not None
        and len(data) > 0
        and data[0] is not None
        and "tag_name" in data[0]
    ):
        return str(data[0]["tag_name"])
    else:
        raise ValueError("Could not get latest github version")
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -1,4 +1,5 @@
 from datetime import datetime, timedelta
 from typing import Generator, Optional
 from .availability_api import WaybackMachineAvailabilityAPI
 from .cdx_api import WaybackMachineCDXServerAPI
@@ -19,35 +20,37 @@ the older interface code.
 """
-class Url:
+class Url(object):
-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
+    def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
        self.url = url
        self.user_agent = str(user_agent)
-        self.archive_url = None
+        self.archive_url: Optional[str] = None
-        self.timestamp = None
+        self.timestamp: Optional[datetime] = None
        self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
            self.url, user_agent=self.user_agent
        )
-    def __str__(self):
+    def __str__(self) -> str:
        if not self.archive_url:
            self.newest()
-        return self.archive_url
+        return str(self.archive_url)
-    def __len__(self):
+    def __len__(self) -> int:
        td_max = timedelta(
            days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
        )
-        if not self.timestamp:
+        if not isinstance(self.timestamp, datetime):
            self.oldest()
-        if self.timestamp == datetime.max:
+        if not isinstance(self.timestamp, datetime):
            raise TypeError("timestamp must be a datetime")
        elif self.timestamp == datetime.max:
            return td_max.days
        else:
            return (datetime.utcnow() - self.timestamp).days
-        return (datetime.utcnow() - self.timestamp).days
+    def save(self) -> "Url":
    def save(self):
        self.wayback_machine_save_api = WaybackMachineSaveAPI(
            self.url, user_agent=self.user_agent
        )
@@ -58,13 +61,13 @@ class Url:
    def near(
        self,
-        year=None,
+        year: Optional[int] = None,
-        month=None,
+        month: Optional[int] = None,
-        day=None,
+        day: Optional[int] = None,
-        hour=None,
+        hour: Optional[int] = None,
-        minute=None,
+        minute: Optional[int] = None,
-        unix_timestamp=None,
+        unix_timestamp: Optional[int] = None,
-    ):
+    ) -> "Url":
        self.wayback_machine_availability_api.near(
            year=year,
@@ -77,22 +80,24 @@ class Url:
        self.set_availability_api_attrs()
        return self
-    def oldest(self):
+    def oldest(self) -> "Url":
        self.wayback_machine_availability_api.oldest()
        self.set_availability_api_attrs()
        return self
-    def newest(self):
+    def newest(self) -> "Url":
        self.wayback_machine_availability_api.newest()
        self.set_availability_api_attrs()
        return self
-    def set_availability_api_attrs(self):
+    def set_availability_api_attrs(self) -> None:
        self.archive_url = self.wayback_machine_availability_api.archive_url
        self.JSON = self.wayback_machine_availability_api.JSON
        self.timestamp = self.wayback_machine_availability_api.timestamp()
-    def total_archives(self, start_timestamp=None, end_timestamp=None):
+    def total_archives(
        self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
    ) -> int:
        cdx = WaybackMachineCDXServerAPI(
            self.url,
            user_agent=self.user_agent,
@@ -107,12 +112,12 @@ class Url:
    def known_urls(
        self,
-        subdomain=False,
+        subdomain: bool = False,
-        host=False,
+        host: bool = False,
-        start_timestamp=None,
+        start_timestamp: Optional[str] = None,
-        end_timestamp=None,
+        end_timestamp: Optional[str] = None,
-        match_type="prefix",
+        match_type: str = "prefix",
-    ):
+    ) -> Generator[str, None, None]:
        if subdomain:
            match_type = "domain"
        if host: