add: type annotation to waybackpy modules

2022-02-04 04:25:01 +09:00
parent c274c474b2
commit 38088fa0d8
9 changed files with 275 additions and 205 deletions
--- a/waybackpy/availability_api.py
+++ b/waybackpy/availability_api.py
@@ -1,6 +1,7 @@
 import json
 import time
 from datetime import datetime
+from typing import Any, Dict, Optional

 import requests

@@ -10,37 +11,41 @@ from .exceptions import (
 )
 from .utils import DEFAULT_USER_AGENT

+ResponseJSON = Dict[str, Any]

-class WaybackMachineAvailabilityAPI:
+
+class WaybackMachineAvailabilityAPI(object):
    """
    Class that interfaces the availability API of the Wayback Machine.
    """

-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
+    def __init__(
+        self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
+    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
-        self.headers = {"User-Agent": self.user_agent}
+        self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
        self.payload = {"url": "{url}".format(url=self.url)}
        self.endpoint = "https://archive.org/wayback/available"
        self.max_tries = max_tries
        self.tries = 0
        self.last_api_call_unix_time = int(time.time())
        self.api_call_time_gap = 5
-        self.JSON = None
+        self.JSON: Optional[ResponseJSON] = None

-    def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
+    def unix_timestamp_to_wayback_timestamp(self, unix_timestamp: int) -> str:
        """
        Converts Unix time to wayback Machine timestamp.
        """
        return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")

-    def __repr__(self):
+    def __repr__(self) -> str:
        """
        Same as string representation, just return the archive URL as a string.
        """
        return str(self)

-    def __str__(self):
+    def __str__(self) -> str:
        """
        String representation of the class. If atleast one API call was successfully
        made then return the archive URL as a string. Else returns None.
@@ -54,7 +59,7 @@ class WaybackMachineAvailabilityAPI:

        return self.archive_url

-    def json(self):
+    def json(self) -> Optional[ResponseJSON]:
        """
        Makes the API call to the availability API can set the JSON response
        to the JSON attribute of the instance and also returns the JSON attribute.
@@ -79,7 +84,7 @@ class WaybackMachineAvailabilityAPI:

        return self.JSON

-    def timestamp(self):
+    def timestamp(self) -> datetime:
        """
        Converts the timestamp form the JSON response to datetime object.
        If JSON attribute of the instance is None it implies that the either
@@ -91,19 +96,29 @@ class WaybackMachineAvailabilityAPI:
        If you get an URL as a response form the availability API it is guaranteed
        that you can get the datetime object from the timestamp.
        """
-        if not self.JSON or not self.JSON["archived_snapshots"]:
+        if self.JSON is None or "archived_snapshots" not in self.JSON:
            return datetime.max
-
+        elif (
+            self.JSON is not None
+            and "archived_snapshots" in self.JSON
+            and self.JSON["archived_snapshots"] is not None
+            and "closest" in self.JSON["archived_snapshots"]
+            and self.JSON["archived_snapshots"]["closest"] is not None
+            and "timestamp" in self.JSON["archived_snapshots"]["closest"]
+        ):
            return datetime.strptime(
                self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
            )
+        else:
+            raise ValueError("Could not get timestamp from result")

    @property
-    def archive_url(self):
+    def archive_url(self) -> str:
        """
        Reads the the JSON response data and tries to get the timestamp and returns
        the timestamp if found else returns None.
        """
+        archive_url = ""
        data = self.JSON

        # If the user didn't used oldest, newest or near but tries to access the
@@ -138,7 +153,7 @@ class WaybackMachineAvailabilityAPI:
            )
        return archive_url

-    def wayback_timestamp(self, **kwargs):
+    def wayback_timestamp(self, **kwargs: int) -> str:
        """
        Prepends zero before the year, month, day, hour and minute so that they
        are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
@@ -148,7 +163,7 @@ class WaybackMachineAvailabilityAPI:
            for key in ["year", "month", "day", "hour", "minute"]
        )

-    def oldest(self):
+    def oldest(self) -> "WaybackMachineAvailabilityAPI":
        """
        Passing the year 1994 should return the oldest archive because
        wayback machine was started in May, 1996 and there should be no archive
@@ -156,7 +171,7 @@ class WaybackMachineAvailabilityAPI:
        """
        return self.near(year=1994)

-    def newest(self):
+    def newest(self) -> "WaybackMachineAvailabilityAPI":
        """
        Passing the current UNIX time should be sufficient to get the newest
        archive considering the API request-response time delay and also the
@@ -166,13 +181,13 @@ class WaybackMachineAvailabilityAPI:

    def near(
        self,
-        year=None,
-        month=None,
-        day=None,
-        hour=None,
-        minute=None,
-        unix_timestamp=None,
-    ):
+        year: Optional[int] = None,
+        month: Optional[int] = None,
+        day: Optional[int] = None,
+        hour: Optional[int] = None,
+        minute: Optional[int] = None,
+        unix_timestamp: Optional[int] = None,
+    ) -> "WaybackMachineAvailabilityAPI":
        """
        The main method for this Class, oldest and newest methods are dependent on this
        method.
@@ -188,11 +203,11 @@ class WaybackMachineAvailabilityAPI:
        else:
            now = datetime.utcnow().timetuple()
            timestamp = self.wayback_timestamp(
-                year=year if year else now.tm_year,
-                month=month if month else now.tm_mon,
-                day=day if day else now.tm_mday,
-                hour=hour if hour else now.tm_hour,
-                minute=minute if minute else now.tm_min,
+                year=now.tm_year if year is None else year,
+                month=now.tm_mon if month is None else month,
+                day=now.tm_mday if day is None else day,
+                hour=now.tm_hour if hour is None else hour,
+                minute=now.tm_min if minute is None else minute,
            )

        self.payload["timestamp"] = timestamp
--- a/waybackpy/cdx_api.py
+++ b/waybackpy/cdx_api.py
@@ -1,3 +1,5 @@
+from typing import Dict, Generator, List, Optional, cast
+
 from .cdx_snapshot import CDXSnapshot
 from .cdx_utils import (
    check_collapses,
@@ -11,43 +13,48 @@ from .exceptions import WaybackError
 from .utils import DEFAULT_USER_AGENT


-class WaybackMachineCDXServerAPI:
+class WaybackMachineCDXServerAPI(object):
    """
    Class that interfaces the CDX server API of the Wayback Machine.
    """

+    # start_timestamp: from, can not use from as it's a keyword
+    # end_timestamp: to, not using to as can not use from
    def __init__(
        self,
-        url,
-        user_agent=DEFAULT_USER_AGENT,
-        start_timestamp=None,  # from, can not use from as it's a keyword
-        end_timestamp=None,  # to, not using to as can not use from
-        filters=[],
-        match_type=None,
-        gzip=None,
-        collapses=[],
-        limit=None,
-        max_tries=3,
-    ):
+        url: str,
+        user_agent: str = DEFAULT_USER_AGENT,
+        start_timestamp: Optional[str] = None,
+        end_timestamp: Optional[str] = None,
+        filters: List[str] = [],
+        match_type: Optional[str] = None,
+        gzip: Optional[str] = None,
+        collapses: List[str] = [],
+        limit: Optional[str] = None,
+        max_tries: int = 3,
+    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
-        self.start_timestamp = str(start_timestamp) if start_timestamp else None
-        self.end_timestamp = str(end_timestamp) if end_timestamp else None
+        self.start_timestamp = (
+            str(start_timestamp) if start_timestamp is not None else None
+        )
+        self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
        self.filters = filters
        check_filters(self.filters)
-        self.match_type = str(match_type).strip() if match_type else None
+        self.match_type = str(match_type).strip() if match_type is not None else None
        check_match_type(self.match_type, self.url)
-        self.gzip = gzip if gzip else True
+        self.gzip = gzip
        self.collapses = collapses
        check_collapses(self.collapses)
-        self.limit = limit if limit else 5000
+        self.limit = limit if limit is not None else 5000
        self.max_tries = max_tries
-        self.last_api_request_url = None
+        self.last_api_request_url: Optional[str] = None
        self.use_page = False
        self.endpoint = "https://web.archive.org/cdx/search/cdx"

-    def cdx_api_manager(self, payload, headers, use_page=False):
-
+    def cdx_api_manager(
+        self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
+    ) -> Generator[str, None, None]:
        total_pages = get_total_pages(self.url, self.user_agent)
        # If we only have two or less pages of archives then we care for more accuracy
        # pagination API is lagged sometimes
@@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI:

                url = full_url(self.endpoint, params=payload)
                res = get_response(url, headers=headers)
+                if isinstance(res, Exception):
+                    raise res

                self.last_api_request_url = url
                text = res.text
@@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI:

                yield text
        else:
-
            payload["showResumeKey"] = "true"
            payload["limit"] = str(self.limit)
            resumeKey = None
-
            more = True
            while more:
-
                if resumeKey:
                    payload["resumeKey"] = resumeKey

                url = full_url(self.endpoint, params=payload)
                res = get_response(url, headers=headers)
+                if isinstance(res, Exception):
+                    raise res

                self.last_api_request_url = url

@@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI:

                yield text

-    def add_payload(self, payload):
+    def add_payload(self, payload: Dict[str, str]) -> None:
        if self.start_timestamp:
            payload["from"] = self.start_timestamp

        if self.end_timestamp:
            payload["to"] = self.end_timestamp

-        if self.gzip is not True:
+        if self.gzip is None:
            payload["gzip"] = "false"

        if self.match_type:
@@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI:
        # Don't need to return anything as it's dictionary.
        payload["url"] = self.url

-    def snapshots(self):
-        payload = {}
+    def snapshots(self) -> Generator[CDXSnapshot, None, None]:
+        payload: Dict[str, str] = {}
        headers = {"User-Agent": self.user_agent}

        self.add_payload(payload)
@@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI:
                if len(snapshot) < 46:  # 14 + 32 (timestamp+digest)
                    continue

-                properties = {
+                properties: Dict[str, Optional[str]] = {
                    "urlkey": None,
                    "timestamp": None,
                    "original": None,
@@ -190,4 +198,4 @@ class WaybackMachineCDXServerAPI:
                    properties["length"],
                ) = prop_values

-                yield CDXSnapshot(properties)
+                yield CDXSnapshot(cast(Dict[str, str], properties))
--- a/waybackpy/cdx_snapshot.py
+++ b/waybackpy/cdx_snapshot.py
@@ -1,7 +1,8 @@
 from datetime import datetime
+from typing import Dict


-class CDXSnapshot:
+class CDXSnapshot(object):
    """
    Class for the CDX snapshot lines returned by the CDX API,
    Each valid line of the CDX API is casted to an CDXSnapshot object
@@ -10,7 +11,7 @@ class CDXSnapshot:
    of the CDXSnapshot.
    """

-    def __init__(self, properties):
+    def __init__(self, properties: Dict[str, str]) -> None:
        self.urlkey = properties["urlkey"]
        self.timestamp = properties["timestamp"]
        self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
@@ -23,7 +24,7 @@ class CDXSnapshot:
            "https://web.archive.org/web/" + self.timestamp + "/" + self.original
        )

-    def __str__(self):
+    def __str__(self) -> str:
        return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
            urlkey=self.urlkey,
            timestamp=self.timestamp,
--- a/waybackpy/cdx_utils.py
+++ b/waybackpy/cdx_utils.py
@@ -1,23 +1,30 @@
 import re
+from typing import Any, Dict, List, Optional, Union
+from urllib.parse import quote

 import requests
 from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+
+# from urllib3.util.retry import Retry
+from requests.packages.urllib3.util.retry import Retry

 from .exceptions import WaybackError
 from .utils import DEFAULT_USER_AGENT


-def get_total_pages(url, user_agent=DEFAULT_USER_AGENT):
+def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
    endpoint = "https://web.archive.org/cdx/search/cdx?"
    payload = {"showNumPages": "true", "url": str(url)}
    headers = {"User-Agent": user_agent}
    request_url = full_url(endpoint, params=payload)
    response = get_response(request_url, headers=headers)
+    if isinstance(response, requests.Response):
        return int(response.text.strip())
+    else:
+        raise response


-def full_url(endpoint, params):
+def full_url(endpoint: str, params: Dict[str, Any]) -> str:
    if not params:
        return endpoint
    full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
@@ -26,27 +33,25 @@ def full_url(endpoint, params):
        key = "collapse" if key.startswith("collapse") else key
        amp = "" if full_url.endswith("?") else "&"
        full_url = (
-            full_url
-            + amp
-            + "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
+            full_url + amp + "{key}={val}".format(key=key, val=quote(str(val), safe=""))
        )
    return full_url


 def get_response(
-    url,
-    headers=None,
-    retries=5,
-    backoff_factor=0.5,
-    no_raise_on_redirects=False,
-):
+    url: str,
+    headers: Optional[Dict[str, str]] = None,
+    retries: int = 5,
+    backoff_factor: float = 0.5,
+    # no_raise_on_redirects=False,
+) -> Union[requests.Response, Exception]:
    session = requests.Session()
-    retries = Retry(
+    retries_ = Retry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504],
    )
-    session.mount("https://", HTTPAdapter(max_retries=retries))
+    session.mount("https://", HTTPAdapter(max_retries=retries_))

    try:
        response = session.get(url, headers=headers)
@@ -62,23 +67,18 @@ def get_response(
        raise exc


-def check_filters(filters):
+def check_filters(filters: List[str]) -> None:
    if not isinstance(filters, list):
        raise WaybackError("filters must be a list.")

    # [!]field:regex
    for _filter in filters:
-        try:
-
        match = re.search(
            r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
            _filter,
        )

-            match.group(1)
-            match.group(2)
-
-        except Exception:
+        if match is None or len(match.groups()) != 2:

            exc_message = (
                "Filter '{_filter}' is not following the cdx filter syntax.".format(
@@ -88,43 +88,38 @@ def check_filters(filters):
            raise WaybackError(exc_message)


-def check_collapses(collapses):
-
+def check_collapses(collapses: List[str]) -> bool:
    if not isinstance(collapses, list):
        raise WaybackError("collapses must be a list.")
-
-    if len(collapses) == 0:
-        return
+    elif len(collapses) == 0:
+        return True

    for collapse in collapses:
-        try:
        match = re.search(
            r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
            collapse,
        )
-            match.group(1)
-            if 2 == len(match.groups()):
-                match.group(2)
-        except Exception:
+        if match is None or len(match.groups()) != 2:
            exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
                collapse=collapse
            )
            raise WaybackError(exc_message)
+    else:
+        return True


-def check_match_type(match_type, url):
+def check_match_type(match_type: Optional[str], url: str) -> bool:
+    legal_match_type = ["exact", "prefix", "host", "domain"]
    if not match_type:
-        return
-
-    if "*" in url:
+        return True
+    elif "*" in url:
        raise WaybackError(
            "Can not use wildcard in the URL along with the match_type arguments."
        )
-
-    legal_match_type = ["exact", "prefix", "host", "domain"]
-
-    if match_type not in legal_match_type:
+    elif match_type not in legal_match_type:
        exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
            match_type=match_type
        )
        raise WaybackError(exc_message)
+    else:
+        return True
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -3,6 +3,7 @@ import os
 import random
 import re
 import string
+from typing import Generator, List, Optional

 import click
 import requests
@@ -163,34 +164,34 @@ from .wrapper import Url
    + "will be printed.",
 )
 def main(
-    url,
-    user_agent,
-    version,
-    license,
-    newest,
-    oldest,
-    json,
-    near,
-    year,
-    month,
-    day,
-    hour,
-    minute,
-    save,
-    headers,
-    known_urls,
-    subdomain,
-    file,
-    cdx,
-    start_timestamp,
-    end_timestamp,
-    filter,
-    match_type,
-    gzip,
-    collapse,
-    limit,
-    cdx_print,
-):
+    url: Optional[str],
+    user_agent: str,
+    version: bool,
+    license: bool,
+    newest: bool,
+    oldest: bool,
+    json: bool,
+    near: bool,
+    year: Optional[int],
+    month: Optional[int],
+    day: Optional[int],
+    hour: Optional[int],
+    minute: Optional[int],
+    save: bool,
+    headers: bool,
+    known_urls: bool,
+    subdomain: bool,
+    file: bool,
+    cdx: bool,
+    start_timestamp: Optional[str],
+    end_timestamp: Optional[str],
+    filter: List[str],
+    match_type: Optional[str],
+    gzip: Optional[str],
+    collapse: List[str],
+    limit: Optional[str],
+    cdx_print: List[str],
+) -> None:
    """\b
                         _                _
                        | |              | |
@@ -244,7 +245,9 @@ def main(
        )
        return

-    def echo_availability_api(availability_api_instance):
+    def echo_availability_api(
+        availability_api_instance: WaybackMachineAvailabilityAPI,
+    ) -> None:
        click.echo("Archive URL:")
        if not availability_api_instance.archive_url:
            archive_url = (
@@ -295,13 +298,14 @@ def main(
            click.echo(save_api.headers)
        return

-    def save_urls_on_file(url_gen):
+    def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
        domain = None
        sys_random = random.SystemRandom()
        uid = "".join(
            sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
        )
        url_count = 0
+        file_name = None

        for url in url_gen:
            url_count += 1
@@ -310,7 +314,7 @@ def main(

                domain = "domain-unknown"

-                if match:
+                if match is not None:
                    domain = match.group(1)

                file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
@@ -323,7 +327,7 @@ def main(

            click.echo(url)

-        if url_count > 0:
+        if url_count > 0 or file_name is not None:
            click.echo(
                "\n\n'{file_name}' saved in current working directory".format(
                    file_name=file_name
--- a/waybackpy/exceptions.py
+++ b/waybackpy/exceptions.py
@@ -14,6 +14,8 @@ class WaybackError(Exception):
     All other exceptions are inherited from this class.
    """

+    pass
+

 class RedirectSaveError(WaybackError):
    """
@@ -21,32 +23,44 @@ class RedirectSaveError(WaybackError):
    redirect URL is archived but not the original URL.
    """

+    pass
+

 class URLError(Exception):
    """
    Raised when malformed URLs are passed as arguments.
    """

+    pass
+

 class MaximumRetriesExceeded(WaybackError):
    """
    MaximumRetriesExceeded
    """

+    pass
+

 class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
    """
    MaximumSaveRetriesExceeded
    """

+    pass
+

 class ArchiveNotInAvailabilityAPIResponse(WaybackError):
    """
    Could not parse the archive in the JSON response of the availability API.
    """

+    pass
+

 class InvalidJSONInAvailabilityAPIResponse(WaybackError):
    """
    availability api returned invalid JSON
    """
+
+    pass
--- a/waybackpy/save_api.py
+++ b/waybackpy/save_api.py
@@ -1,38 +1,42 @@
 import re
 import time
 from datetime import datetime
+from typing import Dict, Optional

 import requests
 from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+
+# from urllib3.util.retry import Retry
+from requests.packages.urllib3.util.retry import Retry

 from .exceptions import MaximumSaveRetriesExceeded
 from .utils import DEFAULT_USER_AGENT


-class WaybackMachineSaveAPI:
-
+class WaybackMachineSaveAPI(object):
    """
    WaybackMachineSaveAPI class provides an interface for saving URLs on the
    Wayback Machine.
    """

-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
+    def __init__(
+        self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
+    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.request_url = "https://web.archive.org/save/" + self.url
        self.user_agent = user_agent
-        self.request_headers = {"User-Agent": self.user_agent}
+        self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
        if max_tries < 1:
            raise ValueError("max_tries should be positive")
        self.max_tries = max_tries
        self.total_save_retries = 5
        self.backoff_factor = 0.5
        self.status_forcelist = [500, 502, 503, 504]
-        self._archive_url = None
+        self._archive_url: Optional[str] = None
        self.instance_birth_time = datetime.utcnow()

    @property
-    def archive_url(self):
+    def archive_url(self) -> str:
        """
        Returns the archive URL is already cached by _archive_url
        else invoke the save method to save the archive which returns the
@@ -44,7 +48,7 @@ class WaybackMachineSaveAPI:
        else:
            return self.save()

-    def get_save_request_headers(self):
+    def get_save_request_headers(self) -> None:
        """
        Creates a session and tries 'retries' number of times to
        retrieve the archive.
@@ -61,21 +65,21 @@ class WaybackMachineSaveAPI:
        the response URL yourself in the browser.
        """
        session = requests.Session()
-        retries = Retry(
+        retries_ = Retry(
            total=self.total_save_retries,
            backoff_factor=self.backoff_factor,
            status_forcelist=self.status_forcelist,
        )
-        session.mount("https://", HTTPAdapter(max_retries=retries))
+        session.mount("https://", HTTPAdapter(max_retries=retries_))
        self.response = session.get(self.request_url, headers=self.request_headers)
-        self.headers = (
-            self.response.headers
-        )  # <class 'requests.structures.CaseInsensitiveDict'>
+        # requests.response.headers is requests.structures.CaseInsensitiveDict
+        self.headers = self.response.headers
+        self.headers_str = str(self.headers)
        self.status_code = self.response.status_code
        self.response_url = self.response.url
        session.close()

-    def archive_url_parser(self):
+    def archive_url_parser(self) -> Optional[str]:
        """
        Three regexen (like oxen?) are used to search for the
        archive URL in the headers and finally look in the response URL
@@ -83,18 +87,18 @@ class WaybackMachineSaveAPI:
        """

        regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
-        match = re.search(regex1, str(self.headers))
+        match = re.search(regex1, self.headers_str)
        if match:
            return "https://web.archive.org" + match.group(1)

        regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
-        match = re.search(regex2, str(self.headers))
-        if match:
+        match = re.search(regex2, self.headers_str)
+        if match is not None and len(match.groups()) == 1:
            return "https://" + match.group(1)

        regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
-        match = re.search(regex3, str(self.headers))
-        if match:
+        match = re.search(regex3, self.headers_str)
+        if match is not None and len(match.groups()) == 1:
            return "https" + match.group(1)

        if self.response_url:
@@ -105,7 +109,9 @@ class WaybackMachineSaveAPI:
                if match:
                    return "https://" + match.group(0)

-    def sleep(self, tries):
+        return None
+
+    def sleep(self, tries: int) -> None:
        """
        Ensure that the we wait some time before succesive retries so that we
        don't waste the retries before the page is even captured by the Wayback
@@ -120,7 +126,7 @@ class WaybackMachineSaveAPI:
            sleep_seconds = 10
        time.sleep(sleep_seconds)

-    def timestamp(self):
+    def timestamp(self) -> datetime:
        """
        Read the timestamp off the archive URL and convert the Wayback Machine
        timestamp to datetime object.
@@ -133,9 +139,10 @@ class WaybackMachineSaveAPI:
        didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
        cached archive if last archive was captured before last 45 minutes.
        """
-        m = re.search(
-            r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
-        )
+        regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
+        m = re.search(regex, str(self._archive_url))
+        if m is None or len(m.groups()) != 1:
+            raise ValueError("Could not find get timestamp")
        string_timestamp = m.group(1)
        timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")

@@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:

        return timestamp

-    def save(self):
+    def save(self) -> str:
        """
        Calls the SavePageNow API of the Wayback Machine with required parameters
        and headers to save the URL.
@@ -169,7 +176,7 @@ class WaybackMachineSaveAPI:
                self.get_save_request_headers()
                self.saved_archive = self.archive_url_parser()

-                if self.saved_archive is not None:
+                if isinstance(self.saved_archive, str):
                    self._archive_url = self.saved_archive
                    self.timestamp()
                    return self.saved_archive
@@ -179,5 +186,5 @@ class WaybackMachineSaveAPI:
                raise MaximumSaveRetriesExceeded(
                    "Tried %s times but failed to save and retrieve the" % str(tries)
                    + " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
-                    % (self.url, self.response_url, str(self.headers)),
+                    % (self.url, self.response_url, self.headers_str),
                )
--- a/waybackpy/utils.py
+++ b/waybackpy/utils.py
@@ -2,22 +2,43 @@ import requests

 from . import __version__

-DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
+DEFAULT_USER_AGENT: str = (
+    "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
+)


-def latest_version_pypi(package_name, user_agent=DEFAULT_USER_AGENT):
+def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
    request_url = "https://pypi.org/pypi/" + package_name + "/json"
    headers = {"User-Agent": user_agent}
    response = requests.get(request_url, headers=headers)
    data = response.json()
-    return data["info"]["version"]
+    if (
+        data is not None
+        and "info" in data
+        and data["info"] is not None
+        and "version" in data["info"]
+        and data["info"]["version"] is not None
+    ):
+        return str(data["info"]["version"])
+    else:
+        raise ValueError("Could not get latest pypi version")


-def latest_version_github(package_name, user_agent=DEFAULT_USER_AGENT):
+def latest_version_github(
+    package_name: str, user_agent: str = DEFAULT_USER_AGENT
+) -> str:
    request_url = (
        "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
    )
    headers = {"User-Agent": user_agent}
    response = requests.get(request_url, headers=headers)
    data = response.json()
-    return data[0]["tag_name"]
+    if (
+        data is not None
+        and len(data) > 0
+        and data[0] is not None
+        and "tag_name" in data[0]
+    ):
+        return str(data[0]["tag_name"])
+    else:
+        raise ValueError("Could not get latest github version")
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -1,4 +1,5 @@
 from datetime import datetime, timedelta
+from typing import Generator, Optional

 from .availability_api import WaybackMachineAvailabilityAPI
 from .cdx_api import WaybackMachineCDXServerAPI
@@ -19,35 +20,37 @@ the older interface code.
 """


-class Url:
-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
+class Url(object):
+    def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
        self.url = url
        self.user_agent = str(user_agent)
-        self.archive_url = None
-        self.timestamp = None
+        self.archive_url: Optional[str] = None
+        self.timestamp: Optional[datetime] = None
        self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
            self.url, user_agent=self.user_agent
        )

-    def __str__(self):
+    def __str__(self) -> str:
        if not self.archive_url:
            self.newest()
-        return self.archive_url
+        return str(self.archive_url)

-    def __len__(self):
+    def __len__(self) -> int:
        td_max = timedelta(
            days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
        )

-        if not self.timestamp:
+        if not isinstance(self.timestamp, datetime):
            self.oldest()

-        if self.timestamp == datetime.max:
+        if not isinstance(self.timestamp, datetime):
+            raise TypeError("timestamp must be a datetime")
+        elif self.timestamp == datetime.max:
            return td_max.days
-
+        else:
            return (datetime.utcnow() - self.timestamp).days

-    def save(self):
+    def save(self) -> "Url":
        self.wayback_machine_save_api = WaybackMachineSaveAPI(
            self.url, user_agent=self.user_agent
        )
@@ -58,13 +61,13 @@ class Url:

    def near(
        self,
-        year=None,
-        month=None,
-        day=None,
-        hour=None,
-        minute=None,
-        unix_timestamp=None,
-    ):
+        year: Optional[int] = None,
+        month: Optional[int] = None,
+        day: Optional[int] = None,
+        hour: Optional[int] = None,
+        minute: Optional[int] = None,
+        unix_timestamp: Optional[int] = None,
+    ) -> "Url":

        self.wayback_machine_availability_api.near(
            year=year,
@@ -77,22 +80,24 @@ class Url:
        self.set_availability_api_attrs()
        return self

-    def oldest(self):
+    def oldest(self) -> "Url":
        self.wayback_machine_availability_api.oldest()
        self.set_availability_api_attrs()
        return self

-    def newest(self):
+    def newest(self) -> "Url":
        self.wayback_machine_availability_api.newest()
        self.set_availability_api_attrs()
        return self

-    def set_availability_api_attrs(self):
+    def set_availability_api_attrs(self) -> None:
        self.archive_url = self.wayback_machine_availability_api.archive_url
        self.JSON = self.wayback_machine_availability_api.JSON
        self.timestamp = self.wayback_machine_availability_api.timestamp()

-    def total_archives(self, start_timestamp=None, end_timestamp=None):
+    def total_archives(
+        self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
+    ) -> int:
        cdx = WaybackMachineCDXServerAPI(
            self.url,
            user_agent=self.user_agent,
@@ -107,12 +112,12 @@ class Url:

    def known_urls(
        self,
-        subdomain=False,
-        host=False,
-        start_timestamp=None,
-        end_timestamp=None,
-        match_type="prefix",
-    ):
+        subdomain: bool = False,
+        host: bool = False,
+        start_timestamp: Optional[str] = None,
+        end_timestamp: Optional[str] = None,
+        match_type: str = "prefix",
+    ) -> Generator[str, None, None]:
        if subdomain:
            match_type = "domain"
        if host: