added docstrings, added some static type hints and also lint. (#141)

* added docstrings, added some static type hints and also lint. * added doc strings and changed some internal variable names for more clarity. * make flake8 happy * add descriptive docstrings and type hints in waybackpy/cdx_snapshot.py * remove useless code and add docstrings and also lint using pylint. * remove unwarented test * added docstrings, lint using pylint and add a raise on 509 SC * added docstrings and lint with pylint * lint * add doc strings and lint * add docstrings and lint
2022-02-07 19:40:37 +05:30
parent 004ff26196
commit 97f8b96411
9 changed files with 400 additions and 127 deletions
@@ -53,10 +53,6 @@ def test_get_response() -> None:
    response = get_response(url, headers=headers)
    assert not isinstance(response, Exception) and response.status_code == 200
    url = "http/wwhfhfvhvjhmom"
    with pytest.raises(WaybackError):
        get_response(url, headers=headers)
 def test_check_filters() -> None:
    filters: List[str] = []
@@ -1,9 +1,32 @@
 """
 This module interfaces the Wayback Machine's availability API.
 The interface could be useful for looking up archives and finding archives
 that are close to a specific date and time.
 It has a class called WaybackMachineAvailabilityAPI, and the class has
 methods such as:
 near() for looking up archives close to a specific date and time.
 oldest() for retrieving the first archive URL of the webpage.
 newest() for retrieving the latest archive of an URL.
 The Wayback Machine Availability response should be a valid JSON and
 if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
 If the Availability API returned valid JSON but archive URL could not be found
 it it then ArchiveNotInAvailabilityAPIResponse is raised.
 """
 import json
 import time
 from datetime import datetime
 from typing import Any, Dict, Optional
 import requests
 from requests.models import Response
 from .exceptions import (
    ArchiveNotInAvailabilityAPIResponse,
@@ -22,38 +45,43 @@ class WaybackMachineAvailabilityAPI(object):
    def __init__(
        self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
        self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
-        self.payload = {"url": self.url}
+        self.payload: Dict[str, str] = {"url": self.url}
-        self.endpoint = "https://archive.org/wayback/available"
+        self.endpoint: str = "https://archive.org/wayback/available"
-        self.max_tries = max_tries
+        self.max_tries: int = max_tries
-        self.tries = 0
+        self.tries: int = 0
-        self.last_api_call_unix_time = int(time.time())
+        self.last_api_call_unix_time: int = int(time.time())
-        self.api_call_time_gap = 5
+        self.api_call_time_gap: int = 5
        self.JSON: Optional[ResponseJSON] = None
    @staticmethod
    def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
        """
-        Converts Unix time to wayback Machine timestamp.
+        Converts Unix time to wayback Machine timestamp and the Wayback Machine
        timestamp format is yyyyMMddhhmmss.
        """
        return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
    def __repr__(self) -> str:
        """
        Same as string representation, just return the archive URL as a string.
        """
        return str(self)
    def __str__(self) -> str:
        """
-        String representation of the class. If atleast one API call was successfully
+        String representation of the class. If atleast one API
-        made then return the archive URL as a string. Else returns None.
+        call was successfully made then return the archive URL
        as a string. Else returns "".
        """
-        # String must not return anything other than a string object
+        # String should not return anything other than a string object
-        # So, if some asks for string repr before making the API requests
+        # So, if a string repr is asked for before making any API requests
        # just return ""
        if not self.JSON:
            return ""
@@ -62,26 +90,36 @@ class WaybackMachineAvailabilityAPI(object):
    def json(self) -> Optional[ResponseJSON]:
        """
-        Makes the API call to the availability API can set the JSON response
+        Makes the API call to the availability API and set the JSON response
-        to the JSON attribute of the instance and also returns the JSON attribute.
+        to the JSON attribute of the instance and also returns the JSON
        attribute.
        time_diff and sleep_time makes sure that you are not making too many
        requests in a short interval of item, making too many requests is bad
        as Wayback Machine may reject them above a certain threshold.
        The end-user can change the api_call_time_gap attribute of the instance
        to increase or decrease the default time gap between two successive API
        calls, but it is not recommended to increase it.
        """
        time_diff = int(time.time()) - self.last_api_call_unix_time
        sleep_time = self.api_call_time_gap - time_diff
        if sleep_time > 0:
            time.sleep(sleep_time)
-        self.response = requests.get(
+        self.response: Response = requests.get(
            self.endpoint, params=self.payload, headers=self.headers
        )
        self.last_api_call_unix_time = int(time.time())
        self.tries += 1
        try:
            self.JSON = self.response.json()
-        except json.decoder.JSONDecodeError:
+        except json.decoder.JSONDecodeError as json_decode_error:
            raise InvalidJSONInAvailabilityAPIResponse(
                f"Response data:\n{self.response.text}"
-            )
+            ) from json_decode_error
        return self.JSON
@@ -91,15 +129,17 @@ class WaybackMachineAvailabilityAPI(object):
        If JSON attribute of the instance is None it implies that the either
        the the last API call failed or one was never made.
-        If not JSON or if JSON but no timestamp in the JSON response then returns
+        If not JSON or if JSON but no timestamp in the JSON response then
-        the maximum value for datetime object that is possible.
+        returns the maximum value for datetime object that is possible.
-        If you get an URL as a response form the availability API it is guaranteed
+        If you get an URL as a response form the availability API it is
-        that you can get the datetime object from the timestamp.
+        guaranteed that you can get the datetime object from the timestamp.
        """
        if self.JSON is None or "archived_snapshots" not in self.JSON:
            return datetime.max
-        elif (
+
        if (
            self.JSON is not None
            and "archived_snapshots" in self.JSON
            and self.JSON["archived_snapshots"] is not None
@@ -110,21 +150,23 @@ class WaybackMachineAvailabilityAPI(object):
            return datetime.strptime(
                self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
            )
-        else:
+
-            raise ValueError("Could not get timestamp from result")
+        raise ValueError("Could not get timestamp from result")
    @property
    def archive_url(self) -> str:
        """
-        Reads the the JSON response data and tries to get the timestamp and returns
+        Reads the the JSON response data and returns
-        the timestamp if found else returns None.
+        the timestamp if found and if not found raises
        ArchiveNotInAvailabilityAPIResponse.
        """
        archive_url = ""
        data = self.JSON
-        # If the user didn't used oldest, newest or near but tries to access the
+        # If the user didn't invoke oldest, newest or near but tries to access the
-        # archive_url attribute then, we assume they are fine with any archive
+        # archive_url attribute then assume they are fine with any archive
-        # and invoke the oldest archive function.
+        # and invoke the oldest method.
        if not data:
            self.oldest()
@@ -137,7 +179,7 @@ class WaybackMachineAvailabilityAPI(object):
                self.json()  # It makes a new API call
                data = self.JSON  # json() updated the value of JSON attribute
-            # Even if after we exhausted teh max_tries, then we give up and
+            # If we exhausted the max_tries, then we give up and
            # raise exception.
            if not data or not data["archived_snapshots"]:
@@ -160,6 +202,7 @@ class WaybackMachineAvailabilityAPI(object):
        Prepends zero before the year, month, day, hour and minute so that they
        are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
        """
        return "".join(
            str(kwargs[key]).zfill(2)
            for key in ["year", "month", "day", "hour", "minute"]
@@ -167,18 +210,21 @@ class WaybackMachineAvailabilityAPI(object):
    def oldest(self) -> "WaybackMachineAvailabilityAPI":
        """
-        Passing the year 1994 should return the oldest archive because
+        Passes the date 1994-01-01 to near which should return the oldest archive
-        wayback machine was started in May, 1996 and there should be no archive
+        because Wayback Machine was started in May, 1996 and it is assumed that
-        before the year 1994.
+        there would be no archive older than January 1, 1994.
        """
-        return self.near(year=1994)
+
        return self.near(year=1994, month=1, day=1)
    def newest(self) -> "WaybackMachineAvailabilityAPI":
        """
-        Passing the current UNIX time should be sufficient to get the newest
+        Passes the current UNIX time to near() for retrieving the newest archive
-        archive considering the API request-response time delay and also the
+        from the availability API.
-        database lags on Wayback machine.
+
        We assume that wayback machine can not archive the future of a webpage.
        """
        return self.near(unix_timestamp=int(time.time()))
    def near(
@@ -191,16 +237,18 @@ class WaybackMachineAvailabilityAPI(object):
        unix_timestamp: Optional[int] = None,
    ) -> "WaybackMachineAvailabilityAPI":
        """
-        The main method for this Class, oldest and newest methods are dependent on this
+        The main method for the Class, oldest() and newest() are dependent on it.
        method.
        It generates the timestamp based on the input either by calling the
        unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
        appropriate arguments for their respective parameters.
        Adds the timestamp to the payload dictionary.
        And finally invoking the json method to make the API call then returns
        the instance.
        """
        if unix_timestamp:
            timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
        else:
@@ -1,3 +1,14 @@
 """
 This module interfaces the Wayback Machine's CDX server API.
 The module has WaybackMachineCDXServerAPI which should be used by the users of
 this module to consume the CDX server API.
 WaybackMachineCDXServerAPI has a snapshot method that yields the snapshots, and
 the snapshots are yielded as instances of the CDXSnapshot class.
 """
 from typing import Dict, Generator, List, Optional, cast
 from .cdx_snapshot import CDXSnapshot
@@ -16,6 +27,11 @@ from .utils import DEFAULT_USER_AGENT
 class WaybackMachineCDXServerAPI(object):
    """
    Class that interfaces the CDX server API of the Wayback Machine.
    snapshot() returns a generator that can be iterated upon by the end-user,
    the generator returns the snapshots/entries as instance of CDXSnapshot to
    make the usage easy, just use '.' to get any attribute as the attributes are
    accessible via a dot ".".
    """
    # start_timestamp: from, can not use from as it's a keyword
@@ -53,9 +69,35 @@ class WaybackMachineCDXServerAPI(object):
    def cdx_api_manager(
        self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
    ) -> Generator[str, None, None]:
        """
        Manages the API calls for the instance, it automatically selects the best
        parameters by looking as the query of the end-user. For bigger queries
        automatically use the CDX pagination API and for smaller queries use the
        normal API.
        CDX Server API is a complex API and to make it easy for the end user to
        consume it the CDX manager(this method) handles the selection of the
        API output, whether to use the pagination API or not.
        For doing large/bulk queries, the use of the Pagination API is
        recommended by the Wayback Machine authors. And it determines if the
        query would be large or not by using the showNumPages=true parameter,
        this tells the number of pages of CDX DATA that the pagination API
        will return.
        If the number of page is less than 2 we use the normal non-pagination
        API as the pagination API is known to lag and for big queries it should
        not matter but for queries where the number of pages are less this
        method chooses accuracy over the pagination API.
        """
        # number of pages that will returned by the pagination API.
        # get_total_pages adds the showNumPages=true param to pagination API
        # requests.
        # This is a special query that will return a single number indicating
        # the number of pages.
        total_pages = get_total_pages(self.url, self.user_agent)
-        # If we only have two or less pages of archives then we care for more accuracy
+
        # pagination API is lagged sometimes
        if use_page is True and total_pages >= 2:
            blank_pages = 0
            for i in range(total_pages):
@@ -78,11 +120,11 @@ class WaybackMachineCDXServerAPI(object):
        else:
            payload["showResumeKey"] = "true"
            payload["limit"] = str(self.limit)
-            resumeKey = None
+            resume_key = None
            more = True
            while more:
-                if resumeKey:
+                if resume_key:
-                    payload["resumeKey"] = resumeKey
+                    payload["resumeKey"] = resume_key
                url = full_url(self.endpoint, params=payload)
                res = get_response(url, headers=headers)
@@ -102,13 +144,16 @@ class WaybackMachineCDXServerAPI(object):
                    if len(second_last_line) == 0:
-                        resumeKey = lines[-1].strip()
+                        resume_key = lines[-1].strip()
-                        text = text.replace(resumeKey, "", 1).strip()
+                        text = text.replace(resume_key, "", 1).strip()
                        more = True
                yield text
    def add_payload(self, payload: Dict[str, str]) -> None:
        """
        Adds the payload to the payload dictionary.
        """
        if self.start_timestamp:
            payload["from"] = self.start_timestamp
@@ -122,17 +167,35 @@ class WaybackMachineCDXServerAPI(object):
            payload["matchType"] = self.match_type
        if self.filters and len(self.filters) > 0:
-            for i, f in enumerate(self.filters):
+            for i, _filter in enumerate(self.filters):
-                payload["filter" + str(i)] = f
+                payload["filter" + str(i)] = _filter
        if self.collapses and len(self.collapses) > 0:
-            for i, f in enumerate(self.collapses):
+            for i, collapse in enumerate(self.collapses):
-                payload["collapse" + str(i)] = f
+                payload["collapse" + str(i)] = collapse
        # Don't need to return anything as it's dictionary.
        payload["url"] = self.url
    def snapshots(self) -> Generator[CDXSnapshot, None, None]:
        """
        This function yields the CDX data lines as snapshots.
        As it is a generator it exhaustible, the reason that this is
        a generator and not a list are:
        a) CDX server API can return millions of entries for a query and list
        is not suitable for such cases.
        b) Preventing memory usage issues, as told before this method may yield
        millions of records for some queries and your system may not have enough
        memory for such a big list. Also Remember this if outputing to Jupyter
        Notebooks.
        The objects yielded by this method are instance of CDXSnapshot class,
        you can access the attributes of the entries as the attribute of the instance
        itself.
        """
        payload: Dict[str, str] = {}
        headers = {"User-Agent": self.user_agent}
@@ -144,18 +207,25 @@ class WaybackMachineCDXServerAPI(object):
        if self.collapses != []:
            self.use_page = False
-        texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
+        entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
-        for text in texts:
+        for entry in entries:
-            if text.isspace() or len(text) <= 1 or not text:
+            if entry.isspace() or len(entry) <= 1 or not entry:
                continue
-            snapshot_list = text.split("\n")
+            # each line is a snapshot aka entry of the CDX server API.
            # We are able to split the page by lines because it only
            # splits the lines on a sinlge page and not all the entries
            # at once, thus there should be no issues of too much memory usage.
            snapshot_list = entry.split("\n")
            for snapshot in snapshot_list:
-                if len(snapshot) < 46:  # 14 + 32 (timestamp+digest)
+                # 14 + 32 == 46 ( timestamp + digest ), ignore the invalid entries.
                # they are invalid if their length is smaller than sum of length
                # of a standard wayback_timestamp and standard digest of an entry.
                if len(snapshot) < 46:
                    continue
                properties: Dict[str, Optional[str]] = {
@@ -168,16 +238,16 @@ class WaybackMachineCDXServerAPI(object):
                    "length": None,
                }
-                prop_values = snapshot.split(" ")
+                property_value = snapshot.split(" ")
-                prop_values_len = len(prop_values)
+                total_property_values = len(property_value)
-                properties_len = len(properties)
+                warranted_total_property_values = len(properties)
-                if prop_values_len != properties_len:
+                if total_property_values != warranted_total_property_values:
                    raise WaybackError(
-                        f"Snapshot returned by Cdx API has {prop_values_len} "
+                        f"Snapshot returned by CDX API has {total_property_values} prop"
-                        f"properties instead of expected {properties_len} properties.\n"
+                        f"erties instead of expected {warranted_total_property_values} "
-                        f"Problematic Snapshot: {snapshot}"
+                        f"properties.\nProblematic Snapshot: {snapshot}"
                    )
                (
@@ -188,6 +258,6 @@ class WaybackMachineCDXServerAPI(object):
                    properties["statuscode"],
                    properties["digest"],
                    properties["length"],
-                ) = prop_values
+                ) = property_value
                yield CDXSnapshot(cast(Dict[str, str], properties))
@@ -1,30 +1,83 @@
 """
 Module that contains the CDXSnapshot class, CDX records are casted
 to CDXSnapshot objects for easier access.
 The CDX index format is plain text data. Each line ('record') indicates a
 crawled document. And these lines are casted to CDXSnapshot.
 """
 from datetime import datetime
 from typing import Dict
 class CDXSnapshot(object):
    """
-    Class for the CDX snapshot lines returned by the CDX API,
+    Class for the CDX snapshot lines('record') returned by the CDX API,
    Each valid line of the CDX API is casted to an CDXSnapshot object
-    by the CDX API interface.
+    by the CDX API interface, just use "." to access any attribute of the
    CDX server API snapshot.
    This provides the end-user the ease of using the data as attributes
    of the CDXSnapshot.
    The string representation of the class is identical to the line returned
    by the CDX server API.
    Besides all the attributes of the CDX server API this class also provides
    archive_url attribute, yes it is the archive url of the snapshot.
    Attributes of the this class and what they represents and are useful for:
    urlkey: The document captured, expressed as a SURT
            SURT stands for Sort-friendly URI Reordering Transform, and is a
            transformation applied to URIs which makes their left-to-right
            representation better match the natural hierarchy of domain names.
            A URI <scheme://domain.tld/path?query> has SURT
            form <scheme://(tld,domain,)/path?query>.
    timestamp: The timestamp of the archive, format is yyyyMMddhhmmss and type
               is string.
    datetime_timestamp: The timestamp as a datetime object.
    original: The original URL of the archive. If archive_url is
    https://web.archive.org/web/20220113130051/https://google.com then the
    original URL is https://google.com
    mimetype: The document’s file type. e.g. text/html
    statuscode: HTTP response code for the document at the time of its crawling
    digest: Base32-encoded SHA-1 checksum of the document for discriminating
            with others
    length: Document’s volume of bytes in the WARC file
    archive_url: The archive url of the snapshot, this is not returned by the
                 CDX server API but created by this class on init.
    """
    def __init__(self, properties: Dict[str, str]) -> None:
-        self.urlkey = properties["urlkey"]
+        self.urlkey: str = properties["urlkey"]
-        self.timestamp = properties["timestamp"]
+        self.timestamp: str = properties["timestamp"]
-        self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
+        self.datetime_timestamp: datetime = datetime.strptime(
-        self.original = properties["original"]
+            self.timestamp, "%Y%m%d%H%M%S"
-        self.mimetype = properties["mimetype"]
+        )
-        self.statuscode = properties["statuscode"]
+        self.original: str = properties["original"]
-        self.digest = properties["digest"]
+        self.mimetype: str = properties["mimetype"]
-        self.length = properties["length"]
+        self.statuscode: str = properties["statuscode"]
-        self.archive_url = (
+        self.digest: str = properties["digest"]
        self.length: str = properties["length"]
        self.archive_url: str = (
            f"https://web.archive.org/web/{self.timestamp}/{self.original}"
        )
    def __str__(self) -> str:
        """
        The string representation is same as the line returned by the
        CDX server API for the snapshot.
        """
        return (
            f"{self.urlkey} {self.timestamp} {self.original} "
            f"{self.mimetype} {self.statuscode} {self.digest} {self.length}"
@@ -1,3 +1,10 @@
 """
 Utility functions required for accessing the CDX server API.
 These are here in this module so that we don’t make any module too
 big.
 """
 import re
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import quote
@@ -11,28 +18,44 @@ from .utils import DEFAULT_USER_AGENT
 def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
    """
    When using the pagination use adding showNumPages=true to the request
    URL makes the CDX server return an integer which is the number of pages
    of CDX pages available for us to query using the pagination API.
    """
    endpoint = "https://web.archive.org/cdx/search/cdx?"
    payload = {"showNumPages": "true", "url": str(url)}
    headers = {"User-Agent": user_agent}
    request_url = full_url(endpoint, params=payload)
    response = get_response(request_url, headers=headers)
    if isinstance(response, requests.Response):
        return int(response.text.strip())
-    else:
+    raise response
        raise response
 def full_url(endpoint: str, params: Dict[str, Any]) -> str:
    """
    As the function's name already implies that it returns
    full URL, but why we need a function for generating full URL?
    The CDX server can support multiple arguments for parameters
    such as filter and collapse and this function adds them without
    overwriting earlier added arguments.
    """
    if not params:
        return endpoint
-    full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
+    _full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
    for key, val in params.items():
        key = "filter" if key.startswith("filter") else key
        key = "collapse" if key.startswith("collapse") else key
-        amp = "" if full_url.endswith("?") else "&"
+        amp = "" if _full_url.endswith("?") else "&"
        val = quote(str(val), safe="")
-        full_url += f"{amp}{key}={val}"
+        _full_url += f"{amp}{key}={val}"
-    return full_url
+
    return _full_url
 def get_response(
@@ -40,29 +63,31 @@ def get_response(
    headers: Optional[Dict[str, str]] = None,
    retries: int = 5,
    backoff_factor: float = 0.5,
    # no_raise_on_redirects=False,
 ) -> Union[requests.Response, Exception]:
    """
    Make get request to the CDX server and return the response.
    """
    session = requests.Session()
    retries_ = Retry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504],
    )
    session.mount("https://", HTTPAdapter(max_retries=retries_))
-    try:
+    session.mount("https://", HTTPAdapter(max_retries=retries_))
-        response = session.get(url, headers=headers)
+    response = session.get(url, headers=headers)
-        session.close()
+    session.close()
-        return response
+    return response
    except Exception as e:
        reason = str(e)
        exc_message = f"Error while retrieving {url}.\n{reason}"
        exc = WaybackError(exc_message)
        exc.__cause__ = e
        raise exc
 def check_filters(filters: List[str]) -> None:
    """
    Check that the filter arguments passed by the end-user are valid.
    If not valid then raise WaybackError.
    """
    if not isinstance(filters, list):
        raise WaybackError("filters must be a list.")
@@ -81,9 +106,15 @@ def check_filters(filters: List[str]) -> None:
 def check_collapses(collapses: List[str]) -> bool:
    """
    Check that the collapse arguments passed by the end-user are valid.
    If not valid then raise WaybackError.
    """
    if not isinstance(collapses, list):
        raise WaybackError("collapses must be a list.")
-    elif len(collapses) == 0:
+
    if len(collapses) == 0:
        return True
    for collapse in collapses:
@@ -103,18 +134,26 @@ def check_collapses(collapses: List[str]) -> bool:
 def check_match_type(match_type: Optional[str], url: str) -> bool:
    """
    Check that the match_type argument passed by the end-user is valid.
    If not valid then raise WaybackError.
    """
    legal_match_type = ["exact", "prefix", "host", "domain"]
    if not match_type:
        return True
-    elif "*" in url:
+
    if "*" in url:
        raise WaybackError(
            "Can not use wildcard in the URL along with the match_type arguments."
        )
-    elif match_type not in legal_match_type:
+
    if match_type not in legal_match_type:
        exc_message = (
            f"{match_type} is not an allowed match type.\n"
            "Use one from 'exact', 'prefix', 'host' or 'domain'"
        )
        raise WaybackError(exc_message)
-    else:
+
-        return True
+    return True
@@ -1,3 +1,7 @@
 """
 Module that makes waybackpy a CLI tool.
 """
 import json as JSON
 import os
 import random
@@ -19,7 +23,10 @@ from .wrapper import Url
 def echo_availability_api(
    availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
 ) -> None:
-    click.echo("Archive URL:")
+    """
    Output availability API depending functions.
    Near, oldest and newest output by this method.
    """
    if not availability_api_instance.archive_url:
        archive_url = (
            "NO ARCHIVE FOUND - The requested URL is probably "
@@ -29,6 +36,7 @@ def echo_availability_api(
        )
    else:
        archive_url = availability_api_instance.archive_url
    click.echo("Archive URL:")
    click.echo(archive_url)
    if json:
        click.echo("JSON response:")
@@ -36,6 +44,10 @@ def echo_availability_api(
 def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
    """
    Save output of CDX API on file.
    Mainly here because of backwards compatibility.
    """
    domain = None
    sys_random = random.SystemRandom()
    uid = "".join(
@@ -51,8 +63,8 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
            domain = "domain-unknown" if match is None else match.group(1)
            file_name = f"{domain}-urls-{uid}.txt"
            file_path = os.path.join(os.getcwd(), file_name)
-            with open(file_path, "a") as f:
+            with open(file_path, "a") as file:
-                f.write(f"{url}\n")
+                file.write(f"{url}\n")
        click.echo(url)
@@ -269,6 +281,7 @@ def main(  # pylint: disable=no-value-for-parameter
    """
    if version:
        click.echo(f"waybackpy version {__version__}")
    elif show_license:
        click.echo(
            requests.get(
@@ -277,6 +290,7 @@ def main(  # pylint: disable=no-value-for-parameter
        )
    elif url is None:
        click.echo("No URL detected. Please provide an URL.", err=True)
    elif (
        not version
        and not oldest
@@ -291,14 +305,17 @@ def main(  # pylint: disable=no-value-for-parameter
            "Use --help flag for help using waybackpy.",
            err=True,
        )
    elif oldest:
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
        availability_api.oldest()
        echo_availability_api(availability_api, json)
    elif newest:
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
        availability_api.newest()
        echo_availability_api(availability_api, json)
    elif near:
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
        near_args = {}
@@ -309,6 +326,7 @@ def main(  # pylint: disable=no-value-for-parameter
                near_args[key] = arg
        availability_api.near(**near_args)
        echo_availability_api(availability_api, json)
    elif save:
        save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
        save_api.save()
@@ -319,15 +337,17 @@ def main(  # pylint: disable=no-value-for-parameter
        if headers:
            click.echo("Save API headers:")
            click.echo(save_api.headers)
    elif known_urls:
        wayback = Url(url, user_agent)
        url_gen = wayback.known_urls(subdomain=subdomain)
        if file:
            return save_urls_on_file(url_gen)
-        else:
+
-            for url in url_gen:
+        for url in url_gen:
-                click.echo(url)
+            click.echo(url)
    elif cdx:
        filters = list(cdx_filter)
        collapses = list(collapse)
@@ -1,3 +1,10 @@
 """
 This module interfaces the Wayback Machine's SavePageNow (SPN) API.
 The module has WaybackMachineSaveAPI class which should be used by the users of
 this module to use the SavePageNow API.
 """
 import re
 import time
 from datetime import datetime
@@ -8,7 +15,7 @@ from requests.adapters import HTTPAdapter
 from requests.structures import CaseInsensitiveDict
 from urllib3.util.retry import Retry
-from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError
+from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, WaybackError
 from .utils import DEFAULT_USER_AGENT
@@ -47,8 +54,8 @@ class WaybackMachineSaveAPI(object):
        if self._archive_url:
            return self._archive_url
-        else:
+
-            return self.save()
+        return self.save()
    def get_save_request_headers(self) -> None:
        """
@@ -66,6 +73,7 @@ class WaybackMachineSaveAPI(object):
        to be very unreliable thus if it fails first check opening
        the response URL yourself in the browser.
        """
        session = requests.Session()
        retries = Retry(
            total=self.total_save_retries,
@@ -79,11 +87,24 @@ class WaybackMachineSaveAPI(object):
        self.status_code = self.response.status_code
        self.response_url = self.response.url
        session.close()
        if self.status_code == 429:
            # why wait 5 minutes and 429?
            # see https://github.com/akamhy/waybackpy/issues/97
            raise TooManyRequestsError(
-                "Seem to be refused to request by the server. "
+                f"Can not save '{self.url}'. "
-                "Save Page Now receives up to 15 URLs per minutes. "
+                f"Save request refused by the server. "
-                "Wait a moment and run again."
+                f"Save Page Now limits saving 15 URLs per minutes. "
                f"Try waiting for 5 minutes and then try again."
            )
        # why 509?
        # see https://github.com/akamhy/waybackpy/pull/99
        # also https://t.co/xww4YJ0Iwc
        if self.status_code == 509:
            raise WaybackError(
                f"Can not save '{self.url}'. You have probably reached the "
                f"limit of active sessions."
            )
    def archive_url_parser(self) -> Optional[str]:
@@ -146,13 +167,17 @@ class WaybackMachineSaveAPI(object):
        the Wayback Machine to serve cached archive if last archive was captured
        before last 45 minutes.
        """
        regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
        m = re.search(regex, str(self._archive_url))
        if m is None or len(m.groups()) != 1:
            raise ValueError("Could not get timestamp")
        string_timestamp = m.group(1)
        timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
        regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
        match = re.search(regex, str(self._archive_url))
        if match is None or len(match.groups()) != 1:
            raise ValueError(
                f"Can not parse timestamp from archive URL, '{self._archive_url}'."
            )
        string_timestamp = match.group(1)
        timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
        timestamp_unixtime = time.mktime(timestamp.timetuple())
        instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple())
@@ -1,3 +1,7 @@
 """
 Utility functions and shared variables like DEFAULT_USER_AGENT are here.
 """
 import requests
 from . import __version__
@@ -8,6 +12,7 @@ DEFAULT_USER_AGENT: str = (
 def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
    """Latest waybackpy version on PyPi."""
    request_url = "https://pypi.org/pypi/" + package_name + "/json"
    headers = {"User-Agent": user_agent}
    response = requests.get(request_url, headers=headers)
@@ -20,13 +25,14 @@ def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT)
        and data["info"]["version"] is not None
    ):
        return str(data["info"]["version"])
-    else:
+
-        raise ValueError("Could not get latest pypi version")
+    raise ValueError("Could not get latest pypi version")
 def latest_version_github(
    package_name: str, user_agent: str = DEFAULT_USER_AGENT
 ) -> str:
    """Latest waybackpy version on GitHub."""
    request_url = (
        "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
    )
@@ -40,5 +46,5 @@ def latest_version_github(
        and "tag_name" in data[0]
    ):
        return str(data[0]["tag_name"])
-    else:
+
-        raise ValueError("Could not get latest github version")
+    raise ValueError("Could not get latest github version")
@@ -1,3 +1,9 @@
 """
 This module exists because backwards compatibility matters.
 Don't touch this or add any new functionality here and don't use
 the Url class.
 """
 from datetime import datetime, timedelta
 from typing import Generator, Optional
@@ -49,12 +55,14 @@ class Url(object):
        if not isinstance(self.timestamp, datetime):
            raise TypeError("timestamp must be a datetime")
-        elif self.timestamp == datetime.max:
+
        if self.timestamp == datetime.max:
            return td_max.days
-        else:
+
-            return (datetime.utcnow() - self.timestamp).days
+        return (datetime.utcnow() - self.timestamp).days
    def save(self) -> "Url":
        """Save the URL on wayback machine."""
        self.wayback_machine_save_api = WaybackMachineSaveAPI(
            self.url, user_agent=self.user_agent
        )
@@ -72,7 +80,7 @@ class Url(object):
        minute: Optional[int] = None,
        unix_timestamp: Optional[int] = None,
    ) -> "Url":
-
+        """Returns the archive of the URL close to a date and time."""
        self.wayback_machine_availability_api.near(
            year=year,
            month=month,
@@ -85,16 +93,19 @@ class Url(object):
        return self
    def oldest(self) -> "Url":
        """Returns the oldest archive of the URL."""
        self.wayback_machine_availability_api.oldest()
        self.set_availability_api_attrs()
        return self
    def newest(self) -> "Url":
        """Returns the newest archive of the URL."""
        self.wayback_machine_availability_api.newest()
        self.set_availability_api_attrs()
        return self
    def set_availability_api_attrs(self) -> None:
        """Set the attributes for total backwards compatibility."""
        self.archive_url = self.wayback_machine_availability_api.archive_url
        self.JSON = self.wayback_machine_availability_api.JSON
        self.timestamp = self.wayback_machine_availability_api.timestamp()
@@ -102,6 +113,10 @@ class Url(object):
    def total_archives(
        self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
    ) -> int:
        """
        Returns an integer which indicates total number of archives for an URL.
        Useless in my opinion, only here because of backwards compatibility.
        """
        cdx = WaybackMachineCDXServerAPI(
            self.url,
            user_agent=self.user_agent,
@@ -122,6 +137,7 @@ class Url(object):
        end_timestamp: Optional[str] = None,
        match_type: str = "prefix",
    ) -> Generator[str, None, None]:
        """Yields known URLs for any URL."""
        if subdomain:
            match_type = "domain"
        if host:
@@ -137,4 +153,4 @@ class Url(object):
        )
        for snapshot in cdx.snapshots():
-            yield (snapshot.original)
+            yield snapshot.original