added docstrings, added some static type hints and also lint.

This commit is contained in:
Akash Mahanty
2022-02-06 20:44:52 +05:30
parent b79f1c471e
commit a2b14fb9a1

View File

@@ -1,9 +1,32 @@
"""
This module interfaces the Wayback Machine's availability API.
The interface could be useful for looking up archives and finding archives
that are close to a specific date and time.
It has a class called WaybackMachineAvailabilityAPI, and the class has
methods such as:
near() for looking up archives close to a specific date and time.
oldest() for retrieving the first archive URL of the webpage.
newest() for retrieving the latest archive of an URL.
The Wayback Machine Availability response should be a valid JSON and
if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
If the Availability API returned valid JSON but archive URL could not be found
it it then ArchiveNotInAvailabilityAPIResponse is raised.
"""
import json import json
import time import time
from datetime import datetime from datetime import datetime
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import requests import requests
from requests.models import Response
from .exceptions import ( from .exceptions import (
ArchiveNotInAvailabilityAPIResponse, ArchiveNotInAvailabilityAPIResponse,
@@ -22,38 +45,43 @@ class WaybackMachineAvailabilityAPI(object):
def __init__( def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3 self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
) -> None: ) -> None:
self.url = str(url).strip().replace(" ", "%20") self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent self.user_agent = user_agent
self.headers: Dict[str, str] = {"User-Agent": self.user_agent} self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
self.payload = {"url": self.url} self.payload: Dict[str, str] = {"url": self.url}
self.endpoint = "https://archive.org/wayback/available" self.endpoint: str = "https://archive.org/wayback/available"
self.max_tries = max_tries self.max_tries: int = max_tries
self.tries = 0 self.tries: int = 0
self.last_api_call_unix_time = int(time.time()) self.last_api_call_unix_time: int = int(time.time())
self.api_call_time_gap = 5 self.api_call_time_gap: int = 5
self.JSON: Optional[ResponseJSON] = None self.JSON: Optional[ResponseJSON] = None
@staticmethod @staticmethod
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
""" """
Converts Unix time to wayback Machine timestamp. Converts Unix time to wayback Machine timestamp and the Wayback Machine
timestamp format is yyyyMMddhhmmss.
""" """
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def __repr__(self) -> str: def __repr__(self) -> str:
""" """
Same as string representation, just return the archive URL as a string. Same as string representation, just return the archive URL as a string.
""" """
return str(self) return str(self)
def __str__(self) -> str: def __str__(self) -> str:
""" """
String representation of the class. If atleast one API call was successfully String representation of the class. If atleast one API
made then return the archive URL as a string. Else returns None. call was successfully made then return the archive URL
as a string. Else returns "".
""" """
# String must not return anything other than a string object # String should not return anything other than a string object
# So, if some asks for string repr before making the API requests # So, if a string repr is asked for before making any API requests
# just return "" # just return ""
if not self.JSON: if not self.JSON:
return "" return ""
@@ -62,26 +90,36 @@ class WaybackMachineAvailabilityAPI(object):
def json(self) -> Optional[ResponseJSON]: def json(self) -> Optional[ResponseJSON]:
""" """
Makes the API call to the availability API can set the JSON response Makes the API call to the availability API and set the JSON response
to the JSON attribute of the instance and also returns the JSON attribute. to the JSON attribute of the instance and also returns the JSON
attribute.
time_diff and sleep_time makes sure that you are not making too many
requests in a short interval of item, making too many requests is bad
as Wayback Machine may reject them above a certain threshold.
The end-user can change the api_call_time_gap attribute of the instance
to increase or decrease the default time gap between two successive API
calls, but it is not recommended to increase it.
""" """
time_diff = int(time.time()) - self.last_api_call_unix_time time_diff = int(time.time()) - self.last_api_call_unix_time
sleep_time = self.api_call_time_gap - time_diff sleep_time = self.api_call_time_gap - time_diff
if sleep_time > 0: if sleep_time > 0:
time.sleep(sleep_time) time.sleep(sleep_time)
self.response = requests.get( self.response: Response = requests.get(
self.endpoint, params=self.payload, headers=self.headers self.endpoint, params=self.payload, headers=self.headers
) )
self.last_api_call_unix_time = int(time.time()) self.last_api_call_unix_time = int(time.time())
self.tries += 1 self.tries += 1
try: try:
self.JSON = self.response.json() self.JSON = self.response.json()
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError as json_decode_error:
raise InvalidJSONInAvailabilityAPIResponse( raise InvalidJSONInAvailabilityAPIResponse(
f"Response data:\n{self.response.text}" f"Response data:\n{self.response.text}"
) ) from json_decode_error
return self.JSON return self.JSON
@@ -91,15 +129,17 @@ class WaybackMachineAvailabilityAPI(object):
If JSON attribute of the instance is None it implies that the either If JSON attribute of the instance is None it implies that the either
the the last API call failed or one was never made. the the last API call failed or one was never made.
If not JSON or if JSON but no timestamp in the JSON response then returns If not JSON or if JSON but no timestamp in the JSON response then
the maximum value for datetime object that is possible. returns the maximum value for datetime object that is possible.
If you get an URL as a response form the availability API it is guaranteed If you get an URL as a response form the availability API it is
that you can get the datetime object from the timestamp. guaranteed that you can get the datetime object from the timestamp.
""" """
if self.JSON is None or "archived_snapshots" not in self.JSON: if self.JSON is None or "archived_snapshots" not in self.JSON:
return datetime.max return datetime.max
elif (
if (
self.JSON is not None self.JSON is not None
and "archived_snapshots" in self.JSON and "archived_snapshots" in self.JSON
and self.JSON["archived_snapshots"] is not None and self.JSON["archived_snapshots"] is not None
@@ -110,21 +150,23 @@ class WaybackMachineAvailabilityAPI(object):
return datetime.strptime( return datetime.strptime(
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
) )
else:
raise ValueError("Could not get timestamp from result") raise ValueError("Could not get timestamp from result")
@property @property
def archive_url(self) -> str: def archive_url(self) -> str:
""" """
Reads the the JSON response data and tries to get the timestamp and returns Reads the the JSON response data and returns
the timestamp if found else returns None. the timestamp if found and if not found raises
ArchiveNotInAvailabilityAPIResponse.
""" """
archive_url = "" archive_url = ""
data = self.JSON data = self.JSON
# If the user didn't used oldest, newest or near but tries to access the # If the user didn't invoke oldest, newest or near but tries to access the
# archive_url attribute then, we assume they are fine with any archive # archive_url attribute then assume they are fine with any archive
# and invoke the oldest archive function. # and invoke the oldest method.
if not data: if not data:
self.oldest() self.oldest()
@@ -137,7 +179,7 @@ class WaybackMachineAvailabilityAPI(object):
self.json() # It makes a new API call self.json() # It makes a new API call
data = self.JSON # json() updated the value of JSON attribute data = self.JSON # json() updated the value of JSON attribute
# Even if after we exhausted teh max_tries, then we give up and # If we exhausted the max_tries, then we give up and
# raise exception. # raise exception.
if not data or not data["archived_snapshots"]: if not data or not data["archived_snapshots"]:
@@ -160,6 +202,7 @@ class WaybackMachineAvailabilityAPI(object):
Prepends zero before the year, month, day, hour and minute so that they Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
""" """
return "".join( return "".join(
str(kwargs[key]).zfill(2) str(kwargs[key]).zfill(2)
for key in ["year", "month", "day", "hour", "minute"] for key in ["year", "month", "day", "hour", "minute"]
@@ -167,18 +210,21 @@ class WaybackMachineAvailabilityAPI(object):
def oldest(self) -> "WaybackMachineAvailabilityAPI": def oldest(self) -> "WaybackMachineAvailabilityAPI":
""" """
Passing the year 1994 should return the oldest archive because Passes the date 1994-01-01 to near which should return the oldest archive
wayback machine was started in May, 1996 and there should be no archive because Wayback Machine was started in May, 1996 and it is assumed that
before the year 1994. there would be no archive older than January 1, 1994.
""" """
return self.near(year=1994)
return self.near(year=1994, month=1, day=1)
def newest(self) -> "WaybackMachineAvailabilityAPI": def newest(self) -> "WaybackMachineAvailabilityAPI":
""" """
Passing the current UNIX time should be sufficient to get the newest Passes the current UNIX time to near() for retrieving the newest archive
archive considering the API request-response time delay and also the from the availability API.
database lags on Wayback machine.
We assume that wayback machine can not archive the future of a webpage.
""" """
return self.near(unix_timestamp=int(time.time())) return self.near(unix_timestamp=int(time.time()))
def near( def near(
@@ -191,16 +237,18 @@ class WaybackMachineAvailabilityAPI(object):
unix_timestamp: Optional[int] = None, unix_timestamp: Optional[int] = None,
) -> "WaybackMachineAvailabilityAPI": ) -> "WaybackMachineAvailabilityAPI":
""" """
The main method for this Class, oldest and newest methods are dependent on this The main method for the Class, oldest() and newest() are dependent on it.
method.
It generates the timestamp based on the input either by calling the It generates the timestamp based on the input either by calling the
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
appropriate arguments for their respective parameters. appropriate arguments for their respective parameters.
Adds the timestamp to the payload dictionary. Adds the timestamp to the payload dictionary.
And finally invoking the json method to make the API call then returns And finally invoking the json method to make the API call then returns
the instance. the instance.
""" """
if unix_timestamp: if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
else: else: