added docstrings, added some static type hints and also lint.
This commit is contained in:
@@ -1,9 +1,32 @@
|
||||
"""
|
||||
This module interfaces the Wayback Machine's availability API.
|
||||
|
||||
The interface could be useful for looking up archives and finding archives
|
||||
that are close to a specific date and time.
|
||||
|
||||
It has a class called WaybackMachineAvailabilityAPI, and the class has
|
||||
methods such as:
|
||||
|
||||
near() for looking up archives close to a specific date and time.
|
||||
|
||||
oldest() for retrieving the first archive URL of the webpage.
|
||||
|
||||
newest() for retrieving the latest archive of an URL.
|
||||
|
||||
The Wayback Machine Availability response should be a valid JSON and
|
||||
if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
|
||||
|
||||
If the Availability API returned valid JSON but archive URL could not be found
|
||||
it it then ArchiveNotInAvailabilityAPIResponse is raised.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
from requests.models import Response
|
||||
|
||||
from .exceptions import (
|
||||
ArchiveNotInAvailabilityAPIResponse,
|
||||
@@ -22,38 +45,43 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
def __init__(
|
||||
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
|
||||
) -> None:
|
||||
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
|
||||
self.payload = {"url": self.url}
|
||||
self.endpoint = "https://archive.org/wayback/available"
|
||||
self.max_tries = max_tries
|
||||
self.tries = 0
|
||||
self.last_api_call_unix_time = int(time.time())
|
||||
self.api_call_time_gap = 5
|
||||
self.payload: Dict[str, str] = {"url": self.url}
|
||||
self.endpoint: str = "https://archive.org/wayback/available"
|
||||
self.max_tries: int = max_tries
|
||||
self.tries: int = 0
|
||||
self.last_api_call_unix_time: int = int(time.time())
|
||||
self.api_call_time_gap: int = 5
|
||||
self.JSON: Optional[ResponseJSON] = None
|
||||
|
||||
@staticmethod
|
||||
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
|
||||
"""
|
||||
Converts Unix time to wayback Machine timestamp.
|
||||
Converts Unix time to wayback Machine timestamp and the Wayback Machine
|
||||
timestamp format is yyyyMMddhhmmss.
|
||||
"""
|
||||
|
||||
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""
|
||||
Same as string representation, just return the archive URL as a string.
|
||||
"""
|
||||
|
||||
return str(self)
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
String representation of the class. If atleast one API call was successfully
|
||||
made then return the archive URL as a string. Else returns None.
|
||||
String representation of the class. If atleast one API
|
||||
call was successfully made then return the archive URL
|
||||
as a string. Else returns "".
|
||||
"""
|
||||
|
||||
# String must not return anything other than a string object
|
||||
# So, if some asks for string repr before making the API requests
|
||||
# String should not return anything other than a string object
|
||||
# So, if a string repr is asked for before making any API requests
|
||||
# just return ""
|
||||
if not self.JSON:
|
||||
return ""
|
||||
@@ -62,26 +90,36 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
|
||||
def json(self) -> Optional[ResponseJSON]:
|
||||
"""
|
||||
Makes the API call to the availability API can set the JSON response
|
||||
to the JSON attribute of the instance and also returns the JSON attribute.
|
||||
Makes the API call to the availability API and set the JSON response
|
||||
to the JSON attribute of the instance and also returns the JSON
|
||||
attribute.
|
||||
|
||||
time_diff and sleep_time makes sure that you are not making too many
|
||||
requests in a short interval of item, making too many requests is bad
|
||||
as Wayback Machine may reject them above a certain threshold.
|
||||
|
||||
The end-user can change the api_call_time_gap attribute of the instance
|
||||
to increase or decrease the default time gap between two successive API
|
||||
calls, but it is not recommended to increase it.
|
||||
"""
|
||||
|
||||
time_diff = int(time.time()) - self.last_api_call_unix_time
|
||||
sleep_time = self.api_call_time_gap - time_diff
|
||||
|
||||
if sleep_time > 0:
|
||||
time.sleep(sleep_time)
|
||||
|
||||
self.response = requests.get(
|
||||
self.response: Response = requests.get(
|
||||
self.endpoint, params=self.payload, headers=self.headers
|
||||
)
|
||||
self.last_api_call_unix_time = int(time.time())
|
||||
self.tries += 1
|
||||
try:
|
||||
self.JSON = self.response.json()
|
||||
except json.decoder.JSONDecodeError:
|
||||
except json.decoder.JSONDecodeError as json_decode_error:
|
||||
raise InvalidJSONInAvailabilityAPIResponse(
|
||||
f"Response data:\n{self.response.text}"
|
||||
)
|
||||
) from json_decode_error
|
||||
|
||||
return self.JSON
|
||||
|
||||
@@ -91,15 +129,17 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
If JSON attribute of the instance is None it implies that the either
|
||||
the the last API call failed or one was never made.
|
||||
|
||||
If not JSON or if JSON but no timestamp in the JSON response then returns
|
||||
the maximum value for datetime object that is possible.
|
||||
If not JSON or if JSON but no timestamp in the JSON response then
|
||||
returns the maximum value for datetime object that is possible.
|
||||
|
||||
If you get an URL as a response form the availability API it is guaranteed
|
||||
that you can get the datetime object from the timestamp.
|
||||
If you get an URL as a response form the availability API it is
|
||||
guaranteed that you can get the datetime object from the timestamp.
|
||||
"""
|
||||
|
||||
if self.JSON is None or "archived_snapshots" not in self.JSON:
|
||||
return datetime.max
|
||||
elif (
|
||||
|
||||
if (
|
||||
self.JSON is not None
|
||||
and "archived_snapshots" in self.JSON
|
||||
and self.JSON["archived_snapshots"] is not None
|
||||
@@ -110,21 +150,23 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
return datetime.strptime(
|
||||
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||
)
|
||||
else:
|
||||
|
||||
raise ValueError("Could not get timestamp from result")
|
||||
|
||||
@property
|
||||
def archive_url(self) -> str:
|
||||
"""
|
||||
Reads the the JSON response data and tries to get the timestamp and returns
|
||||
the timestamp if found else returns None.
|
||||
Reads the the JSON response data and returns
|
||||
the timestamp if found and if not found raises
|
||||
ArchiveNotInAvailabilityAPIResponse.
|
||||
"""
|
||||
|
||||
archive_url = ""
|
||||
data = self.JSON
|
||||
|
||||
# If the user didn't used oldest, newest or near but tries to access the
|
||||
# archive_url attribute then, we assume they are fine with any archive
|
||||
# and invoke the oldest archive function.
|
||||
# If the user didn't invoke oldest, newest or near but tries to access the
|
||||
# archive_url attribute then assume they are fine with any archive
|
||||
# and invoke the oldest method.
|
||||
if not data:
|
||||
self.oldest()
|
||||
|
||||
@@ -137,7 +179,7 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
self.json() # It makes a new API call
|
||||
data = self.JSON # json() updated the value of JSON attribute
|
||||
|
||||
# Even if after we exhausted teh max_tries, then we give up and
|
||||
# If we exhausted the max_tries, then we give up and
|
||||
# raise exception.
|
||||
|
||||
if not data or not data["archived_snapshots"]:
|
||||
@@ -160,6 +202,7 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
Prepends zero before the year, month, day, hour and minute so that they
|
||||
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
|
||||
"""
|
||||
|
||||
return "".join(
|
||||
str(kwargs[key]).zfill(2)
|
||||
for key in ["year", "month", "day", "hour", "minute"]
|
||||
@@ -167,18 +210,21 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
|
||||
def oldest(self) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
Passing the year 1994 should return the oldest archive because
|
||||
wayback machine was started in May, 1996 and there should be no archive
|
||||
before the year 1994.
|
||||
Passes the date 1994-01-01 to near which should return the oldest archive
|
||||
because Wayback Machine was started in May, 1996 and it is assumed that
|
||||
there would be no archive older than January 1, 1994.
|
||||
"""
|
||||
return self.near(year=1994)
|
||||
|
||||
return self.near(year=1994, month=1, day=1)
|
||||
|
||||
def newest(self) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
Passing the current UNIX time should be sufficient to get the newest
|
||||
archive considering the API request-response time delay and also the
|
||||
database lags on Wayback machine.
|
||||
Passes the current UNIX time to near() for retrieving the newest archive
|
||||
from the availability API.
|
||||
|
||||
We assume that wayback machine can not archive the future of a webpage.
|
||||
"""
|
||||
|
||||
return self.near(unix_timestamp=int(time.time()))
|
||||
|
||||
def near(
|
||||
@@ -191,16 +237,18 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
unix_timestamp: Optional[int] = None,
|
||||
) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
The main method for this Class, oldest and newest methods are dependent on this
|
||||
method.
|
||||
The main method for the Class, oldest() and newest() are dependent on it.
|
||||
|
||||
It generates the timestamp based on the input either by calling the
|
||||
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
|
||||
appropriate arguments for their respective parameters.
|
||||
|
||||
Adds the timestamp to the payload dictionary.
|
||||
|
||||
And finally invoking the json method to make the API call then returns
|
||||
the instance.
|
||||
"""
|
||||
|
||||
if unix_timestamp:
|
||||
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||
else:
|
||||
|
Reference in New Issue
Block a user