waybackpy/waybackpy/availability_api.py

262 lines
9.7 KiB
Python

"""
This module interfaces the Wayback Machine's availability API.
The interface is useful for looking up archives and finding archives
that are close to a specific date and time.
It has a class WaybackMachineAvailabilityAPI, and the class has
methods like:
near() for retrieving archives close to a specific date and time.
oldest() for retrieving the first archive URL of the webpage.
newest() for retrieving the latest archive of the webpage.
The Wayback Machine Availability API response must be a valid JSON and
if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
If the Availability API returned valid JSON but archive URL could not be found
it it then ArchiveNotInAvailabilityAPIResponse is raised.
"""
import json
import time
from datetime import datetime
from typing import Any, Dict, Optional
import requests
from requests.models import Response
from .exceptions import (
ArchiveNotInAvailabilityAPIResponse,
InvalidJSONInAvailabilityAPIResponse,
)
from .utils import DEFAULT_USER_AGENT
ResponseJSON = Dict[str, Any]
class WaybackMachineAvailabilityAPI:
"""
Class that interfaces the Wayback Machine's availability API.
"""
def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
self.payload: Dict[str, str] = {"url": self.url}
self.endpoint: str = "https://archive.org/wayback/available"
self.max_tries: int = max_tries
self.tries: int = 0
self.last_api_call_unix_time: int = int(time.time())
self.api_call_time_gap: int = 5
self.json: Optional[ResponseJSON] = None
self.response: Optional[Response] = None
@staticmethod
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
"""
Converts Unix time to Wayback Machine timestamp, Wayback Machine
timestamp format is yyyyMMddhhmmss.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def __repr__(self) -> str:
"""
Same as string representation, just return the archive URL as a string.
"""
return str(self)
def __str__(self) -> str:
"""
String representation of the class. If atleast one API
call was successfully made then return the archive URL
as a string. Else returns "" (empty string literal).
"""
# __str__ can not return anything other than a string object
# So, if a string repr is asked even before making a API request
# just return ""
if not self.json:
return ""
return self.archive_url
def setup_json(self) -> Optional[ResponseJSON]:
"""
Makes the API call to the availability API and set the JSON response
to the JSON attribute of the instance and also returns the JSON
attribute.
time_diff and sleep_time makes sure that you are not making too many
requests in a short interval of item, making too many requests is bad
as Wayback Machine may reject them above a certain threshold.
The end-user can change the api_call_time_gap attribute of the instance
to increase or decrease the default time gap between two successive API
calls, but it is not recommended to increase it.
"""
time_diff = int(time.time()) - self.last_api_call_unix_time
sleep_time = self.api_call_time_gap - time_diff
if sleep_time > 0:
time.sleep(sleep_time)
self.response = requests.get(
self.endpoint, params=self.payload, headers=self.headers
)
self.last_api_call_unix_time = int(time.time())
self.tries += 1
try:
self.json = None if self.response is None else self.response.json()
except json.decoder.JSONDecodeError as json_decode_error:
raise InvalidJSONInAvailabilityAPIResponse(
f"Response data:\n{self.response.text}"
) from json_decode_error
return self.json
def timestamp(self) -> datetime:
"""
Converts the timestamp form the JSON response to datetime object.
If JSON attribute of the instance is None it implies that the either
the the last API call failed or one was never made.
If not JSON or if JSON but no timestamp in the JSON response then
returns the maximum value for datetime object that is possible.
If you get an URL as a response form the availability API it is
guaranteed that you can get the datetime object from the timestamp.
"""
if self.json is None or "archived_snapshots" not in self.json:
return datetime.max
if (
self.json is not None
and "archived_snapshots" in self.json
and self.json["archived_snapshots"] is not None
and "closest" in self.json["archived_snapshots"]
and self.json["archived_snapshots"]["closest"] is not None
and "timestamp" in self.json["archived_snapshots"]["closest"]
):
return datetime.strptime(
self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
raise ValueError("Timestamp not found in the Availability API's JSON response.")
@property
def archive_url(self) -> str:
"""
Reads the the JSON response data and returns
the timestamp if found and if not found raises
ArchiveNotInAvailabilityAPIResponse.
"""
archive_url = ""
data = self.json
# If the user didn't invoke oldest, newest or near but tries to access
# archive_url attribute then assume they that are fine with any archive
# and invoke the oldest method.
if not data:
self.oldest()
# If data is still not none then probably there are no
# archive for the requested URL.
if not data or not data["archived_snapshots"]:
while (self.tries < self.max_tries) and (
not data or not data["archived_snapshots"]
):
self.setup_json() # It makes a new API call
data = self.json # setup_json() updates value of json attribute
# If exhausted max_tries, then give up and
# raise ArchiveNotInAvailabilityAPIResponse.
if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse(
"Archive not found in the availability "
"API response, the URL you requested may not have any archives "
"yet. You may retry after some time or archive the webpage now.\n"
"Response data:\n"
""
if self.response is None
else self.response.text
)
else:
archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace(
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
)
return archive_url
@staticmethod
def wayback_timestamp(**kwargs: int) -> str:
"""
Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format.
"""
return "".join(
str(kwargs[key]).zfill(2)
for key in ["year", "month", "day", "hour", "minute"]
)
def oldest(self) -> "WaybackMachineAvailabilityAPI":
"""
Passes the date 1994-01-01 to near which should return the oldest archive
because Wayback Machine was started in May, 1996 and it is assumed that
there would be no archive older than January 1, 1994.
"""
return self.near(year=1994, month=1, day=1)
def newest(self) -> "WaybackMachineAvailabilityAPI":
"""
Passes the current UNIX time to near() for retrieving the newest archive
from the availability API.
Remember UNIX time is UTC and Wayback Machine is also UTC based.
"""
return self.near(unix_timestamp=int(time.time()))
def near(
self,
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
) -> "WaybackMachineAvailabilityAPI":
"""
The most important method of this Class, oldest() and newest() are
dependent on it.
It generates the timestamp based on the input either by calling the
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
appropriate arguments for their respective parameters.
Adds the timestamp to the payload dictionary.
And finally invokes the setup_json method to make the API call then
finally returns the instance.
"""
if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = self.wayback_timestamp(
year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day,
hour=now.tm_hour if hour is None else hour,
minute=now.tm_min if minute is None else minute,
)
self.payload["timestamp"] = timestamp
self.setup_json()
return self