""" This module interfaces the Wayback Machine's availability API. The interface is useful for looking up archives and finding archives that are close to a specific date and time. It has a class WaybackMachineAvailabilityAPI, and the class has methods like: near() for retrieving archives close to a specific date and time. oldest() for retrieving the first archive URL of the webpage. newest() for retrieving the latest archive of the webpage. The Wayback Machine Availability API response must be a valid JSON and if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised. If the Availability API returned valid JSON but archive URL could not be found it it then ArchiveNotInAvailabilityAPIResponse is raised. """ import json import time from datetime import datetime from typing import Any, Dict, Optional import requests from requests.models import Response from .exceptions import ( ArchiveNotInAvailabilityAPIResponse, InvalidJSONInAvailabilityAPIResponse, ) from .utils import ( DEFAULT_USER_AGENT, unix_timestamp_to_wayback_timestamp, wayback_timestamp, ) ResponseJSON = Dict[str, Any] class WaybackMachineAvailabilityAPI: """ Class that interfaces the Wayback Machine's availability API. """ def __init__( self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3 ) -> None: self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent self.headers: Dict[str, str] = {"User-Agent": self.user_agent} self.payload: Dict[str, str] = {"url": self.url} self.endpoint: str = "https://archive.org/wayback/available" self.max_tries: int = max_tries self.tries: int = 0 self.last_api_call_unix_time: int = int(time.time()) self.api_call_time_gap: int = 5 self.json: Optional[ResponseJSON] = None self.response: Optional[Response] = None def __repr__(self) -> str: """ Same as string representation, just return the archive URL as a string. """ return str(self) def __str__(self) -> str: """ String representation of the class. If atleast one API call was successfully made then return the archive URL as a string. Else returns "" (empty string literal). """ # __str__ can not return anything other than a string object # So, if a string repr is asked even before making a API request # just return "" if not self.json: return "" return self.archive_url def setup_json(self) -> Optional[ResponseJSON]: """ Makes the API call to the availability API and set the JSON response to the JSON attribute of the instance and also returns the JSON attribute. time_diff and sleep_time makes sure that you are not making too many requests in a short interval of item, making too many requests is bad as Wayback Machine may reject them above a certain threshold. The end-user can change the api_call_time_gap attribute of the instance to increase or decrease the default time gap between two successive API calls, but it is not recommended to increase it. """ time_diff = int(time.time()) - self.last_api_call_unix_time sleep_time = self.api_call_time_gap - time_diff if sleep_time > 0: time.sleep(sleep_time) self.response = requests.get( self.endpoint, params=self.payload, headers=self.headers ) self.last_api_call_unix_time = int(time.time()) self.tries += 1 try: self.json = None if self.response is None else self.response.json() except json.decoder.JSONDecodeError as json_decode_error: raise InvalidJSONInAvailabilityAPIResponse( f"Response data:\n{self.response.text}" ) from json_decode_error return self.json def timestamp(self) -> datetime: """ Converts the timestamp form the JSON response to datetime object. If JSON attribute of the instance is None it implies that the either the the last API call failed or one was never made. If not JSON or if JSON but no timestamp in the JSON response then returns the maximum value for datetime object that is possible. If you get an URL as a response form the availability API it is guaranteed that you can get the datetime object from the timestamp. """ if self.json is None or "archived_snapshots" not in self.json: return datetime.max if ( self.json is not None and "archived_snapshots" in self.json and self.json["archived_snapshots"] is not None and "closest" in self.json["archived_snapshots"] and self.json["archived_snapshots"]["closest"] is not None and "timestamp" in self.json["archived_snapshots"]["closest"] ): return datetime.strptime( self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" ) raise ValueError("Timestamp not found in the Availability API's JSON response.") @property def archive_url(self) -> str: """ Reads the the JSON response data and returns the timestamp if found and if not found raises ArchiveNotInAvailabilityAPIResponse. """ archive_url = "" data = self.json # If the user didn't invoke oldest, newest or near but tries to access # archive_url attribute then assume they that are fine with any archive # and invoke the oldest method. if not data: self.oldest() # If data is still not none then probably there are no # archive for the requested URL. if not data or not data["archived_snapshots"]: while (self.tries < self.max_tries) and ( not data or not data["archived_snapshots"] ): self.setup_json() # It makes a new API call data = self.json # setup_json() updates value of json attribute # If exhausted max_tries, then give up and # raise ArchiveNotInAvailabilityAPIResponse. if not data or not data["archived_snapshots"]: raise ArchiveNotInAvailabilityAPIResponse( "Archive not found in the availability " "API response, the URL you requested may not have any archives " "yet. You may retry after some time or archive the webpage now.\n" "Response data:\n" "" if self.response is None else self.response.text ) else: archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = archive_url.replace( "http://web.archive.org/web/", "https://web.archive.org/web/", 1 ) return archive_url def oldest(self) -> "WaybackMachineAvailabilityAPI": """ Passes the date 1994-01-01 to near which should return the oldest archive because Wayback Machine was started in May, 1996 and it is assumed that there would be no archive older than January 1, 1994. """ return self.near(year=1994, month=1, day=1) def newest(self) -> "WaybackMachineAvailabilityAPI": """ Passes the current UNIX time to near() for retrieving the newest archive from the availability API. Remember UNIX time is UTC and Wayback Machine is also UTC based. """ return self.near(unix_timestamp=int(time.time())) def near( self, year: Optional[int] = None, month: Optional[int] = None, day: Optional[int] = None, hour: Optional[int] = None, minute: Optional[int] = None, unix_timestamp: Optional[int] = None, ) -> "WaybackMachineAvailabilityAPI": """ The most important method of this Class, oldest() and newest() are dependent on it. It generates the timestamp based on the input either by calling the unix_timestamp_to_wayback_timestamp or wayback_timestamp method with appropriate arguments for their respective parameters. Adds the timestamp to the payload dictionary. And finally invokes the setup_json method to make the API call then finally returns the instance. """ if unix_timestamp: timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp) else: now = datetime.utcnow().timetuple() timestamp = wayback_timestamp( year=now.tm_year if year is None else year, month=now.tm_mon if month is None else month, day=now.tm_mday if day is None else day, hour=now.tm_hour if hour is None else hour, minute=now.tm_min if minute is None else minute, ) self.payload["timestamp"] = timestamp self.setup_json() return self