waybackpy/waybackpy/availability_api.py

199 lines
7.1 KiB
Python

import time
import json
import requests
from datetime import datetime
from .utils import DEFAULT_USER_AGENT
from .exceptions import (
ArchiveNotInAvailabilityAPIResponse,
InvalidJSONInAvailabilityAPIResponse,
)
class WaybackMachineAvailabilityAPI:
"""
Class that interfaces the availability API of the Wayback Machine.
"""
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
self.headers = {"User-Agent": self.user_agent}
self.payload = {"url": "{url}".format(url=self.url)}
self.endpoint = "https://archive.org/wayback/available"
self.max_tries = max_tries
self.tries = 0
self.last_api_call_unix_time = int(time.time())
self.api_call_time_gap = 5
self.JSON = None
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
"""
Converts Unix time to wayback Machine timestamp.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def __repr__(self):
"""
Same as string representation, just return the archive URL as a string.
"""
return str(self)
def __str__(self):
"""
String representation of the class. If atleast one API call was successfully
made then return the archive URL as a string. Else returns None.
"""
# String must not return anything other than a string object
# So, if some asks for string repr before making the API requests
# just return ""
if not self.JSON:
return ""
return self.archive_url
def json(self):
"""
Makes the API call to the availability API can set the JSON response
to the JSON attribute of the instance and also returns the JSON attribute.
"""
time_diff = int(time.time()) - self.last_api_call_unix_time
sleep_time = self.api_call_time_gap - time_diff
if sleep_time > 0:
time.sleep(sleep_time)
self.response = requests.get(
self.endpoint, params=self.payload, headers=self.headers
)
self.last_api_call_unix_time = int(time.time())
self.tries += 1
try:
self.JSON = self.response.json()
except json.decoder.JSONDecodeError:
raise InvalidJSONInAvailabilityAPIResponse(
"Response data:\n{text}".format(text=self.response.text)
)
return self.JSON
def timestamp(self):
"""
Converts the timestamp form the JSON response to datetime object.
If JSON attribute of the instance is None it implies that the either
the the last API call failed or one was never made.
If not JSON or if JSON but no timestamp in the JSON response then returns
the maximum value for datetime object that is possible.
If you get an URL as a response form the availability API it is guaranteed
that you can get the datetime object from the timestamp.
"""
if not self.JSON or not self.JSON["archived_snapshots"]:
return datetime.max
return datetime.strptime(
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
@property
def archive_url(self):
"""
Reads the the JSON response data and tries to get the timestamp and returns
the timestamp if found else returns None.
"""
data = self.JSON
# If the user didn't used oldest, newest or near but tries to access the
# archive_url attribute then, we assume they are fine with any archive
# and invoke the oldest archive function.
if not data:
self.oldest()
# If data is still not none then probably there are no
# archive for the requested URL.
if not data or not data["archived_snapshots"]:
while (self.tries < self.max_tries) and (
not data or not data["archived_snapshots"]
):
self.json() # It makes a new API call
data = self.JSON # json() updated the value of JSON attribute
# Even if after we exhausted teh max_tries, then we give up and
# raise exception.
if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse(
"Archive not found in the availability "
+ "API response, the URL you requested may not have any "
+ "archives yet. You may retry after some time or archive the webpage now."
+ "\nResponse data:\n{response}".format(response=self.response.text)
)
else:
archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace(
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
)
return archive_url
def wayback_timestamp(self, **kwargs):
"""
Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
"""
return "".join(
str(kwargs[key]).zfill(2)
for key in ["year", "month", "day", "hour", "minute"]
)
def oldest(self):
"""
Passing the year 1994 should return the oldest archive because
wayback machine was started in May, 1996 and there should be no archive
before the year 1994.
"""
return self.near(year=1994)
def newest(self):
"""
Passing the current UNIX time should be sufficient to get the newest
archive considering the API request-response time delay and also the
database lags on Wayback machine.
"""
return self.near(unix_timestamp=int(time.time()))
def near(
self,
year=None,
month=None,
day=None,
hour=None,
minute=None,
unix_timestamp=None,
):
"""
The main method for this Class, oldest and newest methods are dependent on this
method.
It generates the timestamp based on the input either by calling the
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
appropriate arguments for their respective parameters.
Adds the timestamp to the payload dictionary.
And finally invoking the json method to make the API call then returns the instance.
"""
if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = self.wayback_timestamp(
year=year if year else now.tm_year,
month=month if month else now.tm_mon,
day=day if day else now.tm_mday,
hour=hour if hour else now.tm_hour,
minute=minute if minute else now.tm_min,
)
self.payload["timestamp"] = timestamp
self.json()
return self