* fix: CI yml name

* add: mypy configuraion

* add: type annotation to waybackpy modules

* add: type annotation to test modules

* fix: mypy command

* add: types-requests to dev deps

* fix: disable max-line-length

* fix: move pytest.ini into setup.cfg

* add: urllib3 to deps

* fix: Retry (ref: https://github.com/python/typeshed/issues/6893)

* fix: f-string

* fix: shorten long lines

* add: staticmethod decorator to no-self-use methods

* fix: str(headers)->headers_str

* fix: error message

* fix: revert "str(headers)->headers_str" and ignore assignment CaseInsensitiveDict with str

* fix: mypy error
This commit is contained in:
eggplants
2022-02-05 03:23:36 +09:00
committed by GitHub
parent 320ef30371
commit d8cabdfdb5
22 changed files with 537 additions and 364 deletions

View File

@@ -1,6 +1,7 @@
import json
import time
from datetime import datetime
from typing import Any, Dict, Optional
import requests
@@ -10,37 +11,42 @@ from .exceptions import (
)
from .utils import DEFAULT_USER_AGENT
ResponseJSON = Dict[str, Any]
class WaybackMachineAvailabilityAPI:
class WaybackMachineAvailabilityAPI(object):
"""
Class that interfaces the availability API of the Wayback Machine.
"""
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
self.headers = {"User-Agent": self.user_agent}
self.payload = {"url": "{url}".format(url=self.url)}
self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
self.payload = {"url": self.url}
self.endpoint = "https://archive.org/wayback/available"
self.max_tries = max_tries
self.tries = 0
self.last_api_call_unix_time = int(time.time())
self.api_call_time_gap = 5
self.JSON = None
self.JSON: Optional[ResponseJSON] = None
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
@staticmethod
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
"""
Converts Unix time to wayback Machine timestamp.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def __repr__(self):
def __repr__(self) -> str:
"""
Same as string representation, just return the archive URL as a string.
"""
return str(self)
def __str__(self):
def __str__(self) -> str:
"""
String representation of the class. If atleast one API call was successfully
made then return the archive URL as a string. Else returns None.
@@ -54,7 +60,7 @@ class WaybackMachineAvailabilityAPI:
return self.archive_url
def json(self):
def json(self) -> Optional[ResponseJSON]:
"""
Makes the API call to the availability API can set the JSON response
to the JSON attribute of the instance and also returns the JSON attribute.
@@ -74,12 +80,12 @@ class WaybackMachineAvailabilityAPI:
self.JSON = self.response.json()
except json.decoder.JSONDecodeError:
raise InvalidJSONInAvailabilityAPIResponse(
"Response data:\n{text}".format(text=self.response.text)
f"Response data:\n{self.response.text}"
)
return self.JSON
def timestamp(self):
def timestamp(self) -> datetime:
"""
Converts the timestamp form the JSON response to datetime object.
If JSON attribute of the instance is None it implies that the either
@@ -91,19 +97,29 @@ class WaybackMachineAvailabilityAPI:
If you get an URL as a response form the availability API it is guaranteed
that you can get the datetime object from the timestamp.
"""
if not self.JSON or not self.JSON["archived_snapshots"]:
if self.JSON is None or "archived_snapshots" not in self.JSON:
return datetime.max
return datetime.strptime(
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
elif (
self.JSON is not None
and "archived_snapshots" in self.JSON
and self.JSON["archived_snapshots"] is not None
and "closest" in self.JSON["archived_snapshots"]
and self.JSON["archived_snapshots"]["closest"] is not None
and "timestamp" in self.JSON["archived_snapshots"]["closest"]
):
return datetime.strptime(
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
else:
raise ValueError("Could not get timestamp from result")
@property
def archive_url(self):
def archive_url(self) -> str:
"""
Reads the the JSON response data and tries to get the timestamp and returns
the timestamp if found else returns None.
"""
archive_url = ""
data = self.JSON
# If the user didn't used oldest, newest or near but tries to access the
@@ -127,9 +143,9 @@ class WaybackMachineAvailabilityAPI:
if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse(
"Archive not found in the availability "
+ "API response, the URL you requested may not have any "
+ "archives yet. You may retry after some time or archive the webpage now."
+ "\nResponse data:\n{response}".format(response=self.response.text)
"API response, the URL you requested may not have any archives "
"yet. You may retry after some time or archive the webpage now.\n"
f"Response data:\n{self.response.text}"
)
else:
archive_url = data["archived_snapshots"]["closest"]["url"]
@@ -138,7 +154,8 @@ class WaybackMachineAvailabilityAPI:
)
return archive_url
def wayback_timestamp(self, **kwargs):
@staticmethod
def wayback_timestamp(**kwargs: int) -> str:
"""
Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
@@ -148,7 +165,7 @@ class WaybackMachineAvailabilityAPI:
for key in ["year", "month", "day", "hour", "minute"]
)
def oldest(self):
def oldest(self) -> "WaybackMachineAvailabilityAPI":
"""
Passing the year 1994 should return the oldest archive because
wayback machine was started in May, 1996 and there should be no archive
@@ -156,7 +173,7 @@ class WaybackMachineAvailabilityAPI:
"""
return self.near(year=1994)
def newest(self):
def newest(self) -> "WaybackMachineAvailabilityAPI":
"""
Passing the current UNIX time should be sufficient to get the newest
archive considering the API request-response time delay and also the
@@ -166,13 +183,13 @@ class WaybackMachineAvailabilityAPI:
def near(
self,
year=None,
month=None,
day=None,
hour=None,
minute=None,
unix_timestamp=None,
):
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
) -> "WaybackMachineAvailabilityAPI":
"""
The main method for this Class, oldest and newest methods are dependent on this
method.
@@ -181,18 +198,19 @@ class WaybackMachineAvailabilityAPI:
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
appropriate arguments for their respective parameters.
Adds the timestamp to the payload dictionary.
And finally invoking the json method to make the API call then returns the instance.
And finally invoking the json method to make the API call then returns
the instance.
"""
if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = self.wayback_timestamp(
year=year if year else now.tm_year,
month=month if month else now.tm_mon,
day=day if day else now.tm_mday,
hour=hour if hour else now.tm_hour,
minute=minute if minute else now.tm_min,
year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day,
hour=now.tm_hour if hour is None else hour,
minute=now.tm_min if minute is None else minute,
)
self.payload["timestamp"] = timestamp