created pytest.ini, added test for waybackpy/availability_api.py, new exceptions all of which inherit from the main WaybackError and created requirements-dev.txt

This commit is contained in:
Akash Mahanty 2022-01-23 01:29:07 +05:30
parent 79901ba968
commit 1bacd73002
5 changed files with 152 additions and 4 deletions

11
pytest.ini Normal file
View File

@ -0,0 +1,11 @@
[pytest]
addopts =
# show summary of all tests that did not pass
-ra
# enable all warnings
-Wd
# coverage and html report
--cov=waybackpy
--cov-report=html
testpaths =
tests

3
requirements-dev.txt Normal file
View File

@ -0,0 +1,3 @@
click
requests
pytest

View File

@ -0,0 +1,93 @@
import pytest
import random
import string
from datetime import datetime, timedelta
from waybackpy.availability_api import WaybackMachineAvailabilityAPI
from waybackpy.exceptions import (
InvalidJSONInAvailabilityAPIResponse,
ArchiveNotInAvailabilityAPIResponse,
)
now = datetime.utcnow()
url = "https://google.com"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
rndstr = lambda n: "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
)
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
def test_oldest():
"""
Test the oldest archive of Google.com and also checks the attributes.
"""
oldest = availability_api.oldest()
oldest_archive_url = oldest.archive_url
assert "1998" in oldest_archive_url
oldest_timestamp = oldest.timestamp()
assert abs(oldest_timestamp - now) > timedelta(days=8400) # More than 20 years
assert availability_api.JSON["archived_snapshots"]["closest"]["available"] is True
assert "google.com" in repr(oldest)
assert "1998" in str(oldest)
def test_newest():
"""
Assuming that the recent most Google Archive was made no more earlier than
last one day which is 86400 seconds.
"""
newest = availability_api.newest()
newest_timestamp = newest.timestamp()
assert abs(newest_timestamp - now) < timedelta(seconds=86400)
def test_invalid_json():
"""
When the API is malfunctioning or we don't pass a URL it may return invalid JSON data.
"""
with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)
archive_url = availability_api.archive_url
def test_no_archive():
"""
ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not
replied with the archive despite the fact that we know the site has million
of archives. Don't know the reason for this wierd behavior.
And also if really there are no archives for the passed URL this exception
is raised.
"""
with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.com" % rndstr(30), user_agent=user_agent
)
archive_url = availability_api.archive_url
def test_no_api_call_str_repr():
"""
Some entitled users maybe want to see what is the string representation
if they dont make any API requests.
str() must not return None so we return ""
"""
availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.com" % rndstr(30), user_agent=user_agent
)
assert "" == str(availability_api)
def test_no_call_timestamp():
"""
If no API requests were made the bound timestamp() method returns
the datetime.max as a default value.
"""
availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.com" % rndstr(30), user_agent=user_agent
)
assert datetime.max == availability_api.timestamp()

View File

@ -1,7 +1,12 @@
import time
import json
import requests
from datetime import datetime
from .utils import DEFAULT_USER_AGENT
from .exceptions import (
ArchiveNotInAvailabilityAPIResponse,
InvalidJSONInAvailabilityAPIResponse,
)
class WaybackMachineAvailabilityAPI:
@ -34,8 +39,13 @@ class WaybackMachineAvailabilityAPI:
String representation of the class. If atleast one API call was successfully
made then return the archive URL as a string. Else returns None.
"""
# String must not return anything other than a string object
# So, if some asks for string repr before making the API requests
# just return ""
if not self.JSON:
return None
return ""
return self.archive_url
def json(self):
@ -46,7 +56,13 @@ class WaybackMachineAvailabilityAPI:
self.response = requests.get(
self.endpoint, params=self.payload, headers=self.headers
)
self.JSON = self.response.json()
try:
self.JSON = self.response.json()
except json.decoder.JSONDecodeError:
raise InvalidJSONInAvailabilityAPIResponse(
"Response data:\n{text}".format(text=self.response.text)
)
return self.JSON
def timestamp(self):
@ -76,8 +92,21 @@ class WaybackMachineAvailabilityAPI:
"""
data = self.JSON
if not data["archived_snapshots"]:
archive_url = None
# If the user didn't used oldest, newest or near but tries to access the
# archive_url attribute then, we assume they are fine with any archive
# and invoke the oldest archive function.
if not data:
self.oldest()
# If data is still not none then probably there are no
# archive for the requested URL.
if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse(
"Archive not found in the availability "
+ "API response, maybe the URL you requested does not have any "
+ "archive yet. You may retry after some time or archive the webpage now."
+ "\nResponse data:\n{response}".format(response=self.response.text)
)
else:
archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace(

View File

@ -38,3 +38,15 @@ class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
"""
MaximumSaveRetriesExceeded
"""
class ArchiveNotInAvailabilityAPIResponse(WaybackError):
"""
Could not parse the archive in the JSON response of the availability API.
"""
class InvalidJSONInAvailabilityAPIResponse(WaybackError):
"""
availability api returned invalid JSON
"""