diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..9112cc3 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,11 @@ +[pytest] +addopts = + # show summary of all tests that did not pass + -ra + # enable all warnings + -Wd + # coverage and html report + --cov=waybackpy + --cov-report=html +testpaths = + tests diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..943b1a9 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +click +requests +pytest diff --git a/tests/test_availability_api.py b/tests/test_availability_api.py new file mode 100644 index 0000000..df161ea --- /dev/null +++ b/tests/test_availability_api.py @@ -0,0 +1,93 @@ +import pytest +import random +import string +from datetime import datetime, timedelta + +from waybackpy.availability_api import WaybackMachineAvailabilityAPI +from waybackpy.exceptions import ( + InvalidJSONInAvailabilityAPIResponse, + ArchiveNotInAvailabilityAPIResponse, +) + +now = datetime.utcnow() +url = "https://google.com" +user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" + +rndstr = lambda n: "".join( + random.choice(string.ascii_uppercase + string.digits) for _ in range(n) +) + +availability_api = WaybackMachineAvailabilityAPI(url, user_agent) + + +def test_oldest(): + """ + Test the oldest archive of Google.com and also checks the attributes. + """ + oldest = availability_api.oldest() + oldest_archive_url = oldest.archive_url + assert "1998" in oldest_archive_url + oldest_timestamp = oldest.timestamp() + assert abs(oldest_timestamp - now) > timedelta(days=8400) # More than 20 years + assert availability_api.JSON["archived_snapshots"]["closest"]["available"] is True + assert "google.com" in repr(oldest) + assert "1998" in str(oldest) + + +def test_newest(): + """ + Assuming that the recent most Google Archive was made no more earlier than + last one day which is 86400 seconds. + """ + newest = availability_api.newest() + newest_timestamp = newest.timestamp() + assert abs(newest_timestamp - now) < timedelta(seconds=86400) + + +def test_invalid_json(): + """ + When the API is malfunctioning or we don't pass a URL it may return invalid JSON data. + """ + with pytest.raises(InvalidJSONInAvailabilityAPIResponse): + availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent) + archive_url = availability_api.archive_url + + +def test_no_archive(): + """ + ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not + replied with the archive despite the fact that we know the site has million + of archives. Don't know the reason for this wierd behavior. + + And also if really there are no archives for the passed URL this exception + is raised. + """ + with pytest.raises(ArchiveNotInAvailabilityAPIResponse): + availability_api = WaybackMachineAvailabilityAPI( + url="https://%s.com" % rndstr(30), user_agent=user_agent + ) + archive_url = availability_api.archive_url + + +def test_no_api_call_str_repr(): + """ + Some entitled users maybe want to see what is the string representation + if they don’t make any API requests. + + str() must not return None so we return "" + """ + availability_api = WaybackMachineAvailabilityAPI( + url="https://%s.com" % rndstr(30), user_agent=user_agent + ) + assert "" == str(availability_api) + + +def test_no_call_timestamp(): + """ + If no API requests were made the bound timestamp() method returns + the datetime.max as a default value. + """ + availability_api = WaybackMachineAvailabilityAPI( + url="https://%s.com" % rndstr(30), user_agent=user_agent + ) + assert datetime.max == availability_api.timestamp() diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 8d19211..3c8dabb 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -1,7 +1,12 @@ import time +import json import requests from datetime import datetime from .utils import DEFAULT_USER_AGENT +from .exceptions import ( + ArchiveNotInAvailabilityAPIResponse, + InvalidJSONInAvailabilityAPIResponse, +) class WaybackMachineAvailabilityAPI: @@ -34,8 +39,13 @@ class WaybackMachineAvailabilityAPI: String representation of the class. If atleast one API call was successfully made then return the archive URL as a string. Else returns None. """ + + # String must not return anything other than a string object + # So, if some asks for string repr before making the API requests + # just return "" if not self.JSON: - return None + return "" + return self.archive_url def json(self): @@ -46,7 +56,13 @@ class WaybackMachineAvailabilityAPI: self.response = requests.get( self.endpoint, params=self.payload, headers=self.headers ) - self.JSON = self.response.json() + try: + self.JSON = self.response.json() + except json.decoder.JSONDecodeError: + raise InvalidJSONInAvailabilityAPIResponse( + "Response data:\n{text}".format(text=self.response.text) + ) + return self.JSON def timestamp(self): @@ -76,8 +92,21 @@ class WaybackMachineAvailabilityAPI: """ data = self.JSON - if not data["archived_snapshots"]: - archive_url = None + # If the user didn't used oldest, newest or near but tries to access the + # archive_url attribute then, we assume they are fine with any archive + # and invoke the oldest archive function. + if not data: + self.oldest() + + # If data is still not none then probably there are no + # archive for the requested URL. + if not data or not data["archived_snapshots"]: + raise ArchiveNotInAvailabilityAPIResponse( + "Archive not found in the availability " + + "API response, maybe the URL you requested does not have any " + + "archive yet. You may retry after some time or archive the webpage now." + + "\nResponse data:\n{response}".format(response=self.response.text) + ) else: archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = archive_url.replace( diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 3826629..8e75aea 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -38,3 +38,15 @@ class MaximumSaveRetriesExceeded(MaximumRetriesExceeded): """ MaximumSaveRetriesExceeded """ + + +class ArchiveNotInAvailabilityAPIResponse(WaybackError): + """ + Could not parse the archive in the JSON response of the availability API. + """ + + +class InvalidJSONInAvailabilityAPIResponse(WaybackError): + """ + availability api returned invalid JSON + """