updated tests for availability_api.py and also added max_tries(default value is 3) with delay (sleep) between successive API calls. The dealy actually improves the performace of the availability_api interface.

This commit is contained in:
Akash Mahanty 2022-01-23 15:05:10 +05:30
parent e7488f3a3e
commit c0252edff2
2 changed files with 43 additions and 14 deletions

View File

@ -11,19 +11,20 @@ from waybackpy.exceptions import (
now = datetime.utcnow() now = datetime.utcnow()
url = "https://google.com" url = "https://google.com"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
rndstr = lambda n: "".join( rndstr = lambda n: "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(n) random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
) )
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
def test_oldest(): def test_oldest():
""" """
Test the oldest archive of Google.com and also checks the attributes. Test the oldest archive of Google.com and also checks the attributes.
""" """
url = "http://google.com"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
oldest = availability_api.oldest() oldest = availability_api.oldest()
oldest_archive_url = oldest.archive_url oldest_archive_url = oldest.archive_url
assert "1998" in oldest_archive_url assert "1998" in oldest_archive_url
@ -39,9 +40,15 @@ def test_newest():
Assuming that the recent most Google Archive was made no more earlier than Assuming that the recent most Google Archive was made no more earlier than
last one day which is 86400 seconds. last one day which is 86400 seconds.
""" """
url = "https://www.youtube.com/"
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0"
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
newest = availability_api.newest() newest = availability_api.newest()
newest_timestamp = newest.timestamp() newest_timestamp = newest.timestamp()
assert abs(newest_timestamp - now) < timedelta(seconds=86400) # betting in favor that latest youtube archive was not before the last 3 days
# high tarffic sites like youtube are archived mnay times a day, so seems
# very reasonable to me.
assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3)
def test_invalid_json(): def test_invalid_json():
@ -64,7 +71,7 @@ def test_no_archive():
""" """
with pytest.raises(ArchiveNotInAvailabilityAPIResponse): with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
availability_api = WaybackMachineAvailabilityAPI( availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.com" % rndstr(30), user_agent=user_agent url="https://%s.cn" % rndstr(30), user_agent=user_agent
) )
archive_url = availability_api.archive_url archive_url = availability_api.archive_url
@ -77,7 +84,7 @@ def test_no_api_call_str_repr():
str() must not return None so we return "" str() must not return None so we return ""
""" """
availability_api = WaybackMachineAvailabilityAPI( availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.com" % rndstr(30), user_agent=user_agent url="https://%s.gov" % rndstr(30), user_agent=user_agent
) )
assert "" == str(availability_api) assert "" == str(availability_api)
@ -88,6 +95,6 @@ def test_no_call_timestamp():
the datetime.max as a default value. the datetime.max as a default value.
""" """
availability_api = WaybackMachineAvailabilityAPI( availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.com" % rndstr(30), user_agent=user_agent url="https://%s.in" % rndstr(30), user_agent=user_agent
) )
assert datetime.max == availability_api.timestamp() assert datetime.max == availability_api.timestamp()

View File

@ -14,12 +14,16 @@ class WaybackMachineAvailabilityAPI:
Class that interfaces the availability API of the Wayback Machine. Class that interfaces the availability API of the Wayback Machine.
""" """
def __init__(self, url, user_agent=DEFAULT_USER_AGENT): def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
self.url = str(url).strip().replace(" ", "%20") self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent self.user_agent = user_agent
self.headers = {"User-Agent": self.user_agent} self.headers = {"User-Agent": self.user_agent}
self.payload = {"url": "{url}".format(url=self.url)} self.payload = {"url": "{url}".format(url=self.url)}
self.endpoint = "https://archive.org/wayback/available" self.endpoint = "https://archive.org/wayback/available"
self.max_tries = max_tries
self.tries = 0
self.last_api_call_unix_time = int(time.time())
self.api_call_time_gap = 5
self.JSON = None self.JSON = None
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp): def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
@ -53,9 +57,17 @@ class WaybackMachineAvailabilityAPI:
Makes the API call to the availability API can set the JSON response Makes the API call to the availability API can set the JSON response
to the JSON attribute of the instance and also returns the JSON attribute. to the JSON attribute of the instance and also returns the JSON attribute.
""" """
time_diff = int(time.time()) - self.last_api_call_unix_time
sleep_time = self.api_call_time_gap - time_diff
if sleep_time > 0:
time.sleep(sleep_time)
self.response = requests.get( self.response = requests.get(
self.endpoint, params=self.payload, headers=self.headers self.endpoint, params=self.payload, headers=self.headers
) )
self.last_api_call_unix_time = int(time.time())
self.tries += 1
try: try:
self.JSON = self.response.json() self.JSON = self.response.json()
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError:
@ -101,12 +113,22 @@ class WaybackMachineAvailabilityAPI:
# If data is still not none then probably there are no # If data is still not none then probably there are no
# archive for the requested URL. # archive for the requested URL.
if not data or not data["archived_snapshots"]: if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse( while (self.tries < self.max_tries) and (
"Archive not found in the availability " not data or not data["archived_snapshots"]
+ "API response, maybe the URL you requested does not have any " ):
+ "archive yet. You may retry after some time or archive the webpage now." self.json() # It makes a new API call
+ "\nResponse data:\n{response}".format(response=self.response.text) data = self.JSON # json() updated the value of JSON attribute
)
# Even if after we exhausted teh max_tries, then we give up and
# raise exception.
if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse(
"Archive not found in the availability "
+ "API response, the URL you requested may not have any "
+ "archives yet. You may retry after some time or archive the webpage now."
+ "\nResponse data:\n{response}".format(response=self.response.text)
)
else: else:
archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace( archive_url = archive_url.replace(