From c0252edff284aa5fd9df063b17df8183cc7c9d65 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Sun, 23 Jan 2022 15:05:10 +0530 Subject: [PATCH] updated tests for availability_api.py and also added max_tries(default value is 3) with delay (sleep) between successive API calls. The dealy actually improves the performace of the availability_api interface. --- tests/test_availability_api.py | 21 +++++++++++++------- waybackpy/availability_api.py | 36 +++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/tests/test_availability_api.py b/tests/test_availability_api.py index 3177061..2eb7cc2 100644 --- a/tests/test_availability_api.py +++ b/tests/test_availability_api.py @@ -11,19 +11,20 @@ from waybackpy.exceptions import ( now = datetime.utcnow() url = "https://google.com" -user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" +user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" rndstr = lambda n: "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(n) ) -availability_api = WaybackMachineAvailabilityAPI(url, user_agent) - def test_oldest(): """ Test the oldest archive of Google.com and also checks the attributes. """ + url = "http://google.com" + user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" + availability_api = WaybackMachineAvailabilityAPI(url, user_agent) oldest = availability_api.oldest() oldest_archive_url = oldest.archive_url assert "1998" in oldest_archive_url @@ -39,9 +40,15 @@ def test_newest(): Assuming that the recent most Google Archive was made no more earlier than last one day which is 86400 seconds. """ + url = "https://www.youtube.com/" + user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0" + availability_api = WaybackMachineAvailabilityAPI(url, user_agent) newest = availability_api.newest() newest_timestamp = newest.timestamp() - assert abs(newest_timestamp - now) < timedelta(seconds=86400) + # betting in favor that latest youtube archive was not before the last 3 days + # high tarffic sites like youtube are archived mnay times a day, so seems + # very reasonable to me. + assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3) def test_invalid_json(): @@ -64,7 +71,7 @@ def test_no_archive(): """ with pytest.raises(ArchiveNotInAvailabilityAPIResponse): availability_api = WaybackMachineAvailabilityAPI( - url="https://%s.com" % rndstr(30), user_agent=user_agent + url="https://%s.cn" % rndstr(30), user_agent=user_agent ) archive_url = availability_api.archive_url @@ -77,7 +84,7 @@ def test_no_api_call_str_repr(): str() must not return None so we return "" """ availability_api = WaybackMachineAvailabilityAPI( - url="https://%s.com" % rndstr(30), user_agent=user_agent + url="https://%s.gov" % rndstr(30), user_agent=user_agent ) assert "" == str(availability_api) @@ -88,6 +95,6 @@ def test_no_call_timestamp(): the datetime.max as a default value. """ availability_api = WaybackMachineAvailabilityAPI( - url="https://%s.com" % rndstr(30), user_agent=user_agent + url="https://%s.in" % rndstr(30), user_agent=user_agent ) assert datetime.max == availability_api.timestamp() diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 3c8dabb..72f247f 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -14,12 +14,16 @@ class WaybackMachineAvailabilityAPI: Class that interfaces the availability API of the Wayback Machine. """ - def __init__(self, url, user_agent=DEFAULT_USER_AGENT): + def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3): self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent self.headers = {"User-Agent": self.user_agent} self.payload = {"url": "{url}".format(url=self.url)} self.endpoint = "https://archive.org/wayback/available" + self.max_tries = max_tries + self.tries = 0 + self.last_api_call_unix_time = int(time.time()) + self.api_call_time_gap = 5 self.JSON = None def unix_timestamp_to_wayback_timestamp(self, unix_timestamp): @@ -53,9 +57,17 @@ class WaybackMachineAvailabilityAPI: Makes the API call to the availability API can set the JSON response to the JSON attribute of the instance and also returns the JSON attribute. """ + time_diff = int(time.time()) - self.last_api_call_unix_time + sleep_time = self.api_call_time_gap - time_diff + + if sleep_time > 0: + time.sleep(sleep_time) + self.response = requests.get( self.endpoint, params=self.payload, headers=self.headers ) + self.last_api_call_unix_time = int(time.time()) + self.tries += 1 try: self.JSON = self.response.json() except json.decoder.JSONDecodeError: @@ -101,12 +113,22 @@ class WaybackMachineAvailabilityAPI: # If data is still not none then probably there are no # archive for the requested URL. if not data or not data["archived_snapshots"]: - raise ArchiveNotInAvailabilityAPIResponse( - "Archive not found in the availability " - + "API response, maybe the URL you requested does not have any " - + "archive yet. You may retry after some time or archive the webpage now." - + "\nResponse data:\n{response}".format(response=self.response.text) - ) + while (self.tries < self.max_tries) and ( + not data or not data["archived_snapshots"] + ): + self.json() # It makes a new API call + data = self.JSON # json() updated the value of JSON attribute + + # Even if after we exhausted teh max_tries, then we give up and + # raise exception. + + if not data or not data["archived_snapshots"]: + raise ArchiveNotInAvailabilityAPIResponse( + "Archive not found in the availability " + + "API response, the URL you requested may not have any " + + "archives yet. You may retry after some time or archive the webpage now." + + "\nResponse data:\n{response}".format(response=self.response.text) + ) else: archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = archive_url.replace(