updated tests for availability_api.py and also added max_tries(default value is 3) with delay (sleep) between successive API calls. The dealy actually improves the performace of the availability_api interface.

2022-01-23 15:05:10 +05:30
parent e7488f3a3e
commit c0252edff2
2 changed files with 43 additions and 14 deletions
--- a/waybackpy/availability_api.py
+++ b/waybackpy/availability_api.py
@@ -14,12 +14,16 @@ class WaybackMachineAvailabilityAPI:
    Class that interfaces the availability API of the Wayback Machine.
    """

-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
+    def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
        self.headers = {"User-Agent": self.user_agent}
        self.payload = {"url": "{url}".format(url=self.url)}
        self.endpoint = "https://archive.org/wayback/available"
+        self.max_tries = max_tries
+        self.tries = 0
+        self.last_api_call_unix_time = int(time.time())
+        self.api_call_time_gap = 5
        self.JSON = None

    def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
@@ -53,9 +57,17 @@ class WaybackMachineAvailabilityAPI:
        Makes the API call to the availability API can set the JSON response
        to the JSON attribute of the instance and also returns the JSON attribute.
        """
+        time_diff = int(time.time()) - self.last_api_call_unix_time
+        sleep_time = self.api_call_time_gap - time_diff
+
+        if sleep_time > 0:
+            time.sleep(sleep_time)
+
        self.response = requests.get(
            self.endpoint, params=self.payload, headers=self.headers
        )
+        self.last_api_call_unix_time = int(time.time())
+        self.tries += 1
        try:
            self.JSON = self.response.json()
        except json.decoder.JSONDecodeError:
@@ -101,12 +113,22 @@ class WaybackMachineAvailabilityAPI:
        # If data is still not none then probably there are no
        # archive for the requested URL.
        if not data or not data["archived_snapshots"]:
-            raise ArchiveNotInAvailabilityAPIResponse(
-                "Archive not found in the availability "
-                + "API response, maybe the URL you requested does not have any "
-                + "archive yet. You may retry after some time or archive the webpage now."
-                + "\nResponse data:\n{response}".format(response=self.response.text)
-            )
+            while (self.tries < self.max_tries) and (
+                not data or not data["archived_snapshots"]
+            ):
+                self.json()  # It makes a new API call
+                data = self.JSON  # json() updated the value of JSON attribute
+
+            # Even if after we exhausted teh max_tries, then we give up and
+            # raise exception.
+
+            if not data or not data["archived_snapshots"]:
+                raise ArchiveNotInAvailabilityAPIResponse(
+                    "Archive not found in the availability "
+                    + "API response, the URL you requested may not have any "
+                    + "archives yet. You may retry after some time or archive the webpage now."
+                    + "\nResponse data:\n{response}".format(response=self.response.text)
+                )
        else:
            archive_url = data["archived_snapshots"]["closest"]["url"]
            archive_url = archive_url.replace(