updated tests for availability_api.py and also added max_tries(default value is 3) with delay (sleep) between successive API calls. The dealy actually improves the performace of the availability_api interface.

2022-01-23 15:05:10 +05:30
parent e7488f3a3e
commit c0252edff2
2 changed files with 43 additions and 14 deletions
@@ -11,19 +11,20 @@ from waybackpy.exceptions import (

 now = datetime.utcnow()
 url = "https://google.com"
-user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
+user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"

 rndstr = lambda n: "".join(
    random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
 )

-availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
-

 def test_oldest():
    """
    Test the oldest archive of Google.com and also checks the attributes.
    """
+    url = "http://google.com"
+    user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
+    availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
    oldest = availability_api.oldest()
    oldest_archive_url = oldest.archive_url
    assert "1998" in oldest_archive_url
@@ -39,9 +40,15 @@ def test_newest():
    Assuming that the recent most Google Archive was made no more earlier than
    last one day which is 86400 seconds.
    """
+    url = "https://www.youtube.com/"
+    user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0"
+    availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
    newest = availability_api.newest()
    newest_timestamp = newest.timestamp()
-    assert abs(newest_timestamp - now) < timedelta(seconds=86400)
+    # betting in favor that latest youtube archive was not before the last 3 days
+    # high tarffic sites like youtube are archived mnay times a day, so seems
+    # very reasonable to me.
+    assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3)


 def test_invalid_json():
@@ -64,7 +71,7 @@ def test_no_archive():
    """
    with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
        availability_api = WaybackMachineAvailabilityAPI(
-            url="https://%s.com" % rndstr(30), user_agent=user_agent
+            url="https://%s.cn" % rndstr(30), user_agent=user_agent
        )
        archive_url = availability_api.archive_url

@@ -77,7 +84,7 @@ def test_no_api_call_str_repr():
    str() must not return None so we return ""
    """
    availability_api = WaybackMachineAvailabilityAPI(
-        url="https://%s.com" % rndstr(30), user_agent=user_agent
+        url="https://%s.gov" % rndstr(30), user_agent=user_agent
    )
    assert "" == str(availability_api)

@@ -88,6 +95,6 @@ def test_no_call_timestamp():
    the datetime.max as a default value.
    """
    availability_api = WaybackMachineAvailabilityAPI(
-        url="https://%s.com" % rndstr(30), user_agent=user_agent
+        url="https://%s.in" % rndstr(30), user_agent=user_agent
    )
    assert datetime.max == availability_api.timestamp()
@@ -14,12 +14,16 @@ class WaybackMachineAvailabilityAPI:
    Class that interfaces the availability API of the Wayback Machine.
    """

-    def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
+    def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
        self.headers = {"User-Agent": self.user_agent}
        self.payload = {"url": "{url}".format(url=self.url)}
        self.endpoint = "https://archive.org/wayback/available"
+        self.max_tries = max_tries
+        self.tries = 0
+        self.last_api_call_unix_time = int(time.time())
+        self.api_call_time_gap = 5
        self.JSON = None

    def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
@@ -53,9 +57,17 @@ class WaybackMachineAvailabilityAPI:
        Makes the API call to the availability API can set the JSON response
        to the JSON attribute of the instance and also returns the JSON attribute.
        """
+        time_diff = int(time.time()) - self.last_api_call_unix_time
+        sleep_time = self.api_call_time_gap - time_diff
+
+        if sleep_time > 0:
+            time.sleep(sleep_time)
+
        self.response = requests.get(
            self.endpoint, params=self.payload, headers=self.headers
        )
+        self.last_api_call_unix_time = int(time.time())
+        self.tries += 1
        try:
            self.JSON = self.response.json()
        except json.decoder.JSONDecodeError:
@@ -100,11 +112,21 @@ class WaybackMachineAvailabilityAPI:

        # If data is still not none then probably there are no
        # archive for the requested URL.
+        if not data or not data["archived_snapshots"]:
+            while (self.tries < self.max_tries) and (
+                not data or not data["archived_snapshots"]
+            ):
+                self.json()  # It makes a new API call
+                data = self.JSON  # json() updated the value of JSON attribute
+
+            # Even if after we exhausted teh max_tries, then we give up and
+            # raise exception.
+
            if not data or not data["archived_snapshots"]:
                raise ArchiveNotInAvailabilityAPIResponse(
                    "Archive not found in the availability "
-                + "API response, maybe the URL you requested does not have any "
-                + "archive yet. You may retry after some time or archive the webpage now."
+                    + "API response, the URL you requested may not have any "
+                    + "archives yet. You may retry after some time or archive the webpage now."
                    + "\nResponse data:\n{response}".format(response=self.response.text)
                )
        else: