updated tests for availability_api.py and also added max_tries(default value is 3) with delay (sleep) between successive API calls. The dealy actually improves the performace of the availability_api interface.
This commit is contained in:
parent
e7488f3a3e
commit
c0252edff2
@ -11,19 +11,20 @@ from waybackpy.exceptions import (
|
|||||||
|
|
||||||
now = datetime.utcnow()
|
now = datetime.utcnow()
|
||||||
url = "https://google.com"
|
url = "https://google.com"
|
||||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
||||||
|
|
||||||
rndstr = lambda n: "".join(
|
rndstr = lambda n: "".join(
|
||||||
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
|
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
|
||||||
)
|
)
|
||||||
|
|
||||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
|
||||||
|
|
||||||
|
|
||||||
def test_oldest():
|
def test_oldest():
|
||||||
"""
|
"""
|
||||||
Test the oldest archive of Google.com and also checks the attributes.
|
Test the oldest archive of Google.com and also checks the attributes.
|
||||||
"""
|
"""
|
||||||
|
url = "http://google.com"
|
||||||
|
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
||||||
|
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
||||||
oldest = availability_api.oldest()
|
oldest = availability_api.oldest()
|
||||||
oldest_archive_url = oldest.archive_url
|
oldest_archive_url = oldest.archive_url
|
||||||
assert "1998" in oldest_archive_url
|
assert "1998" in oldest_archive_url
|
||||||
@ -39,9 +40,15 @@ def test_newest():
|
|||||||
Assuming that the recent most Google Archive was made no more earlier than
|
Assuming that the recent most Google Archive was made no more earlier than
|
||||||
last one day which is 86400 seconds.
|
last one day which is 86400 seconds.
|
||||||
"""
|
"""
|
||||||
|
url = "https://www.youtube.com/"
|
||||||
|
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0"
|
||||||
|
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
||||||
newest = availability_api.newest()
|
newest = availability_api.newest()
|
||||||
newest_timestamp = newest.timestamp()
|
newest_timestamp = newest.timestamp()
|
||||||
assert abs(newest_timestamp - now) < timedelta(seconds=86400)
|
# betting in favor that latest youtube archive was not before the last 3 days
|
||||||
|
# high tarffic sites like youtube are archived mnay times a day, so seems
|
||||||
|
# very reasonable to me.
|
||||||
|
assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3)
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_json():
|
def test_invalid_json():
|
||||||
@ -64,7 +71,7 @@ def test_no_archive():
|
|||||||
"""
|
"""
|
||||||
with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
|
with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
|
||||||
availability_api = WaybackMachineAvailabilityAPI(
|
availability_api = WaybackMachineAvailabilityAPI(
|
||||||
url="https://%s.com" % rndstr(30), user_agent=user_agent
|
url="https://%s.cn" % rndstr(30), user_agent=user_agent
|
||||||
)
|
)
|
||||||
archive_url = availability_api.archive_url
|
archive_url = availability_api.archive_url
|
||||||
|
|
||||||
@ -77,7 +84,7 @@ def test_no_api_call_str_repr():
|
|||||||
str() must not return None so we return ""
|
str() must not return None so we return ""
|
||||||
"""
|
"""
|
||||||
availability_api = WaybackMachineAvailabilityAPI(
|
availability_api = WaybackMachineAvailabilityAPI(
|
||||||
url="https://%s.com" % rndstr(30), user_agent=user_agent
|
url="https://%s.gov" % rndstr(30), user_agent=user_agent
|
||||||
)
|
)
|
||||||
assert "" == str(availability_api)
|
assert "" == str(availability_api)
|
||||||
|
|
||||||
@ -88,6 +95,6 @@ def test_no_call_timestamp():
|
|||||||
the datetime.max as a default value.
|
the datetime.max as a default value.
|
||||||
"""
|
"""
|
||||||
availability_api = WaybackMachineAvailabilityAPI(
|
availability_api = WaybackMachineAvailabilityAPI(
|
||||||
url="https://%s.com" % rndstr(30), user_agent=user_agent
|
url="https://%s.in" % rndstr(30), user_agent=user_agent
|
||||||
)
|
)
|
||||||
assert datetime.max == availability_api.timestamp()
|
assert datetime.max == availability_api.timestamp()
|
||||||
|
@ -14,12 +14,16 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
Class that interfaces the availability API of the Wayback Machine.
|
Class that interfaces the availability API of the Wayback Machine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
|
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
|
||||||
self.url = str(url).strip().replace(" ", "%20")
|
self.url = str(url).strip().replace(" ", "%20")
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
self.headers = {"User-Agent": self.user_agent}
|
self.headers = {"User-Agent": self.user_agent}
|
||||||
self.payload = {"url": "{url}".format(url=self.url)}
|
self.payload = {"url": "{url}".format(url=self.url)}
|
||||||
self.endpoint = "https://archive.org/wayback/available"
|
self.endpoint = "https://archive.org/wayback/available"
|
||||||
|
self.max_tries = max_tries
|
||||||
|
self.tries = 0
|
||||||
|
self.last_api_call_unix_time = int(time.time())
|
||||||
|
self.api_call_time_gap = 5
|
||||||
self.JSON = None
|
self.JSON = None
|
||||||
|
|
||||||
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
|
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
|
||||||
@ -53,9 +57,17 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
Makes the API call to the availability API can set the JSON response
|
Makes the API call to the availability API can set the JSON response
|
||||||
to the JSON attribute of the instance and also returns the JSON attribute.
|
to the JSON attribute of the instance and also returns the JSON attribute.
|
||||||
"""
|
"""
|
||||||
|
time_diff = int(time.time()) - self.last_api_call_unix_time
|
||||||
|
sleep_time = self.api_call_time_gap - time_diff
|
||||||
|
|
||||||
|
if sleep_time > 0:
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
self.response = requests.get(
|
self.response = requests.get(
|
||||||
self.endpoint, params=self.payload, headers=self.headers
|
self.endpoint, params=self.payload, headers=self.headers
|
||||||
)
|
)
|
||||||
|
self.last_api_call_unix_time = int(time.time())
|
||||||
|
self.tries += 1
|
||||||
try:
|
try:
|
||||||
self.JSON = self.response.json()
|
self.JSON = self.response.json()
|
||||||
except json.decoder.JSONDecodeError:
|
except json.decoder.JSONDecodeError:
|
||||||
@ -100,11 +112,21 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
|
|
||||||
# If data is still not none then probably there are no
|
# If data is still not none then probably there are no
|
||||||
# archive for the requested URL.
|
# archive for the requested URL.
|
||||||
|
if not data or not data["archived_snapshots"]:
|
||||||
|
while (self.tries < self.max_tries) and (
|
||||||
|
not data or not data["archived_snapshots"]
|
||||||
|
):
|
||||||
|
self.json() # It makes a new API call
|
||||||
|
data = self.JSON # json() updated the value of JSON attribute
|
||||||
|
|
||||||
|
# Even if after we exhausted teh max_tries, then we give up and
|
||||||
|
# raise exception.
|
||||||
|
|
||||||
if not data or not data["archived_snapshots"]:
|
if not data or not data["archived_snapshots"]:
|
||||||
raise ArchiveNotInAvailabilityAPIResponse(
|
raise ArchiveNotInAvailabilityAPIResponse(
|
||||||
"Archive not found in the availability "
|
"Archive not found in the availability "
|
||||||
+ "API response, maybe the URL you requested does not have any "
|
+ "API response, the URL you requested may not have any "
|
||||||
+ "archive yet. You may retry after some time or archive the webpage now."
|
+ "archives yet. You may retry after some time or archive the webpage now."
|
||||||
+ "\nResponse data:\n{response}".format(response=self.response.text)
|
+ "\nResponse data:\n{response}".format(response=self.response.text)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user