2022-01-22 20:59:07 +01:00
|
|
|
|
import random
|
|
|
|
|
import string
|
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
2022-02-03 14:43:39 +01:00
|
|
|
|
import pytest
|
|
|
|
|
|
2022-01-22 20:59:07 +01:00
|
|
|
|
from waybackpy.availability_api import WaybackMachineAvailabilityAPI
|
|
|
|
|
from waybackpy.exceptions import (
|
|
|
|
|
ArchiveNotInAvailabilityAPIResponse,
|
2022-02-03 14:43:39 +01:00
|
|
|
|
InvalidJSONInAvailabilityAPIResponse,
|
2022-01-22 20:59:07 +01:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
now = datetime.utcnow()
|
2022-01-24 11:57:35 +01:00
|
|
|
|
url = "https://example.com/"
|
2022-02-04 19:23:36 +01:00
|
|
|
|
user_agent = (
|
|
|
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
|
|
|
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
|
|
|
|
)
|
2022-01-22 20:59:07 +01:00
|
|
|
|
|
2022-02-03 14:43:39 +01:00
|
|
|
|
|
2022-02-04 19:23:36 +01:00
|
|
|
|
def rndstr(n: int) -> str:
|
2022-02-03 14:43:39 +01:00
|
|
|
|
return "".join(
|
|
|
|
|
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
|
|
|
|
|
)
|
2022-01-22 20:59:07 +01:00
|
|
|
|
|
|
|
|
|
|
2022-02-04 19:23:36 +01:00
|
|
|
|
def test_oldest() -> None:
|
2022-01-22 20:59:07 +01:00
|
|
|
|
"""
|
|
|
|
|
Test the oldest archive of Google.com and also checks the attributes.
|
|
|
|
|
"""
|
2022-01-24 11:57:35 +01:00
|
|
|
|
url = "https://example.com/"
|
2022-02-04 19:23:36 +01:00
|
|
|
|
user_agent = (
|
|
|
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
|
|
|
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
|
|
|
|
)
|
2022-01-23 10:35:10 +01:00
|
|
|
|
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
2022-01-22 20:59:07 +01:00
|
|
|
|
oldest = availability_api.oldest()
|
|
|
|
|
oldest_archive_url = oldest.archive_url
|
2022-01-24 11:57:35 +01:00
|
|
|
|
assert "2002" in oldest_archive_url
|
2022-01-22 20:59:07 +01:00
|
|
|
|
oldest_timestamp = oldest.timestamp()
|
2022-01-24 11:57:35 +01:00
|
|
|
|
assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years
|
2022-02-04 19:23:36 +01:00
|
|
|
|
assert (
|
2022-02-07 22:42:20 +01:00
|
|
|
|
availability_api.json is not None
|
|
|
|
|
and availability_api.json["archived_snapshots"]["closest"]["available"] is True
|
2022-02-04 19:23:36 +01:00
|
|
|
|
)
|
2022-01-24 11:57:35 +01:00
|
|
|
|
assert repr(oldest).find("example.com") != -1
|
|
|
|
|
assert "2002" in str(oldest)
|
2022-01-22 20:59:07 +01:00
|
|
|
|
|
|
|
|
|
|
2022-02-04 19:23:36 +01:00
|
|
|
|
def test_newest() -> None:
|
2022-01-22 20:59:07 +01:00
|
|
|
|
"""
|
|
|
|
|
Assuming that the recent most Google Archive was made no more earlier than
|
|
|
|
|
last one day which is 86400 seconds.
|
|
|
|
|
"""
|
2022-01-23 10:35:10 +01:00
|
|
|
|
url = "https://www.youtube.com/"
|
|
|
|
|
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0"
|
|
|
|
|
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
2022-01-22 20:59:07 +01:00
|
|
|
|
newest = availability_api.newest()
|
|
|
|
|
newest_timestamp = newest.timestamp()
|
2022-01-23 10:35:10 +01:00
|
|
|
|
# betting in favor that latest youtube archive was not before the last 3 days
|
|
|
|
|
# high tarffic sites like youtube are archived mnay times a day, so seems
|
|
|
|
|
# very reasonable to me.
|
|
|
|
|
assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3)
|
2022-01-22 20:59:07 +01:00
|
|
|
|
|
|
|
|
|
|
2022-02-04 19:23:36 +01:00
|
|
|
|
def test_invalid_json() -> None:
|
2022-01-22 20:59:07 +01:00
|
|
|
|
"""
|
2022-02-04 19:23:36 +01:00
|
|
|
|
When the API is malfunctioning or we don't pass a URL,
|
|
|
|
|
it may return invalid JSON data.
|
2022-01-22 20:59:07 +01:00
|
|
|
|
"""
|
|
|
|
|
with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
|
|
|
|
|
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)
|
2022-02-03 14:43:39 +01:00
|
|
|
|
_ = availability_api.archive_url
|
2022-01-22 20:59:07 +01:00
|
|
|
|
|
|
|
|
|
|
2022-02-04 19:23:36 +01:00
|
|
|
|
def test_no_archive() -> None:
|
2022-01-22 20:59:07 +01:00
|
|
|
|
"""
|
|
|
|
|
ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not
|
|
|
|
|
replied with the archive despite the fact that we know the site has million
|
|
|
|
|
of archives. Don't know the reason for this wierd behavior.
|
|
|
|
|
|
|
|
|
|
And also if really there are no archives for the passed URL this exception
|
|
|
|
|
is raised.
|
|
|
|
|
"""
|
|
|
|
|
with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
|
|
|
|
|
availability_api = WaybackMachineAvailabilityAPI(
|
2022-02-04 19:23:36 +01:00
|
|
|
|
url=f"https://{rndstr(30)}.cn", user_agent=user_agent
|
2022-01-22 20:59:07 +01:00
|
|
|
|
)
|
2022-02-03 14:43:39 +01:00
|
|
|
|
_ = availability_api.archive_url
|
2022-01-22 20:59:07 +01:00
|
|
|
|
|
|
|
|
|
|
2022-02-04 19:23:36 +01:00
|
|
|
|
def test_no_api_call_str_repr() -> None:
|
2022-01-22 20:59:07 +01:00
|
|
|
|
"""
|
|
|
|
|
Some entitled users maybe want to see what is the string representation
|
|
|
|
|
if they don’t make any API requests.
|
|
|
|
|
|
|
|
|
|
str() must not return None so we return ""
|
|
|
|
|
"""
|
|
|
|
|
availability_api = WaybackMachineAvailabilityAPI(
|
2022-02-04 19:23:36 +01:00
|
|
|
|
url=f"https://{rndstr(30)}.gov", user_agent=user_agent
|
2022-01-22 20:59:07 +01:00
|
|
|
|
)
|
2022-02-08 05:36:23 +01:00
|
|
|
|
assert str(availability_api) == ""
|
2022-01-22 20:59:07 +01:00
|
|
|
|
|
|
|
|
|
|
2022-02-04 19:23:36 +01:00
|
|
|
|
def test_no_call_timestamp() -> None:
|
2022-01-22 20:59:07 +01:00
|
|
|
|
"""
|
|
|
|
|
If no API requests were made the bound timestamp() method returns
|
|
|
|
|
the datetime.max as a default value.
|
|
|
|
|
"""
|
|
|
|
|
availability_api = WaybackMachineAvailabilityAPI(
|
2022-02-04 19:23:36 +01:00
|
|
|
|
url=f"https://{rndstr(30)}.in", user_agent=user_agent
|
2022-01-22 20:59:07 +01:00
|
|
|
|
)
|
|
|
|
|
assert datetime.max == availability_api.timestamp()
|