Cdx based oldest newest and near (#159)

* implement oldest newest and near methods in the cdx interface class, now cli uses the cdx methods instead of availablity api methods.

* handle the closest parameter derivative methods more efficiently and also handle exceptions gracefully.

* update test code
This commit is contained in:
Akash Mahanty
2022-02-18 13:17:40 +05:30
committed by GitHub
parent f990b93f8a
commit 4b218d35cb
9 changed files with 248 additions and 106 deletions

View File

@ -1,4 +1,16 @@
import random
import string
import pytest
from waybackpy.cdx_api import WaybackMachineCDXServerAPI
from waybackpy.exceptions import NoCDXRecordFound
def rndstr(n: int) -> str:
return "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
)
def test_a() -> None:
@ -90,3 +102,77 @@ def test_d() -> None:
count += 1
assert str(snapshot.archive_url).find("akamhy.github.io")
assert count > 50
def test_oldest() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="google.com",
user_agent=user_agent,
filters=["statuscode:200"],
)
oldest = cdx.oldest()
assert "1998" in oldest.timestamp
assert "google" in oldest.urlkey
assert oldest.original.find("google.com") != -1
assert oldest.archive_url.find("google.com") != -1
def test_newest() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="google.com",
user_agent=user_agent,
filters=["statuscode:200"],
)
newest = cdx.newest()
assert "google" in newest.urlkey
assert newest.original.find("google.com") != -1
assert newest.archive_url.find("google.com") != -1
def test_near() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="google.com",
user_agent=user_agent,
filters=["statuscode:200"],
)
near = cdx.near(year=2010, month=10, day=10, hour=10, minute=10)
assert "2010101010" in near.timestamp
assert "google" in near.urlkey
assert near.original.find("google.com") != -1
assert near.archive_url.find("google.com") != -1
near = cdx.near(wayback_machine_timestamp="201010101010")
assert "2010101010" in near.timestamp
assert "google" in near.urlkey
assert near.original.find("google.com") != -1
assert near.archive_url.find("google.com") != -1
near = cdx.near(unix_timestamp=1286705410)
assert "2010101010" in near.timestamp
assert "google" in near.urlkey
assert near.original.find("google.com") != -1
assert near.archive_url.find("google.com") != -1
with pytest.raises(NoCDXRecordFound):
dne_url = f"https://{rndstr(30)}.in"
cdx = WaybackMachineCDXServerAPI(
url=dne_url,
user_agent=user_agent,
filters=["statuscode:200"],
)
cdx.near(unix_timestamp=1286705410)

View File

@ -41,3 +41,4 @@ def test_CDXSnapshot() -> None:
)
assert archive_url == snapshot.archive_url
assert sample_input == str(snapshot)
assert sample_input == repr(snapshot)

View File

@ -42,39 +42,6 @@ def test_near() -> None:
)
def test_json() -> None:
runner = CliRunner()
result = runner.invoke(
main,
[
"--url",
" https://apple.com ",
"--near",
"--year",
"2010",
"--month",
"2",
"--day",
"8",
"--hour",
"12",
"--json",
],
)
assert result.exit_code == 0
assert (
result.output.find(
"""Archive URL:\nhttps://web.archive.org/web/2010020812\
5854/http://www.apple.com/\nJSON respons\
e:\n{"url": "https://apple.com", "archived_snapshots": {"close\
st": {"status": "200", "available": true, "url": "http://web.ar\
chive.org/web/20100208125854/http://www.apple.com/", "timest\
amp": "20100208125854"}}, "timestamp":"""
)
!= -1
)
def test_newest() -> None:
runner = CliRunner()
result = runner.invoke(main, ["--url", " https://microsoft.com ", "--newest"])
@ -145,7 +112,7 @@ def test_only_url() -> None:
assert result.exit_code == 0
assert (
result.output
== "Only URL passed, but did not specify what to do with the URL. Use \
== "NoCommandFound: Only URL passed, but did not specify what to do with the URL. Use \
--help flag for help using waybackpy.\n"
)