Cdx based oldest newest and near (#159)
* implement oldest newest and near methods in the cdx interface class, now cli uses the cdx methods instead of availablity api methods. * handle the closest parameter derivative methods more efficiently and also handle exceptions gracefully. * update test code
This commit is contained in:
@ -1,4 +1,16 @@
|
||||
import random
|
||||
import string
|
||||
|
||||
import pytest
|
||||
|
||||
from waybackpy.cdx_api import WaybackMachineCDXServerAPI
|
||||
from waybackpy.exceptions import NoCDXRecordFound
|
||||
|
||||
|
||||
def rndstr(n: int) -> str:
|
||||
return "".join(
|
||||
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
|
||||
)
|
||||
|
||||
|
||||
def test_a() -> None:
|
||||
@ -90,3 +102,77 @@ def test_d() -> None:
|
||||
count += 1
|
||||
assert str(snapshot.archive_url).find("akamhy.github.io")
|
||||
assert count > 50
|
||||
|
||||
|
||||
def test_oldest() -> None:
|
||||
user_agent = (
|
||||
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||
)
|
||||
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
url="google.com",
|
||||
user_agent=user_agent,
|
||||
filters=["statuscode:200"],
|
||||
)
|
||||
oldest = cdx.oldest()
|
||||
assert "1998" in oldest.timestamp
|
||||
assert "google" in oldest.urlkey
|
||||
assert oldest.original.find("google.com") != -1
|
||||
assert oldest.archive_url.find("google.com") != -1
|
||||
|
||||
|
||||
def test_newest() -> None:
|
||||
user_agent = (
|
||||
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||
)
|
||||
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
url="google.com",
|
||||
user_agent=user_agent,
|
||||
filters=["statuscode:200"],
|
||||
)
|
||||
newest = cdx.newest()
|
||||
assert "google" in newest.urlkey
|
||||
assert newest.original.find("google.com") != -1
|
||||
assert newest.archive_url.find("google.com") != -1
|
||||
|
||||
|
||||
def test_near() -> None:
|
||||
user_agent = (
|
||||
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||
)
|
||||
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
url="google.com",
|
||||
user_agent=user_agent,
|
||||
filters=["statuscode:200"],
|
||||
)
|
||||
near = cdx.near(year=2010, month=10, day=10, hour=10, minute=10)
|
||||
assert "2010101010" in near.timestamp
|
||||
assert "google" in near.urlkey
|
||||
assert near.original.find("google.com") != -1
|
||||
assert near.archive_url.find("google.com") != -1
|
||||
|
||||
near = cdx.near(wayback_machine_timestamp="201010101010")
|
||||
assert "2010101010" in near.timestamp
|
||||
assert "google" in near.urlkey
|
||||
assert near.original.find("google.com") != -1
|
||||
assert near.archive_url.find("google.com") != -1
|
||||
|
||||
near = cdx.near(unix_timestamp=1286705410)
|
||||
assert "2010101010" in near.timestamp
|
||||
assert "google" in near.urlkey
|
||||
assert near.original.find("google.com") != -1
|
||||
assert near.archive_url.find("google.com") != -1
|
||||
|
||||
with pytest.raises(NoCDXRecordFound):
|
||||
dne_url = f"https://{rndstr(30)}.in"
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
url=dne_url,
|
||||
user_agent=user_agent,
|
||||
filters=["statuscode:200"],
|
||||
)
|
||||
cdx.near(unix_timestamp=1286705410)
|
||||
|
@ -41,3 +41,4 @@ def test_CDXSnapshot() -> None:
|
||||
)
|
||||
assert archive_url == snapshot.archive_url
|
||||
assert sample_input == str(snapshot)
|
||||
assert sample_input == repr(snapshot)
|
||||
|
@ -42,39 +42,6 @@ def test_near() -> None:
|
||||
)
|
||||
|
||||
|
||||
def test_json() -> None:
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(
|
||||
main,
|
||||
[
|
||||
"--url",
|
||||
" https://apple.com ",
|
||||
"--near",
|
||||
"--year",
|
||||
"2010",
|
||||
"--month",
|
||||
"2",
|
||||
"--day",
|
||||
"8",
|
||||
"--hour",
|
||||
"12",
|
||||
"--json",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.output.find(
|
||||
"""Archive URL:\nhttps://web.archive.org/web/2010020812\
|
||||
5854/http://www.apple.com/\nJSON respons\
|
||||
e:\n{"url": "https://apple.com", "archived_snapshots": {"close\
|
||||
st": {"status": "200", "available": true, "url": "http://web.ar\
|
||||
chive.org/web/20100208125854/http://www.apple.com/", "timest\
|
||||
amp": "20100208125854"}}, "timestamp":"""
|
||||
)
|
||||
!= -1
|
||||
)
|
||||
|
||||
|
||||
def test_newest() -> None:
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(main, ["--url", " https://microsoft.com ", "--newest"])
|
||||
@ -145,7 +112,7 @@ def test_only_url() -> None:
|
||||
assert result.exit_code == 0
|
||||
assert (
|
||||
result.output
|
||||
== "Only URL passed, but did not specify what to do with the URL. Use \
|
||||
== "NoCommandFound: Only URL passed, but did not specify what to do with the URL. Use \
|
||||
--help flag for help using waybackpy.\n"
|
||||
)
|
||||
|
||||
|
Reference in New Issue
Block a user