Before and After methods (#175)

* Added before and after functions

* add tests

* formatting
This commit is contained in:
ArztKlein 2022-11-17 15:28:46 +13:00 committed by GitHub
parent 0202efd39d
commit 3b3e78d901
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 118 additions and 0 deletions

View File

@ -176,3 +176,39 @@ def test_near() -> None:
filters=["statuscode:200"], filters=["statuscode:200"],
) )
cdx.near(unix_timestamp=1286705410) cdx.near(unix_timestamp=1286705410)
def test_before() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="http://www.google.com/",
user_agent=user_agent,
filters=["statuscode:200"],
)
before = cdx.before(wayback_machine_timestamp=20160731235949)
assert "20160731233347" in before.timestamp
assert "google" in before.urlkey
assert before.original.find("google.com") != -1
assert before.archive_url.find("google.com") != -1
def test_after() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="http://www.google.com/",
user_agent=user_agent,
filters=["statuscode:200"],
)
after = cdx.after(wayback_machine_timestamp=20160731235949)
assert "20160801000917" in after.timestamp, after.timestamp
assert "google" in after.urlkey
assert after.original.find("google.com") != -1
assert after.archive_url.find("google.com") != -1

View File

@ -191,6 +191,88 @@ class WaybackMachineCDXServerAPI:
payload["url"] = self.url payload["url"] = self.url
def before(
self,
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
wayback_machine_timestamp: Optional[Union[int, str]] = None,
) -> CDXSnapshot:
"""
Gets the nearest archive before the given datetime.
"""
if unix_timestamp:
timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
elif wayback_machine_timestamp:
timestamp = str(wayback_machine_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = wayback_timestamp(
year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day,
hour=now.tm_hour if hour is None else hour,
minute=now.tm_min if minute is None else minute,
)
self.closest = timestamp
self.sort = "closest"
self.limit = 25000
for snapshot in self.snapshots():
if snapshot.timestamp < timestamp:
return snapshot
# If a snapshot isn't returned, then none were found.
raise NoCDXRecordFound(
"No records were found before the given date for the query."
+ "Either there are no archives before the given date,"
+ " the URL may not have any archived, or the URL may have been"
+ " recently archived and is still not available on the CDX server."
)
def after(
self,
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
wayback_machine_timestamp: Optional[Union[int, str]] = None,
) -> CDXSnapshot:
"""
Gets the nearest archive after the given datetime.
"""
if unix_timestamp:
timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
elif wayback_machine_timestamp:
timestamp = str(wayback_machine_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = wayback_timestamp(
year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day,
hour=now.tm_hour if hour is None else hour,
minute=now.tm_min if minute is None else minute,
)
self.closest = timestamp
self.sort = "closest"
self.limit = 25000
for snapshot in self.snapshots():
if snapshot.timestamp > timestamp:
return snapshot
# If a snapshot isn't returned, then none were found.
raise NoCDXRecordFound(
"No records were found after the given date for the query."
+ "Either there are no archives after the given date,"
+ " the URL may not have any archives, or the URL may have been"
+ " recently archived and is still not available on the CDX server."
)
def near( def near(
self, self,
year: Optional[int] = None, year: Optional[int] = None,