Cdx based oldest newest and near (#159)

* implement oldest newest and near methods in the cdx interface class, now cli uses the cdx methods instead of availablity api methods.

* handle the closest parameter derivative methods more efficiently and also handle exceptions gracefully.

* update test code
This commit is contained in:
Akash Mahanty 2022-02-18 13:17:40 +05:30 committed by GitHub
parent f990b93f8a
commit 4b218d35cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 248 additions and 106 deletions

View File

@ -1,4 +1,16 @@
import random
import string
import pytest
from waybackpy.cdx_api import WaybackMachineCDXServerAPI from waybackpy.cdx_api import WaybackMachineCDXServerAPI
from waybackpy.exceptions import NoCDXRecordFound
def rndstr(n: int) -> str:
return "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
)
def test_a() -> None: def test_a() -> None:
@ -90,3 +102,77 @@ def test_d() -> None:
count += 1 count += 1
assert str(snapshot.archive_url).find("akamhy.github.io") assert str(snapshot.archive_url).find("akamhy.github.io")
assert count > 50 assert count > 50
def test_oldest() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="google.com",
user_agent=user_agent,
filters=["statuscode:200"],
)
oldest = cdx.oldest()
assert "1998" in oldest.timestamp
assert "google" in oldest.urlkey
assert oldest.original.find("google.com") != -1
assert oldest.archive_url.find("google.com") != -1
def test_newest() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="google.com",
user_agent=user_agent,
filters=["statuscode:200"],
)
newest = cdx.newest()
assert "google" in newest.urlkey
assert newest.original.find("google.com") != -1
assert newest.archive_url.find("google.com") != -1
def test_near() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="google.com",
user_agent=user_agent,
filters=["statuscode:200"],
)
near = cdx.near(year=2010, month=10, day=10, hour=10, minute=10)
assert "2010101010" in near.timestamp
assert "google" in near.urlkey
assert near.original.find("google.com") != -1
assert near.archive_url.find("google.com") != -1
near = cdx.near(wayback_machine_timestamp="201010101010")
assert "2010101010" in near.timestamp
assert "google" in near.urlkey
assert near.original.find("google.com") != -1
assert near.archive_url.find("google.com") != -1
near = cdx.near(unix_timestamp=1286705410)
assert "2010101010" in near.timestamp
assert "google" in near.urlkey
assert near.original.find("google.com") != -1
assert near.archive_url.find("google.com") != -1
with pytest.raises(NoCDXRecordFound):
dne_url = f"https://{rndstr(30)}.in"
cdx = WaybackMachineCDXServerAPI(
url=dne_url,
user_agent=user_agent,
filters=["statuscode:200"],
)
cdx.near(unix_timestamp=1286705410)

View File

@ -41,3 +41,4 @@ def test_CDXSnapshot() -> None:
) )
assert archive_url == snapshot.archive_url assert archive_url == snapshot.archive_url
assert sample_input == str(snapshot) assert sample_input == str(snapshot)
assert sample_input == repr(snapshot)

View File

@ -42,39 +42,6 @@ def test_near() -> None:
) )
def test_json() -> None:
runner = CliRunner()
result = runner.invoke(
main,
[
"--url",
" https://apple.com ",
"--near",
"--year",
"2010",
"--month",
"2",
"--day",
"8",
"--hour",
"12",
"--json",
],
)
assert result.exit_code == 0
assert (
result.output.find(
"""Archive URL:\nhttps://web.archive.org/web/2010020812\
5854/http://www.apple.com/\nJSON respons\
e:\n{"url": "https://apple.com", "archived_snapshots": {"close\
st": {"status": "200", "available": true, "url": "http://web.ar\
chive.org/web/20100208125854/http://www.apple.com/", "timest\
amp": "20100208125854"}}, "timestamp":"""
)
!= -1
)
def test_newest() -> None: def test_newest() -> None:
runner = CliRunner() runner = CliRunner()
result = runner.invoke(main, ["--url", " https://microsoft.com ", "--newest"]) result = runner.invoke(main, ["--url", " https://microsoft.com ", "--newest"])
@ -145,7 +112,7 @@ def test_only_url() -> None:
assert result.exit_code == 0 assert result.exit_code == 0
assert ( assert (
result.output result.output
== "Only URL passed, but did not specify what to do with the URL. Use \ == "NoCommandFound: Only URL passed, but did not specify what to do with the URL. Use \
--help flag for help using waybackpy.\n" --help flag for help using waybackpy.\n"
) )

View File

@ -32,7 +32,11 @@ from .exceptions import (
ArchiveNotInAvailabilityAPIResponse, ArchiveNotInAvailabilityAPIResponse,
InvalidJSONInAvailabilityAPIResponse, InvalidJSONInAvailabilityAPIResponse,
) )
from .utils import DEFAULT_USER_AGENT from .utils import (
DEFAULT_USER_AGENT,
unix_timestamp_to_wayback_timestamp,
wayback_timestamp,
)
ResponseJSON = Dict[str, Any] ResponseJSON = Dict[str, Any]
@ -58,14 +62,6 @@ class WaybackMachineAvailabilityAPI:
self.json: Optional[ResponseJSON] = None self.json: Optional[ResponseJSON] = None
self.response: Optional[Response] = None self.response: Optional[Response] = None
@staticmethod
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
"""
Converts Unix time to Wayback Machine timestamp, Wayback Machine
timestamp format is yyyyMMddhhmmss.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def __repr__(self) -> str: def __repr__(self) -> str:
""" """
Same as string representation, just return the archive URL as a string. Same as string representation, just return the archive URL as a string.
@ -194,17 +190,6 @@ class WaybackMachineAvailabilityAPI:
) )
return archive_url return archive_url
@staticmethod
def wayback_timestamp(**kwargs: int) -> str:
"""
Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format.
"""
return "".join(
str(kwargs[key]).zfill(2)
for key in ["year", "month", "day", "hour", "minute"]
)
def oldest(self) -> "WaybackMachineAvailabilityAPI": def oldest(self) -> "WaybackMachineAvailabilityAPI":
""" """
Passes the date 1994-01-01 to near which should return the oldest archive Passes the date 1994-01-01 to near which should return the oldest archive
@ -245,10 +230,10 @@ class WaybackMachineAvailabilityAPI:
finally returns the instance. finally returns the instance.
""" """
if unix_timestamp: if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
else: else:
now = datetime.utcnow().timetuple() now = datetime.utcnow().timetuple()
timestamp = self.wayback_timestamp( timestamp = wayback_timestamp(
year=now.tm_year if year is None else year, year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month, month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day, day=now.tm_mday if day is None else day,

View File

@ -9,7 +9,9 @@ the snapshots are yielded as instances of the CDXSnapshot class.
""" """
from typing import Dict, Generator, List, Optional, cast import time
from datetime import datetime
from typing import Dict, Generator, List, Optional, Union, cast
from .cdx_snapshot import CDXSnapshot from .cdx_snapshot import CDXSnapshot
from .cdx_utils import ( from .cdx_utils import (
@ -21,8 +23,12 @@ from .cdx_utils import (
get_response, get_response,
get_total_pages, get_total_pages,
) )
from .exceptions import WaybackError from .exceptions import NoCDXRecordFound, WaybackError
from .utils import DEFAULT_USER_AGENT from .utils import (
DEFAULT_USER_AGENT,
unix_timestamp_to_wayback_timestamp,
wayback_timestamp,
)
class WaybackMachineCDXServerAPI: class WaybackMachineCDXServerAPI:
@ -185,6 +191,69 @@ class WaybackMachineCDXServerAPI:
payload["url"] = self.url payload["url"] = self.url
def near(
self,
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
wayback_machine_timestamp: Optional[Union[int, str]] = None,
) -> CDXSnapshot:
"""
Fetch archive close to a datetime, it can only return
a single URL. If you want more do not use this method
instead use the class.
"""
if unix_timestamp:
timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
elif wayback_machine_timestamp:
timestamp = str(wayback_machine_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = wayback_timestamp(
year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day,
hour=now.tm_hour if hour is None else hour,
minute=now.tm_min if minute is None else minute,
)
self.closest = timestamp
self.sort = "closest"
self.limit = 1
first_snapshot = None
for snapshot in self.snapshots():
first_snapshot = snapshot
break
if not first_snapshot:
raise NoCDXRecordFound(
"Wayback Machine's CDX server did not return any records "
+ "for the query. The URL may not have any archives "
+ " on the Wayback Machine or the URL may have been recently "
+ "archived and is still not available on the CDX server."
)
return first_snapshot
def newest(self) -> CDXSnapshot:
"""
Passes the current UNIX time to near() for retrieving the newest archive
from the availability API.
Remember UNIX time is UTC and Wayback Machine is also UTC based.
"""
return self.near(unix_timestamp=int(time.time()))
def oldest(self) -> CDXSnapshot:
"""
Passes the date 1994-01-01 to near which should return the oldest archive
because Wayback Machine was started in May, 1996 and it is assumed that
there would be no archive older than January 1, 1994.
"""
return self.near(year=1994, month=1, day=1)
def snapshots(self) -> Generator[CDXSnapshot, None, None]: def snapshots(self) -> Generator[CDXSnapshot, None, None]:
""" """
This function yields the CDX data lines as snapshots. This function yields the CDX data lines as snapshots.

View File

@ -73,6 +73,12 @@ class CDXSnapshot:
f"https://web.archive.org/web/{self.timestamp}/{self.original}" f"https://web.archive.org/web/{self.timestamp}/{self.original}"
) )
def __repr__(self) -> str:
"""
Same as __str__()
"""
return str(self)
def __str__(self) -> str: def __str__(self) -> str:
""" """
The string representation is same as the line returned by the The string representation is same as the line returned by the

View File

@ -6,47 +6,48 @@ import os
import random import random
import re import re
import string import string
from json import dumps from typing import Any, Dict, Generator, List, Optional
from typing import Any, Generator, List, Optional
import click import click
import requests import requests
from . import __version__ from . import __version__
from .availability_api import WaybackMachineAvailabilityAPI
from .cdx_api import WaybackMachineCDXServerAPI from .cdx_api import WaybackMachineCDXServerAPI
from .exceptions import ArchiveNotInAvailabilityAPIResponse from .exceptions import BlockedSiteError, NoCDXRecordFound
from .save_api import WaybackMachineSaveAPI from .save_api import WaybackMachineSaveAPI
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
from .wrapper import Url from .wrapper import Url
def echo_availability_api( def handle_cdx_closest_derivative_methods(
availability_api_instance: WaybackMachineAvailabilityAPI, json: bool cdx_api: "WaybackMachineCDXServerAPI",
oldest: bool,
near: bool,
newest: bool,
near_args: Optional[Dict[str, int]] = None,
) -> None: ) -> None:
""" """
Output for method that use the availability API. Handles the closest parameter derivative methods.
Near, oldest and newest output via this function.
near, newest and oldest use the closest parameter with active
closest based sorting.
""" """
try: try:
if availability_api_instance.archive_url: if near:
archive_url = availability_api_instance.archive_url if near_args:
except ArchiveNotInAvailabilityAPIResponse as error: archive_url = cdx_api.near(**near_args).archive_url
message = ( else:
"NO ARCHIVE FOUND - The requested URL is probably " archive_url = cdx_api.near().archive_url
+ "not yet archived or if the URL was recently archived then it is " elif newest:
+ "not yet available via the Wayback Machine's availability API " archive_url = cdx_api.newest().archive_url
+ "because of database lag and should be available after some time." elif oldest:
) archive_url = cdx_api.oldest().archive_url
click.echo(message + "\nJSON response:\n" + str(error), err=True)
return
click.echo("Archive URL:") click.echo("Archive URL:")
click.echo(archive_url) click.echo(archive_url)
if json: except NoCDXRecordFound as exc:
click.echo("JSON response:") click.echo(click.style("NoCDXRecordFound: ", fg="red") + str(exc), err=True)
click.echo(dumps(availability_api_instance.json)) except BlockedSiteError as exc:
click.echo(click.style("BlockedSiteError: ", fg="red") + str(exc), err=True)
def handle_cdx(data: List[Any]) -> None: def handle_cdx(data: List[Any]) -> None:
@ -145,7 +146,8 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
file_name = f"{domain}-urls-{uid}.txt" file_name = f"{domain}-urls-{uid}.txt"
file_path = os.path.join(os.getcwd(), file_name) file_path = os.path.join(os.getcwd(), file_name)
if not os.path.isfile(file_path): if not os.path.isfile(file_path):
open(file_path, "w+", encoding="utf-8").close() with open(file_path, "w+", encoding="utf-8") as file:
file.close()
with open(file_path, "a", encoding="utf-8") as file: with open(file_path, "a", encoding="utf-8") as file:
file.write(f"{url}\n") file.write(f"{url}\n")
@ -199,13 +201,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
is_flag=True, is_flag=True,
help="Retrieve the oldest archive of URL.", help="Retrieve the oldest archive of URL.",
) )
@click.option(
"-j",
"--json",
default=False,
is_flag=True,
help="JSON data returned by the availability API.",
)
@click.option( @click.option(
"-N", "-N",
"--near", "--near",
@ -343,7 +338,6 @@ def main( # pylint: disable=no-value-for-parameter
show_license: bool, show_license: bool,
newest: bool, newest: bool,
oldest: bool, oldest: bool,
json: bool,
near: bool, near: bool,
save: bool, save: bool,
headers: bool, headers: bool,
@ -400,28 +394,32 @@ def main( # pylint: disable=no-value-for-parameter
).text ).text
) )
elif url is None: elif url is None:
click.echo("No URL detected. Please provide an URL.", err=True) click.echo(
click.style("NoURLDetected: ", fg="red")
+ "No URL detected. "
+ "Please provide an URL.",
err=True,
)
elif oldest: elif oldest:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
availability_api.oldest() handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest)
echo_availability_api(availability_api, json)
elif newest: elif newest:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
availability_api.newest() handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest)
echo_availability_api(availability_api, json)
elif near: elif near:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
near_args = {} near_args = {}
keys = ["year", "month", "day", "hour", "minute"] keys = ["year", "month", "day", "hour", "minute"]
args_arr = [year, month, day, hour, minute] args_arr = [year, month, day, hour, minute]
for key, arg in zip(keys, args_arr): for key, arg in zip(keys, args_arr):
if arg: if arg:
near_args[key] = arg near_args[key] = arg
availability_api.near(**near_args) handle_cdx_closest_derivative_methods(
echo_availability_api(availability_api, json) cdx_api, oldest, near, newest, near_args=near_args
)
elif save: elif save:
save_api = WaybackMachineSaveAPI(url, user_agent=user_agent) save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
@ -463,9 +461,11 @@ def main( # pylint: disable=no-value-for-parameter
handle_cdx(data) handle_cdx(data)
else: else:
click.echo( click.echo(
"Only URL passed, but did not specify what to do with the URL. " click.style("NoCommandFound: ", fg="red")
"Use --help flag for help using waybackpy.", + "Only URL passed, but did not specify what to do with the URL. "
+ "Use --help flag for help using waybackpy.",
err=True, err=True,
) )

View File

@ -16,6 +16,14 @@ class WaybackError(Exception):
""" """
class NoCDXRecordFound(WaybackError):
"""
No records returned by the CDX server for a query.
Raised when the user invokes near(), newest() or oldest() methods
and there are no archives.
"""
class BlockedSiteError(WaybackError): class BlockedSiteError(WaybackError):
""" """
Raised when the archives for website/URLs that was excluded from Wayback Raised when the archives for website/URLs that was excluded from Wayback

View File

@ -2,8 +2,28 @@
Utility functions and shared variables like DEFAULT_USER_AGENT are here. Utility functions and shared variables like DEFAULT_USER_AGENT are here.
""" """
from datetime import datetime
from . import __version__ from . import __version__
DEFAULT_USER_AGENT: str = ( DEFAULT_USER_AGENT: str = (
f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
) )
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
"""
Converts Unix time to Wayback Machine timestamp, Wayback Machine
timestamp format is yyyyMMddhhmmss.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def wayback_timestamp(**kwargs: int) -> str:
"""
Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format.
"""
return "".join(
str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
)