implement oldest newest and near methods in the cdx interface class, now cli uses the cdx methods instead of availablity api methods.

This commit is contained in:
Akash Mahanty
2022-02-18 11:38:58 +05:30
parent f990b93f8a
commit 407d95cc24
9 changed files with 211 additions and 107 deletions

View File

@@ -6,49 +6,18 @@ import os
import random
import re
import string
from json import dumps
from typing import Any, Generator, List, Optional
import click
import requests
from . import __version__
from .availability_api import WaybackMachineAvailabilityAPI
from .cdx_api import WaybackMachineCDXServerAPI
from .exceptions import ArchiveNotInAvailabilityAPIResponse
from .save_api import WaybackMachineSaveAPI
from .utils import DEFAULT_USER_AGENT
from .wrapper import Url
def echo_availability_api(
availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
) -> None:
"""
Output for method that use the availability API.
Near, oldest and newest output via this function.
"""
try:
if availability_api_instance.archive_url:
archive_url = availability_api_instance.archive_url
except ArchiveNotInAvailabilityAPIResponse as error:
message = (
"NO ARCHIVE FOUND - The requested URL is probably "
+ "not yet archived or if the URL was recently archived then it is "
+ "not yet available via the Wayback Machine's availability API "
+ "because of database lag and should be available after some time."
)
click.echo(message + "\nJSON response:\n" + str(error), err=True)
return
click.echo("Archive URL:")
click.echo(archive_url)
if json:
click.echo("JSON response:")
click.echo(dumps(availability_api_instance.json))
def handle_cdx(data: List[Any]) -> None:
"""
Handles the CDX CLI options and output format.
@@ -145,7 +114,8 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
file_name = f"{domain}-urls-{uid}.txt"
file_path = os.path.join(os.getcwd(), file_name)
if not os.path.isfile(file_path):
open(file_path, "w+", encoding="utf-8").close()
with open(file_path, "w+", encoding="utf-8") as file:
file.close()
with open(file_path, "a", encoding="utf-8") as file:
file.write(f"{url}\n")
@@ -199,13 +169,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
is_flag=True,
help="Retrieve the oldest archive of URL.",
)
@click.option(
"-j",
"--json",
default=False,
is_flag=True,
help="JSON data returned by the availability API.",
)
@click.option(
"-N",
"--near",
@@ -343,7 +306,6 @@ def main( # pylint: disable=no-value-for-parameter
show_license: bool,
newest: bool,
oldest: bool,
json: bool,
near: bool,
save: bool,
headers: bool,
@@ -403,25 +365,25 @@ def main( # pylint: disable=no-value-for-parameter
click.echo("No URL detected. Please provide an URL.", err=True)
elif oldest:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
availability_api.oldest()
echo_availability_api(availability_api, json)
cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
click.echo("Archive URL:")
click.echo(cdx_api.oldest().archive_url)
elif newest:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
availability_api.newest()
echo_availability_api(availability_api, json)
cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
click.echo("Archive URL:")
click.echo(cdx_api.newest().archive_url)
elif near:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
near_args = {}
keys = ["year", "month", "day", "hour", "minute"]
args_arr = [year, month, day, hour, minute]
for key, arg in zip(keys, args_arr):
if arg:
near_args[key] = arg
availability_api.near(**near_args)
echo_availability_api(availability_api, json)
click.echo("Archive URL:")
click.echo(cdx_api.near(**near_args).archive_url)
elif save:
save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)