* Update README.md * Update README.md * update asciinema link * v3.0.4 * update video link
475 lines
13 KiB
Python
475 lines
13 KiB
Python
"""
|
|
Module responsible for enabling waybackpy to function as a CLI tool.
|
|
"""
|
|
|
|
import os
|
|
import random
|
|
import re
|
|
import string
|
|
from typing import Any, Dict, Generator, List, Optional
|
|
|
|
import click
|
|
import requests
|
|
|
|
from . import __version__
|
|
from .cdx_api import WaybackMachineCDXServerAPI
|
|
from .exceptions import BlockedSiteError, NoCDXRecordFound
|
|
from .save_api import WaybackMachineSaveAPI
|
|
from .utils import DEFAULT_USER_AGENT
|
|
from .wrapper import Url
|
|
|
|
|
|
def handle_cdx_closest_derivative_methods(
|
|
cdx_api: "WaybackMachineCDXServerAPI",
|
|
oldest: bool,
|
|
near: bool,
|
|
newest: bool,
|
|
near_args: Optional[Dict[str, int]] = None,
|
|
) -> None:
|
|
"""
|
|
Handles the closest parameter derivative methods.
|
|
|
|
near, newest and oldest use the closest parameter with active
|
|
closest based sorting.
|
|
"""
|
|
try:
|
|
if near:
|
|
if near_args:
|
|
archive_url = cdx_api.near(**near_args).archive_url
|
|
else:
|
|
archive_url = cdx_api.near().archive_url
|
|
elif newest:
|
|
archive_url = cdx_api.newest().archive_url
|
|
elif oldest:
|
|
archive_url = cdx_api.oldest().archive_url
|
|
click.echo("Archive URL:")
|
|
click.echo(archive_url)
|
|
except NoCDXRecordFound as exc:
|
|
click.echo(click.style("NoCDXRecordFound: ", fg="red") + str(exc), err=True)
|
|
except BlockedSiteError as exc:
|
|
click.echo(click.style("BlockedSiteError: ", fg="red") + str(exc), err=True)
|
|
|
|
|
|
def handle_cdx(data: List[Any]) -> None:
|
|
"""
|
|
Handles the CDX CLI options and output format.
|
|
"""
|
|
url = data[0]
|
|
user_agent = data[1]
|
|
start_timestamp = data[2]
|
|
end_timestamp = data[3]
|
|
cdx_filter = data[4]
|
|
collapse = data[5]
|
|
cdx_print = data[6]
|
|
limit = data[7]
|
|
gzip = data[8]
|
|
match_type = data[9]
|
|
sort = data[10]
|
|
use_pagination = data[11]
|
|
closest = data[12]
|
|
|
|
filters = list(cdx_filter)
|
|
collapses = list(collapse)
|
|
cdx_print = list(cdx_print)
|
|
|
|
cdx_api = WaybackMachineCDXServerAPI(
|
|
url,
|
|
user_agent=user_agent,
|
|
start_timestamp=start_timestamp,
|
|
end_timestamp=end_timestamp,
|
|
closest=closest,
|
|
filters=filters,
|
|
match_type=match_type,
|
|
sort=sort,
|
|
use_pagination=use_pagination,
|
|
gzip=gzip,
|
|
collapses=collapses,
|
|
limit=limit,
|
|
)
|
|
|
|
snapshots = cdx_api.snapshots()
|
|
|
|
for snapshot in snapshots:
|
|
if len(cdx_print) == 0:
|
|
click.echo(snapshot)
|
|
else:
|
|
output_string = []
|
|
if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
|
|
output_string.append(snapshot.urlkey)
|
|
if any(
|
|
val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"]
|
|
):
|
|
output_string.append(snapshot.timestamp)
|
|
if "original" in cdx_print:
|
|
output_string.append(snapshot.original)
|
|
if any(val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]):
|
|
output_string.append(snapshot.mimetype)
|
|
if any(
|
|
val in cdx_print for val in ["statuscode", "status-code", "status_code"]
|
|
):
|
|
output_string.append(snapshot.statuscode)
|
|
if "digest" in cdx_print:
|
|
output_string.append(snapshot.digest)
|
|
if "length" in cdx_print:
|
|
output_string.append(snapshot.length)
|
|
if any(
|
|
val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"]
|
|
):
|
|
output_string.append(snapshot.archive_url)
|
|
|
|
click.echo(" ".join(output_string))
|
|
|
|
|
|
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
|
"""
|
|
Save output of CDX API on file.
|
|
Mainly here because of backwards compatibility.
|
|
"""
|
|
domain = None
|
|
sys_random = random.SystemRandom()
|
|
uid = "".join(
|
|
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
|
)
|
|
url_count = 0
|
|
file_name = None
|
|
|
|
for url in url_gen:
|
|
url_count += 1
|
|
if not domain:
|
|
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
|
|
|
domain = "domain-unknown"
|
|
|
|
if match:
|
|
domain = match.group(1)
|
|
|
|
file_name = f"{domain}-urls-{uid}.txt"
|
|
file_path = os.path.join(os.getcwd(), file_name)
|
|
if not os.path.isfile(file_path):
|
|
with open(file_path, "w+", encoding="utf-8") as file:
|
|
file.close()
|
|
|
|
with open(file_path, "a", encoding="utf-8") as file:
|
|
file.write(f"{url}\n")
|
|
|
|
click.echo(url)
|
|
|
|
if url_count > 0:
|
|
click.echo(
|
|
f"\n\n{url_count} URLs saved inside '{file_name}' in the current "
|
|
+ "working directory."
|
|
)
|
|
else:
|
|
click.echo("No known URLs found. Please try a diffrent input!")
|
|
|
|
|
|
@click.command()
|
|
@click.option(
|
|
"-u", "--url", help="URL on which Wayback machine operations are to be performed."
|
|
)
|
|
@click.option(
|
|
"-ua",
|
|
"--user-agent",
|
|
"--user_agent",
|
|
default=DEFAULT_USER_AGENT,
|
|
help=f"User agent, default value is '{DEFAULT_USER_AGENT}'.",
|
|
)
|
|
@click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.")
|
|
@click.option(
|
|
"-l",
|
|
"--show-license",
|
|
"--show_license",
|
|
"--license",
|
|
is_flag=True,
|
|
default=False,
|
|
help="Show license of Waybackpy.",
|
|
)
|
|
@click.option(
|
|
"-n",
|
|
"--newest",
|
|
"-au",
|
|
"--archive_url",
|
|
"--archive-url",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Retrieve the newest archive of URL.",
|
|
)
|
|
@click.option(
|
|
"-o",
|
|
"--oldest",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Retrieve the oldest archive of URL.",
|
|
)
|
|
@click.option(
|
|
"-N",
|
|
"--near",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Archive close to a specified time.",
|
|
)
|
|
@click.option("-Y", "--year", type=click.IntRange(1994, 9999), help="Year in integer.")
|
|
@click.option("-M", "--month", type=click.IntRange(1, 12), help="Month in integer.")
|
|
@click.option("-D", "--day", type=click.IntRange(1, 31), help="Day in integer.")
|
|
@click.option("-H", "--hour", type=click.IntRange(0, 24), help="Hour in integer.")
|
|
@click.option("-MIN", "--minute", type=click.IntRange(0, 60), help="Minute in integer.")
|
|
@click.option(
|
|
"-s",
|
|
"--save",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Save the specified URL's webpage and print the archive URL.",
|
|
)
|
|
@click.option(
|
|
"-h",
|
|
"--headers",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Headers data of the SavePageNow API.",
|
|
)
|
|
@click.option(
|
|
"-ku",
|
|
"--known-urls",
|
|
"--known_urls",
|
|
default=False,
|
|
is_flag=True,
|
|
help="List known URLs. Uses CDX API.",
|
|
)
|
|
@click.option(
|
|
"-sub",
|
|
"--subdomain",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Use with '--known_urls' to include known URLs for subdomains.",
|
|
)
|
|
@click.option(
|
|
"-f",
|
|
"--file",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Use with '--known_urls' to save the URLs in file at current directory.",
|
|
)
|
|
@click.option(
|
|
"--cdx",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Flag for using CDX API.",
|
|
)
|
|
@click.option(
|
|
"-st",
|
|
"--start-timestamp",
|
|
"--start_timestamp",
|
|
"--from",
|
|
help="Start timestamp for CDX API in yyyyMMddhhmmss format.",
|
|
)
|
|
@click.option(
|
|
"-et",
|
|
"--end-timestamp",
|
|
"--end_timestamp",
|
|
"--to",
|
|
help="End timestamp for CDX API in yyyyMMddhhmmss format.",
|
|
)
|
|
@click.option(
|
|
"-C",
|
|
"--closest",
|
|
help="Archive that are closest the timestamp passed as arguments to this "
|
|
+ "parameter.",
|
|
)
|
|
@click.option(
|
|
"-f",
|
|
"--cdx-filter",
|
|
"--cdx_filter",
|
|
"--filter",
|
|
multiple=True,
|
|
help="Filter on a specific field or all the CDX fields.",
|
|
)
|
|
@click.option(
|
|
"-mt",
|
|
"--match-type",
|
|
"--match_type",
|
|
help="The default behavior is to return matches for an exact URL. "
|
|
+ "However, the CDX server can also return results matching a certain prefix, "
|
|
+ "a certain host, or all sub-hosts by using the match_type",
|
|
)
|
|
@click.option(
|
|
"-st",
|
|
"--sort",
|
|
help="Choose one from default, closest or reverse. It returns sorted CDX entries "
|
|
+ "in the response.",
|
|
)
|
|
@click.option(
|
|
"-up",
|
|
"--use-pagination",
|
|
"--use_pagination",
|
|
default=False,
|
|
is_flag=True,
|
|
help="Use the pagination API of the CDX server instead of the default one.",
|
|
)
|
|
@click.option(
|
|
"-gz",
|
|
"--gzip",
|
|
help="To disable gzip compression pass false as argument to this parameter. "
|
|
+ "The default behavior is gzip compression enabled.",
|
|
)
|
|
@click.option(
|
|
"-c",
|
|
"--collapse",
|
|
multiple=True,
|
|
help="Filtering or 'collapse' results based on a field, or a substring of a field.",
|
|
)
|
|
@click.option(
|
|
"-l",
|
|
"--limit",
|
|
help="Number of maximum record that CDX API is asked to return per API call, "
|
|
+ "default value is 25000 records.",
|
|
)
|
|
@click.option(
|
|
"-cp",
|
|
"--cdx-print",
|
|
"--cdx_print",
|
|
multiple=True,
|
|
help="Print only certain fields of the CDX API response, "
|
|
+ "if this parameter is not used then the plain text response of the CDX API "
|
|
+ "will be printed.",
|
|
)
|
|
def main( # pylint: disable=no-value-for-parameter
|
|
user_agent: str,
|
|
version: bool,
|
|
show_license: bool,
|
|
newest: bool,
|
|
oldest: bool,
|
|
near: bool,
|
|
save: bool,
|
|
headers: bool,
|
|
known_urls: bool,
|
|
subdomain: bool,
|
|
file: bool,
|
|
cdx: bool,
|
|
use_pagination: bool,
|
|
cdx_filter: List[str],
|
|
collapse: List[str],
|
|
cdx_print: List[str],
|
|
url: Optional[str] = None,
|
|
year: Optional[int] = None,
|
|
month: Optional[int] = None,
|
|
day: Optional[int] = None,
|
|
hour: Optional[int] = None,
|
|
minute: Optional[int] = None,
|
|
start_timestamp: Optional[str] = None,
|
|
end_timestamp: Optional[str] = None,
|
|
closest: Optional[str] = None,
|
|
match_type: Optional[str] = None,
|
|
sort: Optional[str] = None,
|
|
gzip: Optional[str] = None,
|
|
limit: Optional[str] = None,
|
|
) -> None:
|
|
"""\b
|
|
_ _
|
|
| | | |
|
|
__ ____ _ _ _| |__ __ _ ___| | ___ __ _ _
|
|
\\ \\ /\\ / / _` | | | | '_ \\ / _` |/ __| |/ / '_ \\| | | |
|
|
\\ V V / (_| | |_| | |_) | (_| | (__| <| |_) | |_| |
|
|
\\_/\\_/ \\__,_|\\__, |_.__/ \\__,_|\\___|_|\\_\\ .__/ \\__, |
|
|
__/ | | | __/ |
|
|
|___/ |_| |___/
|
|
|
|
Python package & CLI tool that interfaces the Wayback Machine APIs
|
|
|
|
Repository: https://github.com/akamhy/waybackpy
|
|
|
|
Documentation: https://github.com/akamhy/waybackpy/wiki/CLI-docs
|
|
|
|
waybackpy - CLI usage(Demo video): https://asciinema.org/a/469890
|
|
|
|
Released under the MIT License. Use the flag --license for license.
|
|
|
|
"""
|
|
if version:
|
|
click.echo(f"waybackpy version {__version__}")
|
|
|
|
elif show_license:
|
|
click.echo(
|
|
requests.get(
|
|
url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE"
|
|
).text
|
|
)
|
|
elif url is None:
|
|
click.echo(
|
|
click.style("NoURLDetected: ", fg="red")
|
|
+ "No URL detected. "
|
|
+ "Please provide an URL.",
|
|
err=True,
|
|
)
|
|
|
|
elif oldest:
|
|
cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
|
|
handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest)
|
|
|
|
elif newest:
|
|
cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
|
|
handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest)
|
|
|
|
elif near:
|
|
cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
|
|
near_args = {}
|
|
keys = ["year", "month", "day", "hour", "minute"]
|
|
args_arr = [year, month, day, hour, minute]
|
|
for key, arg in zip(keys, args_arr):
|
|
if arg:
|
|
near_args[key] = arg
|
|
handle_cdx_closest_derivative_methods(
|
|
cdx_api, oldest, near, newest, near_args=near_args
|
|
)
|
|
|
|
elif save:
|
|
save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
|
|
save_api.save()
|
|
click.echo("Archive URL:")
|
|
click.echo(save_api.archive_url)
|
|
click.echo("Cached save:")
|
|
click.echo(save_api.cached_save)
|
|
if headers:
|
|
click.echo("Save API headers:")
|
|
click.echo(save_api.headers)
|
|
|
|
elif known_urls:
|
|
wayback = Url(url, user_agent)
|
|
url_gen = wayback.known_urls(subdomain=subdomain)
|
|
|
|
if file:
|
|
save_urls_on_file(url_gen)
|
|
else:
|
|
for url_ in url_gen:
|
|
click.echo(url_)
|
|
|
|
elif cdx:
|
|
data = [
|
|
url,
|
|
user_agent,
|
|
start_timestamp,
|
|
end_timestamp,
|
|
cdx_filter,
|
|
collapse,
|
|
cdx_print,
|
|
limit,
|
|
gzip,
|
|
match_type,
|
|
sort,
|
|
use_pagination,
|
|
closest,
|
|
]
|
|
handle_cdx(data)
|
|
|
|
else:
|
|
|
|
click.echo(
|
|
click.style("NoCommandFound: ", fg="red")
|
|
+ "Only URL passed, but did not specify what to do with the URL. "
|
|
+ "Use --help flag for help using waybackpy.",
|
|
err=True,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() # pylint: disable=no-value-for-parameter
|