""" Module responsible for enabling waybackpy to function as a CLI tool. """ import os import random import re import string from typing import Any, Dict, Generator, List, Optional import click import requests from . import __version__ from .cdx_api import WaybackMachineCDXServerAPI from .exceptions import BlockedSiteError, NoCDXRecordFound from .save_api import WaybackMachineSaveAPI from .utils import DEFAULT_USER_AGENT from .wrapper import Url def handle_cdx_closest_derivative_methods( cdx_api: "WaybackMachineCDXServerAPI", oldest: bool, near: bool, newest: bool, near_args: Optional[Dict[str, int]] = None, ) -> None: """ Handles the closest parameter derivative methods. near, newest and oldest use the closest parameter with active closest based sorting. """ try: if near: if near_args: archive_url = cdx_api.near(**near_args).archive_url else: archive_url = cdx_api.near().archive_url elif newest: archive_url = cdx_api.newest().archive_url elif oldest: archive_url = cdx_api.oldest().archive_url click.echo("Archive URL:") click.echo(archive_url) except NoCDXRecordFound as exc: click.echo(click.style("NoCDXRecordFound: ", fg="red") + str(exc), err=True) except BlockedSiteError as exc: click.echo(click.style("BlockedSiteError: ", fg="red") + str(exc), err=True) def handle_cdx(data: List[Any]) -> None: """ Handles the CDX CLI options and output format. """ url = data[0] user_agent = data[1] start_timestamp = data[2] end_timestamp = data[3] cdx_filter = data[4] collapse = data[5] cdx_print = data[6] limit = data[7] gzip = data[8] match_type = data[9] sort = data[10] use_pagination = data[11] closest = data[12] filters = list(cdx_filter) collapses = list(collapse) cdx_print = list(cdx_print) cdx_api = WaybackMachineCDXServerAPI( url, user_agent=user_agent, start_timestamp=start_timestamp, end_timestamp=end_timestamp, closest=closest, filters=filters, match_type=match_type, sort=sort, use_pagination=use_pagination, gzip=gzip, collapses=collapses, limit=limit, ) snapshots = cdx_api.snapshots() for snapshot in snapshots: if len(cdx_print) == 0: click.echo(snapshot) else: output_string = [] if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]): output_string.append(snapshot.urlkey) if any( val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"] ): output_string.append(snapshot.timestamp) if "original" in cdx_print: output_string.append(snapshot.original) if any(val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]): output_string.append(snapshot.mimetype) if any( val in cdx_print for val in ["statuscode", "status-code", "status_code"] ): output_string.append(snapshot.statuscode) if "digest" in cdx_print: output_string.append(snapshot.digest) if "length" in cdx_print: output_string.append(snapshot.length) if any( val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"] ): output_string.append(snapshot.archive_url) click.echo(" ".join(output_string)) def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: """ Save output of CDX API on file. Mainly here because of backwards compatibility. """ domain = None sys_random = random.SystemRandom() uid = "".join( sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) ) url_count = 0 file_name = None for url in url_gen: url_count += 1 if not domain: match = re.search("https?://([A-Za-z_0-9.-]+).*", url) domain = "domain-unknown" if match: domain = match.group(1) file_name = f"{domain}-urls-{uid}.txt" file_path = os.path.join(os.getcwd(), file_name) if not os.path.isfile(file_path): with open(file_path, "w+", encoding="utf-8") as file: file.close() with open(file_path, "a", encoding="utf-8") as file: file.write(f"{url}\n") click.echo(url) if url_count > 0: click.echo( f"\n\n{url_count} URLs saved inside '{file_name}' in the current " + "working directory." ) else: click.echo("No known URLs found. Please try a diffrent input!") @click.command() @click.option( "-u", "--url", help="URL on which Wayback machine operations are to be performed." ) @click.option( "-ua", "--user-agent", "--user_agent", default=DEFAULT_USER_AGENT, help=f"User agent, default value is '{DEFAULT_USER_AGENT}'.", ) @click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.") @click.option( "-l", "--show-license", "--show_license", "--license", is_flag=True, default=False, help="Show license of Waybackpy.", ) @click.option( "-n", "--newest", "-au", "--archive_url", "--archive-url", default=False, is_flag=True, help="Retrieve the newest archive of URL.", ) @click.option( "-o", "--oldest", default=False, is_flag=True, help="Retrieve the oldest archive of URL.", ) @click.option( "-N", "--near", default=False, is_flag=True, help="Archive close to a specified time.", ) @click.option("-Y", "--year", type=click.IntRange(1994, 9999), help="Year in integer.") @click.option("-M", "--month", type=click.IntRange(1, 12), help="Month in integer.") @click.option("-D", "--day", type=click.IntRange(1, 31), help="Day in integer.") @click.option("-H", "--hour", type=click.IntRange(0, 24), help="Hour in integer.") @click.option("-MIN", "--minute", type=click.IntRange(0, 60), help="Minute in integer.") @click.option( "-s", "--save", default=False, is_flag=True, help="Save the specified URL's webpage and print the archive URL.", ) @click.option( "-h", "--headers", default=False, is_flag=True, help="Headers data of the SavePageNow API.", ) @click.option( "-ku", "--known-urls", "--known_urls", default=False, is_flag=True, help="List known URLs. Uses CDX API.", ) @click.option( "-sub", "--subdomain", default=False, is_flag=True, help="Use with '--known_urls' to include known URLs for subdomains.", ) @click.option( "-f", "--file", default=False, is_flag=True, help="Use with '--known_urls' to save the URLs in file at current directory.", ) @click.option( "--cdx", default=False, is_flag=True, help="Flag for using CDX API.", ) @click.option( "-st", "--start-timestamp", "--start_timestamp", "--from", help="Start timestamp for CDX API in yyyyMMddhhmmss format.", ) @click.option( "-et", "--end-timestamp", "--end_timestamp", "--to", help="End timestamp for CDX API in yyyyMMddhhmmss format.", ) @click.option( "-C", "--closest", help="Archive that are closest the timestamp passed as arguments to this " + "parameter.", ) @click.option( "-f", "--cdx-filter", "--cdx_filter", "--filter", multiple=True, help="Filter on a specific field or all the CDX fields.", ) @click.option( "-mt", "--match-type", "--match_type", help="The default behavior is to return matches for an exact URL. " + "However, the CDX server can also return results matching a certain prefix, " + "a certain host, or all sub-hosts by using the match_type", ) @click.option( "-st", "--sort", help="Choose one from default, closest or reverse. It returns sorted CDX entries " + "in the response.", ) @click.option( "-up", "--use-pagination", "--use_pagination", default=False, is_flag=True, help="Use the pagination API of the CDX server instead of the default one.", ) @click.option( "-gz", "--gzip", help="To disable gzip compression pass false as argument to this parameter. " + "The default behavior is gzip compression enabled.", ) @click.option( "-c", "--collapse", multiple=True, help="Filtering or 'collapse' results based on a field, or a substring of a field.", ) @click.option( "-l", "--limit", help="Number of maximum record that CDX API is asked to return per API call, " + "default value is 25000 records.", ) @click.option( "-cp", "--cdx-print", "--cdx_print", multiple=True, help="Print only certain fields of the CDX API response, " + "if this parameter is not used then the plain text response of the CDX API " + "will be printed.", ) def main( # pylint: disable=no-value-for-parameter user_agent: str, version: bool, show_license: bool, newest: bool, oldest: bool, near: bool, save: bool, headers: bool, known_urls: bool, subdomain: bool, file: bool, cdx: bool, use_pagination: bool, cdx_filter: List[str], collapse: List[str], cdx_print: List[str], url: Optional[str] = None, year: Optional[int] = None, month: Optional[int] = None, day: Optional[int] = None, hour: Optional[int] = None, minute: Optional[int] = None, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None, closest: Optional[str] = None, match_type: Optional[str] = None, sort: Optional[str] = None, gzip: Optional[str] = None, limit: Optional[str] = None, ) -> None: """\b _ _ | | | | __ ____ _ _ _| |__ __ _ ___| | ___ __ _ _ \\ \\ /\\ / / _` | | | | '_ \\ / _` |/ __| |/ / '_ \\| | | | \\ V V / (_| | |_| | |_) | (_| | (__| <| |_) | |_| | \\_/\\_/ \\__,_|\\__, |_.__/ \\__,_|\\___|_|\\_\\ .__/ \\__, | __/ | | | __/ | |___/ |_| |___/ Python package & CLI tool that interfaces the Wayback Machine APIs Repository: https://github.com/akamhy/waybackpy Documentation: https://github.com/akamhy/waybackpy/wiki/CLI-docs waybackpy - CLI usage(Demo video): https://asciinema.org/a/469890 Released under the MIT License. Use the flag --license for license. """ if version: click.echo(f"waybackpy version {__version__}") elif show_license: click.echo( requests.get( url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE" ).text ) elif url is None: click.echo( click.style("NoURLDetected: ", fg="red") + "No URL detected. " + "Please provide an URL.", err=True, ) elif oldest: cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent) handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest) elif newest: cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent) handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest) elif near: cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent) near_args = {} keys = ["year", "month", "day", "hour", "minute"] args_arr = [year, month, day, hour, minute] for key, arg in zip(keys, args_arr): if arg: near_args[key] = arg handle_cdx_closest_derivative_methods( cdx_api, oldest, near, newest, near_args=near_args ) elif save: save_api = WaybackMachineSaveAPI(url, user_agent=user_agent) save_api.save() click.echo("Archive URL:") click.echo(save_api.archive_url) click.echo("Cached save:") click.echo(save_api.cached_save) if headers: click.echo("Save API headers:") click.echo(save_api.headers) elif known_urls: wayback = Url(url, user_agent) url_gen = wayback.known_urls(subdomain=subdomain) if file: save_urls_on_file(url_gen) else: for url_ in url_gen: click.echo(url_) elif cdx: data = [ url, user_agent, start_timestamp, end_timestamp, cdx_filter, collapse, cdx_print, limit, gzip, match_type, sort, use_pagination, closest, ] handle_cdx(data) else: click.echo( click.style("NoCommandFound: ", fg="red") + "Only URL passed, but did not specify what to do with the URL. " + "Use --help flag for help using waybackpy.", err=True, ) if __name__ == "__main__": main() # pylint: disable=no-value-for-parameter