import click import re import os import json as JSON import random import string from .__version__ import __version__ from .utils import DEFAULT_USER_AGENT from .cdx_api import WaybackMachineCDXServerAPI from .save_api import WaybackMachineSaveAPI from .availability_api import WaybackMachineAvailabilityAPI from .wrapper import Url @click.command() @click.option( "-u", "--url", help="URL on which Wayback machine operations are to be performed." ) @click.option( "-ua", "--user-agent", "--user_agent", default=DEFAULT_USER_AGENT, help="User agent, default user agent is '%s' " % DEFAULT_USER_AGENT, ) @click.option( "-v", "--version", is_flag=True, default=False, help="Print waybackpy version." ) @click.option( "-n", "--newest", "-au", "--archive_url", "--archive-url", default=False, is_flag=True, help="Fetch the newest archive of the specified URL", ) @click.option( "-o", "--oldest", default=False, is_flag=True, help="Fetch the oldest archive of the specified URL", ) @click.option( "-j", "--json", default=False, is_flag=True, help="Spit out the JSON data for availability_api commands.", ) @click.option( "-N", "--near", default=False, is_flag=True, help="Archive near specified time." ) @click.option("-Y", "--year", type=click.IntRange(1994, 9999), help="Year in integer.") @click.option("-M", "--month", type=click.IntRange(1, 12), help="Month in integer.") @click.option("-D", "--day", type=click.IntRange(1, 31), help="Day in integer.") @click.option("-H", "--hour", type=click.IntRange(0, 24), help="Hour in integer.") @click.option("-MIN", "--minute", type=click.IntRange(0, 60), help="Minute in integer.") @click.option( "-s", "--save", default=False, is_flag=True, help="Save the specified URL's webpage and print the archive URL.", ) @click.option( "-h", "--headers", default=False, is_flag=True, help="Spit out the headers data for save_api commands.", ) @click.option( "-ku", "--known-urls", "--known_urls", default=False, is_flag=True, help="List known URLs. Uses CDX API.", ) @click.option( "-sub", "--subdomain", default=False, is_flag=True, help="Use with '--known_urls' to include known URLs for subdomains.", ) @click.option( "-f", "--file", default=False, is_flag=True, help="Use with '--known_urls' to save the URLs in file at current directory.", ) @click.option( "-c", "--cdx", default=False, is_flag=True, help="Spit out the headers data for save_api commands.", ) @click.option( "-st", "--start-timestamp", "--start_timestamp", ) @click.option( "-et", "--end-timestamp", "--end_timestamp", ) @click.option( "-f", "--filters", multiple=True, ) @click.option( "-mt", "--match-type", "--match_type", ) @click.option( "-gz", "--gzip", ) @click.option( "-c", "--collapses", multiple=True, ) @click.option( "-l", "--limit", ) @click.option( "-cp", "--cdx-print", "--cdx_print", multiple=True, ) def main( url, user_agent, version, newest, oldest, json, near, year, month, day, hour, minute, save, headers, known_urls, subdomain, file, cdx, start_timestamp, end_timestamp, filters, match_type, gzip, collapses, limit, cdx_print, ): """ ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━ ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━ ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓ ┃┗┛┗┛┃┗━┓┃━┃┃━┃┃┃┏━┓┃┗━┓┃━┃┏━┛┃┗┛┛┃┏━━┛┃┃━┃┃ ┗┓┏┓┏┛┃┗┛┗┓┃┗━┛┃┃┗━┛┃┃┗┛┗┓┃┗━┓┃┏┓┓┃┃━━━┃┗━┛┃ ━┗┛┗┛━┗━━━┛┗━┓┏┛┗━━━┛┗━━━┛┗━━┛┗┛┗┛┗┛━━━┗━┓┏┛ ━━━━━━━━━━━┏━┛┃━━━━━━━━━━━━━━━━━━━━━━━━┏━┛┃━ ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━ waybackpy : Python package & CLI tool that interfaces the Wayback Machine API Released under the MIT License. License @ https://github.com/akamhy/waybackpy/blob/master/LICENSE Copyright (c) 2020 waybackpy contributors. Contributors list @ https://github.com/akamhy/waybackpy/graphs/contributors https://github.com/akamhy/waybackpy https://pypi.org/project/waybackpy """ if version: click.echo("waybackpy version %s" % __version__) return if not url: click.echo("No URL detected. Please pass an URL.") return def echo_availability_api(availability_api_instance): click.echo("Archive URL:") if not availability_api_instance.archive_url: archive_url = ( "NO ARCHIVE FOUND - The requested URL is probably " + "not yet archived or if the URL was recently archived then it is " + "not yet available via the Wayback Machine's availability API " + "because of database lag and should be available after some time." ) else: archive_url = availability_api_instance.archive_url click.echo(archive_url) if json: click.echo("JSON response:") click.echo(JSON.dumps(availability_api_instance.JSON)) availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) if oldest: availability_api.oldest() echo_availability_api(availability_api) return if newest: availability_api.newest() echo_availability_api(availability_api) return if near: near_args = {} keys = ["year", "month", "day", "hour", "minute"] args_arr = [year, month, day, hour, minute] for key, arg in zip(keys, args_arr): if arg: near_args[key] = arg availability_api.near(**near_args) echo_availability_api(availability_api) return if save: save_api = WaybackMachineSaveAPI(url, user_agent=user_agent) save_api.save() click.echo("Archive URL:") click.echo(save_api.archive_url) click.echo("Cached save:") click.echo(save_api.cached_save) if headers: click.echo("Save API headers:") click.echo(save_api.headers) return def save_urls_on_file(url_gen): domain = None sys_random = random.SystemRandom() uid = "".join( sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) ) url_count = 0 for url in url_gen: url_count += 1 if not domain: match = re.search("https?://([A-Za-z_0-9.-]+).*", url) domain = "domain-unknown" if match: domain = match.group(1) file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid) file_path = os.path.join(os.getcwd(), file_name) if not os.path.isfile(file_path): open(file_path, "w+").close() with open(file_path, "a") as f: f.write("{url}\n".format(url=url)) click.echo(url) if url_count > 0: click.echo( "\n\n'{file_name}' saved in current working directory".format( file_name=file_name ) ) else: click.echo("No known URLs found. Please try a diffrent input!") if known_urls: wayback = Url(url, user_agent) url_gen = wayback.known_urls(subdomain=subdomain) if file: return save_urls_on_file(url_gen) else: for url in url_gen: click.echo(url) if cdx: filters = list(filters) collapses = list(collapses) cdx_print = list(cdx_print) cdx_api = WaybackMachineCDXServerAPI( url, user_agent=user_agent, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, match_type=match_type, gzip=gzip, collapses=collapses, limit=limit, ) snapshots = cdx_api.snapshots() for snapshot in snapshots: if len(cdx_print) == 0: click.echo(snapshot) else: output_string = "" if "urlkey" or "url-key" or "url_key" in cdx_print: output_string = output_string + snapshot.urlkey + " " if "timestamp" or "time-stamp" or "time_stamp" in cdx_print: output_string = output_string + snapshot.timestamp + " " if "original" in cdx_print: output_string = output_string + snapshot.original + " " if "original" in cdx_print: output_string = output_string + snapshot.original + " " if "mimetype" or "mime-type" or "mime_type" in cdx_print: output_string = output_string + snapshot.mimetype + " " if "statuscode" or "status-code" or "status_code" in cdx_print: output_string = output_string + snapshot.statuscode + " " if "digest" in cdx_print: output_string = output_string + snapshot.digest + " " if "length" in cdx_print: output_string = output_string + snapshot.length + " " if "archiveurl" or "archive-url" or "archive_url" in cdx_print: output_string = output_string + snapshot.archive_url + " " click.echo(output_string) if __name__ == "__main__": main()