Create separate module for the 3 different APIs also CDX is now CLI supported.

2022-01-02 14:14:45 +05:30
parent a7b805292d
commit 4e68cd5743
25 changed files with 755 additions and 2337 deletions
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -1,334 +1,270 @@
-import os
-import re
-import sys
-import json
-import random
-import string
-import argparse
-
-from .wrapper import Url
-from .exceptions import WaybackError
+import click
+import json as JSON
 from .__version__ import __version__
+from .utils import DEFAULT_USER_AGENT
+from .cdx_api import WaybackMachineCDXServerAPI
+from .save_api import WaybackMachineSaveAPI
+from .availability_api import WaybackMachineAvailabilityAPI


-def _save(obj):
-    try:
-        return obj.save()
-    except Exception as err:
-        e = str(err)
-        m = re.search(r"Header:\n(.*)", e)
-        if m:
-            header = m.group(1)
-        if "No archive URL found in the API response" in e:
-            return (
-                "\n[waybackpy] Can not save/archive your link.\n[waybackpy] This "
-                "could happen because either your waybackpy ({version}) is likely out of "
-                "date or Wayback Machine is malfunctioning.\n[waybackpy] Visit "
-                "https://github.com/akamhy/waybackpy for the latest version of "
-                "waybackpy.\n[waybackpy] API response Header :\n{header}".format(
-                    version=__version__, header=header
-                )
-            )
-        if "URL cannot be archived by wayback machine as it is a redirect" in e:
-            return ("URL cannot be archived by wayback machine as it is a redirect")
-        raise WaybackError(err)
-
-
-def _archive_url(obj):
-    return obj.archive_url
-
-
-def _json(obj):
-    return json.dumps(obj.JSON)
-
-
-def no_archive_handler(e, obj):
-    m = re.search(r"archive\sfor\s\'(.*?)\'\stry", str(e))
-    if m:
-        url = m.group(1)
-        ua = obj.user_agent
-        if "github.com/akamhy/waybackpy" in ua:
-            ua = "YOUR_USER_AGENT_HERE"
-        return (
-            "\n[Waybackpy] Can not find archive for '{url}'.\n[Waybackpy] You can"
-            " save the URL using the following command:\n[Waybackpy] waybackpy --"
-            'user_agent "{user_agent}" --url "{url}" --save'.format(
-                url=url, user_agent=ua
-            )
-        )
-    raise WaybackError(e)
-
-
-def _oldest(obj):
-    try:
-        return obj.oldest()
-    except Exception as e:
-        return no_archive_handler(e, obj)
-
-
-def _newest(obj):
-    try:
-        return obj.newest()
-    except Exception as e:
-        return no_archive_handler(e, obj)
-
-
-def _total_archives(obj):
-    return obj.total_archives()
-
-
-def _near(obj, args):
-    _near_args = {}
-    args_arr = [args.year, args.month, args.day, args.hour, args.minute]
-    keys = ["year", "month", "day", "hour", "minute"]
-
-    for key, arg in zip(keys, args_arr):
-        if arg:
-            _near_args[key] = arg
-
-    try:
-        return obj.near(**_near_args)
-    except Exception as e:
-        return no_archive_handler(e, obj)
-
-
-def _save_urls_on_file(url_gen):
-    domain = None
-    sys_random = random.SystemRandom()
-    uid = "".join(
-        sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
-    )
-    url_count = 0
-
-    for url in url_gen:
-        url_count += 1
-        if not domain:
-            m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
-
-            domain = "domain-unknown"
-
-            if m:
-                domain = m.group(1)
-
-            file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
-            file_path = os.path.join(os.getcwd(), file_name)
-            if not os.path.isfile(file_path):
-                open(file_path, "w+").close()
-
-        with open(file_path, "a") as f:
-            f.write("{url}\n".format(url=url))
-
-        print(url)
-
-    if url_count > 0:
-        return "\n\n'{file_name}' saved in current working directory".format(
-            file_name=file_name
-        )
-    else:
-        return "No known URLs found. Please try a diffrent input!"
-
-
-def _known_urls(obj, args):
+@click.command()
+@click.option(
+    "-u", "--url", help="URL on which Wayback machine operations are to be performed."
+)
+@click.option(
+    "-ua",
+    "--user-agent",
+    "--user_agent",
+    default=DEFAULT_USER_AGENT,
+    help="User agent, default user agent is '%s' " % DEFAULT_USER_AGENT,
+)
+@click.option(
+    "-v", "--version", is_flag=True, default=False, help="Print waybackpy version."
+)
+@click.option(
+    "-n",
+    "--newest",
+    "-au",
+    "--archive_url",
+    "--archive-url",
+    default=False,
+    is_flag=True,
+    help="Fetch the newest archive of the specified URL",
+)
+@click.option(
+    "-o",
+    "--oldest",
+    default=False,
+    is_flag=True,
+    help="Fetch the oldest archive of the specified URL",
+)
+@click.option(
+    "-j",
+    "--json",
+    default=False,
+    is_flag=True,
+    help="Spit out the JSON data for availability_api commands.",
+)
+@click.option(
+    "-N", "--near", default=False, is_flag=True, help="Archive near specified time."
+)
+@click.option("-Y", "--year", type=click.IntRange(1994, 9999), help="Year in integer.")
+@click.option("-M", "--month", type=click.IntRange(1, 12), help="Month in integer.")
+@click.option("-D", "--day", type=click.IntRange(1, 31), help="Day in integer.")
+@click.option("-H", "--hour", type=click.IntRange(0, 24), help="Hour in integer.")
+@click.option("-MIN", "--minute", type=click.IntRange(0, 60), help="Minute in integer.")
+@click.option(
+    "-s",
+    "--save",
+    default=False,
+    is_flag=True,
+    help="Save the specified URL's webpage and print the archive URL.",
+)
+@click.option(
+    "-h",
+    "--headers",
+    default=False,
+    is_flag=True,
+    help="Spit out the headers data for save_api commands.",
+)
+@click.option(
+    "-c",
+    "--cdx",
+    default=False,
+    is_flag=True,
+    help="Spit out the headers data for save_api commands.",
+)
+@click.option(
+    "-st",
+    "--start-timestamp",
+    "--start_timestamp",
+)
+@click.option(
+    "-et",
+    "--end-timestamp",
+    "--end_timestamp",
+)
+@click.option(
+    "-f",
+    "--filters",
+    multiple=True,
+)
+@click.option(
+    "-mt",
+    "--match-type",
+    "--match_type",
+)
+@click.option(
+    "-gz",
+    "--gzip",
+)
+@click.option(
+    "-c",
+    "--collapses",
+    multiple=True,
+)
+@click.option(
+    "-l",
+    "--limit",
+)
+@click.option(
+    "-cp",
+    "--cdx-print",
+    "--cdx_print",
+    multiple=True,
+)
+def main(
+    url,
+    user_agent,
+    version,
+    newest,
+    oldest,
+    json,
+    near,
+    year,
+    month,
+    day,
+    hour,
+    minute,
+    save,
+    headers,
+    cdx,
+    start_timestamp,
+    end_timestamp,
+    filters,
+    match_type,
+    gzip,
+    collapses,
+    limit,
+    cdx_print,
+):
    """
-    Known urls for a domain.
+    ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
+    ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
+    ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
+    ┃┗┛┗┛┃┗━┓┃━┃┃━┃┃┃┏━┓┃┗━┓┃━┃┏━┛┃┗┛┛┃┏━━┛┃┃━┃┃
+    ┗┓┏┓┏┛┃┗┛┗┓┃┗━┛┃┃┗━┛┃┃┗┛┗┓┃┗━┓┃┏┓┓┃┃━━━┃┗━┛┃
+    ━┗┛┗┛━┗━━━┛┗━┓┏┛┗━━━┛┗━━━┛┗━━┛┗┛┗┛┗┛━━━┗━┓┏┛
+    ━━━━━━━━━━━┏━┛┃━━━━━━━━━━━━━━━━━━━━━━━━┏━┛┃━
+    ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
+
+    waybackpy : Python package & CLI tool that interfaces the Wayback Machine API
+
+    Released under the MIT License.
+    License @ https://github.com/akamhy/waybackpy/blob/master/LICENSE
+
+    Copyright (c) 2020 waybackpy contributors. Contributors list @
+    https://github.com/akamhy/waybackpy/graphs/contributors
+
+    https://github.com/akamhy/waybackpy
+
+    https://pypi.org/project/waybackpy
+
    """

-    subdomain = True if args.subdomain else False
+    if version:
+        click.echo("waybackpy version %s" % __version__)
+        return

-    url_gen = obj.known_urls(subdomain=subdomain)
+    if not url:
+        click.echo("No URL detected. Please pass an URL.")
+        return

-    if args.file:
-        return _save_urls_on_file(url_gen)
-    else:
-        for url in url_gen:
-            print(url)
-        return "\n"
+    def echo_availability_api(availability_api_instance):
+        click.echo("Archive URL:")
+        if not availability_api_instance.archive_url:
+            archive_url = (
+                "NO ARCHIVE FOUND - The requested URL is probably "
+                + "not yet archived or if the URL was recently archived then it is "
+                + "not yet available via the Wayback Machine's availability API "
+                + "because of database lag and should be available after some time."
+            )
+        else:
+            archive_url = availability_api_instance.archive_url
+        click.echo(archive_url)
+        if json:
+            click.echo("JSON response:")
+            click.echo(JSON.dumps(availability_api_instance.JSON))

+    availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)

-def _get(obj, args):
-    if args.get.lower() == "url":
-        return obj.get()
-    if args.get.lower() == "archive_url":
-        return obj.get(obj.archive_url)
-    if args.get.lower() == "oldest":
-        return obj.get(obj.oldest())
-    if args.get.lower() == "latest" or args.get.lower() == "newest":
-        return obj.get(obj.newest())
-    if args.get.lower() == "save":
-        return obj.get(obj.save())
-    return "Use get as \"--get 'source'\", 'source' can be one of the followings: \
-        \n1) url - get the source code of the url specified using --url/-u.\
-        \n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\
-        \n3) oldest - get the source code of the oldest archive for the supplied url.\
-        \n4) newest - get the source code of the newest archive for the supplied url.\
-        \n5) save - Create a new archive and get the source code of this new archive for the supplied url."
+    if oldest:
+        availability_api.oldest()
+        echo_availability_api(availability_api)
+        return

+    if newest:
+        availability_api.newest()
+        echo_availability_api(availability_api)
+        return

-def args_handler(args):
-    if args.version:
-        return "waybackpy version {version}".format(version=__version__)
+    if near:
+        near_args = {}
+        keys = ["year", "month", "day", "hour", "minute"]
+        args_arr = [year, month, day, hour, minute]
+        for key, arg in zip(keys, args_arr):
+            if arg:
+                near_args[key] = arg
+        availability_api.near(**near_args)
+        echo_availability_api(availability_api)
+        return

-    if not args.url:
-        return "waybackpy {version} \nSee 'waybackpy --help' for help using this tool.".format(
-            version=__version__
+    if save:
+        save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
+        save_api.save()
+        click.echo("Archive URL:")
+        click.echo(save_api.archive_url)
+        click.echo("Cached save:")
+        click.echo(save_api.cached_save)
+        if headers:
+            click.echo("Save API headers:")
+            click.echo(save_api.headers)
+        return
+
+    if cdx:
+        filters = list(filters)
+        collapses = list(collapses)
+        cdx_print = list(cdx_print)
+
+        cdx_api = WaybackMachineCDXServerAPI(
+            url,
+            user_agent=user_agent,
+            start_timestamp=start_timestamp,
+            end_timestamp=end_timestamp,
+            filters=filters,
+            match_type=match_type,
+            gzip=gzip,
+            collapses=collapses,
+            limit=limit,
        )

-    obj = Url(args.url)
-    if args.user_agent:
-        obj = Url(args.url, args.user_agent)
+        snapshots = cdx_api.snapshots()

-    if args.save:
-        output = _save(obj)
-    elif args.archive_url:
-        output = _archive_url(obj)
-    elif args.json:
-        output = _json(obj)
-    elif args.oldest:
-        output = _oldest(obj)
-    elif args.newest:
-        output = _newest(obj)
-    elif args.known_urls:
-        output = _known_urls(obj, args)
-    elif args.total:
-        output = _total_archives(obj)
-    elif args.near:
-        return _near(obj, args)
-    elif args.get:
-        output = _get(obj, args)
-    else:
-        output = (
-            "You only specified the URL. But you also need to specify the operation."
-            "\nSee 'waybackpy --help' for help using this tool."
-        )
-    return output
-
-
-def add_requiredArgs(requiredArgs):
-    requiredArgs.add_argument(
-        "--url", "-u", help="URL on which Wayback machine operations would occur"
-    )
-
-
-def add_userAgentArg(userAgentArg):
-    help_text = 'User agent, default user_agent is "waybackpy python package - https://github.com/akamhy/waybackpy"'
-    userAgentArg.add_argument("--user_agent", "-ua", help=help_text)
-
-
-def add_saveArg(saveArg):
-    saveArg.add_argument(
-        "--save", "-s", action="store_true", help="Save the URL on the Wayback machine"
-    )
-
-
-def add_auArg(auArg):
-    auArg.add_argument(
-        "--archive_url",
-        "-au",
-        action="store_true",
-        help="Get the latest archive URL, alias for --newest",
-    )
-
-
-def add_jsonArg(jsonArg):
-    jsonArg.add_argument(
-        "--json",
-        "-j",
-        action="store_true",
-        help="JSON data of the availability API request",
-    )
-
-
-def add_oldestArg(oldestArg):
-    oldestArg.add_argument(
-        "--oldest",
-        "-o",
-        action="store_true",
-        help="Oldest archive for the specified URL",
-    )
-
-
-def add_newestArg(newestArg):
-    newestArg.add_argument(
-        "--newest",
-        "-n",
-        action="store_true",
-        help="Newest archive for the specified URL",
-    )
-
-
-def add_totalArg(totalArg):
-    totalArg.add_argument(
-        "--total",
-        "-t",
-        action="store_true",
-        help="Total number of archives for the specified URL",
-    )
-
-
-def add_getArg(getArg):
-    getArg.add_argument(
-        "--get",
-        "-g",
-        help="Prints the source code of the supplied url. Use '--get help' for extended usage",
-    )
-
-
-def add_knownUrlArg(knownUrlArg):
-    knownUrlArg.add_argument(
-        "--known_urls", "-ku", action="store_true", help="URLs known for the domain."
-    )
-    help_text = "Use with '--known_urls' to include known URLs for subdomains."
-    knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
-    knownUrlArg.add_argument(
-        "--file",
-        "-f",
-        action="store_true",
-        help="Save the URLs in file at current directory.",
-    )
-
-
-def add_nearArg(nearArg):
-    nearArg.add_argument(
-        "--near", "-N", action="store_true", help="Archive near specified time"
-    )
-
-
-def add_nearArgs(nearArgs):
-    nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
-    nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
-    nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.")
-    nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
-    nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
-
-
-def parse_args(argv):
-    parser = argparse.ArgumentParser()
-    add_requiredArgs(parser.add_argument_group("URL argument (required)"))
-    add_userAgentArg(parser.add_argument_group("User Agent"))
-    add_saveArg(parser.add_argument_group("Create new archive/save URL"))
-    add_auArg(parser.add_argument_group("Get the latest Archive"))
-    add_jsonArg(parser.add_argument_group("Get the JSON data"))
-    add_oldestArg(parser.add_argument_group("Oldest archive"))
-    add_newestArg(parser.add_argument_group("Newest archive"))
-    add_totalArg(parser.add_argument_group("Total number of archives"))
-    add_getArg(parser.add_argument_group("Get source code"))
-    add_knownUrlArg(
-        parser.add_argument_group(
-            "URLs known and archived to Wayback Machine for the site."
-        )
-    )
-    add_nearArg(parser.add_argument_group("Archive close to time specified"))
-    add_nearArgs(parser.add_argument_group("Arguments that are used only with --near"))
-    parser.add_argument(
-        "--version", "-v", action="store_true", help="Waybackpy version"
-    )
-    return parser.parse_args(argv[1:])
-
-
-def main(argv=None):
-    argv = sys.argv if argv is None else argv
-    print(args_handler(parse_args(argv)))
+        for snapshot in snapshots:
+            if len(cdx_print) == 0:
+                click.echo(snapshot)
+            else:
+                output_string = ""
+                if "urlkey" or "url-key" or "url_key" in cdx_print:
+                    output_string = output_string + snapshot.urlkey + " "
+                if "timestamp" or "time-stamp" or "time_stamp" in cdx_print:
+                    output_string = output_string + snapshot.timestamp + " "
+                if "original" in cdx_print:
+                    output_string = output_string + snapshot.original + " "
+                if "original" in cdx_print:
+                    output_string = output_string + snapshot.original + " "
+                if "mimetype" or "mime-type" or "mime_type" in cdx_print:
+                    output_string = output_string + snapshot.mimetype + " "
+                if "statuscode" or "status-code" or "status_code" in cdx_print:
+                    output_string = output_string + snapshot.statuscode + " "
+                if "digest" in cdx_print:
+                    output_string = output_string + snapshot.digest + " "
+                if "length" in cdx_print:
+                    output_string = output_string + snapshot.length + " "
+                if "archiveurl" or "archive-url" or "archive_url" in cdx_print:
+                    output_string = output_string + snapshot.archive_url + " "
+                click.echo(output_string)


 if __name__ == "__main__":
-    sys.exit(main(sys.argv))
+    main()