implement known_urls for cli from the newer interface. Although use of CDX is recommended but backward-compatibility matters.

This commit is contained in:
Akash Mahanty 2022-01-18 20:07:12 +05:30
parent 9bbd056268
commit 7adc01bff2

View File

@ -1,10 +1,15 @@
import click import click
import re
import os
import json as JSON import json as JSON
import random
import string
from .__version__ import __version__ from .__version__ import __version__
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
from .cdx_api import WaybackMachineCDXServerAPI from .cdx_api import WaybackMachineCDXServerAPI
from .save_api import WaybackMachineSaveAPI from .save_api import WaybackMachineSaveAPI
from .availability_api import WaybackMachineAvailabilityAPI from .availability_api import WaybackMachineAvailabilityAPI
from .wrapper import Url
@click.command() @click.command()
@ -67,6 +72,28 @@ from .availability_api import WaybackMachineAvailabilityAPI
is_flag=True, is_flag=True,
help="Spit out the headers data for save_api commands.", help="Spit out the headers data for save_api commands.",
) )
@click.option(
"-ku",
"--known-urls",
"--known_urls",
default=False,
is_flag=True,
help="List known URLs. Uses CDX API.",
)
@click.option(
"-sub",
"--subdomain",
default=False,
is_flag=True,
help="Use with '--known_urls' to include known URLs for subdomains.",
)
@click.option(
"-f",
"--file",
default=False,
is_flag=True,
help="Use with '--known_urls' to save the URLs in file at current directory.",
)
@click.option( @click.option(
"-c", "-c",
"--cdx", "--cdx",
@ -128,6 +155,9 @@ def main(
minute, minute,
save, save,
headers, headers,
known_urls,
subdomain,
file,
cdx, cdx,
start_timestamp, start_timestamp,
end_timestamp, end_timestamp,
@ -221,6 +251,44 @@ def main(
click.echo(save_api.headers) click.echo(save_api.headers)
return return
def save_urls_on_file(url_gen):
domain = None
sys_random = random.SystemRandom()
uid = "".join(
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
)
url_count = 0
for url in url_gen:
url_count += 1
if not domain:
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
domain = "domain-unknown"
if match:
domain = match.group(1)
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
file_path = os.path.join(os.getcwd(), file_name)
if not os.path.isfile(file_path):
open(file_path, "w+").close()
with open(file_path, "a") as f:
f.write("{url}\n".format(url=url))
print(url)
if known_urls:
wayback = Url(url, user_agent)
url_gen = wayback.known_urls(subdomain=subdomain)
if file:
return save_urls_on_file(url_gen)
else:
for url in url_gen:
click.echo(url)
if cdx: if cdx:
filters = list(filters) filters = list(filters)
collapses = list(collapses) collapses = list(collapses)