implement known_urls for cli from the newer interface. Although use of CDX is recommended but backward-compatibility matters.
This commit is contained in:
parent
9bbd056268
commit
7adc01bff2
@ -1,10 +1,15 @@
|
|||||||
import click
|
import click
|
||||||
|
import re
|
||||||
|
import os
|
||||||
import json as JSON
|
import json as JSON
|
||||||
|
import random
|
||||||
|
import string
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
from .cdx_api import WaybackMachineCDXServerAPI
|
from .cdx_api import WaybackMachineCDXServerAPI
|
||||||
from .save_api import WaybackMachineSaveAPI
|
from .save_api import WaybackMachineSaveAPI
|
||||||
from .availability_api import WaybackMachineAvailabilityAPI
|
from .availability_api import WaybackMachineAvailabilityAPI
|
||||||
|
from .wrapper import Url
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@ -67,6 +72,28 @@ from .availability_api import WaybackMachineAvailabilityAPI
|
|||||||
is_flag=True,
|
is_flag=True,
|
||||||
help="Spit out the headers data for save_api commands.",
|
help="Spit out the headers data for save_api commands.",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"-ku",
|
||||||
|
"--known-urls",
|
||||||
|
"--known_urls",
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="List known URLs. Uses CDX API.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-sub",
|
||||||
|
"--subdomain",
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="Use with '--known_urls' to include known URLs for subdomains.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-f",
|
||||||
|
"--file",
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="Use with '--known_urls' to save the URLs in file at current directory.",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-c",
|
"-c",
|
||||||
"--cdx",
|
"--cdx",
|
||||||
@ -128,6 +155,9 @@ def main(
|
|||||||
minute,
|
minute,
|
||||||
save,
|
save,
|
||||||
headers,
|
headers,
|
||||||
|
known_urls,
|
||||||
|
subdomain,
|
||||||
|
file,
|
||||||
cdx,
|
cdx,
|
||||||
start_timestamp,
|
start_timestamp,
|
||||||
end_timestamp,
|
end_timestamp,
|
||||||
@ -221,6 +251,44 @@ def main(
|
|||||||
click.echo(save_api.headers)
|
click.echo(save_api.headers)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def save_urls_on_file(url_gen):
|
||||||
|
domain = None
|
||||||
|
sys_random = random.SystemRandom()
|
||||||
|
uid = "".join(
|
||||||
|
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||||
|
)
|
||||||
|
url_count = 0
|
||||||
|
|
||||||
|
for url in url_gen:
|
||||||
|
url_count += 1
|
||||||
|
if not domain:
|
||||||
|
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
||||||
|
|
||||||
|
domain = "domain-unknown"
|
||||||
|
|
||||||
|
if match:
|
||||||
|
domain = match.group(1)
|
||||||
|
|
||||||
|
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
|
||||||
|
file_path = os.path.join(os.getcwd(), file_name)
|
||||||
|
if not os.path.isfile(file_path):
|
||||||
|
open(file_path, "w+").close()
|
||||||
|
|
||||||
|
with open(file_path, "a") as f:
|
||||||
|
f.write("{url}\n".format(url=url))
|
||||||
|
|
||||||
|
print(url)
|
||||||
|
|
||||||
|
if known_urls:
|
||||||
|
wayback = Url(url, user_agent)
|
||||||
|
url_gen = wayback.known_urls(subdomain=subdomain)
|
||||||
|
|
||||||
|
if file:
|
||||||
|
return save_urls_on_file(url_gen)
|
||||||
|
else:
|
||||||
|
for url in url_gen:
|
||||||
|
click.echo(url)
|
||||||
|
|
||||||
if cdx:
|
if cdx:
|
||||||
filters = list(filters)
|
filters = list(filters)
|
||||||
collapses = list(collapses)
|
collapses = list(collapses)
|
||||||
|
Loading…
Reference in New Issue
Block a user