known urls now yileds, more reliable. And save the file in chucks wrt to response. --file arg can be used to create output file, if --file not used no output will be saved in any file. (#88)
This commit is contained in:
parent
a3bc6aad2b
commit
36b936820b
@ -18,6 +18,7 @@ def test_save():
|
||||
url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=True,
|
||||
json=False,
|
||||
@ -38,6 +39,7 @@ def test_json():
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=True,
|
||||
@ -58,6 +60,7 @@ def test_archive_url():
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -78,6 +81,7 @@ def test_oldest():
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=True,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -100,6 +104,7 @@ def test_oldest():
|
||||
url=url,
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=True,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -121,6 +126,7 @@ def test_newest():
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -143,6 +149,7 @@ def test_newest():
|
||||
url=url,
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -164,6 +171,7 @@ def test_total_archives():
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=True,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -185,6 +193,7 @@ def test_known_urls():
|
||||
url="https://www.keybr.com",
|
||||
total=False,
|
||||
version=False,
|
||||
file=True,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -198,25 +207,6 @@ def test_known_urls():
|
||||
reply = cli.args_handler(args)
|
||||
assert "keybr" in str(reply)
|
||||
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://akfyfufyjcujfufu6576r76r6amhy.gitd6r67r6u6hub.yfjyfjio",
|
||||
total=False,
|
||||
version=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=True,
|
||||
known_urls=True,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "No known URLs found" in str(reply)
|
||||
|
||||
|
||||
def test_near():
|
||||
args = argparse.Namespace(
|
||||
@ -225,6 +215,7 @@ def test_near():
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -252,6 +243,7 @@ def test_near():
|
||||
url=url,
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -278,6 +270,7 @@ def test_get():
|
||||
url="https://github.com/akamhy",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -297,6 +290,7 @@ def test_get():
|
||||
url="https://github.com/akamhy/waybackpy",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -316,6 +310,7 @@ def test_get():
|
||||
url="https://akamhy.github.io/waybackpy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
@ -335,25 +330,7 @@ def test_get():
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="save",
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in str(reply)
|
||||
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
|
@ -17,26 +17,6 @@ def test_url_check():
|
||||
Url(broken_url, user_agent)
|
||||
|
||||
|
||||
def test_save():
|
||||
|
||||
url_list = [
|
||||
"en.wikipedia.org",
|
||||
"akamhy.github.io",
|
||||
"www.wiktionary.org",
|
||||
"www.w3schools.com",
|
||||
"youtube.com",
|
||||
]
|
||||
x = random.randint(0, len(url_list) - 1)
|
||||
url1 = url_list[x]
|
||||
target = Url(
|
||||
url1,
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
|
||||
)
|
||||
archived_url1 = str(target.save())
|
||||
assert url1 in archived_url1
|
||||
|
||||
|
||||
def test_near():
|
||||
with pytest.raises(Exception):
|
||||
NeverArchivedUrl = (
|
||||
|
@ -89,28 +89,40 @@ def _near(obj, args):
|
||||
return no_archive_handler(e, obj)
|
||||
|
||||
|
||||
def _save_urls_on_file(input_list, live_url_count):
|
||||
m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
|
||||
|
||||
domain = "domain-unknown"
|
||||
if m:
|
||||
domain = m.group(1)
|
||||
|
||||
def _save_urls_on_file(url_gen):
|
||||
domain = None
|
||||
sys_random = random.SystemRandom()
|
||||
uid = "".join(
|
||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url_count = 0
|
||||
|
||||
file_name = "{domain}-{live_url_count}-urls-{uid}.txt".format(
|
||||
domain=domain, live_url_count=live_url_count, uid=uid
|
||||
)
|
||||
file_content = "\n".join(input_list)
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
with open(file_path, "w+") as f:
|
||||
f.write(file_content)
|
||||
return "{file_content}\n\n'{file_name}' saved in current working directory".format(
|
||||
file_content=file_content, file_name=file_name
|
||||
)
|
||||
for url in url_gen:
|
||||
url_count += 1
|
||||
if not domain:
|
||||
m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
||||
|
||||
domain = "domain-unknown"
|
||||
|
||||
if m:
|
||||
domain = m.group(1)
|
||||
|
||||
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
if not os.path.isfile(file_path):
|
||||
open(file_path, "w+").close()
|
||||
|
||||
with open(file_path, "a") as f:
|
||||
f.write("{url}\n".format(url=url))
|
||||
|
||||
print(url)
|
||||
|
||||
if url_count > 0:
|
||||
return "\n\n'{file_name}' saved in current working directory".format(
|
||||
file_name=file_name
|
||||
)
|
||||
else:
|
||||
return "No known URLs found. Please try a diffrent input!"
|
||||
|
||||
|
||||
def _known_urls(obj, args):
|
||||
@ -118,17 +130,16 @@ def _known_urls(obj, args):
|
||||
Known urls for a domain.
|
||||
"""
|
||||
|
||||
subdomain = False
|
||||
if args.subdomain:
|
||||
subdomain = True
|
||||
subdomain = True if args.subdomain else False
|
||||
|
||||
url_list = obj.known_urls(subdomain=subdomain)
|
||||
total_urls = len(url_list)
|
||||
url_gen = obj.known_urls(subdomain=subdomain)
|
||||
|
||||
if total_urls > 0:
|
||||
return _save_urls_on_file(url_list, total_urls)
|
||||
|
||||
return "No known URLs found. Please try a diffrent domain!"
|
||||
if args.file:
|
||||
return _save_urls_on_file(url_gen)
|
||||
else:
|
||||
for url in url_gen:
|
||||
print(url)
|
||||
return "\n"
|
||||
|
||||
|
||||
def _get(obj, args):
|
||||
@ -265,6 +276,12 @@ def add_knownUrlArg(knownUrlArg):
|
||||
)
|
||||
help_text = "Use with '--known_urls' to include known URLs for subdomains."
|
||||
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
|
||||
knownUrlArg.add_argument(
|
||||
"--file",
|
||||
"-f",
|
||||
action="store_true",
|
||||
help="Save the URLs in file at current directory.",
|
||||
)
|
||||
|
||||
|
||||
def add_nearArg(nearArg):
|
||||
|
@ -308,41 +308,40 @@ class Url:
|
||||
i = i + 1
|
||||
return i
|
||||
|
||||
def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
|
||||
def known_urls(
|
||||
self,
|
||||
subdomain=False,
|
||||
host=False,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
match_type="prefix",
|
||||
):
|
||||
"""
|
||||
Returns list of URLs known to exist for given domain name
|
||||
because these URLs were crawled by WayBack Machine spider.
|
||||
Useful for pen-testing.
|
||||
Yields list of URLs known to exist for given input.
|
||||
Defaults to input URL as prefix.
|
||||
|
||||
This method is kept for compatibility, use the Cdx class instead.
|
||||
This method itself depends on Cdx.
|
||||
|
||||
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
|
||||
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||
"""
|
||||
|
||||
# Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
|
||||
# https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||
|
||||
url_list = []
|
||||
|
||||
if subdomain:
|
||||
cdx = Cdx(
|
||||
_cleaned_url(self.url),
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
match_type="domain",
|
||||
collapses=["urlkey"],
|
||||
)
|
||||
else:
|
||||
cdx = Cdx(
|
||||
_cleaned_url(self.url),
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
match_type="host",
|
||||
collapses=["urlkey"],
|
||||
)
|
||||
match_type = "domain"
|
||||
if host:
|
||||
match_type = "host"
|
||||
|
||||
cdx = Cdx(
|
||||
_cleaned_url(self.url),
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
match_type=match_type,
|
||||
collapses=["urlkey"],
|
||||
)
|
||||
|
||||
snapshots = cdx.snapshots()
|
||||
|
||||
url_list = []
|
||||
for snapshot in snapshots:
|
||||
url_list.append(snapshot.original)
|
||||
|
||||
return url_list
|
||||
yield (snapshot.original)
|
||||
|
Loading…
Reference in New Issue
Block a user