known urls now yileds, more reliable. And save the file in chucks wrt to response. --file arg can be used to create output file, if --file not used no output will be saved in any file. (#88)
This commit is contained in:
parent
a3bc6aad2b
commit
36b936820b
@ -18,6 +18,7 @@ def test_save():
|
|||||||
url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm",
|
url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=True,
|
save=True,
|
||||||
json=False,
|
json=False,
|
||||||
@ -38,6 +39,7 @@ def test_json():
|
|||||||
url="https://pypi.org/user/akamhy/",
|
url="https://pypi.org/user/akamhy/",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=True,
|
json=True,
|
||||||
@ -58,6 +60,7 @@ def test_archive_url():
|
|||||||
url="https://pypi.org/user/akamhy/",
|
url="https://pypi.org/user/akamhy/",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -78,6 +81,7 @@ def test_oldest():
|
|||||||
url="https://pypi.org/user/akamhy/",
|
url="https://pypi.org/user/akamhy/",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=True,
|
oldest=True,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -100,6 +104,7 @@ def test_oldest():
|
|||||||
url=url,
|
url=url,
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=True,
|
oldest=True,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -121,6 +126,7 @@ def test_newest():
|
|||||||
url="https://pypi.org/user/akamhy/",
|
url="https://pypi.org/user/akamhy/",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -143,6 +149,7 @@ def test_newest():
|
|||||||
url=url,
|
url=url,
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -164,6 +171,7 @@ def test_total_archives():
|
|||||||
url="https://pypi.org/user/akamhy/",
|
url="https://pypi.org/user/akamhy/",
|
||||||
total=True,
|
total=True,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -185,6 +193,7 @@ def test_known_urls():
|
|||||||
url="https://www.keybr.com",
|
url="https://www.keybr.com",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=True,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -198,25 +207,6 @@ def test_known_urls():
|
|||||||
reply = cli.args_handler(args)
|
reply = cli.args_handler(args)
|
||||||
assert "keybr" in str(reply)
|
assert "keybr" in str(reply)
|
||||||
|
|
||||||
args = argparse.Namespace(
|
|
||||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
|
||||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
|
||||||
url="https://akfyfufyjcujfufu6576r76r6amhy.gitd6r67r6u6hub.yfjyfjio",
|
|
||||||
total=False,
|
|
||||||
version=False,
|
|
||||||
oldest=False,
|
|
||||||
save=False,
|
|
||||||
json=False,
|
|
||||||
archive_url=False,
|
|
||||||
newest=False,
|
|
||||||
near=False,
|
|
||||||
subdomain=True,
|
|
||||||
known_urls=True,
|
|
||||||
get=None,
|
|
||||||
)
|
|
||||||
reply = cli.args_handler(args)
|
|
||||||
assert "No known URLs found" in str(reply)
|
|
||||||
|
|
||||||
|
|
||||||
def test_near():
|
def test_near():
|
||||||
args = argparse.Namespace(
|
args = argparse.Namespace(
|
||||||
@ -225,6 +215,7 @@ def test_near():
|
|||||||
url="https://pypi.org/user/akamhy/",
|
url="https://pypi.org/user/akamhy/",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -252,6 +243,7 @@ def test_near():
|
|||||||
url=url,
|
url=url,
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -278,6 +270,7 @@ def test_get():
|
|||||||
url="https://github.com/akamhy",
|
url="https://github.com/akamhy",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -297,6 +290,7 @@ def test_get():
|
|||||||
url="https://github.com/akamhy/waybackpy",
|
url="https://github.com/akamhy/waybackpy",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -316,6 +310,7 @@ def test_get():
|
|||||||
url="https://akamhy.github.io/waybackpy/",
|
url="https://akamhy.github.io/waybackpy/",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
|
file=False,
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
@ -335,25 +330,7 @@ def test_get():
|
|||||||
url="https://pypi.org/user/akamhy/",
|
url="https://pypi.org/user/akamhy/",
|
||||||
total=False,
|
total=False,
|
||||||
version=False,
|
version=False,
|
||||||
oldest=False,
|
file=False,
|
||||||
save=False,
|
|
||||||
json=False,
|
|
||||||
archive_url=False,
|
|
||||||
newest=False,
|
|
||||||
near=False,
|
|
||||||
subdomain=False,
|
|
||||||
known_urls=False,
|
|
||||||
get="save",
|
|
||||||
)
|
|
||||||
reply = cli.args_handler(args)
|
|
||||||
assert "waybackpy" in str(reply)
|
|
||||||
|
|
||||||
args = argparse.Namespace(
|
|
||||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
|
||||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
|
||||||
url="https://pypi.org/user/akamhy/",
|
|
||||||
total=False,
|
|
||||||
version=False,
|
|
||||||
oldest=False,
|
oldest=False,
|
||||||
save=False,
|
save=False,
|
||||||
json=False,
|
json=False,
|
||||||
|
@ -17,26 +17,6 @@ def test_url_check():
|
|||||||
Url(broken_url, user_agent)
|
Url(broken_url, user_agent)
|
||||||
|
|
||||||
|
|
||||||
def test_save():
|
|
||||||
|
|
||||||
url_list = [
|
|
||||||
"en.wikipedia.org",
|
|
||||||
"akamhy.github.io",
|
|
||||||
"www.wiktionary.org",
|
|
||||||
"www.w3schools.com",
|
|
||||||
"youtube.com",
|
|
||||||
]
|
|
||||||
x = random.randint(0, len(url_list) - 1)
|
|
||||||
url1 = url_list[x]
|
|
||||||
target = Url(
|
|
||||||
url1,
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
|
|
||||||
"(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
|
|
||||||
)
|
|
||||||
archived_url1 = str(target.save())
|
|
||||||
assert url1 in archived_url1
|
|
||||||
|
|
||||||
|
|
||||||
def test_near():
|
def test_near():
|
||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
NeverArchivedUrl = (
|
NeverArchivedUrl = (
|
||||||
|
@ -89,28 +89,40 @@ def _near(obj, args):
|
|||||||
return no_archive_handler(e, obj)
|
return no_archive_handler(e, obj)
|
||||||
|
|
||||||
|
|
||||||
def _save_urls_on_file(input_list, live_url_count):
|
def _save_urls_on_file(url_gen):
|
||||||
m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
|
domain = None
|
||||||
|
|
||||||
domain = "domain-unknown"
|
|
||||||
if m:
|
|
||||||
domain = m.group(1)
|
|
||||||
|
|
||||||
sys_random = random.SystemRandom()
|
sys_random = random.SystemRandom()
|
||||||
uid = "".join(
|
uid = "".join(
|
||||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||||
)
|
)
|
||||||
|
url_count = 0
|
||||||
|
|
||||||
file_name = "{domain}-{live_url_count}-urls-{uid}.txt".format(
|
for url in url_gen:
|
||||||
domain=domain, live_url_count=live_url_count, uid=uid
|
url_count += 1
|
||||||
)
|
if not domain:
|
||||||
file_content = "\n".join(input_list)
|
m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
||||||
file_path = os.path.join(os.getcwd(), file_name)
|
|
||||||
with open(file_path, "w+") as f:
|
domain = "domain-unknown"
|
||||||
f.write(file_content)
|
|
||||||
return "{file_content}\n\n'{file_name}' saved in current working directory".format(
|
if m:
|
||||||
file_content=file_content, file_name=file_name
|
domain = m.group(1)
|
||||||
)
|
|
||||||
|
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
|
||||||
|
file_path = os.path.join(os.getcwd(), file_name)
|
||||||
|
if not os.path.isfile(file_path):
|
||||||
|
open(file_path, "w+").close()
|
||||||
|
|
||||||
|
with open(file_path, "a") as f:
|
||||||
|
f.write("{url}\n".format(url=url))
|
||||||
|
|
||||||
|
print(url)
|
||||||
|
|
||||||
|
if url_count > 0:
|
||||||
|
return "\n\n'{file_name}' saved in current working directory".format(
|
||||||
|
file_name=file_name
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return "No known URLs found. Please try a diffrent input!"
|
||||||
|
|
||||||
|
|
||||||
def _known_urls(obj, args):
|
def _known_urls(obj, args):
|
||||||
@ -118,17 +130,16 @@ def _known_urls(obj, args):
|
|||||||
Known urls for a domain.
|
Known urls for a domain.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
subdomain = False
|
subdomain = True if args.subdomain else False
|
||||||
if args.subdomain:
|
|
||||||
subdomain = True
|
|
||||||
|
|
||||||
url_list = obj.known_urls(subdomain=subdomain)
|
url_gen = obj.known_urls(subdomain=subdomain)
|
||||||
total_urls = len(url_list)
|
|
||||||
|
|
||||||
if total_urls > 0:
|
if args.file:
|
||||||
return _save_urls_on_file(url_list, total_urls)
|
return _save_urls_on_file(url_gen)
|
||||||
|
else:
|
||||||
return "No known URLs found. Please try a diffrent domain!"
|
for url in url_gen:
|
||||||
|
print(url)
|
||||||
|
return "\n"
|
||||||
|
|
||||||
|
|
||||||
def _get(obj, args):
|
def _get(obj, args):
|
||||||
@ -265,6 +276,12 @@ def add_knownUrlArg(knownUrlArg):
|
|||||||
)
|
)
|
||||||
help_text = "Use with '--known_urls' to include known URLs for subdomains."
|
help_text = "Use with '--known_urls' to include known URLs for subdomains."
|
||||||
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
|
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
|
||||||
|
knownUrlArg.add_argument(
|
||||||
|
"--file",
|
||||||
|
"-f",
|
||||||
|
action="store_true",
|
||||||
|
help="Save the URLs in file at current directory.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def add_nearArg(nearArg):
|
def add_nearArg(nearArg):
|
||||||
|
@ -308,41 +308,40 @@ class Url:
|
|||||||
i = i + 1
|
i = i + 1
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
|
def known_urls(
|
||||||
|
self,
|
||||||
|
subdomain=False,
|
||||||
|
host=False,
|
||||||
|
start_timestamp=None,
|
||||||
|
end_timestamp=None,
|
||||||
|
match_type="prefix",
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Returns list of URLs known to exist for given domain name
|
Yields list of URLs known to exist for given input.
|
||||||
because these URLs were crawled by WayBack Machine spider.
|
Defaults to input URL as prefix.
|
||||||
Useful for pen-testing.
|
|
||||||
|
This method is kept for compatibility, use the Cdx class instead.
|
||||||
|
This method itself depends on Cdx.
|
||||||
|
|
||||||
|
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
|
||||||
|
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
|
|
||||||
# https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
|
||||||
|
|
||||||
url_list = []
|
|
||||||
|
|
||||||
if subdomain:
|
if subdomain:
|
||||||
cdx = Cdx(
|
match_type = "domain"
|
||||||
_cleaned_url(self.url),
|
if host:
|
||||||
user_agent=self.user_agent,
|
match_type = "host"
|
||||||
start_timestamp=start_timestamp,
|
|
||||||
end_timestamp=end_timestamp,
|
cdx = Cdx(
|
||||||
match_type="domain",
|
_cleaned_url(self.url),
|
||||||
collapses=["urlkey"],
|
user_agent=self.user_agent,
|
||||||
)
|
start_timestamp=start_timestamp,
|
||||||
else:
|
end_timestamp=end_timestamp,
|
||||||
cdx = Cdx(
|
match_type=match_type,
|
||||||
_cleaned_url(self.url),
|
collapses=["urlkey"],
|
||||||
user_agent=self.user_agent,
|
)
|
||||||
start_timestamp=start_timestamp,
|
|
||||||
end_timestamp=end_timestamp,
|
|
||||||
match_type="host",
|
|
||||||
collapses=["urlkey"],
|
|
||||||
)
|
|
||||||
|
|
||||||
snapshots = cdx.snapshots()
|
snapshots = cdx.snapshots()
|
||||||
|
|
||||||
url_list = []
|
|
||||||
for snapshot in snapshots:
|
for snapshot in snapshots:
|
||||||
url_list.append(snapshot.original)
|
yield (snapshot.original)
|
||||||
|
|
||||||
return url_list
|
|
||||||
|
Loading…
Reference in New Issue
Block a user