known urls now yileds, more reliable. And save the file in chucks wrt to response. --file arg can be used to create output file, if --file not used no output will be saved in any file. (#88)

This commit is contained in:
Akash Mahanty 2021-01-24 16:11:39 +05:30 committed by GitHub
parent a3bc6aad2b
commit 36b936820b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 87 additions and 114 deletions

View File

@ -18,6 +18,7 @@ def test_save():
url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm", url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm",
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=True, save=True,
json=False, json=False,
@ -38,6 +39,7 @@ def test_json():
url="https://pypi.org/user/akamhy/", url="https://pypi.org/user/akamhy/",
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=True, json=True,
@ -58,6 +60,7 @@ def test_archive_url():
url="https://pypi.org/user/akamhy/", url="https://pypi.org/user/akamhy/",
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -78,6 +81,7 @@ def test_oldest():
url="https://pypi.org/user/akamhy/", url="https://pypi.org/user/akamhy/",
total=False, total=False,
version=False, version=False,
file=False,
oldest=True, oldest=True,
save=False, save=False,
json=False, json=False,
@ -100,6 +104,7 @@ def test_oldest():
url=url, url=url,
total=False, total=False,
version=False, version=False,
file=False,
oldest=True, oldest=True,
save=False, save=False,
json=False, json=False,
@ -121,6 +126,7 @@ def test_newest():
url="https://pypi.org/user/akamhy/", url="https://pypi.org/user/akamhy/",
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -143,6 +149,7 @@ def test_newest():
url=url, url=url,
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -164,6 +171,7 @@ def test_total_archives():
url="https://pypi.org/user/akamhy/", url="https://pypi.org/user/akamhy/",
total=True, total=True,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -185,6 +193,7 @@ def test_known_urls():
url="https://www.keybr.com", url="https://www.keybr.com",
total=False, total=False,
version=False, version=False,
file=True,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -198,25 +207,6 @@ def test_known_urls():
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "keybr" in str(reply) assert "keybr" in str(reply)
args = argparse.Namespace(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://akfyfufyjcujfufu6576r76r6amhy.gitd6r67r6u6hub.yfjyfjio",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
subdomain=True,
known_urls=True,
get=None,
)
reply = cli.args_handler(args)
assert "No known URLs found" in str(reply)
def test_near(): def test_near():
args = argparse.Namespace( args = argparse.Namespace(
@ -225,6 +215,7 @@ def test_near():
url="https://pypi.org/user/akamhy/", url="https://pypi.org/user/akamhy/",
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -252,6 +243,7 @@ def test_near():
url=url, url=url,
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -278,6 +270,7 @@ def test_get():
url="https://github.com/akamhy", url="https://github.com/akamhy",
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -297,6 +290,7 @@ def test_get():
url="https://github.com/akamhy/waybackpy", url="https://github.com/akamhy/waybackpy",
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -316,6 +310,7 @@ def test_get():
url="https://akamhy.github.io/waybackpy/", url="https://akamhy.github.io/waybackpy/",
total=False, total=False,
version=False, version=False,
file=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,
@ -335,25 +330,7 @@ def test_get():
url="https://pypi.org/user/akamhy/", url="https://pypi.org/user/akamhy/",
total=False, total=False,
version=False, version=False,
oldest=False, file=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
subdomain=False,
known_urls=False,
get="save",
)
reply = cli.args_handler(args)
assert "waybackpy" in str(reply)
args = argparse.Namespace(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False, oldest=False,
save=False, save=False,
json=False, json=False,

View File

@ -17,26 +17,6 @@ def test_url_check():
Url(broken_url, user_agent) Url(broken_url, user_agent)
def test_save():
url_list = [
"en.wikipedia.org",
"akamhy.github.io",
"www.wiktionary.org",
"www.w3schools.com",
"youtube.com",
]
x = random.randint(0, len(url_list) - 1)
url1 = url_list[x]
target = Url(
url1,
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
)
archived_url1 = str(target.save())
assert url1 in archived_url1
def test_near(): def test_near():
with pytest.raises(Exception): with pytest.raises(Exception):
NeverArchivedUrl = ( NeverArchivedUrl = (

View File

@ -89,28 +89,40 @@ def _near(obj, args):
return no_archive_handler(e, obj) return no_archive_handler(e, obj)
def _save_urls_on_file(input_list, live_url_count): def _save_urls_on_file(url_gen):
m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0]) domain = None
domain = "domain-unknown"
if m:
domain = m.group(1)
sys_random = random.SystemRandom() sys_random = random.SystemRandom()
uid = "".join( uid = "".join(
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
) )
url_count = 0
file_name = "{domain}-{live_url_count}-urls-{uid}.txt".format( for url in url_gen:
domain=domain, live_url_count=live_url_count, uid=uid url_count += 1
) if not domain:
file_content = "\n".join(input_list) m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
file_path = os.path.join(os.getcwd(), file_name)
with open(file_path, "w+") as f: domain = "domain-unknown"
f.write(file_content)
return "{file_content}\n\n'{file_name}' saved in current working directory".format( if m:
file_content=file_content, file_name=file_name domain = m.group(1)
)
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
file_path = os.path.join(os.getcwd(), file_name)
if not os.path.isfile(file_path):
open(file_path, "w+").close()
with open(file_path, "a") as f:
f.write("{url}\n".format(url=url))
print(url)
if url_count > 0:
return "\n\n'{file_name}' saved in current working directory".format(
file_name=file_name
)
else:
return "No known URLs found. Please try a diffrent input!"
def _known_urls(obj, args): def _known_urls(obj, args):
@ -118,17 +130,16 @@ def _known_urls(obj, args):
Known urls for a domain. Known urls for a domain.
""" """
subdomain = False subdomain = True if args.subdomain else False
if args.subdomain:
subdomain = True
url_list = obj.known_urls(subdomain=subdomain) url_gen = obj.known_urls(subdomain=subdomain)
total_urls = len(url_list)
if total_urls > 0: if args.file:
return _save_urls_on_file(url_list, total_urls) return _save_urls_on_file(url_gen)
else:
return "No known URLs found. Please try a diffrent domain!" for url in url_gen:
print(url)
return "\n"
def _get(obj, args): def _get(obj, args):
@ -265,6 +276,12 @@ def add_knownUrlArg(knownUrlArg):
) )
help_text = "Use with '--known_urls' to include known URLs for subdomains." help_text = "Use with '--known_urls' to include known URLs for subdomains."
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text) knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
knownUrlArg.add_argument(
"--file",
"-f",
action="store_true",
help="Save the URLs in file at current directory.",
)
def add_nearArg(nearArg): def add_nearArg(nearArg):

View File

@ -308,41 +308,40 @@ class Url:
i = i + 1 i = i + 1
return i return i
def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None): def known_urls(
self,
subdomain=False,
host=False,
start_timestamp=None,
end_timestamp=None,
match_type="prefix",
):
""" """
Returns list of URLs known to exist for given domain name Yields list of URLs known to exist for given input.
because these URLs were crawled by WayBack Machine spider. Defaults to input URL as prefix.
Useful for pen-testing.
This method is kept for compatibility, use the Cdx class instead.
This method itself depends on Cdx.
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
""" """
# Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
# https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
url_list = []
if subdomain: if subdomain:
cdx = Cdx( match_type = "domain"
_cleaned_url(self.url), if host:
user_agent=self.user_agent, match_type = "host"
start_timestamp=start_timestamp,
end_timestamp=end_timestamp, cdx = Cdx(
match_type="domain", _cleaned_url(self.url),
collapses=["urlkey"], user_agent=self.user_agent,
) start_timestamp=start_timestamp,
else: end_timestamp=end_timestamp,
cdx = Cdx( match_type=match_type,
_cleaned_url(self.url), collapses=["urlkey"],
user_agent=self.user_agent, )
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
match_type="host",
collapses=["urlkey"],
)
snapshots = cdx.snapshots() snapshots = cdx.snapshots()
url_list = []
for snapshot in snapshots: for snapshot in snapshots:
url_list.append(snapshot.original) yield (snapshot.original)
return url_list