known urls now yileds, more reliable. And save the file in chucks wrt to response. --file arg can be used to create output file, if --file not used no output will be saved in any file. (#88)

This commit is contained in:
Akash Mahanty
2021-01-24 16:11:39 +05:30
committed by GitHub
parent a3bc6aad2b
commit 36b936820b
4 changed files with 87 additions and 114 deletions

View File

@@ -308,41 +308,40 @@ class Url:
i = i + 1
return i
def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
def known_urls(
self,
subdomain=False,
host=False,
start_timestamp=None,
end_timestamp=None,
match_type="prefix",
):
"""
Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine spider.
Useful for pen-testing.
Yields list of URLs known to exist for given input.
Defaults to input URL as prefix.
This method is kept for compatibility, use the Cdx class instead.
This method itself depends on Cdx.
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
"""
# Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
# https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
url_list = []
if subdomain:
cdx = Cdx(
_cleaned_url(self.url),
user_agent=self.user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
match_type="domain",
collapses=["urlkey"],
)
else:
cdx = Cdx(
_cleaned_url(self.url),
user_agent=self.user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
match_type="host",
collapses=["urlkey"],
)
match_type = "domain"
if host:
match_type = "host"
cdx = Cdx(
_cleaned_url(self.url),
user_agent=self.user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
match_type=match_type,
collapses=["urlkey"],
)
snapshots = cdx.snapshots()
url_list = []
for snapshot in snapshots:
url_list.append(snapshot.original)
return url_list
yield (snapshot.original)