diff --git a/tests/test_cli.py b/tests/test_cli.py index 8866aeb..d8593c7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,6 +18,7 @@ def test_save(): url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm", total=False, version=False, + file=False, oldest=False, save=True, json=False, @@ -38,6 +39,7 @@ def test_json(): url="https://pypi.org/user/akamhy/", total=False, version=False, + file=False, oldest=False, save=False, json=True, @@ -58,6 +60,7 @@ def test_archive_url(): url="https://pypi.org/user/akamhy/", total=False, version=False, + file=False, oldest=False, save=False, json=False, @@ -78,6 +81,7 @@ def test_oldest(): url="https://pypi.org/user/akamhy/", total=False, version=False, + file=False, oldest=True, save=False, json=False, @@ -100,6 +104,7 @@ def test_oldest(): url=url, total=False, version=False, + file=False, oldest=True, save=False, json=False, @@ -121,6 +126,7 @@ def test_newest(): url="https://pypi.org/user/akamhy/", total=False, version=False, + file=False, oldest=False, save=False, json=False, @@ -143,6 +149,7 @@ def test_newest(): url=url, total=False, version=False, + file=False, oldest=False, save=False, json=False, @@ -164,6 +171,7 @@ def test_total_archives(): url="https://pypi.org/user/akamhy/", total=True, version=False, + file=False, oldest=False, save=False, json=False, @@ -185,6 +193,7 @@ def test_known_urls(): url="https://www.keybr.com", total=False, version=False, + file=True, oldest=False, save=False, json=False, @@ -198,25 +207,6 @@ def test_known_urls(): reply = cli.args_handler(args) assert "keybr" in str(reply) - args = argparse.Namespace( - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", - url="https://akfyfufyjcujfufu6576r76r6amhy.gitd6r67r6u6hub.yfjyfjio", - total=False, - version=False, - oldest=False, - save=False, - json=False, - archive_url=False, - newest=False, - near=False, - subdomain=True, - known_urls=True, - get=None, - ) - reply = cli.args_handler(args) - assert "No known URLs found" in str(reply) - def test_near(): args = argparse.Namespace( @@ -225,6 +215,7 @@ def test_near(): url="https://pypi.org/user/akamhy/", total=False, version=False, + file=False, oldest=False, save=False, json=False, @@ -252,6 +243,7 @@ def test_near(): url=url, total=False, version=False, + file=False, oldest=False, save=False, json=False, @@ -278,6 +270,7 @@ def test_get(): url="https://github.com/akamhy", total=False, version=False, + file=False, oldest=False, save=False, json=False, @@ -297,6 +290,7 @@ def test_get(): url="https://github.com/akamhy/waybackpy", total=False, version=False, + file=False, oldest=False, save=False, json=False, @@ -316,6 +310,7 @@ def test_get(): url="https://akamhy.github.io/waybackpy/", total=False, version=False, + file=False, oldest=False, save=False, json=False, @@ -335,25 +330,7 @@ def test_get(): url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, - save=False, - json=False, - archive_url=False, - newest=False, - near=False, - subdomain=False, - known_urls=False, - get="save", - ) - reply = cli.args_handler(args) - assert "waybackpy" in str(reply) - - args = argparse.Namespace( - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", - url="https://pypi.org/user/akamhy/", - total=False, - version=False, + file=False, oldest=False, save=False, json=False, diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index cf4d743..359ba91 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -17,26 +17,6 @@ def test_url_check(): Url(broken_url, user_agent) -def test_save(): - - url_list = [ - "en.wikipedia.org", - "akamhy.github.io", - "www.wiktionary.org", - "www.w3schools.com", - "youtube.com", - ] - x = random.randint(0, len(url_list) - 1) - url1 = url_list[x] - target = Url( - url1, - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", - ) - archived_url1 = str(target.save()) - assert url1 in archived_url1 - - def test_near(): with pytest.raises(Exception): NeverArchivedUrl = ( diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 63280f3..adbf1aa 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -89,28 +89,40 @@ def _near(obj, args): return no_archive_handler(e, obj) -def _save_urls_on_file(input_list, live_url_count): - m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0]) - - domain = "domain-unknown" - if m: - domain = m.group(1) - +def _save_urls_on_file(url_gen): + domain = None sys_random = random.SystemRandom() uid = "".join( sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) ) + url_count = 0 - file_name = "{domain}-{live_url_count}-urls-{uid}.txt".format( - domain=domain, live_url_count=live_url_count, uid=uid - ) - file_content = "\n".join(input_list) - file_path = os.path.join(os.getcwd(), file_name) - with open(file_path, "w+") as f: - f.write(file_content) - return "{file_content}\n\n'{file_name}' saved in current working directory".format( - file_content=file_content, file_name=file_name - ) + for url in url_gen: + url_count += 1 + if not domain: + m = re.search("https?://([A-Za-z_0-9.-]+).*", url) + + domain = "domain-unknown" + + if m: + domain = m.group(1) + + file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid) + file_path = os.path.join(os.getcwd(), file_name) + if not os.path.isfile(file_path): + open(file_path, "w+").close() + + with open(file_path, "a") as f: + f.write("{url}\n".format(url=url)) + + print(url) + + if url_count > 0: + return "\n\n'{file_name}' saved in current working directory".format( + file_name=file_name + ) + else: + return "No known URLs found. Please try a diffrent input!" def _known_urls(obj, args): @@ -118,17 +130,16 @@ def _known_urls(obj, args): Known urls for a domain. """ - subdomain = False - if args.subdomain: - subdomain = True + subdomain = True if args.subdomain else False - url_list = obj.known_urls(subdomain=subdomain) - total_urls = len(url_list) + url_gen = obj.known_urls(subdomain=subdomain) - if total_urls > 0: - return _save_urls_on_file(url_list, total_urls) - - return "No known URLs found. Please try a diffrent domain!" + if args.file: + return _save_urls_on_file(url_gen) + else: + for url in url_gen: + print(url) + return "\n" def _get(obj, args): @@ -265,6 +276,12 @@ def add_knownUrlArg(knownUrlArg): ) help_text = "Use with '--known_urls' to include known URLs for subdomains." knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text) + knownUrlArg.add_argument( + "--file", + "-f", + action="store_true", + help="Save the URLs in file at current directory.", + ) def add_nearArg(nearArg): diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 067a75c..bea9a7a 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -308,41 +308,40 @@ class Url: i = i + 1 return i - def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None): + def known_urls( + self, + subdomain=False, + host=False, + start_timestamp=None, + end_timestamp=None, + match_type="prefix", + ): """ - Returns list of URLs known to exist for given domain name - because these URLs were crawled by WayBack Machine spider. - Useful for pen-testing. + Yields list of URLs known to exist for given input. + Defaults to input URL as prefix. + + This method is kept for compatibility, use the Cdx class instead. + This method itself depends on Cdx. + + Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: + https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050 """ - # Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: - # https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050 - - url_list = [] - if subdomain: - cdx = Cdx( - _cleaned_url(self.url), - user_agent=self.user_agent, - start_timestamp=start_timestamp, - end_timestamp=end_timestamp, - match_type="domain", - collapses=["urlkey"], - ) - else: - cdx = Cdx( - _cleaned_url(self.url), - user_agent=self.user_agent, - start_timestamp=start_timestamp, - end_timestamp=end_timestamp, - match_type="host", - collapses=["urlkey"], - ) + match_type = "domain" + if host: + match_type = "host" + + cdx = Cdx( + _cleaned_url(self.url), + user_agent=self.user_agent, + start_timestamp=start_timestamp, + end_timestamp=end_timestamp, + match_type=match_type, + collapses=["urlkey"], + ) snapshots = cdx.snapshots() - url_list = [] for snapshot in snapshots: - url_list.append(snapshot.original) - - return url_list + yield (snapshot.original)