known urls now yileds, more reliable. And save the file in chucks wrt to response. --file arg can be used to create output file, if --file not used no output will be saved in any file. (#88)

2021-01-24 16:11:39 +05:30
parent a3bc6aad2b
commit 36b936820b
4 changed files with 87 additions and 114 deletions
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -18,6 +18,7 @@ def test_save():
        url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm",
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=True,
        json=False,
@@ -38,6 +39,7 @@ def test_json():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=True,
@@ -58,6 +60,7 @@ def test_archive_url():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -78,6 +81,7 @@ def test_oldest():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
        file=False,
        oldest=True,
        save=False,
        json=False,
@@ -100,6 +104,7 @@ def test_oldest():
        url=url,
        total=False,
        version=False,
        file=False,
        oldest=True,
        save=False,
        json=False,
@@ -121,6 +126,7 @@ def test_newest():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -143,6 +149,7 @@ def test_newest():
        url=url,
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -164,6 +171,7 @@ def test_total_archives():
        url="https://pypi.org/user/akamhy/",
        total=True,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -185,6 +193,7 @@ def test_known_urls():
        url="https://www.keybr.com",
        total=False,
        version=False,
        file=True,
        oldest=False,
        save=False,
        json=False,
@@ -198,25 +207,6 @@ def test_known_urls():
    reply = cli.args_handler(args)
    assert "keybr" in str(reply)
    args = argparse.Namespace(
        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
    (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
        url="https://akfyfufyjcujfufu6576r76r6amhy.gitd6r67r6u6hub.yfjyfjio",
        total=False,
        version=False,
        oldest=False,
        save=False,
        json=False,
        archive_url=False,
        newest=False,
        near=False,
        subdomain=True,
        known_urls=True,
        get=None,
    )
    reply = cli.args_handler(args)
    assert "No known URLs found" in str(reply)
 def test_near():
    args = argparse.Namespace(
@@ -225,6 +215,7 @@ def test_near():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -252,6 +243,7 @@ def test_near():
        url=url,
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -278,6 +270,7 @@ def test_get():
        url="https://github.com/akamhy",
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -297,6 +290,7 @@ def test_get():
        url="https://github.com/akamhy/waybackpy",
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -316,6 +310,7 @@ def test_get():
        url="https://akamhy.github.io/waybackpy/",
        total=False,
        version=False,
        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -335,25 +330,7 @@ def test_get():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
-        oldest=False,
+        file=False,
        save=False,
        json=False,
        archive_url=False,
        newest=False,
        near=False,
        subdomain=False,
        known_urls=False,
        get="save",
    )
    reply = cli.args_handler(args)
    assert "waybackpy" in str(reply)
    args = argparse.Namespace(
        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
    (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
        oldest=False,
        save=False,
        json=False,
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@@ -17,26 +17,6 @@ def test_url_check():
        Url(broken_url, user_agent)
 def test_save():
    url_list = [
        "en.wikipedia.org",
        "akamhy.github.io",
        "www.wiktionary.org",
        "www.w3schools.com",
        "youtube.com",
    ]
    x = random.randint(0, len(url_list) - 1)
    url1 = url_list[x]
    target = Url(
        url1,
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
    )
    archived_url1 = str(target.save())
    assert url1 in archived_url1
 def test_near():
    with pytest.raises(Exception):
        NeverArchivedUrl = (
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -89,28 +89,40 @@ def _near(obj, args):
        return no_archive_handler(e, obj)
-def _save_urls_on_file(input_list, live_url_count):
+def _save_urls_on_file(url_gen):
-    m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
+    domain = None
    domain = "domain-unknown"
    if m:
        domain = m.group(1)
    sys_random = random.SystemRandom()
    uid = "".join(
        sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
    )
    url_count = 0
-    file_name = "{domain}-{live_url_count}-urls-{uid}.txt".format(
+    for url in url_gen:
-        domain=domain, live_url_count=live_url_count, uid=uid
+        url_count += 1
-    )
+        if not domain:
-    file_content = "\n".join(input_list)
+            m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
-    file_path = os.path.join(os.getcwd(), file_name)
+
-    with open(file_path, "w+") as f:
+            domain = "domain-unknown"
-        f.write(file_content)
+
-    return "{file_content}\n\n'{file_name}' saved in current working directory".format(
+            if m:
-        file_content=file_content, file_name=file_name
+                domain = m.group(1)
-    )
+
            file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
            file_path = os.path.join(os.getcwd(), file_name)
            if not os.path.isfile(file_path):
                open(file_path, "w+").close()
        with open(file_path, "a") as f:
            f.write("{url}\n".format(url=url))
        print(url)
    if url_count > 0:
        return "\n\n'{file_name}' saved in current working directory".format(
            file_name=file_name
        )
    else:
        return "No known URLs found. Please try a diffrent input!"
 def _known_urls(obj, args):
@@ -118,17 +130,16 @@ def _known_urls(obj, args):
    Known urls for a domain.
    """
-    subdomain = False
+    subdomain = True if args.subdomain else False
    if args.subdomain:
        subdomain = True
-    url_list = obj.known_urls(subdomain=subdomain)
+    url_gen = obj.known_urls(subdomain=subdomain)
    total_urls = len(url_list)
-    if total_urls > 0:
+    if args.file:
-        return _save_urls_on_file(url_list, total_urls)
+        return _save_urls_on_file(url_gen)
-
+    else:
-    return "No known URLs found. Please try a diffrent domain!"
+        for url in url_gen:
            print(url)
        return "\n"
 def _get(obj, args):
@@ -265,6 +276,12 @@ def add_knownUrlArg(knownUrlArg):
    )
    help_text = "Use with '--known_urls' to include known URLs for subdomains."
    knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
    knownUrlArg.add_argument(
        "--file",
        "-f",
        action="store_true",
        help="Save the URLs in file at current directory.",
    )
 def add_nearArg(nearArg):
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -308,41 +308,40 @@ class Url:
            i = i + 1
        return i
-    def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
+    def known_urls(
        self,
        subdomain=False,
        host=False,
        start_timestamp=None,
        end_timestamp=None,
        match_type="prefix",
    ):
        """
-        Returns list of URLs known to exist for given domain name
+        Yields list of URLs known to exist for given input.
-        because these URLs were crawled by WayBack Machine spider.
+        Defaults to input URL as prefix.
-        Useful for pen-testing.
+
        This method is kept for compatibility, use the Cdx class instead.
        This method itself depends on Cdx.
         Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
         https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
        """
        # Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
        # https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
        url_list = []
        if subdomain:
-            cdx = Cdx(
+            match_type = "domain"
-                _cleaned_url(self.url),
+        if host:
-                user_agent=self.user_agent,
+            match_type = "host"
-                start_timestamp=start_timestamp,
+
-                end_timestamp=end_timestamp,
+        cdx = Cdx(
-                match_type="domain",
+            _cleaned_url(self.url),
-                collapses=["urlkey"],
+            user_agent=self.user_agent,
-            )
+            start_timestamp=start_timestamp,
-        else:
+            end_timestamp=end_timestamp,
-            cdx = Cdx(
+            match_type=match_type,
-                _cleaned_url(self.url),
+            collapses=["urlkey"],
-                user_agent=self.user_agent,
+        )
                start_timestamp=start_timestamp,
                end_timestamp=end_timestamp,
                match_type="host",
                collapses=["urlkey"],
            )
        snapshots = cdx.snapshots()
        url_list = []
        for snapshot in snapshots:
-            url_list.append(snapshot.original)
+            yield (snapshot.original)
        return url_list