known urls now yileds, more reliable. And save the file in chucks wrt to response. --file arg can be used to create output file, if --file not used no output will be saved in any file. (#88)

2021-01-24 16:11:39 +05:30
parent a3bc6aad2b
commit 36b936820b
4 changed files with 87 additions and 114 deletions
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -18,6 +18,7 @@ def test_save():
        url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm",
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=True,
        json=False,
@@ -38,6 +39,7 @@ def test_json():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=True,
@@ -58,6 +60,7 @@ def test_archive_url():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -78,6 +81,7 @@ def test_oldest():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
+        file=False,
        oldest=True,
        save=False,
        json=False,
@@ -100,6 +104,7 @@ def test_oldest():
        url=url,
        total=False,
        version=False,
+        file=False,
        oldest=True,
        save=False,
        json=False,
@@ -121,6 +126,7 @@ def test_newest():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -143,6 +149,7 @@ def test_newest():
        url=url,
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -164,6 +171,7 @@ def test_total_archives():
        url="https://pypi.org/user/akamhy/",
        total=True,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -185,6 +193,7 @@ def test_known_urls():
        url="https://www.keybr.com",
        total=False,
        version=False,
+        file=True,
        oldest=False,
        save=False,
        json=False,
@@ -198,25 +207,6 @@ def test_known_urls():
    reply = cli.args_handler(args)
    assert "keybr" in str(reply)

-    args = argparse.Namespace(
-        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
-    (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
-        url="https://akfyfufyjcujfufu6576r76r6amhy.gitd6r67r6u6hub.yfjyfjio",
-        total=False,
-        version=False,
-        oldest=False,
-        save=False,
-        json=False,
-        archive_url=False,
-        newest=False,
-        near=False,
-        subdomain=True,
-        known_urls=True,
-        get=None,
-    )
-    reply = cli.args_handler(args)
-    assert "No known URLs found" in str(reply)
-

 def test_near():
    args = argparse.Namespace(
@@ -225,6 +215,7 @@ def test_near():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -252,6 +243,7 @@ def test_near():
        url=url,
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -278,6 +270,7 @@ def test_get():
        url="https://github.com/akamhy",
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -297,6 +290,7 @@ def test_get():
        url="https://github.com/akamhy/waybackpy",
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -316,6 +310,7 @@ def test_get():
        url="https://akamhy.github.io/waybackpy/",
        total=False,
        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
@@ -335,25 +330,7 @@ def test_get():
        url="https://pypi.org/user/akamhy/",
        total=False,
        version=False,
-        oldest=False,
-        save=False,
-        json=False,
-        archive_url=False,
-        newest=False,
-        near=False,
-        subdomain=False,
-        known_urls=False,
-        get="save",
-    )
-    reply = cli.args_handler(args)
-    assert "waybackpy" in str(reply)
-
-    args = argparse.Namespace(
-        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
-    (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
-        url="https://pypi.org/user/akamhy/",
-        total=False,
-        version=False,
+        file=False,
        oldest=False,
        save=False,
        json=False,
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@@ -17,26 +17,6 @@ def test_url_check():
        Url(broken_url, user_agent)


-def test_save():
-
-    url_list = [
-        "en.wikipedia.org",
-        "akamhy.github.io",
-        "www.wiktionary.org",
-        "www.w3schools.com",
-        "youtube.com",
-    ]
-    x = random.randint(0, len(url_list) - 1)
-    url1 = url_list[x]
-    target = Url(
-        url1,
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
-        "(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
-    )
-    archived_url1 = str(target.save())
-    assert url1 in archived_url1
-
-
 def test_near():
    with pytest.raises(Exception):
        NeverArchivedUrl = (
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -89,28 +89,40 @@ def _near(obj, args):
        return no_archive_handler(e, obj)


-def _save_urls_on_file(input_list, live_url_count):
-    m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
-
-    domain = "domain-unknown"
-    if m:
-        domain = m.group(1)
-
+def _save_urls_on_file(url_gen):
+    domain = None
    sys_random = random.SystemRandom()
    uid = "".join(
        sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
    )
+    url_count = 0

-    file_name = "{domain}-{live_url_count}-urls-{uid}.txt".format(
-        domain=domain, live_url_count=live_url_count, uid=uid
-    )
-    file_content = "\n".join(input_list)
-    file_path = os.path.join(os.getcwd(), file_name)
-    with open(file_path, "w+") as f:
-        f.write(file_content)
-    return "{file_content}\n\n'{file_name}' saved in current working directory".format(
-        file_content=file_content, file_name=file_name
-    )
+    for url in url_gen:
+        url_count += 1
+        if not domain:
+            m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
+
+            domain = "domain-unknown"
+
+            if m:
+                domain = m.group(1)
+
+            file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
+            file_path = os.path.join(os.getcwd(), file_name)
+            if not os.path.isfile(file_path):
+                open(file_path, "w+").close()
+
+        with open(file_path, "a") as f:
+            f.write("{url}\n".format(url=url))
+
+        print(url)
+
+    if url_count > 0:
+        return "\n\n'{file_name}' saved in current working directory".format(
+            file_name=file_name
+        )
+    else:
+        return "No known URLs found. Please try a diffrent input!"


 def _known_urls(obj, args):
@@ -118,17 +130,16 @@ def _known_urls(obj, args):
    Known urls for a domain.
    """

-    subdomain = False
-    if args.subdomain:
-        subdomain = True
+    subdomain = True if args.subdomain else False

-    url_list = obj.known_urls(subdomain=subdomain)
-    total_urls = len(url_list)
+    url_gen = obj.known_urls(subdomain=subdomain)

-    if total_urls > 0:
-        return _save_urls_on_file(url_list, total_urls)
-
-    return "No known URLs found. Please try a diffrent domain!"
+    if args.file:
+        return _save_urls_on_file(url_gen)
+    else:
+        for url in url_gen:
+            print(url)
+        return "\n"


 def _get(obj, args):
@@ -265,6 +276,12 @@ def add_knownUrlArg(knownUrlArg):
    )
    help_text = "Use with '--known_urls' to include known URLs for subdomains."
    knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
+    knownUrlArg.add_argument(
+        "--file",
+        "-f",
+        action="store_true",
+        help="Save the URLs in file at current directory.",
+    )


 def add_nearArg(nearArg):
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -308,41 +308,40 @@ class Url:
            i = i + 1
        return i

-    def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
+    def known_urls(
+        self,
+        subdomain=False,
+        host=False,
+        start_timestamp=None,
+        end_timestamp=None,
+        match_type="prefix",
+    ):
        """
-        Returns list of URLs known to exist for given domain name
-        because these URLs were crawled by WayBack Machine spider.
-        Useful for pen-testing.
+        Yields list of URLs known to exist for given input.
+        Defaults to input URL as prefix.
+
+        This method is kept for compatibility, use the Cdx class instead.
+        This method itself depends on Cdx.
+
+         Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
+         https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
        """

-        # Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
-        # https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
-
-        url_list = []
-
        if subdomain:
-            cdx = Cdx(
-                _cleaned_url(self.url),
-                user_agent=self.user_agent,
-                start_timestamp=start_timestamp,
-                end_timestamp=end_timestamp,
-                match_type="domain",
-                collapses=["urlkey"],
-            )
-        else:
-            cdx = Cdx(
-                _cleaned_url(self.url),
-                user_agent=self.user_agent,
-                start_timestamp=start_timestamp,
-                end_timestamp=end_timestamp,
-                match_type="host",
-                collapses=["urlkey"],
-            )
+            match_type = "domain"
+        if host:
+            match_type = "host"
+
+        cdx = Cdx(
+            _cleaned_url(self.url),
+            user_agent=self.user_agent,
+            start_timestamp=start_timestamp,
+            end_timestamp=end_timestamp,
+            match_type=match_type,
+            collapses=["urlkey"],
+        )

        snapshots = cdx.snapshots()

-        url_list = []
        for snapshot in snapshots:
-            url_list.append(snapshot.original)
-
-        return url_list
+            yield (snapshot.original)