known urls now yileds, more reliable. And save the file in chucks wrt to response. --file arg can be used to create output file, if --file not used no output will be saved in any file. (#88)

2021-01-24 16:11:39 +05:30
parent a3bc6aad2b
commit 36b936820b
4 changed files with 87 additions and 114 deletions
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -89,28 +89,40 @@ def _near(obj, args):
        return no_archive_handler(e, obj)


-def _save_urls_on_file(input_list, live_url_count):
-    m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
-
-    domain = "domain-unknown"
-    if m:
-        domain = m.group(1)
-
+def _save_urls_on_file(url_gen):
+    domain = None
    sys_random = random.SystemRandom()
    uid = "".join(
        sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
    )
+    url_count = 0

-    file_name = "{domain}-{live_url_count}-urls-{uid}.txt".format(
-        domain=domain, live_url_count=live_url_count, uid=uid
-    )
-    file_content = "\n".join(input_list)
-    file_path = os.path.join(os.getcwd(), file_name)
-    with open(file_path, "w+") as f:
-        f.write(file_content)
-    return "{file_content}\n\n'{file_name}' saved in current working directory".format(
-        file_content=file_content, file_name=file_name
-    )
+    for url in url_gen:
+        url_count += 1
+        if not domain:
+            m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
+
+            domain = "domain-unknown"
+
+            if m:
+                domain = m.group(1)
+
+            file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
+            file_path = os.path.join(os.getcwd(), file_name)
+            if not os.path.isfile(file_path):
+                open(file_path, "w+").close()
+
+        with open(file_path, "a") as f:
+            f.write("{url}\n".format(url=url))
+
+        print(url)
+
+    if url_count > 0:
+        return "\n\n'{file_name}' saved in current working directory".format(
+            file_name=file_name
+        )
+    else:
+        return "No known URLs found. Please try a diffrent input!"


 def _known_urls(obj, args):
@@ -118,17 +130,16 @@ def _known_urls(obj, args):
    Known urls for a domain.
    """

-    subdomain = False
-    if args.subdomain:
-        subdomain = True
+    subdomain = True if args.subdomain else False

-    url_list = obj.known_urls(subdomain=subdomain)
-    total_urls = len(url_list)
+    url_gen = obj.known_urls(subdomain=subdomain)

-    if total_urls > 0:
-        return _save_urls_on_file(url_list, total_urls)
-
-    return "No known URLs found. Please try a diffrent domain!"
+    if args.file:
+        return _save_urls_on_file(url_gen)
+    else:
+        for url in url_gen:
+            print(url)
+        return "\n"


 def _get(obj, args):
@@ -265,6 +276,12 @@ def add_knownUrlArg(knownUrlArg):
    )
    help_text = "Use with '--known_urls' to include known URLs for subdomains."
    knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
+    knownUrlArg.add_argument(
+        "--file",
+        "-f",
+        action="store_true",
+        help="Save the URLs in file at current directory.",
+    )


 def add_nearArg(nearArg):
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -308,41 +308,40 @@ class Url:
            i = i + 1
        return i

-    def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
+    def known_urls(
+        self,
+        subdomain=False,
+        host=False,
+        start_timestamp=None,
+        end_timestamp=None,
+        match_type="prefix",
+    ):
        """
-        Returns list of URLs known to exist for given domain name
-        because these URLs were crawled by WayBack Machine spider.
-        Useful for pen-testing.
+        Yields list of URLs known to exist for given input.
+        Defaults to input URL as prefix.
+
+        This method is kept for compatibility, use the Cdx class instead.
+        This method itself depends on Cdx.
+
+         Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
+         https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
        """

-        # Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
-        # https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
-
-        url_list = []
-
        if subdomain:
-            cdx = Cdx(
-                _cleaned_url(self.url),
-                user_agent=self.user_agent,
-                start_timestamp=start_timestamp,
-                end_timestamp=end_timestamp,
-                match_type="domain",
-                collapses=["urlkey"],
-            )
-        else:
-            cdx = Cdx(
-                _cleaned_url(self.url),
-                user_agent=self.user_agent,
-                start_timestamp=start_timestamp,
-                end_timestamp=end_timestamp,
-                match_type="host",
-                collapses=["urlkey"],
-            )
+            match_type = "domain"
+        if host:
+            match_type = "host"
+
+        cdx = Cdx(
+            _cleaned_url(self.url),
+            user_agent=self.user_agent,
+            start_timestamp=start_timestamp,
+            end_timestamp=end_timestamp,
+            match_type=match_type,
+            collapses=["urlkey"],
+        )

        snapshots = cdx.snapshots()

-        url_list = []
        for snapshot in snapshots:
-            url_list.append(snapshot.original)
-
-        return url_list
+            yield (snapshot.original)