known urls now yileds, more reliable. And save the file in chucks wrt to response. --file arg can be used to create output file, if --file not used no output will be saved in any file. (#88)
This commit is contained in:
		@@ -89,28 +89,40 @@ def _near(obj, args):
 | 
			
		||||
        return no_archive_handler(e, obj)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _save_urls_on_file(input_list, live_url_count):
 | 
			
		||||
    m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
 | 
			
		||||
 | 
			
		||||
    domain = "domain-unknown"
 | 
			
		||||
    if m:
 | 
			
		||||
        domain = m.group(1)
 | 
			
		||||
 | 
			
		||||
def _save_urls_on_file(url_gen):
 | 
			
		||||
    domain = None
 | 
			
		||||
    sys_random = random.SystemRandom()
 | 
			
		||||
    uid = "".join(
 | 
			
		||||
        sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
 | 
			
		||||
    )
 | 
			
		||||
    url_count = 0
 | 
			
		||||
 | 
			
		||||
    file_name = "{domain}-{live_url_count}-urls-{uid}.txt".format(
 | 
			
		||||
        domain=domain, live_url_count=live_url_count, uid=uid
 | 
			
		||||
    )
 | 
			
		||||
    file_content = "\n".join(input_list)
 | 
			
		||||
    file_path = os.path.join(os.getcwd(), file_name)
 | 
			
		||||
    with open(file_path, "w+") as f:
 | 
			
		||||
        f.write(file_content)
 | 
			
		||||
    return "{file_content}\n\n'{file_name}' saved in current working directory".format(
 | 
			
		||||
        file_content=file_content, file_name=file_name
 | 
			
		||||
    )
 | 
			
		||||
    for url in url_gen:
 | 
			
		||||
        url_count += 1
 | 
			
		||||
        if not domain:
 | 
			
		||||
            m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
 | 
			
		||||
 | 
			
		||||
            domain = "domain-unknown"
 | 
			
		||||
 | 
			
		||||
            if m:
 | 
			
		||||
                domain = m.group(1)
 | 
			
		||||
 | 
			
		||||
            file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
 | 
			
		||||
            file_path = os.path.join(os.getcwd(), file_name)
 | 
			
		||||
            if not os.path.isfile(file_path):
 | 
			
		||||
                open(file_path, "w+").close()
 | 
			
		||||
 | 
			
		||||
        with open(file_path, "a") as f:
 | 
			
		||||
            f.write("{url}\n".format(url=url))
 | 
			
		||||
 | 
			
		||||
        print(url)
 | 
			
		||||
 | 
			
		||||
    if url_count > 0:
 | 
			
		||||
        return "\n\n'{file_name}' saved in current working directory".format(
 | 
			
		||||
            file_name=file_name
 | 
			
		||||
        )
 | 
			
		||||
    else:
 | 
			
		||||
        return "No known URLs found. Please try a diffrent input!"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _known_urls(obj, args):
 | 
			
		||||
@@ -118,17 +130,16 @@ def _known_urls(obj, args):
 | 
			
		||||
    Known urls for a domain.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    subdomain = False
 | 
			
		||||
    if args.subdomain:
 | 
			
		||||
        subdomain = True
 | 
			
		||||
    subdomain = True if args.subdomain else False
 | 
			
		||||
 | 
			
		||||
    url_list = obj.known_urls(subdomain=subdomain)
 | 
			
		||||
    total_urls = len(url_list)
 | 
			
		||||
    url_gen = obj.known_urls(subdomain=subdomain)
 | 
			
		||||
 | 
			
		||||
    if total_urls > 0:
 | 
			
		||||
        return _save_urls_on_file(url_list, total_urls)
 | 
			
		||||
 | 
			
		||||
    return "No known URLs found. Please try a diffrent domain!"
 | 
			
		||||
    if args.file:
 | 
			
		||||
        return _save_urls_on_file(url_gen)
 | 
			
		||||
    else:
 | 
			
		||||
        for url in url_gen:
 | 
			
		||||
            print(url)
 | 
			
		||||
        return "\n"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get(obj, args):
 | 
			
		||||
@@ -265,6 +276,12 @@ def add_knownUrlArg(knownUrlArg):
 | 
			
		||||
    )
 | 
			
		||||
    help_text = "Use with '--known_urls' to include known URLs for subdomains."
 | 
			
		||||
    knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
 | 
			
		||||
    knownUrlArg.add_argument(
 | 
			
		||||
        "--file",
 | 
			
		||||
        "-f",
 | 
			
		||||
        action="store_true",
 | 
			
		||||
        help="Save the URLs in file at current directory.",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_nearArg(nearArg):
 | 
			
		||||
 
 | 
			
		||||
@@ -308,41 +308,40 @@ class Url:
 | 
			
		||||
            i = i + 1
 | 
			
		||||
        return i
 | 
			
		||||
 | 
			
		||||
    def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
 | 
			
		||||
    def known_urls(
 | 
			
		||||
        self,
 | 
			
		||||
        subdomain=False,
 | 
			
		||||
        host=False,
 | 
			
		||||
        start_timestamp=None,
 | 
			
		||||
        end_timestamp=None,
 | 
			
		||||
        match_type="prefix",
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Returns list of URLs known to exist for given domain name
 | 
			
		||||
        because these URLs were crawled by WayBack Machine spider.
 | 
			
		||||
        Useful for pen-testing.
 | 
			
		||||
        Yields list of URLs known to exist for given input.
 | 
			
		||||
        Defaults to input URL as prefix.
 | 
			
		||||
 | 
			
		||||
        This method is kept for compatibility, use the Cdx class instead.
 | 
			
		||||
        This method itself depends on Cdx.
 | 
			
		||||
 | 
			
		||||
         Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
 | 
			
		||||
         https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        # Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
 | 
			
		||||
        # https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
 | 
			
		||||
 | 
			
		||||
        url_list = []
 | 
			
		||||
 | 
			
		||||
        if subdomain:
 | 
			
		||||
            cdx = Cdx(
 | 
			
		||||
                _cleaned_url(self.url),
 | 
			
		||||
                user_agent=self.user_agent,
 | 
			
		||||
                start_timestamp=start_timestamp,
 | 
			
		||||
                end_timestamp=end_timestamp,
 | 
			
		||||
                match_type="domain",
 | 
			
		||||
                collapses=["urlkey"],
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            cdx = Cdx(
 | 
			
		||||
                _cleaned_url(self.url),
 | 
			
		||||
                user_agent=self.user_agent,
 | 
			
		||||
                start_timestamp=start_timestamp,
 | 
			
		||||
                end_timestamp=end_timestamp,
 | 
			
		||||
                match_type="host",
 | 
			
		||||
                collapses=["urlkey"],
 | 
			
		||||
            )
 | 
			
		||||
            match_type = "domain"
 | 
			
		||||
        if host:
 | 
			
		||||
            match_type = "host"
 | 
			
		||||
 | 
			
		||||
        cdx = Cdx(
 | 
			
		||||
            _cleaned_url(self.url),
 | 
			
		||||
            user_agent=self.user_agent,
 | 
			
		||||
            start_timestamp=start_timestamp,
 | 
			
		||||
            end_timestamp=end_timestamp,
 | 
			
		||||
            match_type=match_type,
 | 
			
		||||
            collapses=["urlkey"],
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        snapshots = cdx.snapshots()
 | 
			
		||||
 | 
			
		||||
        url_list = []
 | 
			
		||||
        for snapshot in snapshots:
 | 
			
		||||
            url_list.append(snapshot.original)
 | 
			
		||||
 | 
			
		||||
        return url_list
 | 
			
		||||
            yield (snapshot.original)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user