Implemented new feature, known urls for domain.

This commit is contained in:
Akash Mahanty 2020-10-02 20:27:28 +05:30
parent c9fa114d2e
commit ce7294d990
2 changed files with 126 additions and 17 deletions

View File

@ -1,6 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import print_function from __future__ import print_function
import sys import sys
import os
import re
import argparse import argparse
from waybackpy.wrapper import Url from waybackpy.wrapper import Url
from waybackpy.__version__ import __version__ from waybackpy.__version__ import __version__
@ -31,6 +33,36 @@ def _near(obj, args):
_near_args["minute"] = args.minute _near_args["minute"] = args.minute
return (obj.near(**_near_args)) return (obj.near(**_near_args))
def _known_urls(obj, args):
sd = False
al = False
if args.subdomain:
sd = True
if args.alive:
al = True
url_list = obj.known_urls(alive=al, subdomain=sd)
total_urls = len(url_list)
if total_urls > 0:
m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0])
if m:
domain = m.group(1)
else:
domain = "waybackpy-known"
dir_path = os.path.abspath(os.getcwd())
file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
text = "\n".join(url_list) + "\n"
with open(file_name, "a+") as f:
f.write(text)
text = text + "%d URLs found and saved in ./%s-%d-urls.txt" % (
total_urls, domain, total_urls
)
else:
text = "No known URLs found. Please try a diffrent domain!"
return text
def _get(obj, args): def _get(obj, args):
if args.get.lower() == "url": if args.get.lower() == "url":
return (obj.get()) return (obj.get())
@ -52,10 +84,10 @@ def _get(obj, args):
def args_handler(args): def args_handler(args):
if args.version: if args.version:
return (__version__) return ("waybackpy version %s" % __version__)
if not args.url: if not args.url:
return ("Specify an URL. See --help for help using waybackpy.") return ("waybackpy %s \nSee 'waybackpy --help' for help using this tool." % __version__)
if args.user_agent: if args.user_agent:
obj = Url(args.url, args.user_agent) obj = Url(args.url, args.user_agent)
@ -72,26 +104,54 @@ def args_handler(args):
return _total_archives(obj) return _total_archives(obj)
if args.near: if args.near:
return _near(obj, args) return _near(obj, args)
if args.known_urls:
return _known_urls(obj, args)
if args.get: if args.get:
return _get(obj, args) return _get(obj, args)
return ("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help for help using waybackpy.") return ("You only specified the URL. But you also need to specify the operation.\nSee 'waybackpy --help' for help using this tool.")
def parse_args(argv): def parse_args(argv):
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help="URL on which Wayback machine operations would occur.")
parser.add_argument("-ua", "--user_agent", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\".") requiredArgs = parser.add_argument_group('URL argument (required)')
parser.add_argument("-s", "--save", action='store_true', help="Save the URL on the Wayback machine.") requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur")
parser.add_argument("-o", "--oldest", action='store_true', help="Oldest archive for the specified URL.")
parser.add_argument("-n", "--newest", action='store_true', help="Newest archive for the specified URL.") userAgentArg = parser.add_argument_group('User Agent')
parser.add_argument("-t", "--total", action='store_true', help="Total number of archives for the specified URL.") userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"")
parser.add_argument("-g", "--get", help="Prints the source code of the supplied url. Use '--get help' for extended usage.")
parser.add_argument("-v", "--version", action='store_true', help="Prints the waybackpy version.") saveArg = parser.add_argument_group("Create new archive/save URL")
parser.add_argument("-N", "--near", action='store_true', help="Latest/Newest archive for the specified URL.") saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine")
parser.add_argument("-Y", "--year", type=int, help="Year in integer. For use with --near.")
parser.add_argument("-M", "--month", type=int, help="Month in integer. For use with --near.") oldestArg = parser.add_argument_group("Oldest archive")
parser.add_argument("-D", "--day", type=int, help="Day in integer. For use with --near.") oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL")
parser.add_argument("-H", "--hour", type=int, help="Hour in integer. For use with --near.")
parser.add_argument("-MIN", "--minute", type=int, help="Minute in integer. For use with --near.") newestArg = parser.add_argument_group("Newest archive")
newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL")
totalArg = parser.add_argument_group("Total number of archives")
totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL")
getArg = parser.add_argument_group("Get source code")
getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage")
knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.")
knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.")
knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help="Use with '--known_urls' to include known URLs for subdomains.")
knownUrlArg.add_argument("--alive", "-a", action='store_true', help="Only include live URLs. Will not inlclude dead links.")
nearArg = parser.add_argument_group('Archive close to time specified')
nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time")
nearArgs = parser.add_argument_group('Arguments that are used only with --near')
nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.")
nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version")
return parser.parse_args(argv[1:]) return parser.parse_args(argv[1:])
def main(argv=None): def main(argv=None):

View File

@ -100,8 +100,10 @@ class Url:
"""Return the source code of the supplied URL. """Return the source code of the supplied URL.
If encoding is not supplied, it is auto-detected from the response. If encoding is not supplied, it is auto-detected from the response.
""" """
if not url: if not url:
url = self._clean_url() url = self._clean_url()
if not user_agent: if not user_agent:
user_agent = self.user_agent user_agent = self.user_agent
@ -173,3 +175,50 @@ class Url:
response = _get_response(req) response = _get_response(req)
# Most efficient method to count number of archives (yet) # Most efficient method to count number of archives (yet)
return str(response.read()).count(",") return str(response.read()).count(",")
def known_urls(self, alive=False, subdomain=False):
"""Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine bots.
Useful for pen-testers and others.
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
"""
url_list = []
if subdomain:
request_url = (
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url()
)
else:
request_url = (
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url()
)
hdr = {"User-Agent": "%s" % self.user_agent}
req = Request(request_url, headers=hdr) # nosec
response = _get_response(req)
data = json.loads(response.read().decode("UTF-8"))
url_list = [y[0] for y in data if y[0] != "original"]
#Remove all deadURLs from url_list if alive=True
if alive:
tmp_url_list = []
for url in url_list:
try:
urlopen(url)
except:
continue
tmp_url_list.append(url)
url_list = tmp_url_list
return url_list