Implemented new feature, known urls for domain.
This commit is contained in:
parent
c9fa114d2e
commit
ce7294d990
@ -1,6 +1,8 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import sys
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
from waybackpy.wrapper import Url
|
from waybackpy.wrapper import Url
|
||||||
from waybackpy.__version__ import __version__
|
from waybackpy.__version__ import __version__
|
||||||
@ -31,6 +33,36 @@ def _near(obj, args):
|
|||||||
_near_args["minute"] = args.minute
|
_near_args["minute"] = args.minute
|
||||||
return (obj.near(**_near_args))
|
return (obj.near(**_near_args))
|
||||||
|
|
||||||
|
def _known_urls(obj, args):
|
||||||
|
sd = False
|
||||||
|
al = False
|
||||||
|
if args.subdomain:
|
||||||
|
sd = True
|
||||||
|
if args.alive:
|
||||||
|
al = True
|
||||||
|
url_list = obj.known_urls(alive=al, subdomain=sd)
|
||||||
|
total_urls = len(url_list)
|
||||||
|
|
||||||
|
if total_urls > 0:
|
||||||
|
m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0])
|
||||||
|
if m:
|
||||||
|
domain = m.group(1)
|
||||||
|
else:
|
||||||
|
domain = "waybackpy-known"
|
||||||
|
dir_path = os.path.abspath(os.getcwd())
|
||||||
|
file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
|
||||||
|
text = "\n".join(url_list) + "\n"
|
||||||
|
with open(file_name, "a+") as f:
|
||||||
|
f.write(text)
|
||||||
|
text = text + "%d URLs found and saved in ./%s-%d-urls.txt" % (
|
||||||
|
total_urls, domain, total_urls
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
text = "No known URLs found. Please try a diffrent domain!"
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
def _get(obj, args):
|
def _get(obj, args):
|
||||||
if args.get.lower() == "url":
|
if args.get.lower() == "url":
|
||||||
return (obj.get())
|
return (obj.get())
|
||||||
@ -52,10 +84,10 @@ def _get(obj, args):
|
|||||||
|
|
||||||
def args_handler(args):
|
def args_handler(args):
|
||||||
if args.version:
|
if args.version:
|
||||||
return (__version__)
|
return ("waybackpy version %s" % __version__)
|
||||||
|
|
||||||
if not args.url:
|
if not args.url:
|
||||||
return ("Specify an URL. See --help for help using waybackpy.")
|
return ("waybackpy %s \nSee 'waybackpy --help' for help using this tool." % __version__)
|
||||||
|
|
||||||
if args.user_agent:
|
if args.user_agent:
|
||||||
obj = Url(args.url, args.user_agent)
|
obj = Url(args.url, args.user_agent)
|
||||||
@ -72,26 +104,54 @@ def args_handler(args):
|
|||||||
return _total_archives(obj)
|
return _total_archives(obj)
|
||||||
if args.near:
|
if args.near:
|
||||||
return _near(obj, args)
|
return _near(obj, args)
|
||||||
|
if args.known_urls:
|
||||||
|
return _known_urls(obj, args)
|
||||||
if args.get:
|
if args.get:
|
||||||
return _get(obj, args)
|
return _get(obj, args)
|
||||||
return ("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help for help using waybackpy.")
|
return ("You only specified the URL. But you also need to specify the operation.\nSee 'waybackpy --help' for help using this tool.")
|
||||||
|
|
||||||
def parse_args(argv):
|
def parse_args(argv):
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("-u", "--url", help="URL on which Wayback machine operations would occur.")
|
|
||||||
parser.add_argument("-ua", "--user_agent", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\".")
|
requiredArgs = parser.add_argument_group('URL argument (required)')
|
||||||
parser.add_argument("-s", "--save", action='store_true', help="Save the URL on the Wayback machine.")
|
requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur")
|
||||||
parser.add_argument("-o", "--oldest", action='store_true', help="Oldest archive for the specified URL.")
|
|
||||||
parser.add_argument("-n", "--newest", action='store_true', help="Newest archive for the specified URL.")
|
userAgentArg = parser.add_argument_group('User Agent')
|
||||||
parser.add_argument("-t", "--total", action='store_true', help="Total number of archives for the specified URL.")
|
userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"")
|
||||||
parser.add_argument("-g", "--get", help="Prints the source code of the supplied url. Use '--get help' for extended usage.")
|
|
||||||
parser.add_argument("-v", "--version", action='store_true', help="Prints the waybackpy version.")
|
saveArg = parser.add_argument_group("Create new archive/save URL")
|
||||||
parser.add_argument("-N", "--near", action='store_true', help="Latest/Newest archive for the specified URL.")
|
saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine")
|
||||||
parser.add_argument("-Y", "--year", type=int, help="Year in integer. For use with --near.")
|
|
||||||
parser.add_argument("-M", "--month", type=int, help="Month in integer. For use with --near.")
|
oldestArg = parser.add_argument_group("Oldest archive")
|
||||||
parser.add_argument("-D", "--day", type=int, help="Day in integer. For use with --near.")
|
oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL")
|
||||||
parser.add_argument("-H", "--hour", type=int, help="Hour in integer. For use with --near.")
|
|
||||||
parser.add_argument("-MIN", "--minute", type=int, help="Minute in integer. For use with --near.")
|
newestArg = parser.add_argument_group("Newest archive")
|
||||||
|
newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL")
|
||||||
|
|
||||||
|
totalArg = parser.add_argument_group("Total number of archives")
|
||||||
|
totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL")
|
||||||
|
|
||||||
|
getArg = parser.add_argument_group("Get source code")
|
||||||
|
getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage")
|
||||||
|
|
||||||
|
knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.")
|
||||||
|
knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.")
|
||||||
|
knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help="Use with '--known_urls' to include known URLs for subdomains.")
|
||||||
|
knownUrlArg.add_argument("--alive", "-a", action='store_true', help="Only include live URLs. Will not inlclude dead links.")
|
||||||
|
|
||||||
|
|
||||||
|
nearArg = parser.add_argument_group('Archive close to time specified')
|
||||||
|
nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time")
|
||||||
|
|
||||||
|
nearArgs = parser.add_argument_group('Arguments that are used only with --near')
|
||||||
|
nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
|
||||||
|
nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
|
||||||
|
nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.")
|
||||||
|
nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
|
||||||
|
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
|
||||||
|
|
||||||
|
parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version")
|
||||||
|
|
||||||
return parser.parse_args(argv[1:])
|
return parser.parse_args(argv[1:])
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv=None):
|
||||||
|
@ -100,8 +100,10 @@ class Url:
|
|||||||
"""Return the source code of the supplied URL.
|
"""Return the source code of the supplied URL.
|
||||||
If encoding is not supplied, it is auto-detected from the response.
|
If encoding is not supplied, it is auto-detected from the response.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not url:
|
if not url:
|
||||||
url = self._clean_url()
|
url = self._clean_url()
|
||||||
|
|
||||||
if not user_agent:
|
if not user_agent:
|
||||||
user_agent = self.user_agent
|
user_agent = self.user_agent
|
||||||
|
|
||||||
@ -173,3 +175,50 @@ class Url:
|
|||||||
response = _get_response(req)
|
response = _get_response(req)
|
||||||
# Most efficient method to count number of archives (yet)
|
# Most efficient method to count number of archives (yet)
|
||||||
return str(response.read()).count(",")
|
return str(response.read()).count(",")
|
||||||
|
|
||||||
|
def known_urls(self, alive=False, subdomain=False):
|
||||||
|
"""Returns list of URLs known to exist for given domain name
|
||||||
|
because these URLs were crawled by WayBack Machine bots.
|
||||||
|
|
||||||
|
Useful for pen-testers and others.
|
||||||
|
|
||||||
|
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
|
||||||
|
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||||
|
"""
|
||||||
|
|
||||||
|
url_list = []
|
||||||
|
|
||||||
|
if subdomain:
|
||||||
|
request_url = (
|
||||||
|
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
|
||||||
|
% self._clean_url()
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
request_url = (
|
||||||
|
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
|
||||||
|
% self._clean_url()
|
||||||
|
)
|
||||||
|
|
||||||
|
hdr = {"User-Agent": "%s" % self.user_agent}
|
||||||
|
req = Request(request_url, headers=hdr) # nosec
|
||||||
|
response = _get_response(req)
|
||||||
|
|
||||||
|
data = json.loads(response.read().decode("UTF-8"))
|
||||||
|
url_list = [y[0] for y in data if y[0] != "original"]
|
||||||
|
|
||||||
|
#Remove all deadURLs from url_list if alive=True
|
||||||
|
if alive:
|
||||||
|
tmp_url_list = []
|
||||||
|
for url in url_list:
|
||||||
|
|
||||||
|
try:
|
||||||
|
urlopen(url)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tmp_url_list.append(url)
|
||||||
|
|
||||||
|
url_list = tmp_url_list
|
||||||
|
|
||||||
|
return url_list
|
Loading…
Reference in New Issue
Block a user