better error messages(str), check latest version before asking for an upgrade and rm alive checking
This commit is contained in:
parent
dcd7b03302
commit
712471176b
@ -23,7 +23,6 @@ def test_save():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -42,7 +41,6 @@ def test_save():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -63,7 +61,6 @@ def test_json():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -84,7 +81,6 @@ def test_archive_url():
|
|||||||
archive_url=True,
|
archive_url=True,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -105,7 +101,6 @@ def test_oldest():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -128,7 +123,6 @@ def test_oldest():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -150,7 +144,6 @@ def test_newest():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=True,
|
newest=True,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -173,7 +166,6 @@ def test_newest():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=True,
|
newest=True,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -195,7 +187,6 @@ def test_total_archives():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -217,7 +208,6 @@ def test_known_urls():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=True,
|
known_urls=True,
|
||||||
get=None,
|
get=None,
|
||||||
@ -237,7 +227,6 @@ def test_known_urls():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=True,
|
|
||||||
subdomain=True,
|
subdomain=True,
|
||||||
known_urls=True,
|
known_urls=True,
|
||||||
get=None,
|
get=None,
|
||||||
@ -259,7 +248,6 @@ def test_near():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=True,
|
near=True,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -287,7 +275,6 @@ def test_near():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=True,
|
near=True,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get=None,
|
get=None,
|
||||||
@ -314,7 +301,6 @@ def test_get():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get="url",
|
get="url",
|
||||||
@ -334,7 +320,6 @@ def test_get():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get="oldest",
|
get="oldest",
|
||||||
@ -354,7 +339,6 @@ def test_get():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get="newest",
|
get="newest",
|
||||||
@ -374,7 +358,6 @@ def test_get():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get="save",
|
get="save",
|
||||||
@ -394,7 +377,6 @@ def test_get():
|
|||||||
archive_url=False,
|
archive_url=False,
|
||||||
newest=False,
|
newest=False,
|
||||||
near=False,
|
near=False,
|
||||||
alive=False,
|
|
||||||
subdomain=False,
|
subdomain=False,
|
||||||
known_urls=False,
|
known_urls=False,
|
||||||
get="foobar",
|
get="foobar",
|
||||||
|
@ -128,8 +128,5 @@ def test_total_archives():
|
|||||||
|
|
||||||
def test_known_urls():
|
def test_known_urls():
|
||||||
|
|
||||||
target = Url("akamhy.github.io", user_agent)
|
|
||||||
assert len(target.known_urls(alive=True, subdomain=False)) > 2
|
|
||||||
|
|
||||||
target = Url("akamhy.github.io", user_agent)
|
target = Url("akamhy.github.io", user_agent)
|
||||||
assert len(target.known_urls()) > 3
|
assert len(target.known_urls()) > 3
|
||||||
|
@ -120,11 +120,7 @@ def _known_urls(obj, args):
|
|||||||
if args.subdomain:
|
if args.subdomain:
|
||||||
subdomain = True
|
subdomain = True
|
||||||
|
|
||||||
alive = False
|
url_list = obj.known_urls(subdomain=subdomain)
|
||||||
if args.alive:
|
|
||||||
alive = True
|
|
||||||
|
|
||||||
url_list = obj.known_urls(alive=alive, subdomain=subdomain)
|
|
||||||
total_urls = len(url_list)
|
total_urls = len(url_list)
|
||||||
|
|
||||||
if total_urls > 0:
|
if total_urls > 0:
|
||||||
@ -267,8 +263,6 @@ def add_knownUrlArg(knownUrlArg):
|
|||||||
)
|
)
|
||||||
help_text = "Use with '--known_urls' to include known URLs for subdomains."
|
help_text = "Use with '--known_urls' to include known URLs for subdomains."
|
||||||
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
|
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
|
||||||
help_text = "Only include live URLs. Will not inlclude dead links."
|
|
||||||
knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text)
|
|
||||||
|
|
||||||
|
|
||||||
def add_nearArg(nearArg):
|
def add_nearArg(nearArg):
|
||||||
|
@ -11,6 +11,12 @@ quote = requests.utils.quote
|
|||||||
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||||
|
|
||||||
|
|
||||||
|
def _latest_version(package_name, headers):
|
||||||
|
endpoint = "https://pypi.org/pypi/" + package_name + "/json"
|
||||||
|
json = _get_response(endpoint, headers=headers).json()
|
||||||
|
return json["info"]["version"]
|
||||||
|
|
||||||
|
|
||||||
def _unix_ts_to_wayback_ts(unix_ts):
|
def _unix_ts_to_wayback_ts(unix_ts):
|
||||||
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
|
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
|
||||||
|
|
||||||
@ -183,7 +189,7 @@ def _get_total_pages(url, user_agent):
|
|||||||
return int((_get_response(total_pages_url, headers=headers).text).strip())
|
return int((_get_response(total_pages_url, headers=headers).text).strip())
|
||||||
|
|
||||||
|
|
||||||
def _archive_url_parser(header, url):
|
def _archive_url_parser(header, url, latest_version=__version__):
|
||||||
"""
|
"""
|
||||||
The wayback machine's save API doesn't
|
The wayback machine's save API doesn't
|
||||||
return JSON response, we are required
|
return JSON response, we are required
|
||||||
@ -226,16 +232,26 @@ def _archive_url_parser(header, url):
|
|||||||
if m:
|
if m:
|
||||||
return m.group(1)
|
return m.group(1)
|
||||||
|
|
||||||
raise WaybackError(
|
if __version__ == latest_version:
|
||||||
|
exc_message = (
|
||||||
"No archive URL found in the API response. "
|
"No archive URL found in the API response. "
|
||||||
"If '{url}' can be accessed via your web browser then either "
|
"If '{url}' can be accessed via your web browser then either "
|
||||||
"this version of waybackpy ({version}) is out of date or WayBack Machine is malfunctioning. Visit "
|
"Wayback Machine is malfunctioning or it refused to archive your URL."
|
||||||
"'https://github.com/akamhy/waybackpy' for the latest version "
|
"\nHeader:\n{header}".format(url=url, header=header)
|
||||||
"of waybackpy.\nHeader:\n{header}".format(
|
)
|
||||||
|
else:
|
||||||
|
exc_message = (
|
||||||
|
"No archive URL found in the API response. "
|
||||||
|
"If '{url}' can be accessed via your web browser then either "
|
||||||
|
"this version of waybackpy ({version}) is out of date or WayBack "
|
||||||
|
"Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' "
|
||||||
|
"for the latest version of waybackpy.\nHeader:\n{header}".format(
|
||||||
url=url, version=__version__, header=header
|
url=url, version=__version__, header=header
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
raise WaybackError(exc_message)
|
||||||
|
|
||||||
|
|
||||||
def _wayback_timestamp(**kwargs):
|
def _wayback_timestamp(**kwargs):
|
||||||
"""
|
"""
|
||||||
@ -292,20 +308,27 @@ def _get_response(
|
|||||||
|
|
||||||
# From https://stackoverflow.com/a/35504626
|
# From https://stackoverflow.com/a/35504626
|
||||||
# By https://stackoverflow.com/users/401467/datashaman
|
# By https://stackoverflow.com/users/401467/datashaman
|
||||||
|
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
|
|
||||||
retries = Retry(
|
retries = Retry(
|
||||||
total=retries,
|
total=retries,
|
||||||
backoff_factor=backoff_factor,
|
backoff_factor=backoff_factor,
|
||||||
status_forcelist=[500, 502, 503, 504],
|
status_forcelist=[500, 502, 503, 504],
|
||||||
)
|
)
|
||||||
|
|
||||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||||
|
|
||||||
url = _full_url(endpoint, params)
|
url = _full_url(endpoint, params)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not return_full_url:
|
if not return_full_url:
|
||||||
return s.get(url, headers=headers)
|
return s.get(url, headers=headers)
|
||||||
return (url, s.get(url, headers=headers))
|
return (url, s.get(url, headers=headers))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
exc_message = "Error while retrieving {url}".format(url=url)
|
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
||||||
|
url=url, reason=str(e)
|
||||||
|
)
|
||||||
exc = WaybackError(exc_message)
|
exc = WaybackError(exc_message)
|
||||||
exc.__cause__ = e
|
exc.__cause__ = e
|
||||||
raise exc
|
raise exc
|
||||||
|
@ -12,6 +12,7 @@ from .utils import (
|
|||||||
_cleaned_url,
|
_cleaned_url,
|
||||||
_ts,
|
_ts,
|
||||||
_unix_ts_to_wayback_ts,
|
_unix_ts_to_wayback_ts,
|
||||||
|
_latest_version,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -23,7 +24,7 @@ class Url:
|
|||||||
self._archive_url = None
|
self._archive_url = None
|
||||||
self.timestamp = None
|
self.timestamp = None
|
||||||
self._JSON = None
|
self._JSON = None
|
||||||
self._alive_url_list = []
|
self.latest_version = None
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
|
return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
|
||||||
@ -141,7 +142,11 @@ class Url:
|
|||||||
response = _get_response(
|
response = _get_response(
|
||||||
request_url, params=None, headers=headers, backoff_factor=2
|
request_url, params=None, headers=headers, backoff_factor=2
|
||||||
)
|
)
|
||||||
self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
|
if not self.latest_version:
|
||||||
|
self.latest_version = _latest_version("waybackpy", headers=headers)
|
||||||
|
self._archive_url = "https://" + _archive_url_parser(
|
||||||
|
response.headers, self.url, self.latest_version
|
||||||
|
)
|
||||||
self.timestamp = datetime.utcnow()
|
self.timestamp = datetime.utcnow()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@ -291,26 +296,7 @@ class Url:
|
|||||||
i = i + 1
|
i = i + 1
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def live_urls_finder(self, url):
|
def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
|
||||||
"""
|
|
||||||
This method is used to check if supplied url
|
|
||||||
is >= 400.
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
response_code = requests.get(url).status_code
|
|
||||||
except Exception:
|
|
||||||
return # we don't care if Exception
|
|
||||||
|
|
||||||
# 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
|
|
||||||
if response_code >= 400:
|
|
||||||
return
|
|
||||||
|
|
||||||
self._alive_url_list.append(url)
|
|
||||||
|
|
||||||
def known_urls(
|
|
||||||
self, alive=False, subdomain=False, start_timestamp=None, end_timestamp=None
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Returns list of URLs known to exist for given domain name
|
Returns list of URLs known to exist for given domain name
|
||||||
because these URLs were crawled by WayBack Machine spider.
|
because these URLs were crawled by WayBack Machine spider.
|
||||||
@ -347,10 +333,4 @@ class Url:
|
|||||||
for snapshot in snapshots:
|
for snapshot in snapshots:
|
||||||
url_list.append(snapshot.original)
|
url_list.append(snapshot.original)
|
||||||
|
|
||||||
# Remove all deadURLs from url_list if alive=True
|
|
||||||
if alive:
|
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
||||||
executor.map(self.live_urls_finder, url_list)
|
|
||||||
url_list = self._alive_url_list
|
|
||||||
|
|
||||||
return url_list
|
return url_list
|
||||||
|
Loading…
Reference in New Issue
Block a user