better error messages(str), check latest version before asking for an upgrade and rm alive checking
This commit is contained in:
parent
dcd7b03302
commit
712471176b
@ -23,7 +23,6 @@ def test_save():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -42,7 +41,6 @@ def test_save():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -63,7 +61,6 @@ def test_json():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -84,7 +81,6 @@ def test_archive_url():
|
||||
archive_url=True,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -105,7 +101,6 @@ def test_oldest():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -128,7 +123,6 @@ def test_oldest():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -150,7 +144,6 @@ def test_newest():
|
||||
archive_url=False,
|
||||
newest=True,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -173,7 +166,6 @@ def test_newest():
|
||||
archive_url=False,
|
||||
newest=True,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -195,7 +187,6 @@ def test_total_archives():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -217,7 +208,6 @@ def test_known_urls():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=True,
|
||||
get=None,
|
||||
@ -237,7 +227,6 @@ def test_known_urls():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=True,
|
||||
subdomain=True,
|
||||
known_urls=True,
|
||||
get=None,
|
||||
@ -259,7 +248,6 @@ def test_near():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=True,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -287,7 +275,6 @@ def test_near():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=True,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
@ -314,7 +301,6 @@ def test_get():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="url",
|
||||
@ -334,7 +320,6 @@ def test_get():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="oldest",
|
||||
@ -354,7 +339,6 @@ def test_get():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="newest",
|
||||
@ -374,7 +358,6 @@ def test_get():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="save",
|
||||
@ -394,7 +377,6 @@ def test_get():
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
alive=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="foobar",
|
||||
|
@ -128,8 +128,5 @@ def test_total_archives():
|
||||
|
||||
def test_known_urls():
|
||||
|
||||
target = Url("akamhy.github.io", user_agent)
|
||||
assert len(target.known_urls(alive=True, subdomain=False)) > 2
|
||||
|
||||
target = Url("akamhy.github.io", user_agent)
|
||||
assert len(target.known_urls()) > 3
|
||||
|
@ -120,11 +120,7 @@ def _known_urls(obj, args):
|
||||
if args.subdomain:
|
||||
subdomain = True
|
||||
|
||||
alive = False
|
||||
if args.alive:
|
||||
alive = True
|
||||
|
||||
url_list = obj.known_urls(alive=alive, subdomain=subdomain)
|
||||
url_list = obj.known_urls(subdomain=subdomain)
|
||||
total_urls = len(url_list)
|
||||
|
||||
if total_urls > 0:
|
||||
@ -267,8 +263,6 @@ def add_knownUrlArg(knownUrlArg):
|
||||
)
|
||||
help_text = "Use with '--known_urls' to include known URLs for subdomains."
|
||||
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
|
||||
help_text = "Only include live URLs. Will not inlclude dead links."
|
||||
knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text)
|
||||
|
||||
|
||||
def add_nearArg(nearArg):
|
||||
|
@ -11,6 +11,12 @@ quote = requests.utils.quote
|
||||
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||
|
||||
|
||||
def _latest_version(package_name, headers):
|
||||
endpoint = "https://pypi.org/pypi/" + package_name + "/json"
|
||||
json = _get_response(endpoint, headers=headers).json()
|
||||
return json["info"]["version"]
|
||||
|
||||
|
||||
def _unix_ts_to_wayback_ts(unix_ts):
|
||||
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
|
||||
|
||||
@ -183,7 +189,7 @@ def _get_total_pages(url, user_agent):
|
||||
return int((_get_response(total_pages_url, headers=headers).text).strip())
|
||||
|
||||
|
||||
def _archive_url_parser(header, url):
|
||||
def _archive_url_parser(header, url, latest_version=__version__):
|
||||
"""
|
||||
The wayback machine's save API doesn't
|
||||
return JSON response, we are required
|
||||
@ -226,16 +232,26 @@ def _archive_url_parser(header, url):
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
raise WaybackError(
|
||||
if __version__ == latest_version:
|
||||
exc_message = (
|
||||
"No archive URL found in the API response. "
|
||||
"If '{url}' can be accessed via your web browser then either "
|
||||
"this version of waybackpy ({version}) is out of date or WayBack Machine is malfunctioning. Visit "
|
||||
"'https://github.com/akamhy/waybackpy' for the latest version "
|
||||
"of waybackpy.\nHeader:\n{header}".format(
|
||||
"Wayback Machine is malfunctioning or it refused to archive your URL."
|
||||
"\nHeader:\n{header}".format(url=url, header=header)
|
||||
)
|
||||
else:
|
||||
exc_message = (
|
||||
"No archive URL found in the API response. "
|
||||
"If '{url}' can be accessed via your web browser then either "
|
||||
"this version of waybackpy ({version}) is out of date or WayBack "
|
||||
"Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' "
|
||||
"for the latest version of waybackpy.\nHeader:\n{header}".format(
|
||||
url=url, version=__version__, header=header
|
||||
)
|
||||
)
|
||||
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def _wayback_timestamp(**kwargs):
|
||||
"""
|
||||
@ -292,20 +308,27 @@ def _get_response(
|
||||
|
||||
# From https://stackoverflow.com/a/35504626
|
||||
# By https://stackoverflow.com/users/401467/datashaman
|
||||
|
||||
s = requests.Session()
|
||||
|
||||
retries = Retry(
|
||||
total=retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
)
|
||||
|
||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
url = _full_url(endpoint, params)
|
||||
|
||||
try:
|
||||
if not return_full_url:
|
||||
return s.get(url, headers=headers)
|
||||
return (url, s.get(url, headers=headers))
|
||||
except Exception as e:
|
||||
exc_message = "Error while retrieving {url}".format(url=url)
|
||||
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
||||
url=url, reason=str(e)
|
||||
)
|
||||
exc = WaybackError(exc_message)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
||||
|
@ -12,6 +12,7 @@ from .utils import (
|
||||
_cleaned_url,
|
||||
_ts,
|
||||
_unix_ts_to_wayback_ts,
|
||||
_latest_version,
|
||||
)
|
||||
|
||||
|
||||
@ -23,7 +24,7 @@ class Url:
|
||||
self._archive_url = None
|
||||
self.timestamp = None
|
||||
self._JSON = None
|
||||
self._alive_url_list = []
|
||||
self.latest_version = None
|
||||
|
||||
def __repr__(self):
|
||||
return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
|
||||
@ -141,7 +142,11 @@ class Url:
|
||||
response = _get_response(
|
||||
request_url, params=None, headers=headers, backoff_factor=2
|
||||
)
|
||||
self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
|
||||
if not self.latest_version:
|
||||
self.latest_version = _latest_version("waybackpy", headers=headers)
|
||||
self._archive_url = "https://" + _archive_url_parser(
|
||||
response.headers, self.url, self.latest_version
|
||||
)
|
||||
self.timestamp = datetime.utcnow()
|
||||
return self
|
||||
|
||||
@ -291,26 +296,7 @@ class Url:
|
||||
i = i + 1
|
||||
return i
|
||||
|
||||
def live_urls_finder(self, url):
|
||||
"""
|
||||
This method is used to check if supplied url
|
||||
is >= 400.
|
||||
"""
|
||||
|
||||
try:
|
||||
response_code = requests.get(url).status_code
|
||||
except Exception:
|
||||
return # we don't care if Exception
|
||||
|
||||
# 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
|
||||
if response_code >= 400:
|
||||
return
|
||||
|
||||
self._alive_url_list.append(url)
|
||||
|
||||
def known_urls(
|
||||
self, alive=False, subdomain=False, start_timestamp=None, end_timestamp=None
|
||||
):
|
||||
def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
|
||||
"""
|
||||
Returns list of URLs known to exist for given domain name
|
||||
because these URLs were crawled by WayBack Machine spider.
|
||||
@ -347,10 +333,4 @@ class Url:
|
||||
for snapshot in snapshots:
|
||||
url_list.append(snapshot.original)
|
||||
|
||||
# Remove all deadURLs from url_list if alive=True
|
||||
if alive:
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
executor.map(self.live_urls_finder, url_list)
|
||||
url_list = self._alive_url_list
|
||||
|
||||
return url_list
|
||||
|
Loading…
Reference in New Issue
Block a user