better error messages(str), check latest version before asking for an upgrade and rm alive checking

This commit is contained in:
Akash Mahanty 2021-01-15 16:47:26 +05:30
parent dcd7b03302
commit 712471176b
5 changed files with 42 additions and 66 deletions

View File

@ -23,7 +23,6 @@ def test_save():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -42,7 +41,6 @@ def test_save():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -63,7 +61,6 @@ def test_json():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -84,7 +81,6 @@ def test_archive_url():
archive_url=True,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -105,7 +101,6 @@ def test_oldest():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -128,7 +123,6 @@ def test_oldest():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -150,7 +144,6 @@ def test_newest():
archive_url=False,
newest=True,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -173,7 +166,6 @@ def test_newest():
archive_url=False,
newest=True,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -195,7 +187,6 @@ def test_total_archives():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -217,7 +208,6 @@ def test_known_urls():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=True,
get=None,
@ -237,7 +227,6 @@ def test_known_urls():
archive_url=False,
newest=False,
near=False,
alive=True,
subdomain=True,
known_urls=True,
get=None,
@ -259,7 +248,6 @@ def test_near():
archive_url=False,
newest=False,
near=True,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -287,7 +275,6 @@ def test_near():
archive_url=False,
newest=False,
near=True,
alive=False,
subdomain=False,
known_urls=False,
get=None,
@ -314,7 +301,6 @@ def test_get():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="url",
@ -334,7 +320,6 @@ def test_get():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="oldest",
@ -354,7 +339,6 @@ def test_get():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="newest",
@ -374,7 +358,6 @@ def test_get():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="save",
@ -394,7 +377,6 @@ def test_get():
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="foobar",

View File

@ -128,8 +128,5 @@ def test_total_archives():
def test_known_urls():
target = Url("akamhy.github.io", user_agent)
assert len(target.known_urls(alive=True, subdomain=False)) > 2
target = Url("akamhy.github.io", user_agent)
assert len(target.known_urls()) > 3

View File

@ -120,11 +120,7 @@ def _known_urls(obj, args):
if args.subdomain:
subdomain = True
alive = False
if args.alive:
alive = True
url_list = obj.known_urls(alive=alive, subdomain=subdomain)
url_list = obj.known_urls(subdomain=subdomain)
total_urls = len(url_list)
if total_urls > 0:
@ -267,8 +263,6 @@ def add_knownUrlArg(knownUrlArg):
)
help_text = "Use with '--known_urls' to include known URLs for subdomains."
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
help_text = "Only include live URLs. Will not inlclude dead links."
knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text)
def add_nearArg(nearArg):

View File

@ -11,6 +11,12 @@ quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _latest_version(package_name, headers):
endpoint = "https://pypi.org/pypi/" + package_name + "/json"
json = _get_response(endpoint, headers=headers).json()
return json["info"]["version"]
def _unix_ts_to_wayback_ts(unix_ts):
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
@ -183,7 +189,7 @@ def _get_total_pages(url, user_agent):
return int((_get_response(total_pages_url, headers=headers).text).strip())
def _archive_url_parser(header, url):
def _archive_url_parser(header, url, latest_version=__version__):
"""
The wayback machine's save API doesn't
return JSON response, we are required
@ -226,15 +232,25 @@ def _archive_url_parser(header, url):
if m:
return m.group(1)
raise WaybackError(
"No archive URL found in the API response. "
"If '{url}' can be accessed via your web browser then either "
"this version of waybackpy ({version}) is out of date or WayBack Machine is malfunctioning. Visit "
"'https://github.com/akamhy/waybackpy' for the latest version "
"of waybackpy.\nHeader:\n{header}".format(
url=url, version=__version__, header=header
if __version__ == latest_version:
exc_message = (
"No archive URL found in the API response. "
"If '{url}' can be accessed via your web browser then either "
"Wayback Machine is malfunctioning or it refused to archive your URL."
"\nHeader:\n{header}".format(url=url, header=header)
)
)
else:
exc_message = (
"No archive URL found in the API response. "
"If '{url}' can be accessed via your web browser then either "
"this version of waybackpy ({version}) is out of date or WayBack "
"Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' "
"for the latest version of waybackpy.\nHeader:\n{header}".format(
url=url, version=__version__, header=header
)
)
raise WaybackError(exc_message)
def _wayback_timestamp(**kwargs):
@ -292,20 +308,27 @@ def _get_response(
# From https://stackoverflow.com/a/35504626
# By https://stackoverflow.com/users/401467/datashaman
s = requests.Session()
retries = Retry(
total=retries,
backoff_factor=backoff_factor,
status_forcelist=[500, 502, 503, 504],
)
s.mount("https://", HTTPAdapter(max_retries=retries))
url = _full_url(endpoint, params)
try:
if not return_full_url:
return s.get(url, headers=headers)
return (url, s.get(url, headers=headers))
except Exception as e:
exc_message = "Error while retrieving {url}".format(url=url)
exc_message = "Error while retrieving {url}.\n{reason}".format(
url=url, reason=str(e)
)
exc = WaybackError(exc_message)
exc.__cause__ = e
raise exc

View File

@ -12,6 +12,7 @@ from .utils import (
_cleaned_url,
_ts,
_unix_ts_to_wayback_ts,
_latest_version,
)
@ -23,7 +24,7 @@ class Url:
self._archive_url = None
self.timestamp = None
self._JSON = None
self._alive_url_list = []
self.latest_version = None
def __repr__(self):
return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
@ -141,7 +142,11 @@ class Url:
response = _get_response(
request_url, params=None, headers=headers, backoff_factor=2
)
self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
if not self.latest_version:
self.latest_version = _latest_version("waybackpy", headers=headers)
self._archive_url = "https://" + _archive_url_parser(
response.headers, self.url, self.latest_version
)
self.timestamp = datetime.utcnow()
return self
@ -291,26 +296,7 @@ class Url:
i = i + 1
return i
def live_urls_finder(self, url):
"""
This method is used to check if supplied url
is >= 400.
"""
try:
response_code = requests.get(url).status_code
except Exception:
return # we don't care if Exception
# 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
if response_code >= 400:
return
self._alive_url_list.append(url)
def known_urls(
self, alive=False, subdomain=False, start_timestamp=None, end_timestamp=None
):
def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
"""
Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine spider.
@ -347,10 +333,4 @@ class Url:
for snapshot in snapshots:
url_list.append(snapshot.original)
# Remove all deadURLs from url_list if alive=True
if alive:
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(self.live_urls_finder, url_list)
url_list = self._alive_url_list
return url_list