better error messages(str), check latest version before asking for an upgrade and rm alive checking

This commit is contained in:
Akash Mahanty 2021-01-15 16:47:26 +05:30
parent dcd7b03302
commit 712471176b
5 changed files with 42 additions and 66 deletions

View File

@ -23,7 +23,6 @@ def test_save():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -42,7 +41,6 @@ def test_save():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -63,7 +61,6 @@ def test_json():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -84,7 +81,6 @@ def test_archive_url():
archive_url=True, archive_url=True,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -105,7 +101,6 @@ def test_oldest():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -128,7 +123,6 @@ def test_oldest():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -150,7 +144,6 @@ def test_newest():
archive_url=False, archive_url=False,
newest=True, newest=True,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -173,7 +166,6 @@ def test_newest():
archive_url=False, archive_url=False,
newest=True, newest=True,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -195,7 +187,6 @@ def test_total_archives():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -217,7 +208,6 @@ def test_known_urls():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=True, known_urls=True,
get=None, get=None,
@ -237,7 +227,6 @@ def test_known_urls():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=True,
subdomain=True, subdomain=True,
known_urls=True, known_urls=True,
get=None, get=None,
@ -259,7 +248,6 @@ def test_near():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=True, near=True,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -287,7 +275,6 @@ def test_near():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=True, near=True,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get=None, get=None,
@ -314,7 +301,6 @@ def test_get():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get="url", get="url",
@ -334,7 +320,6 @@ def test_get():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get="oldest", get="oldest",
@ -354,7 +339,6 @@ def test_get():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get="newest", get="newest",
@ -374,7 +358,6 @@ def test_get():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get="save", get="save",
@ -394,7 +377,6 @@ def test_get():
archive_url=False, archive_url=False,
newest=False, newest=False,
near=False, near=False,
alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get="foobar", get="foobar",

View File

@ -128,8 +128,5 @@ def test_total_archives():
def test_known_urls(): def test_known_urls():
target = Url("akamhy.github.io", user_agent)
assert len(target.known_urls(alive=True, subdomain=False)) > 2
target = Url("akamhy.github.io", user_agent) target = Url("akamhy.github.io", user_agent)
assert len(target.known_urls()) > 3 assert len(target.known_urls()) > 3

View File

@ -120,11 +120,7 @@ def _known_urls(obj, args):
if args.subdomain: if args.subdomain:
subdomain = True subdomain = True
alive = False url_list = obj.known_urls(subdomain=subdomain)
if args.alive:
alive = True
url_list = obj.known_urls(alive=alive, subdomain=subdomain)
total_urls = len(url_list) total_urls = len(url_list)
if total_urls > 0: if total_urls > 0:
@ -267,8 +263,6 @@ def add_knownUrlArg(knownUrlArg):
) )
help_text = "Use with '--known_urls' to include known URLs for subdomains." help_text = "Use with '--known_urls' to include known URLs for subdomains."
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text) knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
help_text = "Only include live URLs. Will not inlclude dead links."
knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text)
def add_nearArg(nearArg): def add_nearArg(nearArg):

View File

@ -11,6 +11,12 @@ quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy" default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _latest_version(package_name, headers):
endpoint = "https://pypi.org/pypi/" + package_name + "/json"
json = _get_response(endpoint, headers=headers).json()
return json["info"]["version"]
def _unix_ts_to_wayback_ts(unix_ts): def _unix_ts_to_wayback_ts(unix_ts):
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S") return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
@ -183,7 +189,7 @@ def _get_total_pages(url, user_agent):
return int((_get_response(total_pages_url, headers=headers).text).strip()) return int((_get_response(total_pages_url, headers=headers).text).strip())
def _archive_url_parser(header, url): def _archive_url_parser(header, url, latest_version=__version__):
""" """
The wayback machine's save API doesn't The wayback machine's save API doesn't
return JSON response, we are required return JSON response, we are required
@ -226,16 +232,26 @@ def _archive_url_parser(header, url):
if m: if m:
return m.group(1) return m.group(1)
raise WaybackError( if __version__ == latest_version:
exc_message = (
"No archive URL found in the API response. " "No archive URL found in the API response. "
"If '{url}' can be accessed via your web browser then either " "If '{url}' can be accessed via your web browser then either "
"this version of waybackpy ({version}) is out of date or WayBack Machine is malfunctioning. Visit " "Wayback Machine is malfunctioning or it refused to archive your URL."
"'https://github.com/akamhy/waybackpy' for the latest version " "\nHeader:\n{header}".format(url=url, header=header)
"of waybackpy.\nHeader:\n{header}".format( )
else:
exc_message = (
"No archive URL found in the API response. "
"If '{url}' can be accessed via your web browser then either "
"this version of waybackpy ({version}) is out of date or WayBack "
"Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' "
"for the latest version of waybackpy.\nHeader:\n{header}".format(
url=url, version=__version__, header=header url=url, version=__version__, header=header
) )
) )
raise WaybackError(exc_message)
def _wayback_timestamp(**kwargs): def _wayback_timestamp(**kwargs):
""" """
@ -292,20 +308,27 @@ def _get_response(
# From https://stackoverflow.com/a/35504626 # From https://stackoverflow.com/a/35504626
# By https://stackoverflow.com/users/401467/datashaman # By https://stackoverflow.com/users/401467/datashaman
s = requests.Session() s = requests.Session()
retries = Retry( retries = Retry(
total=retries, total=retries,
backoff_factor=backoff_factor, backoff_factor=backoff_factor,
status_forcelist=[500, 502, 503, 504], status_forcelist=[500, 502, 503, 504],
) )
s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries))
url = _full_url(endpoint, params) url = _full_url(endpoint, params)
try: try:
if not return_full_url: if not return_full_url:
return s.get(url, headers=headers) return s.get(url, headers=headers)
return (url, s.get(url, headers=headers)) return (url, s.get(url, headers=headers))
except Exception as e: except Exception as e:
exc_message = "Error while retrieving {url}".format(url=url) exc_message = "Error while retrieving {url}.\n{reason}".format(
url=url, reason=str(e)
)
exc = WaybackError(exc_message) exc = WaybackError(exc_message)
exc.__cause__ = e exc.__cause__ = e
raise exc raise exc

View File

@ -12,6 +12,7 @@ from .utils import (
_cleaned_url, _cleaned_url,
_ts, _ts,
_unix_ts_to_wayback_ts, _unix_ts_to_wayback_ts,
_latest_version,
) )
@ -23,7 +24,7 @@ class Url:
self._archive_url = None self._archive_url = None
self.timestamp = None self.timestamp = None
self._JSON = None self._JSON = None
self._alive_url_list = [] self.latest_version = None
def __repr__(self): def __repr__(self):
return "waybackpy.Url(url={url}, user_agent={user_agent})".format( return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
@ -141,7 +142,11 @@ class Url:
response = _get_response( response = _get_response(
request_url, params=None, headers=headers, backoff_factor=2 request_url, params=None, headers=headers, backoff_factor=2
) )
self._archive_url = "https://" + _archive_url_parser(response.headers, self.url) if not self.latest_version:
self.latest_version = _latest_version("waybackpy", headers=headers)
self._archive_url = "https://" + _archive_url_parser(
response.headers, self.url, self.latest_version
)
self.timestamp = datetime.utcnow() self.timestamp = datetime.utcnow()
return self return self
@ -291,26 +296,7 @@ class Url:
i = i + 1 i = i + 1
return i return i
def live_urls_finder(self, url): def known_urls(self, subdomain=False, start_timestamp=None, end_timestamp=None):
"""
This method is used to check if supplied url
is >= 400.
"""
try:
response_code = requests.get(url).status_code
except Exception:
return # we don't care if Exception
# 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
if response_code >= 400:
return
self._alive_url_list.append(url)
def known_urls(
self, alive=False, subdomain=False, start_timestamp=None, end_timestamp=None
):
""" """
Returns list of URLs known to exist for given domain name Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine spider. because these URLs were crawled by WayBack Machine spider.
@ -347,10 +333,4 @@ class Url:
for snapshot in snapshots: for snapshot in snapshots:
url_list.append(snapshot.original) url_list.append(snapshot.original)
# Remove all deadURLs from url_list if alive=True
if alive:
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(self.live_urls_finder, url_list)
url_list = self._alive_url_list
return url_list return url_list