From 8d2cc112c19138de2aa17e85af8a03e001e8a260 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Fri, 2 Apr 2021 09:55:11 +0530 Subject: [PATCH] check if url is redirect before throwing exceptions, res.url is the redirect url if redirected at all --- waybackpy/utils.py | 14 +++++++++++++- waybackpy/wrapper.py | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 0fa79b5..c9f3c55 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -302,7 +302,9 @@ def _get_total_pages(url, user_agent): return int((_get_response(total_pages_url, headers=headers).text).strip()) -def _archive_url_parser(header, url, latest_version=__version__, instance=None): +def _archive_url_parser( + header, url, latest_version=__version__, instance=None, response=None +): """Returns the archive after parsing it from the response header. Parameters @@ -388,6 +390,16 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None): if m: return m.group(1) + if response: + if response.url: + if "web.archive.org/web" in response.url: + m = re.search( + r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$", + str(response.url).strip(), + ) + if m: + return m.group(0) + if instance: newest_archive = None try: diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index ef24a81..6b68d1c 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -225,6 +225,7 @@ class Url: self.url, latest_version=self.latest_version, instance=self, + response=response, ) m = re.search(