From dd1917c77eab744706ed13d4232c4413ddce6c8b Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Fri, 2 Apr 2021 10:38:17 +0530 Subject: [PATCH] =?UTF-8?q?added=20RedirectSaveError=20-=20for=20failed=20?= =?UTF-8?q?saves=20if=20the=20URL=20is=20a=20permanent=20=E2=80=A6=20(#93)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * added RedirectSaveError - for failed saves if the URL is a permanent redirect. * check if url is redirect before throwing exceptions, res.url is the redirect url if redirected at all * update tests and cli errors --- tests/test_cli.py | 2 +- waybackpy/cli.py | 2 ++ waybackpy/exceptions.py | 7 +++++++ waybackpy/utils.py | 23 +++++++++++++++++++++-- waybackpy/wrapper.py | 1 + 5 files changed, 32 insertions(+), 3 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index f788c2e..2a90973 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -29,7 +29,7 @@ def test_save(): get=None, ) reply = cli.args_handler(args) - assert "could happen because either your waybackpy" in str(reply) + assert "could happen because either your waybackpy" or "cannot be archived by wayback machine as it is a redirect" in str(reply) def test_json(): diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 45f305a..80445b8 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -29,6 +29,8 @@ def _save(obj): version=__version__, header=header ) ) + if "URL cannot be archived by wayback machine as it is a redirect" in e: + return ("URL cannot be archived by wayback machine as it is a redirect") raise WaybackError(err) diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 71e62ec..eb3ef67 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -13,6 +13,13 @@ class WaybackError(Exception): """ +class RedirectSaveError(WaybackError): + """ + Raised when the original URL is redirected and the + redirect URL is archived but not the original URL. + """ + + class URLError(Exception): """ Raised when malformed URLs are passed as arguments. diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 7c6958d..c9f3c55 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -3,7 +3,7 @@ import time import requests from datetime import datetime -from .exceptions import WaybackError, URLError +from .exceptions import WaybackError, URLError, RedirectSaveError from .__version__ import __version__ from urllib3.util.retry import Retry @@ -302,7 +302,9 @@ def _get_total_pages(url, user_agent): return int((_get_response(total_pages_url, headers=headers).text).strip()) -def _archive_url_parser(header, url, latest_version=__version__, instance=None): +def _archive_url_parser( + header, url, latest_version=__version__, instance=None, response=None +): """Returns the archive after parsing it from the response header. Parameters @@ -388,6 +390,16 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None): if m: return m.group(1) + if response: + if response.url: + if "web.archive.org/web" in response.url: + m = re.search( + r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$", + str(response.url).strip(), + ) + if m: + return m.group(0) + if instance: newest_archive = None try: @@ -414,6 +426,13 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None): "Wayback Machine is malfunctioning or it refused to archive your URL." "\nHeader:\n{header}".format(url=url, header=header) ) + + if "save redirected" == header.strip(): + raise RedirectSaveError( + "URL cannot be archived by wayback machine as it is a redirect.\nHeader:\n{header}".format( + header=header + ) + ) else: exc_message = ( "No archive URL found in the API response. " diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index ef24a81..6b68d1c 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -225,6 +225,7 @@ class Url: self.url, latest_version=self.latest_version, instance=self, + response=response, ) m = re.search(