From ed6160c54faf498ba5355efba32ea9694d57a700 Mon Sep 17 00:00:00 2001 From: eggplants Date: Sat, 5 Feb 2022 06:19:02 +0900 Subject: [PATCH 1/2] add: TooManyRequestsError --- waybackpy/exceptions.py | 26 ++++++++++---------------- waybackpy/save_api.py | 14 ++++++++++---- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 53f00c2..02ee953 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -8,14 +8,13 @@ This module contains the set of Waybackpy's exceptions. class WaybackError(Exception): """ Raised when Waybackpy can not return what you asked for. - 1) Wayback Machine API Service is unreachable/down. - 2) You passed illegal arguments. - All other exceptions are inherited from this class. + 1) Wayback Machine API Service is unreachable/down. + 2) You passed illegal arguments. + + All other exceptions are inherited from this class. """ - pass - class RedirectSaveError(WaybackError): """ @@ -23,15 +22,18 @@ class RedirectSaveError(WaybackError): redirect URL is archived but not the original URL. """ - pass - class URLError(Exception): """ Raised when malformed URLs are passed as arguments. """ - pass + +class TooManyRequestsError(WaybackError): + """ + Raised when you make more than 15 requests per + minute and the Wayback Machine returns 429. + """ class MaximumRetriesExceeded(WaybackError): @@ -39,28 +41,20 @@ class MaximumRetriesExceeded(WaybackError): MaximumRetriesExceeded """ - pass - class MaximumSaveRetriesExceeded(MaximumRetriesExceeded): """ MaximumSaveRetriesExceeded """ - pass - class ArchiveNotInAvailabilityAPIResponse(WaybackError): """ Could not parse the archive in the JSON response of the availability API. """ - pass - class InvalidJSONInAvailabilityAPIResponse(WaybackError): """ availability api returned invalid JSON """ - - pass diff --git a/waybackpy/save_api.py b/waybackpy/save_api.py index c0ca55c..a3f77fa 100644 --- a/waybackpy/save_api.py +++ b/waybackpy/save_api.py @@ -8,7 +8,7 @@ from requests.adapters import HTTPAdapter from requests.structures import CaseInsensitiveDict from urllib3.util.retry import Retry -from .exceptions import MaximumSaveRetriesExceeded +from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError from .utils import DEFAULT_USER_AGENT @@ -79,6 +79,12 @@ class WaybackMachineSaveAPI(object): self.status_code = self.response.status_code self.response_url = self.response.url session.close() + if self.status_code == 429: + raise TooManyRequestsError( + "Seem to be refused to request by the server. " + "Save Page Now receives up to 15 URLs per minutes. " + "Wait a moment and run again." + ) def archive_url_parser(self) -> Optional[str]: """ @@ -105,9 +111,9 @@ class WaybackMachineSaveAPI(object): if self.response_url: self.response_url = self.response_url.strip() if "web.archive.org/web" in self.response_url: - regex = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$" - match = re.search(regex, self.response_url) - if match: + regex4 = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$" + match = re.search(regex4, self.response_url) + if match is not None: return "https://" + match.group(0) return None From 7b6401d59b4e73bb076a6d1f9e58720fa6c220b0 Mon Sep 17 00:00:00 2001 From: eggplants Date: Sat, 5 Feb 2022 06:20:03 +0900 Subject: [PATCH 2/2] fix: delete useless conds --- waybackpy/save_api.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/waybackpy/save_api.py b/waybackpy/save_api.py index a3f77fa..29fb2a3 100644 --- a/waybackpy/save_api.py +++ b/waybackpy/save_api.py @@ -108,13 +108,11 @@ class WaybackMachineSaveAPI(object): if match is not None and len(match.groups()) == 1: return "https" + match.group(1) - if self.response_url: - self.response_url = self.response_url.strip() - if "web.archive.org/web" in self.response_url: - regex4 = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$" - match = re.search(regex4, self.response_url) - if match is not None: - return "https://" + match.group(0) + self.response_url = self.response_url.strip() + regex4 = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$" + match = re.search(regex4, self.response_url) + if match is not None: + return "https://" + match.group(0) return None