From 76205d9cf6ccb4fcc8d03db2ad6587d5d25f6dbe Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 13 Jan 2021 10:13:16 +0530 Subject: [PATCH] backoff_factor=2 for save, incr success by 25% --- tests/test_wrapper.py | 10 ---------- waybackpy/cdx.py | 2 +- waybackpy/utils.py | 11 +++++++++-- waybackpy/wrapper.py | 7 ++++--- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 28b2bce..54a6b4a 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -41,16 +41,6 @@ def test_save(): with pytest.raises(Exception): url2 = "ha ha ha ha" Url(url2, user_agent) -# url3 = "http://www.archive.is/faq.html" - -# with pytest.raises(Exception): -# target = Url( -# url3, -# "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) " -# "AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 " -# "Safari/533.20.27", -# ) -# target.save() def test_near(): diff --git a/waybackpy/cdx.py b/waybackpy/cdx.py index e4d1a4d..fc53266 100644 --- a/waybackpy/cdx.py +++ b/waybackpy/cdx.py @@ -84,7 +84,7 @@ class Cdx: endpoint = "https://web.archive.org/cdx/search/cdx" total_pages = _get_total_pages(self.url, self.user_agent) - #If we only have two or less pages of archives then we care for accuracy + # If we only have two or less pages of archives then we care for accuracy # pagination API can be lagged sometimes if use_page == True and total_pages >= 2: blank_pages = 0 diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 780d0a5..9876578 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -252,7 +252,12 @@ def _wayback_timestamp(**kwargs): def _get_response( - endpoint, params=None, headers=None, retries=5, return_full_url=False + endpoint, + params=None, + headers=None, + return_full_url=False, + retries=5, + backoff_factor=0.5, ): """ This function is used make get request. @@ -276,7 +281,9 @@ def _get_response( # By https://stackoverflow.com/users/401467/datashaman s = requests.Session() retries = Retry( - total=retries, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504] + total=retries, + backoff_factor=backoff_factor, + status_forcelist=[500, 502, 503, 504], ) s.mount("https://", HTTPAdapter(max_retries=retries)) url = _full_url(endpoint, params) diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 218bc55..cba1741 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -129,15 +129,16 @@ class Url: And to get the archive URL we are required to read the header of the API response. - _get_response() takes care of the get requests. It uses requests - package. + _get_response() takes care of the get requests. _archive_url_parser() parses the archive from the header. """ request_url = "https://web.archive.org/save/" + _cleaned_url(self.url) headers = {"User-Agent": self.user_agent} - response = _get_response(request_url, params=None, headers=headers) + response = _get_response( + request_url, params=None, headers=headers, backoff_factor=2 + ) self._archive_url = "https://" + _archive_url_parser(response.headers, self.url) self.timestamp = datetime.utcnow() return self