backoff_factor=2 for save, incr success by 25%

This commit is contained in:
Akash Mahanty 2021-01-13 10:13:16 +05:30
parent ec0a0d04cc
commit 76205d9cf6
4 changed files with 14 additions and 16 deletions

View File

@ -41,16 +41,6 @@ def test_save():
with pytest.raises(Exception): with pytest.raises(Exception):
url2 = "ha ha ha ha" url2 = "ha ha ha ha"
Url(url2, user_agent) Url(url2, user_agent)
# url3 = "http://www.archive.is/faq.html"
# with pytest.raises(Exception):
# target = Url(
# url3,
# "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) "
# "AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 "
# "Safari/533.20.27",
# )
# target.save()
def test_near(): def test_near():

View File

@ -84,7 +84,7 @@ class Cdx:
endpoint = "https://web.archive.org/cdx/search/cdx" endpoint = "https://web.archive.org/cdx/search/cdx"
total_pages = _get_total_pages(self.url, self.user_agent) total_pages = _get_total_pages(self.url, self.user_agent)
#If we only have two or less pages of archives then we care for accuracy # If we only have two or less pages of archives then we care for accuracy
# pagination API can be lagged sometimes # pagination API can be lagged sometimes
if use_page == True and total_pages >= 2: if use_page == True and total_pages >= 2:
blank_pages = 0 blank_pages = 0

View File

@ -252,7 +252,12 @@ def _wayback_timestamp(**kwargs):
def _get_response( def _get_response(
endpoint, params=None, headers=None, retries=5, return_full_url=False endpoint,
params=None,
headers=None,
return_full_url=False,
retries=5,
backoff_factor=0.5,
): ):
""" """
This function is used make get request. This function is used make get request.
@ -276,7 +281,9 @@ def _get_response(
# By https://stackoverflow.com/users/401467/datashaman # By https://stackoverflow.com/users/401467/datashaman
s = requests.Session() s = requests.Session()
retries = Retry( retries = Retry(
total=retries, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504] total=retries,
backoff_factor=backoff_factor,
status_forcelist=[500, 502, 503, 504],
) )
s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries))
url = _full_url(endpoint, params) url = _full_url(endpoint, params)

View File

@ -129,15 +129,16 @@ class Url:
And to get the archive URL we are required to read the And to get the archive URL we are required to read the
header of the API response. header of the API response.
_get_response() takes care of the get requests. It uses requests _get_response() takes care of the get requests.
package.
_archive_url_parser() parses the archive from the header. _archive_url_parser() parses the archive from the header.
""" """
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url) request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
response = _get_response(request_url, params=None, headers=headers) response = _get_response(
request_url, params=None, headers=headers, backoff_factor=2
)
self._archive_url = "https://" + _archive_url_parser(response.headers, self.url) self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
self.timestamp = datetime.utcnow() self.timestamp = datetime.utcnow()
return self return self