backoff_factor=2 for save, incr success by 25%
This commit is contained in:
parent
ec0a0d04cc
commit
76205d9cf6
@ -41,16 +41,6 @@ def test_save():
|
|||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
url2 = "ha ha ha ha"
|
url2 = "ha ha ha ha"
|
||||||
Url(url2, user_agent)
|
Url(url2, user_agent)
|
||||||
# url3 = "http://www.archive.is/faq.html"
|
|
||||||
|
|
||||||
# with pytest.raises(Exception):
|
|
||||||
# target = Url(
|
|
||||||
# url3,
|
|
||||||
# "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) "
|
|
||||||
# "AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 "
|
|
||||||
# "Safari/533.20.27",
|
|
||||||
# )
|
|
||||||
# target.save()
|
|
||||||
|
|
||||||
|
|
||||||
def test_near():
|
def test_near():
|
||||||
|
@ -84,7 +84,7 @@ class Cdx:
|
|||||||
|
|
||||||
endpoint = "https://web.archive.org/cdx/search/cdx"
|
endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||||
total_pages = _get_total_pages(self.url, self.user_agent)
|
total_pages = _get_total_pages(self.url, self.user_agent)
|
||||||
#If we only have two or less pages of archives then we care for accuracy
|
# If we only have two or less pages of archives then we care for accuracy
|
||||||
# pagination API can be lagged sometimes
|
# pagination API can be lagged sometimes
|
||||||
if use_page == True and total_pages >= 2:
|
if use_page == True and total_pages >= 2:
|
||||||
blank_pages = 0
|
blank_pages = 0
|
||||||
|
@ -252,7 +252,12 @@ def _wayback_timestamp(**kwargs):
|
|||||||
|
|
||||||
|
|
||||||
def _get_response(
|
def _get_response(
|
||||||
endpoint, params=None, headers=None, retries=5, return_full_url=False
|
endpoint,
|
||||||
|
params=None,
|
||||||
|
headers=None,
|
||||||
|
return_full_url=False,
|
||||||
|
retries=5,
|
||||||
|
backoff_factor=0.5,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
This function is used make get request.
|
This function is used make get request.
|
||||||
@ -276,7 +281,9 @@ def _get_response(
|
|||||||
# By https://stackoverflow.com/users/401467/datashaman
|
# By https://stackoverflow.com/users/401467/datashaman
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
retries = Retry(
|
retries = Retry(
|
||||||
total=retries, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]
|
total=retries,
|
||||||
|
backoff_factor=backoff_factor,
|
||||||
|
status_forcelist=[500, 502, 503, 504],
|
||||||
)
|
)
|
||||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||||
url = _full_url(endpoint, params)
|
url = _full_url(endpoint, params)
|
||||||
|
@ -129,15 +129,16 @@ class Url:
|
|||||||
And to get the archive URL we are required to read the
|
And to get the archive URL we are required to read the
|
||||||
header of the API response.
|
header of the API response.
|
||||||
|
|
||||||
_get_response() takes care of the get requests. It uses requests
|
_get_response() takes care of the get requests.
|
||||||
package.
|
|
||||||
|
|
||||||
_archive_url_parser() parses the archive from the header.
|
_archive_url_parser() parses the archive from the header.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
|
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
|
||||||
headers = {"User-Agent": self.user_agent}
|
headers = {"User-Agent": self.user_agent}
|
||||||
response = _get_response(request_url, params=None, headers=headers)
|
response = _get_response(
|
||||||
|
request_url, params=None, headers=headers, backoff_factor=2
|
||||||
|
)
|
||||||
self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
|
self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
|
||||||
self.timestamp = datetime.utcnow()
|
self.timestamp = datetime.utcnow()
|
||||||
return self
|
return self
|
||||||
|
Loading…
Reference in New Issue
Block a user