improve functions get_total_pages, get_response and lint check_filters, check_collapses and check_match_type
get_total_pages : default user agent is now DEFAULT_USER_AGENT and now instead of str formatting passing payload as param to full_url to generate the request url also get_response make the request instead of directly using requests.get() get_response : get_response is now not taking param as keyword arguments instead the invoker is supposed to pass the full url which may be generated by the full_url function therefore the return_full_url=False, is deprecated also. Also now closing the session via session.close() No need to check 'Exceeded 30 redirects' as save API uses a diffrent method. check_filters : Not assigning to variables the return of match groups beacause we wont be using them and the linter picks these unused assignments. check_collapses : Same reason as for check_filters but also removed a foolish test that checks equality with objects that are guaranteed to be same. check_match_type : Updated the text that of WaybackError
This commit is contained in:
parent
d1a1cf2546
commit
9262f5da21
@ -3,16 +3,16 @@ import requests
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
from .exceptions import WaybackError
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
def get_total_pages(url, user_agent):
|
||||
request_url = (
|
||||
"https://web.archive.org/cdx/search/cdx?url={url}&showNumPages=true".format(
|
||||
url=url
|
||||
)
|
||||
)
|
||||
def get_total_pages(url, user_agent=DEFAULT_USER_AGENT):
|
||||
endpoint = "https://web.archive.org/cdx/search/cdx?"
|
||||
payload = {"showNumPages": "true", "url": str(url)}
|
||||
headers = {"User-Agent": user_agent}
|
||||
return int((requests.get(request_url, headers=headers).text).strip())
|
||||
request_url = full_url(endpoint, params=payload)
|
||||
response = get_response(request_url, headers=headers)
|
||||
return int(response.text.strip())
|
||||
|
||||
|
||||
def full_url(endpoint, params):
|
||||
@ -32,47 +32,29 @@ def full_url(endpoint, params):
|
||||
|
||||
|
||||
def get_response(
|
||||
endpoint,
|
||||
params=None,
|
||||
url,
|
||||
headers=None,
|
||||
return_full_url=False,
|
||||
retries=5,
|
||||
backoff_factor=0.5,
|
||||
no_raise_on_redirects=False,
|
||||
):
|
||||
|
||||
s = requests.Session()
|
||||
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
total=retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
)
|
||||
|
||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
# The URL with parameters required for the get request
|
||||
url = full_url(endpoint, params)
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
try:
|
||||
|
||||
if not return_full_url:
|
||||
return s.get(url, headers=headers)
|
||||
|
||||
return (url, s.get(url, headers=headers))
|
||||
|
||||
response = session.get(url, headers=headers)
|
||||
session.close()
|
||||
return response
|
||||
except Exception as e:
|
||||
|
||||
reason = str(e)
|
||||
|
||||
if no_raise_on_redirects:
|
||||
if "Exceeded 30 redirects" in reason:
|
||||
return
|
||||
|
||||
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
||||
url=url, reason=reason
|
||||
)
|
||||
|
||||
exc = WaybackError(exc_message)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
||||
@ -91,8 +73,8 @@ def check_filters(filters):
|
||||
_filter,
|
||||
)
|
||||
|
||||
key = match.group(1)
|
||||
val = match.group(2)
|
||||
match.group(1)
|
||||
match.group(2)
|
||||
|
||||
except Exception:
|
||||
|
||||
@ -118,19 +100,9 @@ def check_collapses(collapses):
|
||||
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
|
||||
collapse,
|
||||
)
|
||||
field = match.group(1)
|
||||
|
||||
N = None
|
||||
match.group(1)
|
||||
if 2 == len(match.groups()):
|
||||
N = match.group(2)
|
||||
|
||||
if N:
|
||||
if not (field + N == collapse):
|
||||
raise Exception
|
||||
else:
|
||||
if not (field == collapse):
|
||||
raise Exception
|
||||
|
||||
match.group(2)
|
||||
except Exception:
|
||||
exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
|
||||
collapse=collapse
|
||||
@ -143,7 +115,9 @@ def check_match_type(match_type, url):
|
||||
return
|
||||
|
||||
if "*" in url:
|
||||
raise WaybackError("Can not use wildcard with match_type argument")
|
||||
raise WaybackError(
|
||||
"Can not use wildcard in the URL along with the match_type arguments."
|
||||
)
|
||||
|
||||
legal_match_type = ["exact", "prefix", "host", "domain"]
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user