From 9262f5da21956aa1106795e0f46c8a49c362efb1 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Mon, 24 Jan 2022 22:57:20 +0530 Subject: [PATCH] improve functions get_total_pages, get_response and lint check_filters, check_collapses and check_match_type get_total_pages : default user agent is now DEFAULT_USER_AGENT and now instead of str formatting passing payload as param to full_url to generate the request url also get_response make the request instead of directly using requests.get() get_response : get_response is now not taking param as keyword arguments instead the invoker is supposed to pass the full url which may be generated by the full_url function therefore the return_full_url=False, is deprecated also. Also now closing the session via session.close() No need to check 'Exceeded 30 redirects' as save API uses a diffrent method. check_filters : Not assigning to variables the return of match groups beacause we wont be using them and the linter picks these unused assignments. check_collapses : Same reason as for check_filters but also removed a foolish test that checks equality with objects that are guaranteed to be same. check_match_type : Updated the text that of WaybackError --- waybackpy/cdx_utils.py | 66 +++++++++++++----------------------------- 1 file changed, 20 insertions(+), 46 deletions(-) diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py index 84b3928..ff31918 100644 --- a/waybackpy/cdx_utils.py +++ b/waybackpy/cdx_utils.py @@ -3,16 +3,16 @@ import requests from urllib3.util.retry import Retry from requests.adapters import HTTPAdapter from .exceptions import WaybackError +from .utils import DEFAULT_USER_AGENT -def get_total_pages(url, user_agent): - request_url = ( - "https://web.archive.org/cdx/search/cdx?url={url}&showNumPages=true".format( - url=url - ) - ) +def get_total_pages(url, user_agent=DEFAULT_USER_AGENT): + endpoint = "https://web.archive.org/cdx/search/cdx?" + payload = {"showNumPages": "true", "url": str(url)} headers = {"User-Agent": user_agent} - return int((requests.get(request_url, headers=headers).text).strip()) + request_url = full_url(endpoint, params=payload) + response = get_response(request_url, headers=headers) + return int(response.text.strip()) def full_url(endpoint, params): @@ -32,47 +32,29 @@ def full_url(endpoint, params): def get_response( - endpoint, - params=None, + url, headers=None, - return_full_url=False, retries=5, backoff_factor=0.5, no_raise_on_redirects=False, ): - - s = requests.Session() - + session = requests.Session() retries = Retry( total=retries, backoff_factor=backoff_factor, status_forcelist=[500, 502, 503, 504], ) - - s.mount("https://", HTTPAdapter(max_retries=retries)) - - # The URL with parameters required for the get request - url = full_url(endpoint, params) + session.mount("https://", HTTPAdapter(max_retries=retries)) try: - - if not return_full_url: - return s.get(url, headers=headers) - - return (url, s.get(url, headers=headers)) - + response = session.get(url, headers=headers) + session.close() + return response except Exception as e: - reason = str(e) - - if no_raise_on_redirects: - if "Exceeded 30 redirects" in reason: - return - exc_message = "Error while retrieving {url}.\n{reason}".format( url=url, reason=reason ) - exc = WaybackError(exc_message) exc.__cause__ = e raise exc @@ -91,8 +73,8 @@ def check_filters(filters): _filter, ) - key = match.group(1) - val = match.group(2) + match.group(1) + match.group(2) except Exception: @@ -118,19 +100,9 @@ def check_collapses(collapses): r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?", collapse, ) - field = match.group(1) - - N = None + match.group(1) if 2 == len(match.groups()): - N = match.group(2) - - if N: - if not (field + N == collapse): - raise Exception - else: - if not (field == collapse): - raise Exception - + match.group(2) except Exception: exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format( collapse=collapse @@ -143,7 +115,9 @@ def check_match_type(match_type, url): return if "*" in url: - raise WaybackError("Can not use wildcard with match_type argument") + raise WaybackError( + "Can not use wildcard in the URL along with the match_type arguments." + ) legal_match_type = ["exact", "prefix", "host", "domain"]