waybackpy/cdx_api.py : deafult user agent is now DEFAULT_USER_AGENT, get_response now take url and headers as arguments and request url is generated by full_url function. max_tries added as parameter for the WaybackMachineCDXServerAPI class with default value of 3.

This commit is contained in:
Akash Mahanty 2022-01-24 23:20:49 +05:30
parent c49fe971fd
commit 06fc7855bf

View File

@ -6,6 +6,7 @@ from .cdx_utils import (
check_filters, check_filters,
check_collapses, check_collapses,
check_match_type, check_match_type,
full_url,
) )
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
@ -19,17 +20,18 @@ class WaybackMachineCDXServerAPI:
def __init__( def __init__(
self, self,
url, url,
user_agent=None, user_agent=DEFAULT_USER_AGENT,
start_timestamp=None, start_timestamp=None, # from, can not use from as it's a keyword
end_timestamp=None, end_timestamp=None, # to, not using to as can not use from
filters=[], filters=[],
match_type=None, match_type=None,
gzip=None, gzip=None,
collapses=[], collapses=[],
limit=None, limit=None,
max_tries=3,
): ):
self.url = str(url).strip().replace(" ", "%20") self.url = str(url).strip().replace(" ", "%20")
self.user_agent = str(user_agent) if user_agent else DEFAULT_USER_AGENT self.user_agent = user_agent
self.start_timestamp = str(start_timestamp) if start_timestamp else None self.start_timestamp = str(start_timestamp) if start_timestamp else None
self.end_timestamp = str(end_timestamp) if end_timestamp else None self.end_timestamp = str(end_timestamp) if end_timestamp else None
self.filters = filters self.filters = filters
@ -40,6 +42,7 @@ class WaybackMachineCDXServerAPI:
self.collapses = collapses self.collapses = collapses
check_collapses(self.collapses) check_collapses(self.collapses)
self.limit = limit if limit else 5000 self.limit = limit if limit else 5000
self.max_tries = max_tries
self.last_api_request_url = None self.last_api_request_url = None
self.use_page = False self.use_page = False
self.endpoint = "https://web.archive.org/cdx/search/cdx" self.endpoint = "https://web.archive.org/cdx/search/cdx"
@ -47,16 +50,15 @@ class WaybackMachineCDXServerAPI:
def cdx_api_manager(self, payload, headers, use_page=False): def cdx_api_manager(self, payload, headers, use_page=False):
total_pages = get_total_pages(self.url, self.user_agent) total_pages = get_total_pages(self.url, self.user_agent)
# If we only have two or less pages of archives then we care for accuracy # If we only have two or less pages of archives then we care for more accuracy
# pagination API can be lagged sometimes # pagination API is lagged sometimes
if use_page is True and total_pages >= 2: if use_page is True and total_pages >= 2:
blank_pages = 0 blank_pages = 0
for i in range(total_pages): for i in range(total_pages):
payload["page"] = str(i) payload["page"] = str(i)
url, res = get_response( url = full_url(endpoint, params)
self.endpoint, params=payload, headers=headers, return_full_url=True res = get_response(url, headers=headers)
)
self.last_api_request_url = url self.last_api_request_url = url
text = res.text text = res.text
@ -79,9 +81,8 @@ class WaybackMachineCDXServerAPI:
if resumeKey: if resumeKey:
payload["resumeKey"] = resumeKey payload["resumeKey"] = resumeKey
url, res = get_response( url = full_url(endpoint, params)
self.endpoint, params=payload, headers=headers, return_full_url=True res = get_response(url, headers=headers)
)
self.last_api_request_url = url self.last_api_request_url = url