From 0e7255c1d867facfb94b3fd437a68e7b0bbad0f6 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Thu, 17 Feb 2022 21:13:42 +0530 Subject: [PATCH] add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API. --- waybackpy/cdx_api.py | 75 +++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index db02bf5..7e24db3 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -50,6 +50,8 @@ class WaybackMachineCDXServerAPI: collapses: Optional[List[str]] = None, limit: Optional[str] = None, max_tries: int = 3, + use_pagination: bool = False, + closest: Optional[str] = None, ) -> None: self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent @@ -66,60 +68,58 @@ class WaybackMachineCDXServerAPI: check_collapses(self.collapses) self.limit = 25000 if limit is None else limit self.max_tries = max_tries + self.use_pagination = use_pagination + self.closest = None if closest is None else str(closest) self.last_api_request_url: Optional[str] = None - self.use_page = False self.endpoint = "https://web.archive.org/cdx/search/cdx" def cdx_api_manager( - self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False + self, payload: Dict[str, str], headers: Dict[str, str] ) -> Generator[str, None, None]: """ - Manages the API calls for the instance, it automatically selects the best - parameters by looking as the query of the end-user. For bigger queries - automatically use the CDX pagination API and for smaller queries use the - normal API. - - CDX Server API is a complex API and to make it easy for the end user to - consume it the CDX manager(this method) handles the selection of the - API output, whether to use the pagination API or not. - - For doing large/bulk queries, the use of the Pagination API is - recommended by the Wayback Machine authors. And it determines if the - query would be large or not by using the showNumPages=true parameter, - this tells the number of pages of CDX DATA that the pagination API - will return. - - If the number of page is less than 2 we use the normal non-pagination - API as the pagination API is known to lag and for big queries it should - not matter but for queries where the number of pages are less this - method chooses accuracy over the pagination API. + This method uses the pagination API of the CDX server if + use_pagination attribute is True else uses the standard + CDX server response data. """ - # number of pages that will returned by the pagination API. - # get_total_pages adds the showNumPages=true param to pagination API - # requests. - # This is a special query that will return a single number indicating - # the number of pages. - total_pages = get_total_pages(self.url, self.user_agent) - if use_page is True and total_pages >= 2: - blank_pages = 0 + # When using the pagination API of the CDX server. + if self.use_pagination is True: + + total_pages = get_total_pages(self.url, self.user_agent) + successive_blank_pages = 0 + for i in range(total_pages): payload["page"] = str(i) url = full_url(self.endpoint, params=payload) res = get_response(url, headers=headers) + if isinstance(res, Exception): raise res self.last_api_request_url = url text = res.text - if len(text) == 0: - blank_pages += 1 - if blank_pages >= 2: + # Reset the counter if the last page was blank + # but the current page is not. + if successive_blank_pages == 1: + if len(text) != 0: + successive_blank_pages = 0 + + # Increase the succesive page counter on encountering + # blank page. + if len(text) == 0: + successive_blank_pages += 1 + + # If two succesive pages are blank + # then we don't have any more pages left to + # iterate. + if successive_blank_pages >= 2: break yield text + + # When not using the pagination API of the CDX server else: payload["showResumeKey"] = "true" payload["limit"] = str(self.limit) @@ -166,6 +166,9 @@ class WaybackMachineCDXServerAPI: if self.gzip is None: payload["gzip"] = "false" + if self.closest: + payload["closest"] = self.closest + if self.match_type: payload["matchType"] = self.match_type @@ -206,13 +209,7 @@ class WaybackMachineCDXServerAPI: self.add_payload(payload) - if not self.start_timestamp or self.end_timestamp: - self.use_page = True - - if self.collapses != []: - self.use_page = False - - entries = self.cdx_api_manager(payload, headers, use_page=self.use_page) + entries = self.cdx_api_manager(payload, headers) for entry in entries: