From f990b93f8a262f1b045c4ba09326693dde515ba7 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Fri, 18 Feb 2022 00:24:14 +0530 Subject: [PATCH] Add sort, use_pagination and closest (#158) * add sort param support in CDX API class see https://nla.github.io/outbackcdx/api.html#operation/query sort takes string input which must be one of the follwoing: - default - closest - reverse This commit shall help in closing issue at https://github.com/akamhy/waybackpy/issues/155 * add BlockedSiteError for cases when archiving is blocked by site's robots.txt * create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy * add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API. * added cli support for sort, use-pagination and closest * added tests * fix codeql warnings, nothing to worry about here. * fix save test for archive_url --- tests/test_cdx_api.py | 52 +++++++++++++++++++++++++++- tests/test_save_api.py | 1 + tests/test_wrapper.py | 9 ++++- waybackpy/cdx_api.py | 75 ++++++++++++++++++++--------------------- waybackpy/cdx_utils.py | 31 +++++++++++++++-- waybackpy/cli.py | 33 +++++++++++++++++- waybackpy/exceptions.py | 7 ++++ 7 files changed, 164 insertions(+), 44 deletions(-) diff --git a/tests/test_cdx_api.py b/tests/test_cdx_api.py index 410a318..b7f28c2 100644 --- a/tests/test_cdx_api.py +++ b/tests/test_cdx_api.py @@ -32,7 +32,11 @@ def test_b() -> None: url = "https://www.google.com" wayback = WaybackMachineCDXServerAPI( - url=url, user_agent=user_agent, start_timestamp="202101", end_timestamp="202112" + url=url, + user_agent=user_agent, + start_timestamp="202101", + end_timestamp="202112", + collapses=["urlkey"], ) # timeframe bound prefix matching enabled along with active urlkey based collapsing @@ -40,3 +44,49 @@ def test_b() -> None: for snapshot in snapshots: assert snapshot.timestamp.startswith("2021") + + +def test_c() -> None: + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) + url = "https://www.google.com" + + cdx = WaybackMachineCDXServerAPI( + url=url, + user_agent=user_agent, + closest="201010101010", + sort="closest", + limit="1", + ) + snapshots = cdx.snapshots() + for snapshot in snapshots: + archive_url = snapshot.archive_url + timestamp = snapshot.timestamp + break + + assert str(archive_url).find("google.com") + assert "20101010" in timestamp + + +def test_d() -> None: + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) + + cdx = WaybackMachineCDXServerAPI( + url="akamhy.github.io", + user_agent=user_agent, + match_type="prefix", + use_pagination=True, + filters=["statuscode:200"], + ) + snapshots = cdx.snapshots() + + count = 0 + for snapshot in snapshots: + count += 1 + assert str(snapshot.archive_url).find("akamhy.github.io") + assert count > 50 diff --git a/tests/test_save_api.py b/tests/test_save_api.py index ab2e4fc..a6ff0b1 100644 --- a/tests/test_save_api.py +++ b/tests/test_save_api.py @@ -219,4 +219,5 @@ def test_archive_url() -> None: save_api.saved_archive = ( "https://web.archive.org/web/20220124063056/https://example.com/" ) + save_api._archive_url = save_api.saved_archive assert save_api.archive_url == save_api.saved_archive diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 02f886d..0e5f824 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -35,4 +35,11 @@ def test_total_archives() -> None: def test_known_urls() -> None: wayback = Url("akamhy.github.io") - assert len(list(wayback.known_urls())) > 40 + assert len(list(wayback.known_urls(subdomain=True))) > 40 + + +def test_Save() -> None: + wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property") + wayback.save() + archive_url = str(wayback.archive_url) + assert archive_url.find("Asymptotic_equipartition_property") != -1 diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index db02bf5..7e24db3 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -50,6 +50,8 @@ class WaybackMachineCDXServerAPI: collapses: Optional[List[str]] = None, limit: Optional[str] = None, max_tries: int = 3, + use_pagination: bool = False, + closest: Optional[str] = None, ) -> None: self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent @@ -66,60 +68,58 @@ class WaybackMachineCDXServerAPI: check_collapses(self.collapses) self.limit = 25000 if limit is None else limit self.max_tries = max_tries + self.use_pagination = use_pagination + self.closest = None if closest is None else str(closest) self.last_api_request_url: Optional[str] = None - self.use_page = False self.endpoint = "https://web.archive.org/cdx/search/cdx" def cdx_api_manager( - self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False + self, payload: Dict[str, str], headers: Dict[str, str] ) -> Generator[str, None, None]: """ - Manages the API calls for the instance, it automatically selects the best - parameters by looking as the query of the end-user. For bigger queries - automatically use the CDX pagination API and for smaller queries use the - normal API. - - CDX Server API is a complex API and to make it easy for the end user to - consume it the CDX manager(this method) handles the selection of the - API output, whether to use the pagination API or not. - - For doing large/bulk queries, the use of the Pagination API is - recommended by the Wayback Machine authors. And it determines if the - query would be large or not by using the showNumPages=true parameter, - this tells the number of pages of CDX DATA that the pagination API - will return. - - If the number of page is less than 2 we use the normal non-pagination - API as the pagination API is known to lag and for big queries it should - not matter but for queries where the number of pages are less this - method chooses accuracy over the pagination API. + This method uses the pagination API of the CDX server if + use_pagination attribute is True else uses the standard + CDX server response data. """ - # number of pages that will returned by the pagination API. - # get_total_pages adds the showNumPages=true param to pagination API - # requests. - # This is a special query that will return a single number indicating - # the number of pages. - total_pages = get_total_pages(self.url, self.user_agent) - if use_page is True and total_pages >= 2: - blank_pages = 0 + # When using the pagination API of the CDX server. + if self.use_pagination is True: + + total_pages = get_total_pages(self.url, self.user_agent) + successive_blank_pages = 0 + for i in range(total_pages): payload["page"] = str(i) url = full_url(self.endpoint, params=payload) res = get_response(url, headers=headers) + if isinstance(res, Exception): raise res self.last_api_request_url = url text = res.text - if len(text) == 0: - blank_pages += 1 - if blank_pages >= 2: + # Reset the counter if the last page was blank + # but the current page is not. + if successive_blank_pages == 1: + if len(text) != 0: + successive_blank_pages = 0 + + # Increase the succesive page counter on encountering + # blank page. + if len(text) == 0: + successive_blank_pages += 1 + + # If two succesive pages are blank + # then we don't have any more pages left to + # iterate. + if successive_blank_pages >= 2: break yield text + + # When not using the pagination API of the CDX server else: payload["showResumeKey"] = "true" payload["limit"] = str(self.limit) @@ -166,6 +166,9 @@ class WaybackMachineCDXServerAPI: if self.gzip is None: payload["gzip"] = "false" + if self.closest: + payload["closest"] = self.closest + if self.match_type: payload["matchType"] = self.match_type @@ -206,13 +209,7 @@ class WaybackMachineCDXServerAPI: self.add_payload(payload) - if not self.start_timestamp or self.end_timestamp: - self.use_page = True - - if self.collapses != []: - self.use_page = False - - entries = self.cdx_api_manager(payload, headers, use_page=self.use_page) + entries = self.cdx_api_manager(payload, headers) for entry in entries: diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py index 79d222e..2dc291d 100644 --- a/waybackpy/cdx_utils.py +++ b/waybackpy/cdx_utils.py @@ -13,7 +13,7 @@ import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from .exceptions import WaybackError +from .exceptions import BlockedSiteError, WaybackError from .utils import DEFAULT_USER_AGENT @@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int: headers = {"User-Agent": user_agent} request_url = full_url(endpoint, params=payload) response = get_response(request_url, headers=headers) - + check_for_blocked_site(response, url) if isinstance(response, requests.Response): return int(response.text.strip()) raise response +def check_for_blocked_site( + response: Union[requests.Response, Exception], url: Optional[str] = None +) -> None: + """ + Checks that the URL can be archived by wayback machine or not. + robots.txt policy of the site may prevent the wayback machine. + """ + # see https://github.com/akamhy/waybackpy/issues/157 + + # the following if block is to make mypy happy. + if isinstance(response, Exception): + raise response + + if not url: + url = "The requested content" + if ( + "org.archive.util.io.RuntimeIOException: " + + "org.archive.wayback.exception.AdministrativeAccessControlException: " + + "Blocked Site Error" + in response.text.strip() + ): + raise BlockedSiteError( + f"{url} is excluded from Wayback Machine by the site's robots.txt policy." + ) + + def full_url(endpoint: str, params: Dict[str, Any]) -> str: """ As the function's name already implies that it returns @@ -76,6 +102,7 @@ def get_response( session.mount("https://", HTTPAdapter(max_retries=retries_)) response = session.get(url, headers=headers) session.close() + check_for_blocked_site(response) return response diff --git a/waybackpy/cli.py b/waybackpy/cli.py index f8eb424..c805243 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -63,6 +63,9 @@ def handle_cdx(data: List[Any]) -> None: limit = data[7] gzip = data[8] match_type = data[9] + sort = data[10] + use_pagination = data[11] + closest = data[12] filters = list(cdx_filter) collapses = list(collapse) @@ -73,8 +76,11 @@ def handle_cdx(data: List[Any]) -> None: user_agent=user_agent, start_timestamp=start_timestamp, end_timestamp=end_timestamp, + closest=closest, filters=filters, match_type=match_type, + sort=sort, + use_pagination=use_pagination, gzip=gzip, collapses=collapses, limit=limit, @@ -249,7 +255,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: help="Use with '--known_urls' to save the URLs in file at current directory.", ) @click.option( - "-c", "--cdx", default=False, is_flag=True, @@ -269,6 +274,12 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: "--to", help="End timestamp for CDX API in yyyyMMddhhmmss format.", ) +@click.option( + "-C", + "--closest", + help="Archive that are closest the timestamp passed as arguments to this " + + "parameter.", +) @click.option( "-f", "--cdx-filter", @@ -285,6 +296,20 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: + "However, the CDX server can also return results matching a certain prefix, " + "a certain host, or all sub-hosts by using the match_type", ) +@click.option( + "-st", + "--sort", + help="Choose one from default, closest or reverse. It returns sorted CDX entries " + + "in the response.", +) +@click.option( + "-up", + "--use-pagination", + "--use_pagination", + default=False, + is_flag=True, + help="Use the pagination API of the CDX server instead of the default one.", +) @click.option( "-gz", "--gzip", @@ -326,6 +351,7 @@ def main( # pylint: disable=no-value-for-parameter subdomain: bool, file: bool, cdx: bool, + use_pagination: bool, cdx_filter: List[str], collapse: List[str], cdx_print: List[str], @@ -337,7 +363,9 @@ def main( # pylint: disable=no-value-for-parameter minute: Optional[int] = None, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None, + closest: Optional[str] = None, match_type: Optional[str] = None, + sort: Optional[str] = None, gzip: Optional[str] = None, limit: Optional[str] = None, ) -> None: @@ -428,6 +456,9 @@ def main( # pylint: disable=no-value-for-parameter limit, gzip, match_type, + sort, + use_pagination, + closest, ] handle_cdx(data) diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 3e8d347..fb6ad86 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -16,6 +16,13 @@ class WaybackError(Exception): """ +class BlockedSiteError(WaybackError): + """ + Raised when the archives for website/URLs that was excluded from Wayback + Machine are requested via the CDX server API. + """ + + class TooManyRequestsError(WaybackError): """ Raised when you make more than 15 requests per