From f990b93f8a262f1b045c4ba09326693dde515ba7 Mon Sep 17 00:00:00 2001
From: Akash Mahanty <akamhy@yahoo.com>
Date: Fri, 18 Feb 2022 00:24:14 +0530
Subject: [PATCH] Add sort, use_pagination and closest (#158)

* add sort param support in CDX API class

see https://nla.github.io/outbackcdx/api.html#operation/query

sort takes string input which must be one of the follwoing:
- default
- closest
- reverse

This commit shall help in closing issue at https://github.com/akamhy/waybackpy/issues/155

* add BlockedSiteError for cases when archiving is blocked by site's robots.txt

* create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy

* add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API.

* added cli support for sort, use-pagination and closest

* added tests

* fix codeql warnings, nothing to worry about here.

* fix save test for archive_url
---
 tests/test_cdx_api.py   | 52 +++++++++++++++++++++++++++-
 tests/test_save_api.py  |  1 +
 tests/test_wrapper.py   |  9 ++++-
 waybackpy/cdx_api.py    | 75 ++++++++++++++++++++---------------------
 waybackpy/cdx_utils.py  | 31 +++++++++++++++--
 waybackpy/cli.py        | 33 +++++++++++++++++-
 waybackpy/exceptions.py |  7 ++++
 7 files changed, 164 insertions(+), 44 deletions(-)

diff --git a/tests/test_cdx_api.py b/tests/test_cdx_api.py
index 410a318..b7f28c2 100644
--- a/tests/test_cdx_api.py
+++ b/tests/test_cdx_api.py
@@ -32,7 +32,11 @@ def test_b() -> None:
     url = "https://www.google.com"
 
     wayback = WaybackMachineCDXServerAPI(
-        url=url, user_agent=user_agent, start_timestamp="202101", end_timestamp="202112"
+        url=url,
+        user_agent=user_agent,
+        start_timestamp="202101",
+        end_timestamp="202112",
+        collapses=["urlkey"],
     )
     #  timeframe bound prefix matching enabled along with active urlkey based collapsing
 
@@ -40,3 +44,49 @@ def test_b() -> None:
 
     for snapshot in snapshots:
         assert snapshot.timestamp.startswith("2021")
+
+
+def test_c() -> None:
+    user_agent = (
+        "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
+        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
+    )
+    url = "https://www.google.com"
+
+    cdx = WaybackMachineCDXServerAPI(
+        url=url,
+        user_agent=user_agent,
+        closest="201010101010",
+        sort="closest",
+        limit="1",
+    )
+    snapshots = cdx.snapshots()
+    for snapshot in snapshots:
+        archive_url = snapshot.archive_url
+        timestamp = snapshot.timestamp
+        break
+
+    assert str(archive_url).find("google.com")
+    assert "20101010" in timestamp
+
+
+def test_d() -> None:
+    user_agent = (
+        "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
+        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
+    )
+
+    cdx = WaybackMachineCDXServerAPI(
+        url="akamhy.github.io",
+        user_agent=user_agent,
+        match_type="prefix",
+        use_pagination=True,
+        filters=["statuscode:200"],
+    )
+    snapshots = cdx.snapshots()
+
+    count = 0
+    for snapshot in snapshots:
+        count += 1
+        assert str(snapshot.archive_url).find("akamhy.github.io")
+    assert count > 50
diff --git a/tests/test_save_api.py b/tests/test_save_api.py
index ab2e4fc..a6ff0b1 100644
--- a/tests/test_save_api.py
+++ b/tests/test_save_api.py
@@ -219,4 +219,5 @@ def test_archive_url() -> None:
     save_api.saved_archive = (
         "https://web.archive.org/web/20220124063056/https://example.com/"
     )
+    save_api._archive_url = save_api.saved_archive
     assert save_api.archive_url == save_api.saved_archive
diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py
index 02f886d..0e5f824 100644
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@@ -35,4 +35,11 @@ def test_total_archives() -> None:
 
 def test_known_urls() -> None:
     wayback = Url("akamhy.github.io")
-    assert len(list(wayback.known_urls())) > 40
+    assert len(list(wayback.known_urls(subdomain=True))) > 40
+
+
+def test_Save() -> None:
+    wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property")
+    wayback.save()
+    archive_url = str(wayback.archive_url)
+    assert archive_url.find("Asymptotic_equipartition_property") != -1
diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py
index db02bf5..7e24db3 100644
--- a/waybackpy/cdx_api.py
+++ b/waybackpy/cdx_api.py
@@ -50,6 +50,8 @@ class WaybackMachineCDXServerAPI:
         collapses: Optional[List[str]] = None,
         limit: Optional[str] = None,
         max_tries: int = 3,
+        use_pagination: bool = False,
+        closest: Optional[str] = None,
     ) -> None:
         self.url = str(url).strip().replace(" ", "%20")
         self.user_agent = user_agent
@@ -66,60 +68,58 @@ class WaybackMachineCDXServerAPI:
         check_collapses(self.collapses)
         self.limit = 25000 if limit is None else limit
         self.max_tries = max_tries
+        self.use_pagination = use_pagination
+        self.closest = None if closest is None else str(closest)
         self.last_api_request_url: Optional[str] = None
-        self.use_page = False
         self.endpoint = "https://web.archive.org/cdx/search/cdx"
 
     def cdx_api_manager(
-        self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
+        self, payload: Dict[str, str], headers: Dict[str, str]
     ) -> Generator[str, None, None]:
         """
-        Manages the API calls for the instance, it automatically selects the best
-        parameters by looking as the query of the end-user. For bigger queries
-        automatically use the CDX pagination API and for smaller queries use the
-        normal API.
-
-        CDX Server API is a complex API and to make it easy for the end user to
-        consume it the CDX manager(this method) handles the selection of the
-        API output, whether to use the pagination API or not.
-
-        For doing large/bulk queries, the use of the Pagination API is
-        recommended by the Wayback Machine authors. And it determines if the
-        query would be large or not by using the showNumPages=true parameter,
-        this tells the number of pages of CDX DATA that the pagination API
-        will return.
-
-        If the number of page is less than 2 we use the normal non-pagination
-        API as the pagination API is known to lag and for big queries it should
-        not matter but for queries where the number of pages are less this
-        method chooses accuracy over the pagination API.
+        This method uses the pagination API of the CDX server if
+        use_pagination attribute is True else uses the standard
+        CDX server response data.
         """
-        # number of pages that will returned by the pagination API.
-        # get_total_pages adds the showNumPages=true param to pagination API
-        # requests.
-        # This is a special query that will return a single number indicating
-        # the number of pages.
-        total_pages = get_total_pages(self.url, self.user_agent)
 
-        if use_page is True and total_pages >= 2:
-            blank_pages = 0
+        # When using the pagination API of the CDX server.
+        if self.use_pagination is True:
+
+            total_pages = get_total_pages(self.url, self.user_agent)
+            successive_blank_pages = 0
+
             for i in range(total_pages):
                 payload["page"] = str(i)
 
                 url = full_url(self.endpoint, params=payload)
                 res = get_response(url, headers=headers)
+
                 if isinstance(res, Exception):
                     raise res
 
                 self.last_api_request_url = url
                 text = res.text
-                if len(text) == 0:
-                    blank_pages += 1
 
-                if blank_pages >= 2:
+                # Reset the counter if the last page was blank
+                # but the current page is not.
+                if successive_blank_pages == 1:
+                    if len(text) != 0:
+                        successive_blank_pages = 0
+
+                # Increase the succesive page counter on encountering
+                # blank page.
+                if len(text) == 0:
+                    successive_blank_pages += 1
+
+                # If two succesive pages are blank
+                # then we don't have any more pages left to
+                # iterate.
+                if successive_blank_pages >= 2:
                     break
 
                 yield text
+
+        # When not using the pagination API of the CDX server
         else:
             payload["showResumeKey"] = "true"
             payload["limit"] = str(self.limit)
@@ -166,6 +166,9 @@ class WaybackMachineCDXServerAPI:
         if self.gzip is None:
             payload["gzip"] = "false"
 
+        if self.closest:
+            payload["closest"] = self.closest
+
         if self.match_type:
             payload["matchType"] = self.match_type
 
@@ -206,13 +209,7 @@ class WaybackMachineCDXServerAPI:
 
         self.add_payload(payload)
 
-        if not self.start_timestamp or self.end_timestamp:
-            self.use_page = True
-
-        if self.collapses != []:
-            self.use_page = False
-
-        entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
+        entries = self.cdx_api_manager(payload, headers)
 
         for entry in entries:
 
diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py
index 79d222e..2dc291d 100644
--- a/waybackpy/cdx_utils.py
+++ b/waybackpy/cdx_utils.py
@@ -13,7 +13,7 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
-from .exceptions import WaybackError
+from .exceptions import BlockedSiteError, WaybackError
 from .utils import DEFAULT_USER_AGENT
 
 
@@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
     headers = {"User-Agent": user_agent}
     request_url = full_url(endpoint, params=payload)
     response = get_response(request_url, headers=headers)
-
+    check_for_blocked_site(response, url)
     if isinstance(response, requests.Response):
         return int(response.text.strip())
     raise response
 
 
+def check_for_blocked_site(
+    response: Union[requests.Response, Exception], url: Optional[str] = None
+) -> None:
+    """
+    Checks that the URL can be archived by wayback machine or not.
+    robots.txt policy of the site may prevent the wayback machine.
+    """
+    # see https://github.com/akamhy/waybackpy/issues/157
+
+    # the following if block is to make mypy happy.
+    if isinstance(response, Exception):
+        raise response
+
+    if not url:
+        url = "The requested content"
+    if (
+        "org.archive.util.io.RuntimeIOException: "
+        + "org.archive.wayback.exception.AdministrativeAccessControlException: "
+        + "Blocked Site Error"
+        in response.text.strip()
+    ):
+        raise BlockedSiteError(
+            f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
+        )
+
+
 def full_url(endpoint: str, params: Dict[str, Any]) -> str:
     """
     As the function's name already implies that it returns
@@ -76,6 +102,7 @@ def get_response(
     session.mount("https://", HTTPAdapter(max_retries=retries_))
     response = session.get(url, headers=headers)
     session.close()
+    check_for_blocked_site(response)
     return response
 
 
diff --git a/waybackpy/cli.py b/waybackpy/cli.py
index f8eb424..c805243 100644
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -63,6 +63,9 @@ def handle_cdx(data: List[Any]) -> None:
     limit = data[7]
     gzip = data[8]
     match_type = data[9]
+    sort = data[10]
+    use_pagination = data[11]
+    closest = data[12]
 
     filters = list(cdx_filter)
     collapses = list(collapse)
@@ -73,8 +76,11 @@ def handle_cdx(data: List[Any]) -> None:
         user_agent=user_agent,
         start_timestamp=start_timestamp,
         end_timestamp=end_timestamp,
+        closest=closest,
         filters=filters,
         match_type=match_type,
+        sort=sort,
+        use_pagination=use_pagination,
         gzip=gzip,
         collapses=collapses,
         limit=limit,
@@ -249,7 +255,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
     help="Use with '--known_urls' to save the URLs in file at current directory.",
 )
 @click.option(
-    "-c",
     "--cdx",
     default=False,
     is_flag=True,
@@ -269,6 +274,12 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
     "--to",
     help="End timestamp for CDX API in yyyyMMddhhmmss format.",
 )
+@click.option(
+    "-C",
+    "--closest",
+    help="Archive that are closest the timestamp passed as arguments to this "
+    + "parameter.",
+)
 @click.option(
     "-f",
     "--cdx-filter",
@@ -285,6 +296,20 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
     + "However, the CDX server can also return results matching a certain prefix, "
     + "a certain host, or all sub-hosts by using the match_type",
 )
+@click.option(
+    "-st",
+    "--sort",
+    help="Choose one from default, closest or reverse. It returns sorted CDX entries "
+    + "in the response.",
+)
+@click.option(
+    "-up",
+    "--use-pagination",
+    "--use_pagination",
+    default=False,
+    is_flag=True,
+    help="Use the pagination API of the CDX server instead of the default one.",
+)
 @click.option(
     "-gz",
     "--gzip",
@@ -326,6 +351,7 @@ def main(  # pylint: disable=no-value-for-parameter
     subdomain: bool,
     file: bool,
     cdx: bool,
+    use_pagination: bool,
     cdx_filter: List[str],
     collapse: List[str],
     cdx_print: List[str],
@@ -337,7 +363,9 @@ def main(  # pylint: disable=no-value-for-parameter
     minute: Optional[int] = None,
     start_timestamp: Optional[str] = None,
     end_timestamp: Optional[str] = None,
+    closest: Optional[str] = None,
     match_type: Optional[str] = None,
+    sort: Optional[str] = None,
     gzip: Optional[str] = None,
     limit: Optional[str] = None,
 ) -> None:
@@ -428,6 +456,9 @@ def main(  # pylint: disable=no-value-for-parameter
             limit,
             gzip,
             match_type,
+            sort,
+            use_pagination,
+            closest,
         ]
         handle_cdx(data)
 
diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py
index 3e8d347..fb6ad86 100644
--- a/waybackpy/exceptions.py
+++ b/waybackpy/exceptions.py
@@ -16,6 +16,13 @@ class WaybackError(Exception):
     """
 
 
+class BlockedSiteError(WaybackError):
+    """
+    Raised when the archives for website/URLs that was excluded from Wayback
+    Machine are requested via the CDX server API.
+    """
+
+
 class TooManyRequestsError(WaybackError):
     """
     Raised when you make more than 15 requests per