Add sort, use_pagination and closest (#158)

* add sort param support in CDX API class see https://nla.github.io/outbackcdx/api.html#operation/query sort takes string input which must be one of the follwoing: - default - closest - reverse This commit shall help in closing issue at https://github.com/akamhy/waybackpy/issues/155 * add BlockedSiteError for cases when archiving is blocked by site's robots.txt * create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy * add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API. * added cli support for sort, use-pagination and closest * added tests * fix codeql warnings, nothing to worry about here. * fix save test for archive_url
2022-02-18 00:24:14 +05:30 · 2022-02-18 00:24:14 +05:30 · f990b93f8a
commit f990b93f8a
parent 3a44a710d3
7 changed files with 164 additions and 44 deletions
--- a/tests/test_cdx_api.py
+++ b/tests/test_cdx_api.py
@ -32,7 +32,11 @@ def test_b() -> None:
    url = "https://www.google.com"

    wayback = WaybackMachineCDXServerAPI(
-        url=url, user_agent=user_agent, start_timestamp="202101", end_timestamp="202112"
+        url=url,
+        user_agent=user_agent,
+        start_timestamp="202101",
+        end_timestamp="202112",
+        collapses=["urlkey"],
    )
    #  timeframe bound prefix matching enabled along with active urlkey based collapsing

@ -40,3 +44,49 @@ def test_b() -> None:

    for snapshot in snapshots:
        assert snapshot.timestamp.startswith("2021")
+
+
+def test_c() -> None:
+    user_agent = (
+        "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
+        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
+    )
+    url = "https://www.google.com"
+
+    cdx = WaybackMachineCDXServerAPI(
+        url=url,
+        user_agent=user_agent,
+        closest="201010101010",
+        sort="closest",
+        limit="1",
+    )
+    snapshots = cdx.snapshots()
+    for snapshot in snapshots:
+        archive_url = snapshot.archive_url
+        timestamp = snapshot.timestamp
+        break
+
+    assert str(archive_url).find("google.com")
+    assert "20101010" in timestamp
+
+
+def test_d() -> None:
+    user_agent = (
+        "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
+        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
+    )
+
+    cdx = WaybackMachineCDXServerAPI(
+        url="akamhy.github.io",
+        user_agent=user_agent,
+        match_type="prefix",
+        use_pagination=True,
+        filters=["statuscode:200"],
+    )
+    snapshots = cdx.snapshots()
+
+    count = 0
+    for snapshot in snapshots:
+        count += 1
+        assert str(snapshot.archive_url).find("akamhy.github.io")
+    assert count > 50
--- a/tests/test_save_api.py
+++ b/tests/test_save_api.py
@ -219,4 +219,5 @@ def test_archive_url() -> None:
    save_api.saved_archive = (
        "https://web.archive.org/web/20220124063056/https://example.com/"
    )
+    save_api._archive_url = save_api.saved_archive
    assert save_api.archive_url == save_api.saved_archive
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@ -35,4 +35,11 @@ def test_total_archives() -> None:

 def test_known_urls() -> None:
    wayback = Url("akamhy.github.io")
-    assert len(list(wayback.known_urls())) > 40
+    assert len(list(wayback.known_urls(subdomain=True))) > 40
+
+
+def test_Save() -> None:
+    wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property")
+    wayback.save()
+    archive_url = str(wayback.archive_url)
+    assert archive_url.find("Asymptotic_equipartition_property") != -1
--- a/waybackpy/cdx_api.py
+++ b/waybackpy/cdx_api.py
@ -50,6 +50,8 @@ class WaybackMachineCDXServerAPI:
        collapses: Optional[List[str]] = None,
        limit: Optional[str] = None,
        max_tries: int = 3,
+        use_pagination: bool = False,
+        closest: Optional[str] = None,
    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
@ -66,60 +68,58 @@ class WaybackMachineCDXServerAPI:
        check_collapses(self.collapses)
        self.limit = 25000 if limit is None else limit
        self.max_tries = max_tries
+        self.use_pagination = use_pagination
+        self.closest = None if closest is None else str(closest)
        self.last_api_request_url: Optional[str] = None
-        self.use_page = False
        self.endpoint = "https://web.archive.org/cdx/search/cdx"

    def cdx_api_manager(
-        self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
+        self, payload: Dict[str, str], headers: Dict[str, str]
    ) -> Generator[str, None, None]:
        """
-        Manages the API calls for the instance, it automatically selects the best
-        parameters by looking as the query of the end-user. For bigger queries
-        automatically use the CDX pagination API and for smaller queries use the
-        normal API.
-
-        CDX Server API is a complex API and to make it easy for the end user to
-        consume it the CDX manager(this method) handles the selection of the
-        API output, whether to use the pagination API or not.
-
-        For doing large/bulk queries, the use of the Pagination API is
-        recommended by the Wayback Machine authors. And it determines if the
-        query would be large or not by using the showNumPages=true parameter,
-        this tells the number of pages of CDX DATA that the pagination API
-        will return.
-
-        If the number of page is less than 2 we use the normal non-pagination
-        API as the pagination API is known to lag and for big queries it should
-        not matter but for queries where the number of pages are less this
-        method chooses accuracy over the pagination API.
+        This method uses the pagination API of the CDX server if
+        use_pagination attribute is True else uses the standard
+        CDX server response data.
        """
-        # number of pages that will returned by the pagination API.
-        # get_total_pages adds the showNumPages=true param to pagination API
-        # requests.
-        # This is a special query that will return a single number indicating
-        # the number of pages.
-        total_pages = get_total_pages(self.url, self.user_agent)

-        if use_page is True and total_pages >= 2:
-            blank_pages = 0
+        # When using the pagination API of the CDX server.
+        if self.use_pagination is True:
+
+            total_pages = get_total_pages(self.url, self.user_agent)
+            successive_blank_pages = 0
+
            for i in range(total_pages):
                payload["page"] = str(i)

                url = full_url(self.endpoint, params=payload)
                res = get_response(url, headers=headers)
+
                if isinstance(res, Exception):
                    raise res

                self.last_api_request_url = url
                text = res.text
-                if len(text) == 0:
-                    blank_pages += 1

-                if blank_pages >= 2:
+                # Reset the counter if the last page was blank
+                # but the current page is not.
+                if successive_blank_pages == 1:
+                    if len(text) != 0:
+                        successive_blank_pages = 0
+
+                # Increase the succesive page counter on encountering
+                # blank page.
+                if len(text) == 0:
+                    successive_blank_pages += 1
+
+                # If two succesive pages are blank
+                # then we don't have any more pages left to
+                # iterate.
+                if successive_blank_pages >= 2:
                    break

                yield text
+
+        # When not using the pagination API of the CDX server
        else:
            payload["showResumeKey"] = "true"
            payload["limit"] = str(self.limit)
@ -166,6 +166,9 @@ class WaybackMachineCDXServerAPI:
        if self.gzip is None:
            payload["gzip"] = "false"

+        if self.closest:
+            payload["closest"] = self.closest
+
        if self.match_type:
            payload["matchType"] = self.match_type

@ -206,13 +209,7 @@ class WaybackMachineCDXServerAPI:

        self.add_payload(payload)

-        if not self.start_timestamp or self.end_timestamp:
-            self.use_page = True
-
-        if self.collapses != []:
-            self.use_page = False
-
-        entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
+        entries = self.cdx_api_manager(payload, headers)

        for entry in entries:

--- a/waybackpy/cdx_utils.py
+++ b/waybackpy/cdx_utils.py
@ -13,7 +13,7 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry

-from .exceptions import WaybackError
+from .exceptions import BlockedSiteError, WaybackError
 from .utils import DEFAULT_USER_AGENT


@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
    headers = {"User-Agent": user_agent}
    request_url = full_url(endpoint, params=payload)
    response = get_response(request_url, headers=headers)
-
+    check_for_blocked_site(response, url)
    if isinstance(response, requests.Response):
        return int(response.text.strip())
    raise response


+def check_for_blocked_site(
+    response: Union[requests.Response, Exception], url: Optional[str] = None
+) -> None:
+    """
+    Checks that the URL can be archived by wayback machine or not.
+    robots.txt policy of the site may prevent the wayback machine.
+    """
+    # see https://github.com/akamhy/waybackpy/issues/157
+
+    # the following if block is to make mypy happy.
+    if isinstance(response, Exception):
+        raise response
+
+    if not url:
+        url = "The requested content"
+    if (
+        "org.archive.util.io.RuntimeIOException: "
+        + "org.archive.wayback.exception.AdministrativeAccessControlException: "
+        + "Blocked Site Error"
+        in response.text.strip()
+    ):
+        raise BlockedSiteError(
+            f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
+        )
+
+
 def full_url(endpoint: str, params: Dict[str, Any]) -> str:
    """
    As the function's name already implies that it returns
@ -76,6 +102,7 @@ def get_response(
    session.mount("https://", HTTPAdapter(max_retries=retries_))
    response = session.get(url, headers=headers)
    session.close()
+    check_for_blocked_site(response)
    return response


--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@ -63,6 +63,9 @@ def handle_cdx(data: List[Any]) -> None:
    limit = data[7]
    gzip = data[8]
    match_type = data[9]
+    sort = data[10]
+    use_pagination = data[11]
+    closest = data[12]

    filters = list(cdx_filter)
    collapses = list(collapse)
@ -73,8 +76,11 @@ def handle_cdx(data: List[Any]) -> None:
        user_agent=user_agent,
        start_timestamp=start_timestamp,
        end_timestamp=end_timestamp,
+        closest=closest,
        filters=filters,
        match_type=match_type,
+        sort=sort,
+        use_pagination=use_pagination,
        gzip=gzip,
        collapses=collapses,
        limit=limit,
@ -249,7 +255,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
    help="Use with '--known_urls' to save the URLs in file at current directory.",
 )
@click.option(
-    "-c",
    "--cdx",
    default=False,
    is_flag=True,
@ -269,6 +274,12 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
    "--to",
    help="End timestamp for CDX API in yyyyMMddhhmmss format.",
 )
+@click.option(
+    "-C",
+    "--closest",
+    help="Archive that are closest the timestamp passed as arguments to this "
+    + "parameter.",
+)
@click.option(
    "-f",
    "--cdx-filter",
@ -285,6 +296,20 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
    + "However, the CDX server can also return results matching a certain prefix, "
    + "a certain host, or all sub-hosts by using the match_type",
 )
+@click.option(
+    "-st",
+    "--sort",
+    help="Choose one from default, closest or reverse. It returns sorted CDX entries "
+    + "in the response.",
+)
+@click.option(
+    "-up",
+    "--use-pagination",
+    "--use_pagination",
+    default=False,
+    is_flag=True,
+    help="Use the pagination API of the CDX server instead of the default one.",
+)
@click.option(
    "-gz",
    "--gzip",
@ -326,6 +351,7 @@ def main(  # pylint: disable=no-value-for-parameter
    subdomain: bool,
    file: bool,
    cdx: bool,
+    use_pagination: bool,
    cdx_filter: List[str],
    collapse: List[str],
    cdx_print: List[str],
@ -337,7 +363,9 @@ def main(  # pylint: disable=no-value-for-parameter
    minute: Optional[int] = None,
    start_timestamp: Optional[str] = None,
    end_timestamp: Optional[str] = None,
+    closest: Optional[str] = None,
    match_type: Optional[str] = None,
+    sort: Optional[str] = None,
    gzip: Optional[str] = None,
    limit: Optional[str] = None,
 ) -> None:
@ -428,6 +456,9 @@ def main(  # pylint: disable=no-value-for-parameter
            limit,
            gzip,
            match_type,
+            sort,
+            use_pagination,
+            closest,
        ]
        handle_cdx(data)

--- a/waybackpy/exceptions.py
+++ b/waybackpy/exceptions.py
@ -16,6 +16,13 @@ class WaybackError(Exception):
    """


+class BlockedSiteError(WaybackError):
+    """
+    Raised when the archives for website/URLs that was excluded from Wayback
+    Machine are requested via the CDX server API.
+    """
+
+
 class TooManyRequestsError(WaybackError):
    """
    Raised when you make more than 15 requests per