Add sort, use_pagination and closest (#158)

* add sort param support in CDX API class see https://nla.github.io/outbackcdx/api.html#operation/query sort takes string input which must be one of the follwoing: - default - closest - reverse This commit shall help in closing issue at https://github.com/akamhy/waybackpy/issues/155 * add BlockedSiteError for cases when archiving is blocked by site's robots.txt * create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy * add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API. * added cli support for sort, use-pagination and closest * added tests * fix codeql warnings, nothing to worry about here. * fix save test for archive_url
2022-02-18 00:24:14 +05:30
parent 3a44a710d3
commit f990b93f8a
7 changed files with 164 additions and 44 deletions
--- a/tests/test_cdx_api.py
+++ b/tests/test_cdx_api.py
@@ -32,7 +32,11 @@ def test_b() -> None:
    url = "https://www.google.com"
    wayback = WaybackMachineCDXServerAPI(
-        url=url, user_agent=user_agent, start_timestamp="202101", end_timestamp="202112"
+        url=url,
        user_agent=user_agent,
        start_timestamp="202101",
        end_timestamp="202112",
        collapses=["urlkey"],
    )
    #  timeframe bound prefix matching enabled along with active urlkey based collapsing
@@ -40,3 +44,49 @@ def test_b() -> None:
    for snapshot in snapshots:
        assert snapshot.timestamp.startswith("2021")
 def test_c() -> None:
    user_agent = (
        "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
    )
    url = "https://www.google.com"
    cdx = WaybackMachineCDXServerAPI(
        url=url,
        user_agent=user_agent,
        closest="201010101010",
        sort="closest",
        limit="1",
    )
    snapshots = cdx.snapshots()
    for snapshot in snapshots:
        archive_url = snapshot.archive_url
        timestamp = snapshot.timestamp
        break
    assert str(archive_url).find("google.com")
    assert "20101010" in timestamp
 def test_d() -> None:
    user_agent = (
        "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
    )
    cdx = WaybackMachineCDXServerAPI(
        url="akamhy.github.io",
        user_agent=user_agent,
        match_type="prefix",
        use_pagination=True,
        filters=["statuscode:200"],
    )
    snapshots = cdx.snapshots()
    count = 0
    for snapshot in snapshots:
        count += 1
        assert str(snapshot.archive_url).find("akamhy.github.io")
    assert count > 50
--- a/tests/test_save_api.py
+++ b/tests/test_save_api.py
@@ -219,4 +219,5 @@ def test_archive_url() -> None:
    save_api.saved_archive = (
        "https://web.archive.org/web/20220124063056/https://example.com/"
    )
    save_api._archive_url = save_api.saved_archive
    assert save_api.archive_url == save_api.saved_archive
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@@ -35,4 +35,11 @@ def test_total_archives() -> None:
 def test_known_urls() -> None:
    wayback = Url("akamhy.github.io")
-    assert len(list(wayback.known_urls())) > 40
+    assert len(list(wayback.known_urls(subdomain=True))) > 40
 def test_Save() -> None:
    wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property")
    wayback.save()
    archive_url = str(wayback.archive_url)
    assert archive_url.find("Asymptotic_equipartition_property") != -1
--- a/waybackpy/cdx_api.py
+++ b/waybackpy/cdx_api.py
@@ -50,6 +50,8 @@ class WaybackMachineCDXServerAPI:
        collapses: Optional[List[str]] = None,
        limit: Optional[str] = None,
        max_tries: int = 3,
        use_pagination: bool = False,
        closest: Optional[str] = None,
    ) -> None:
        self.url = str(url).strip().replace(" ", "%20")
        self.user_agent = user_agent
@@ -66,60 +68,58 @@ class WaybackMachineCDXServerAPI:
        check_collapses(self.collapses)
        self.limit = 25000 if limit is None else limit
        self.max_tries = max_tries
        self.use_pagination = use_pagination
        self.closest = None if closest is None else str(closest)
        self.last_api_request_url: Optional[str] = None
        self.use_page = False
        self.endpoint = "https://web.archive.org/cdx/search/cdx"
    def cdx_api_manager(
-        self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
+        self, payload: Dict[str, str], headers: Dict[str, str]
    ) -> Generator[str, None, None]:
        """
-        Manages the API calls for the instance, it automatically selects the best
+        This method uses the pagination API of the CDX server if
-        parameters by looking as the query of the end-user. For bigger queries
+        use_pagination attribute is True else uses the standard
-        automatically use the CDX pagination API and for smaller queries use the
+        CDX server response data.
        normal API.
        CDX Server API is a complex API and to make it easy for the end user to
        consume it the CDX manager(this method) handles the selection of the
        API output, whether to use the pagination API or not.
        For doing large/bulk queries, the use of the Pagination API is
        recommended by the Wayback Machine authors. And it determines if the
        query would be large or not by using the showNumPages=true parameter,
        this tells the number of pages of CDX DATA that the pagination API
        will return.
        If the number of page is less than 2 we use the normal non-pagination
        API as the pagination API is known to lag and for big queries it should
        not matter but for queries where the number of pages are less this
        method chooses accuracy over the pagination API.
        """
        # number of pages that will returned by the pagination API.
        # get_total_pages adds the showNumPages=true param to pagination API
        # requests.
        # This is a special query that will return a single number indicating
        # the number of pages.
        total_pages = get_total_pages(self.url, self.user_agent)
-        if use_page is True and total_pages >= 2:
+        # When using the pagination API of the CDX server.
-            blank_pages = 0
+        if self.use_pagination is True:
            total_pages = get_total_pages(self.url, self.user_agent)
            successive_blank_pages = 0
            for i in range(total_pages):
                payload["page"] = str(i)
                url = full_url(self.endpoint, params=payload)
                res = get_response(url, headers=headers)
                if isinstance(res, Exception):
                    raise res
                self.last_api_request_url = url
                text = res.text
                if len(text) == 0:
                    blank_pages += 1
-                if blank_pages >= 2:
+                # Reset the counter if the last page was blank
                # but the current page is not.
                if successive_blank_pages == 1:
                    if len(text) != 0:
                        successive_blank_pages = 0
                # Increase the succesive page counter on encountering
                # blank page.
                if len(text) == 0:
                    successive_blank_pages += 1
                # If two succesive pages are blank
                # then we don't have any more pages left to
                # iterate.
                if successive_blank_pages >= 2:
                    break
                yield text
        # When not using the pagination API of the CDX server
        else:
            payload["showResumeKey"] = "true"
            payload["limit"] = str(self.limit)
@@ -166,6 +166,9 @@ class WaybackMachineCDXServerAPI:
        if self.gzip is None:
            payload["gzip"] = "false"
        if self.closest:
            payload["closest"] = self.closest
        if self.match_type:
            payload["matchType"] = self.match_type
@@ -206,13 +209,7 @@ class WaybackMachineCDXServerAPI:
        self.add_payload(payload)
-        if not self.start_timestamp or self.end_timestamp:
+        entries = self.cdx_api_manager(payload, headers)
            self.use_page = True
        if self.collapses != []:
            self.use_page = False
        entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
        for entry in entries:
--- a/waybackpy/cdx_utils.py
+++ b/waybackpy/cdx_utils.py
@@ -13,7 +13,7 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
-from .exceptions import WaybackError
+from .exceptions import BlockedSiteError, WaybackError
 from .utils import DEFAULT_USER_AGENT
@@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
    headers = {"User-Agent": user_agent}
    request_url = full_url(endpoint, params=payload)
    response = get_response(request_url, headers=headers)
-
+    check_for_blocked_site(response, url)
    if isinstance(response, requests.Response):
        return int(response.text.strip())
    raise response
 def check_for_blocked_site(
    response: Union[requests.Response, Exception], url: Optional[str] = None
 ) -> None:
    """
    Checks that the URL can be archived by wayback machine or not.
    robots.txt policy of the site may prevent the wayback machine.
    """
    # see https://github.com/akamhy/waybackpy/issues/157
    # the following if block is to make mypy happy.
    if isinstance(response, Exception):
        raise response
    if not url:
        url = "The requested content"
    if (
        "org.archive.util.io.RuntimeIOException: "
        + "org.archive.wayback.exception.AdministrativeAccessControlException: "
        + "Blocked Site Error"
        in response.text.strip()
    ):
        raise BlockedSiteError(
            f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
        )
 def full_url(endpoint: str, params: Dict[str, Any]) -> str:
    """
    As the function's name already implies that it returns
@@ -76,6 +102,7 @@ def get_response(
    session.mount("https://", HTTPAdapter(max_retries=retries_))
    response = session.get(url, headers=headers)
    session.close()
    check_for_blocked_site(response)
    return response
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -63,6 +63,9 @@ def handle_cdx(data: List[Any]) -> None:
    limit = data[7]
    gzip = data[8]
    match_type = data[9]
    sort = data[10]
    use_pagination = data[11]
    closest = data[12]
    filters = list(cdx_filter)
    collapses = list(collapse)
@@ -73,8 +76,11 @@ def handle_cdx(data: List[Any]) -> None:
        user_agent=user_agent,
        start_timestamp=start_timestamp,
        end_timestamp=end_timestamp,
        closest=closest,
        filters=filters,
        match_type=match_type,
        sort=sort,
        use_pagination=use_pagination,
        gzip=gzip,
        collapses=collapses,
        limit=limit,
@@ -249,7 +255,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
    help="Use with '--known_urls' to save the URLs in file at current directory.",
 )
@click.option(
    "-c",
    "--cdx",
    default=False,
    is_flag=True,
@@ -269,6 +274,12 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
    "--to",
    help="End timestamp for CDX API in yyyyMMddhhmmss format.",
 )
@click.option(
    "-C",
    "--closest",
    help="Archive that are closest the timestamp passed as arguments to this "
    + "parameter.",
 )
@click.option(
    "-f",
    "--cdx-filter",
@@ -285,6 +296,20 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
    + "However, the CDX server can also return results matching a certain prefix, "
    + "a certain host, or all sub-hosts by using the match_type",
 )
@click.option(
    "-st",
    "--sort",
    help="Choose one from default, closest or reverse. It returns sorted CDX entries "
    + "in the response.",
 )
@click.option(
    "-up",
    "--use-pagination",
    "--use_pagination",
    default=False,
    is_flag=True,
    help="Use the pagination API of the CDX server instead of the default one.",
 )
@click.option(
    "-gz",
    "--gzip",
@@ -326,6 +351,7 @@ def main(  # pylint: disable=no-value-for-parameter
    subdomain: bool,
    file: bool,
    cdx: bool,
    use_pagination: bool,
    cdx_filter: List[str],
    collapse: List[str],
    cdx_print: List[str],
@@ -337,7 +363,9 @@ def main(  # pylint: disable=no-value-for-parameter
    minute: Optional[int] = None,
    start_timestamp: Optional[str] = None,
    end_timestamp: Optional[str] = None,
    closest: Optional[str] = None,
    match_type: Optional[str] = None,
    sort: Optional[str] = None,
    gzip: Optional[str] = None,
    limit: Optional[str] = None,
 ) -> None:
@@ -428,6 +456,9 @@ def main(  # pylint: disable=no-value-for-parameter
            limit,
            gzip,
            match_type,
            sort,
            use_pagination,
            closest,
        ]
        handle_cdx(data)
--- a/waybackpy/exceptions.py
+++ b/waybackpy/exceptions.py
@@ -16,6 +16,13 @@ class WaybackError(Exception):
    """
 class BlockedSiteError(WaybackError):
    """
    Raised when the archives for website/URLs that was excluded from Wayback
    Machine are requested via the CDX server API.
    """
 class TooManyRequestsError(WaybackError):
    """
    Raised when you make more than 15 requests per