create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy

2022-02-17 21:07:05 +05:30
parent faa97e5877
commit 28f4b579f2
1 changed files with 29 additions and 2 deletions
--- a/waybackpy/cdx_utils.py
+++ b/waybackpy/cdx_utils.py
@@ -13,7 +13,7 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry

-from .exceptions import WaybackError
+from .exceptions import BlockedSiteError, WaybackError
 from .utils import DEFAULT_USER_AGENT


@@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
    headers = {"User-Agent": user_agent}
    request_url = full_url(endpoint, params=payload)
    response = get_response(request_url, headers=headers)
-
+    check_for_blocked_site(response, url)
    if isinstance(response, requests.Response):
        return int(response.text.strip())
    raise response


+def check_for_blocked_site(
+    response: Union[requests.Response, Exception], url: Optional[str] = None
+) -> None:
+    """
+    Checks that the URL can be archived by wayback machine or not.
+    robots.txt policy of the site may prevent the wayback machine.
+    """
+    # see https://github.com/akamhy/waybackpy/issues/157
+
+    # the following if block is to make mypy happy.
+    if isinstance(response, Exception):
+        raise response
+
+    if not url:
+        url = "The requested content"
+    if (
+        "org.archive.util.io.RuntimeIOException: "
+        + "org.archive.wayback.exception.AdministrativeAccessControlException: "
+        + "Blocked Site Error"
+        in response.text.strip()
+    ):
+        raise BlockedSiteError(
+            f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
+        )
+
+
 def full_url(endpoint: str, params: Dict[str, Any]) -> str:
    """
    As the function's name already implies that it returns
@@ -76,6 +102,7 @@ def get_response(
    session.mount("https://", HTTPAdapter(max_retries=retries_))
    response = session.get(url, headers=headers)
    session.close()
+    check_for_blocked_site(response)
    return response