create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy

This commit is contained in:
Akash Mahanty
2022-02-17 21:07:05 +05:30
parent faa97e5877
commit 28f4b579f2

View File

@@ -13,7 +13,7 @@ import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from .exceptions import WaybackError from .exceptions import BlockedSiteError, WaybackError
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
@@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
headers = {"User-Agent": user_agent} headers = {"User-Agent": user_agent}
request_url = full_url(endpoint, params=payload) request_url = full_url(endpoint, params=payload)
response = get_response(request_url, headers=headers) response = get_response(request_url, headers=headers)
check_for_blocked_site(response, url)
if isinstance(response, requests.Response): if isinstance(response, requests.Response):
return int(response.text.strip()) return int(response.text.strip())
raise response raise response
def check_for_blocked_site(
response: Union[requests.Response, Exception], url: Optional[str] = None
) -> None:
"""
Checks that the URL can be archived by wayback machine or not.
robots.txt policy of the site may prevent the wayback machine.
"""
# see https://github.com/akamhy/waybackpy/issues/157
# the following if block is to make mypy happy.
if isinstance(response, Exception):
raise response
if not url:
url = "The requested content"
if (
"org.archive.util.io.RuntimeIOException: "
+ "org.archive.wayback.exception.AdministrativeAccessControlException: "
+ "Blocked Site Error"
in response.text.strip()
):
raise BlockedSiteError(
f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
)
def full_url(endpoint: str, params: Dict[str, Any]) -> str: def full_url(endpoint: str, params: Dict[str, Any]) -> str:
""" """
As the function's name already implies that it returns As the function's name already implies that it returns
@@ -76,6 +102,7 @@ def get_response(
session.mount("https://", HTTPAdapter(max_retries=retries_)) session.mount("https://", HTTPAdapter(max_retries=retries_))
response = session.get(url, headers=headers) response = session.get(url, headers=headers)
session.close() session.close()
check_for_blocked_site(response)
return response return response