Add sort, use_pagination and closest (#158)

* add sort param support in CDX API class

see https://nla.github.io/outbackcdx/api.html#operation/query

sort takes string input which must be one of the follwoing:
- default
- closest
- reverse

This commit shall help in closing issue at https://github.com/akamhy/waybackpy/issues/155

* add BlockedSiteError for cases when archiving is blocked by site's robots.txt

* create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy

* add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API.

* added cli support for sort, use-pagination and closest

* added tests

* fix codeql warnings, nothing to worry about here.

* fix save test for archive_url
This commit is contained in:
Akash Mahanty 2022-02-18 00:24:14 +05:30 committed by GitHub
parent 3a44a710d3
commit f990b93f8a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 164 additions and 44 deletions

View File

@ -32,7 +32,11 @@ def test_b() -> None:
url = "https://www.google.com"
wayback = WaybackMachineCDXServerAPI(
url=url, user_agent=user_agent, start_timestamp="202101", end_timestamp="202112"
url=url,
user_agent=user_agent,
start_timestamp="202101",
end_timestamp="202112",
collapses=["urlkey"],
)
# timeframe bound prefix matching enabled along with active urlkey based collapsing
@ -40,3 +44,49 @@ def test_b() -> None:
for snapshot in snapshots:
assert snapshot.timestamp.startswith("2021")
def test_c() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
url = "https://www.google.com"
cdx = WaybackMachineCDXServerAPI(
url=url,
user_agent=user_agent,
closest="201010101010",
sort="closest",
limit="1",
)
snapshots = cdx.snapshots()
for snapshot in snapshots:
archive_url = snapshot.archive_url
timestamp = snapshot.timestamp
break
assert str(archive_url).find("google.com")
assert "20101010" in timestamp
def test_d() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="akamhy.github.io",
user_agent=user_agent,
match_type="prefix",
use_pagination=True,
filters=["statuscode:200"],
)
snapshots = cdx.snapshots()
count = 0
for snapshot in snapshots:
count += 1
assert str(snapshot.archive_url).find("akamhy.github.io")
assert count > 50

View File

@ -219,4 +219,5 @@ def test_archive_url() -> None:
save_api.saved_archive = (
"https://web.archive.org/web/20220124063056/https://example.com/"
)
save_api._archive_url = save_api.saved_archive
assert save_api.archive_url == save_api.saved_archive

View File

@ -35,4 +35,11 @@ def test_total_archives() -> None:
def test_known_urls() -> None:
wayback = Url("akamhy.github.io")
assert len(list(wayback.known_urls())) > 40
assert len(list(wayback.known_urls(subdomain=True))) > 40
def test_Save() -> None:
wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property")
wayback.save()
archive_url = str(wayback.archive_url)
assert archive_url.find("Asymptotic_equipartition_property") != -1

View File

@ -50,6 +50,8 @@ class WaybackMachineCDXServerAPI:
collapses: Optional[List[str]] = None,
limit: Optional[str] = None,
max_tries: int = 3,
use_pagination: bool = False,
closest: Optional[str] = None,
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
@ -66,60 +68,58 @@ class WaybackMachineCDXServerAPI:
check_collapses(self.collapses)
self.limit = 25000 if limit is None else limit
self.max_tries = max_tries
self.use_pagination = use_pagination
self.closest = None if closest is None else str(closest)
self.last_api_request_url: Optional[str] = None
self.use_page = False
self.endpoint = "https://web.archive.org/cdx/search/cdx"
def cdx_api_manager(
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
self, payload: Dict[str, str], headers: Dict[str, str]
) -> Generator[str, None, None]:
"""
Manages the API calls for the instance, it automatically selects the best
parameters by looking as the query of the end-user. For bigger queries
automatically use the CDX pagination API and for smaller queries use the
normal API.
CDX Server API is a complex API and to make it easy for the end user to
consume it the CDX manager(this method) handles the selection of the
API output, whether to use the pagination API or not.
For doing large/bulk queries, the use of the Pagination API is
recommended by the Wayback Machine authors. And it determines if the
query would be large or not by using the showNumPages=true parameter,
this tells the number of pages of CDX DATA that the pagination API
will return.
If the number of page is less than 2 we use the normal non-pagination
API as the pagination API is known to lag and for big queries it should
not matter but for queries where the number of pages are less this
method chooses accuracy over the pagination API.
This method uses the pagination API of the CDX server if
use_pagination attribute is True else uses the standard
CDX server response data.
"""
# number of pages that will returned by the pagination API.
# get_total_pages adds the showNumPages=true param to pagination API
# requests.
# This is a special query that will return a single number indicating
# the number of pages.
total_pages = get_total_pages(self.url, self.user_agent)
if use_page is True and total_pages >= 2:
blank_pages = 0
# When using the pagination API of the CDX server.
if self.use_pagination is True:
total_pages = get_total_pages(self.url, self.user_agent)
successive_blank_pages = 0
for i in range(total_pages):
payload["page"] = str(i)
url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url
text = res.text
if len(text) == 0:
blank_pages += 1
if blank_pages >= 2:
# Reset the counter if the last page was blank
# but the current page is not.
if successive_blank_pages == 1:
if len(text) != 0:
successive_blank_pages = 0
# Increase the succesive page counter on encountering
# blank page.
if len(text) == 0:
successive_blank_pages += 1
# If two succesive pages are blank
# then we don't have any more pages left to
# iterate.
if successive_blank_pages >= 2:
break
yield text
# When not using the pagination API of the CDX server
else:
payload["showResumeKey"] = "true"
payload["limit"] = str(self.limit)
@ -166,6 +166,9 @@ class WaybackMachineCDXServerAPI:
if self.gzip is None:
payload["gzip"] = "false"
if self.closest:
payload["closest"] = self.closest
if self.match_type:
payload["matchType"] = self.match_type
@ -206,13 +209,7 @@ class WaybackMachineCDXServerAPI:
self.add_payload(payload)
if not self.start_timestamp or self.end_timestamp:
self.use_page = True
if self.collapses != []:
self.use_page = False
entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
entries = self.cdx_api_manager(payload, headers)
for entry in entries:

View File

@ -13,7 +13,7 @@ import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from .exceptions import WaybackError
from .exceptions import BlockedSiteError, WaybackError
from .utils import DEFAULT_USER_AGENT
@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
headers = {"User-Agent": user_agent}
request_url = full_url(endpoint, params=payload)
response = get_response(request_url, headers=headers)
check_for_blocked_site(response, url)
if isinstance(response, requests.Response):
return int(response.text.strip())
raise response
def check_for_blocked_site(
response: Union[requests.Response, Exception], url: Optional[str] = None
) -> None:
"""
Checks that the URL can be archived by wayback machine or not.
robots.txt policy of the site may prevent the wayback machine.
"""
# see https://github.com/akamhy/waybackpy/issues/157
# the following if block is to make mypy happy.
if isinstance(response, Exception):
raise response
if not url:
url = "The requested content"
if (
"org.archive.util.io.RuntimeIOException: "
+ "org.archive.wayback.exception.AdministrativeAccessControlException: "
+ "Blocked Site Error"
in response.text.strip()
):
raise BlockedSiteError(
f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
)
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
"""
As the function's name already implies that it returns
@ -76,6 +102,7 @@ def get_response(
session.mount("https://", HTTPAdapter(max_retries=retries_))
response = session.get(url, headers=headers)
session.close()
check_for_blocked_site(response)
return response

View File

@ -63,6 +63,9 @@ def handle_cdx(data: List[Any]) -> None:
limit = data[7]
gzip = data[8]
match_type = data[9]
sort = data[10]
use_pagination = data[11]
closest = data[12]
filters = list(cdx_filter)
collapses = list(collapse)
@ -73,8 +76,11 @@ def handle_cdx(data: List[Any]) -> None:
user_agent=user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
closest=closest,
filters=filters,
match_type=match_type,
sort=sort,
use_pagination=use_pagination,
gzip=gzip,
collapses=collapses,
limit=limit,
@ -249,7 +255,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
help="Use with '--known_urls' to save the URLs in file at current directory.",
)
@click.option(
"-c",
"--cdx",
default=False,
is_flag=True,
@ -269,6 +274,12 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
"--to",
help="End timestamp for CDX API in yyyyMMddhhmmss format.",
)
@click.option(
"-C",
"--closest",
help="Archive that are closest the timestamp passed as arguments to this "
+ "parameter.",
)
@click.option(
"-f",
"--cdx-filter",
@ -285,6 +296,20 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
+ "However, the CDX server can also return results matching a certain prefix, "
+ "a certain host, or all sub-hosts by using the match_type",
)
@click.option(
"-st",
"--sort",
help="Choose one from default, closest or reverse. It returns sorted CDX entries "
+ "in the response.",
)
@click.option(
"-up",
"--use-pagination",
"--use_pagination",
default=False,
is_flag=True,
help="Use the pagination API of the CDX server instead of the default one.",
)
@click.option(
"-gz",
"--gzip",
@ -326,6 +351,7 @@ def main( # pylint: disable=no-value-for-parameter
subdomain: bool,
file: bool,
cdx: bool,
use_pagination: bool,
cdx_filter: List[str],
collapse: List[str],
cdx_print: List[str],
@ -337,7 +363,9 @@ def main( # pylint: disable=no-value-for-parameter
minute: Optional[int] = None,
start_timestamp: Optional[str] = None,
end_timestamp: Optional[str] = None,
closest: Optional[str] = None,
match_type: Optional[str] = None,
sort: Optional[str] = None,
gzip: Optional[str] = None,
limit: Optional[str] = None,
) -> None:
@ -428,6 +456,9 @@ def main( # pylint: disable=no-value-for-parameter
limit,
gzip,
match_type,
sort,
use_pagination,
closest,
]
handle_cdx(data)

View File

@ -16,6 +16,13 @@ class WaybackError(Exception):
"""
class BlockedSiteError(WaybackError):
"""
Raised when the archives for website/URLs that was excluded from Wayback
Machine are requested via the CDX server API.
"""
class TooManyRequestsError(WaybackError):
"""
Raised when you make more than 15 requests per