Add sort, use_pagination and closest (#158)
* add sort param support in CDX API class see https://nla.github.io/outbackcdx/api.html#operation/query sort takes string input which must be one of the follwoing: - default - closest - reverse This commit shall help in closing issue at https://github.com/akamhy/waybackpy/issues/155 * add BlockedSiteError for cases when archiving is blocked by site's robots.txt * create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy * add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API. * added cli support for sort, use-pagination and closest * added tests * fix codeql warnings, nothing to worry about here. * fix save test for archive_url
This commit is contained in:
parent
3a44a710d3
commit
f990b93f8a
@ -32,7 +32,11 @@ def test_b() -> None:
|
||||
url = "https://www.google.com"
|
||||
|
||||
wayback = WaybackMachineCDXServerAPI(
|
||||
url=url, user_agent=user_agent, start_timestamp="202101", end_timestamp="202112"
|
||||
url=url,
|
||||
user_agent=user_agent,
|
||||
start_timestamp="202101",
|
||||
end_timestamp="202112",
|
||||
collapses=["urlkey"],
|
||||
)
|
||||
# timeframe bound prefix matching enabled along with active urlkey based collapsing
|
||||
|
||||
@ -40,3 +44,49 @@ def test_b() -> None:
|
||||
|
||||
for snapshot in snapshots:
|
||||
assert snapshot.timestamp.startswith("2021")
|
||||
|
||||
|
||||
def test_c() -> None:
|
||||
user_agent = (
|
||||
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||
)
|
||||
url = "https://www.google.com"
|
||||
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
url=url,
|
||||
user_agent=user_agent,
|
||||
closest="201010101010",
|
||||
sort="closest",
|
||||
limit="1",
|
||||
)
|
||||
snapshots = cdx.snapshots()
|
||||
for snapshot in snapshots:
|
||||
archive_url = snapshot.archive_url
|
||||
timestamp = snapshot.timestamp
|
||||
break
|
||||
|
||||
assert str(archive_url).find("google.com")
|
||||
assert "20101010" in timestamp
|
||||
|
||||
|
||||
def test_d() -> None:
|
||||
user_agent = (
|
||||
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||
)
|
||||
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
url="akamhy.github.io",
|
||||
user_agent=user_agent,
|
||||
match_type="prefix",
|
||||
use_pagination=True,
|
||||
filters=["statuscode:200"],
|
||||
)
|
||||
snapshots = cdx.snapshots()
|
||||
|
||||
count = 0
|
||||
for snapshot in snapshots:
|
||||
count += 1
|
||||
assert str(snapshot.archive_url).find("akamhy.github.io")
|
||||
assert count > 50
|
||||
|
@ -219,4 +219,5 @@ def test_archive_url() -> None:
|
||||
save_api.saved_archive = (
|
||||
"https://web.archive.org/web/20220124063056/https://example.com/"
|
||||
)
|
||||
save_api._archive_url = save_api.saved_archive
|
||||
assert save_api.archive_url == save_api.saved_archive
|
||||
|
@ -35,4 +35,11 @@ def test_total_archives() -> None:
|
||||
|
||||
def test_known_urls() -> None:
|
||||
wayback = Url("akamhy.github.io")
|
||||
assert len(list(wayback.known_urls())) > 40
|
||||
assert len(list(wayback.known_urls(subdomain=True))) > 40
|
||||
|
||||
|
||||
def test_Save() -> None:
|
||||
wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property")
|
||||
wayback.save()
|
||||
archive_url = str(wayback.archive_url)
|
||||
assert archive_url.find("Asymptotic_equipartition_property") != -1
|
||||
|
@ -50,6 +50,8 @@ class WaybackMachineCDXServerAPI:
|
||||
collapses: Optional[List[str]] = None,
|
||||
limit: Optional[str] = None,
|
||||
max_tries: int = 3,
|
||||
use_pagination: bool = False,
|
||||
closest: Optional[str] = None,
|
||||
) -> None:
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
@ -66,60 +68,58 @@ class WaybackMachineCDXServerAPI:
|
||||
check_collapses(self.collapses)
|
||||
self.limit = 25000 if limit is None else limit
|
||||
self.max_tries = max_tries
|
||||
self.use_pagination = use_pagination
|
||||
self.closest = None if closest is None else str(closest)
|
||||
self.last_api_request_url: Optional[str] = None
|
||||
self.use_page = False
|
||||
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||
|
||||
def cdx_api_manager(
|
||||
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
|
||||
self, payload: Dict[str, str], headers: Dict[str, str]
|
||||
) -> Generator[str, None, None]:
|
||||
"""
|
||||
Manages the API calls for the instance, it automatically selects the best
|
||||
parameters by looking as the query of the end-user. For bigger queries
|
||||
automatically use the CDX pagination API and for smaller queries use the
|
||||
normal API.
|
||||
|
||||
CDX Server API is a complex API and to make it easy for the end user to
|
||||
consume it the CDX manager(this method) handles the selection of the
|
||||
API output, whether to use the pagination API or not.
|
||||
|
||||
For doing large/bulk queries, the use of the Pagination API is
|
||||
recommended by the Wayback Machine authors. And it determines if the
|
||||
query would be large or not by using the showNumPages=true parameter,
|
||||
this tells the number of pages of CDX DATA that the pagination API
|
||||
will return.
|
||||
|
||||
If the number of page is less than 2 we use the normal non-pagination
|
||||
API as the pagination API is known to lag and for big queries it should
|
||||
not matter but for queries where the number of pages are less this
|
||||
method chooses accuracy over the pagination API.
|
||||
This method uses the pagination API of the CDX server if
|
||||
use_pagination attribute is True else uses the standard
|
||||
CDX server response data.
|
||||
"""
|
||||
# number of pages that will returned by the pagination API.
|
||||
# get_total_pages adds the showNumPages=true param to pagination API
|
||||
# requests.
|
||||
# This is a special query that will return a single number indicating
|
||||
# the number of pages.
|
||||
total_pages = get_total_pages(self.url, self.user_agent)
|
||||
|
||||
if use_page is True and total_pages >= 2:
|
||||
blank_pages = 0
|
||||
# When using the pagination API of the CDX server.
|
||||
if self.use_pagination is True:
|
||||
|
||||
total_pages = get_total_pages(self.url, self.user_agent)
|
||||
successive_blank_pages = 0
|
||||
|
||||
for i in range(total_pages):
|
||||
payload["page"] = str(i)
|
||||
|
||||
url = full_url(self.endpoint, params=payload)
|
||||
res = get_response(url, headers=headers)
|
||||
|
||||
if isinstance(res, Exception):
|
||||
raise res
|
||||
|
||||
self.last_api_request_url = url
|
||||
text = res.text
|
||||
if len(text) == 0:
|
||||
blank_pages += 1
|
||||
|
||||
if blank_pages >= 2:
|
||||
# Reset the counter if the last page was blank
|
||||
# but the current page is not.
|
||||
if successive_blank_pages == 1:
|
||||
if len(text) != 0:
|
||||
successive_blank_pages = 0
|
||||
|
||||
# Increase the succesive page counter on encountering
|
||||
# blank page.
|
||||
if len(text) == 0:
|
||||
successive_blank_pages += 1
|
||||
|
||||
# If two succesive pages are blank
|
||||
# then we don't have any more pages left to
|
||||
# iterate.
|
||||
if successive_blank_pages >= 2:
|
||||
break
|
||||
|
||||
yield text
|
||||
|
||||
# When not using the pagination API of the CDX server
|
||||
else:
|
||||
payload["showResumeKey"] = "true"
|
||||
payload["limit"] = str(self.limit)
|
||||
@ -166,6 +166,9 @@ class WaybackMachineCDXServerAPI:
|
||||
if self.gzip is None:
|
||||
payload["gzip"] = "false"
|
||||
|
||||
if self.closest:
|
||||
payload["closest"] = self.closest
|
||||
|
||||
if self.match_type:
|
||||
payload["matchType"] = self.match_type
|
||||
|
||||
@ -206,13 +209,7 @@ class WaybackMachineCDXServerAPI:
|
||||
|
||||
self.add_payload(payload)
|
||||
|
||||
if not self.start_timestamp or self.end_timestamp:
|
||||
self.use_page = True
|
||||
|
||||
if self.collapses != []:
|
||||
self.use_page = False
|
||||
|
||||
entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
|
||||
entries = self.cdx_api_manager(payload, headers)
|
||||
|
||||
for entry in entries:
|
||||
|
||||
|
@ -13,7 +13,7 @@ import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from .exceptions import WaybackError
|
||||
from .exceptions import BlockedSiteError, WaybackError
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
|
||||
headers = {"User-Agent": user_agent}
|
||||
request_url = full_url(endpoint, params=payload)
|
||||
response = get_response(request_url, headers=headers)
|
||||
|
||||
check_for_blocked_site(response, url)
|
||||
if isinstance(response, requests.Response):
|
||||
return int(response.text.strip())
|
||||
raise response
|
||||
|
||||
|
||||
def check_for_blocked_site(
|
||||
response: Union[requests.Response, Exception], url: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
Checks that the URL can be archived by wayback machine or not.
|
||||
robots.txt policy of the site may prevent the wayback machine.
|
||||
"""
|
||||
# see https://github.com/akamhy/waybackpy/issues/157
|
||||
|
||||
# the following if block is to make mypy happy.
|
||||
if isinstance(response, Exception):
|
||||
raise response
|
||||
|
||||
if not url:
|
||||
url = "The requested content"
|
||||
if (
|
||||
"org.archive.util.io.RuntimeIOException: "
|
||||
+ "org.archive.wayback.exception.AdministrativeAccessControlException: "
|
||||
+ "Blocked Site Error"
|
||||
in response.text.strip()
|
||||
):
|
||||
raise BlockedSiteError(
|
||||
f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
|
||||
)
|
||||
|
||||
|
||||
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
|
||||
"""
|
||||
As the function's name already implies that it returns
|
||||
@ -76,6 +102,7 @@ def get_response(
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries_))
|
||||
response = session.get(url, headers=headers)
|
||||
session.close()
|
||||
check_for_blocked_site(response)
|
||||
return response
|
||||
|
||||
|
||||
|
@ -63,6 +63,9 @@ def handle_cdx(data: List[Any]) -> None:
|
||||
limit = data[7]
|
||||
gzip = data[8]
|
||||
match_type = data[9]
|
||||
sort = data[10]
|
||||
use_pagination = data[11]
|
||||
closest = data[12]
|
||||
|
||||
filters = list(cdx_filter)
|
||||
collapses = list(collapse)
|
||||
@ -73,8 +76,11 @@ def handle_cdx(data: List[Any]) -> None:
|
||||
user_agent=user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
closest=closest,
|
||||
filters=filters,
|
||||
match_type=match_type,
|
||||
sort=sort,
|
||||
use_pagination=use_pagination,
|
||||
gzip=gzip,
|
||||
collapses=collapses,
|
||||
limit=limit,
|
||||
@ -249,7 +255,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||
help="Use with '--known_urls' to save the URLs in file at current directory.",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--cdx",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
@ -269,6 +274,12 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||
"--to",
|
||||
help="End timestamp for CDX API in yyyyMMddhhmmss format.",
|
||||
)
|
||||
@click.option(
|
||||
"-C",
|
||||
"--closest",
|
||||
help="Archive that are closest the timestamp passed as arguments to this "
|
||||
+ "parameter.",
|
||||
)
|
||||
@click.option(
|
||||
"-f",
|
||||
"--cdx-filter",
|
||||
@ -285,6 +296,20 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||
+ "However, the CDX server can also return results matching a certain prefix, "
|
||||
+ "a certain host, or all sub-hosts by using the match_type",
|
||||
)
|
||||
@click.option(
|
||||
"-st",
|
||||
"--sort",
|
||||
help="Choose one from default, closest or reverse. It returns sorted CDX entries "
|
||||
+ "in the response.",
|
||||
)
|
||||
@click.option(
|
||||
"-up",
|
||||
"--use-pagination",
|
||||
"--use_pagination",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Use the pagination API of the CDX server instead of the default one.",
|
||||
)
|
||||
@click.option(
|
||||
"-gz",
|
||||
"--gzip",
|
||||
@ -326,6 +351,7 @@ def main( # pylint: disable=no-value-for-parameter
|
||||
subdomain: bool,
|
||||
file: bool,
|
||||
cdx: bool,
|
||||
use_pagination: bool,
|
||||
cdx_filter: List[str],
|
||||
collapse: List[str],
|
||||
cdx_print: List[str],
|
||||
@ -337,7 +363,9 @@ def main( # pylint: disable=no-value-for-parameter
|
||||
minute: Optional[int] = None,
|
||||
start_timestamp: Optional[str] = None,
|
||||
end_timestamp: Optional[str] = None,
|
||||
closest: Optional[str] = None,
|
||||
match_type: Optional[str] = None,
|
||||
sort: Optional[str] = None,
|
||||
gzip: Optional[str] = None,
|
||||
limit: Optional[str] = None,
|
||||
) -> None:
|
||||
@ -428,6 +456,9 @@ def main( # pylint: disable=no-value-for-parameter
|
||||
limit,
|
||||
gzip,
|
||||
match_type,
|
||||
sort,
|
||||
use_pagination,
|
||||
closest,
|
||||
]
|
||||
handle_cdx(data)
|
||||
|
||||
|
@ -16,6 +16,13 @@ class WaybackError(Exception):
|
||||
"""
|
||||
|
||||
|
||||
class BlockedSiteError(WaybackError):
|
||||
"""
|
||||
Raised when the archives for website/URLs that was excluded from Wayback
|
||||
Machine are requested via the CDX server API.
|
||||
"""
|
||||
|
||||
|
||||
class TooManyRequestsError(WaybackError):
|
||||
"""
|
||||
Raised when you make more than 15 requests per
|
||||
|
Loading…
Reference in New Issue
Block a user