Add sort, use_pagination and closest (#158)
* add sort param support in CDX API class see https://nla.github.io/outbackcdx/api.html#operation/query sort takes string input which must be one of the follwoing: - default - closest - reverse This commit shall help in closing issue at https://github.com/akamhy/waybackpy/issues/155 * add BlockedSiteError for cases when archiving is blocked by site's robots.txt * create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy * add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API. * added cli support for sort, use-pagination and closest * added tests * fix codeql warnings, nothing to worry about here. * fix save test for archive_url
This commit is contained in:
parent
3a44a710d3
commit
f990b93f8a
@ -32,7 +32,11 @@ def test_b() -> None:
|
|||||||
url = "https://www.google.com"
|
url = "https://www.google.com"
|
||||||
|
|
||||||
wayback = WaybackMachineCDXServerAPI(
|
wayback = WaybackMachineCDXServerAPI(
|
||||||
url=url, user_agent=user_agent, start_timestamp="202101", end_timestamp="202112"
|
url=url,
|
||||||
|
user_agent=user_agent,
|
||||||
|
start_timestamp="202101",
|
||||||
|
end_timestamp="202112",
|
||||||
|
collapses=["urlkey"],
|
||||||
)
|
)
|
||||||
# timeframe bound prefix matching enabled along with active urlkey based collapsing
|
# timeframe bound prefix matching enabled along with active urlkey based collapsing
|
||||||
|
|
||||||
@ -40,3 +44,49 @@ def test_b() -> None:
|
|||||||
|
|
||||||
for snapshot in snapshots:
|
for snapshot in snapshots:
|
||||||
assert snapshot.timestamp.startswith("2021")
|
assert snapshot.timestamp.startswith("2021")
|
||||||
|
|
||||||
|
|
||||||
|
def test_c() -> None:
|
||||||
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
|
url = "https://www.google.com"
|
||||||
|
|
||||||
|
cdx = WaybackMachineCDXServerAPI(
|
||||||
|
url=url,
|
||||||
|
user_agent=user_agent,
|
||||||
|
closest="201010101010",
|
||||||
|
sort="closest",
|
||||||
|
limit="1",
|
||||||
|
)
|
||||||
|
snapshots = cdx.snapshots()
|
||||||
|
for snapshot in snapshots:
|
||||||
|
archive_url = snapshot.archive_url
|
||||||
|
timestamp = snapshot.timestamp
|
||||||
|
break
|
||||||
|
|
||||||
|
assert str(archive_url).find("google.com")
|
||||||
|
assert "20101010" in timestamp
|
||||||
|
|
||||||
|
|
||||||
|
def test_d() -> None:
|
||||||
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
|
|
||||||
|
cdx = WaybackMachineCDXServerAPI(
|
||||||
|
url="akamhy.github.io",
|
||||||
|
user_agent=user_agent,
|
||||||
|
match_type="prefix",
|
||||||
|
use_pagination=True,
|
||||||
|
filters=["statuscode:200"],
|
||||||
|
)
|
||||||
|
snapshots = cdx.snapshots()
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for snapshot in snapshots:
|
||||||
|
count += 1
|
||||||
|
assert str(snapshot.archive_url).find("akamhy.github.io")
|
||||||
|
assert count > 50
|
||||||
|
@ -219,4 +219,5 @@ def test_archive_url() -> None:
|
|||||||
save_api.saved_archive = (
|
save_api.saved_archive = (
|
||||||
"https://web.archive.org/web/20220124063056/https://example.com/"
|
"https://web.archive.org/web/20220124063056/https://example.com/"
|
||||||
)
|
)
|
||||||
|
save_api._archive_url = save_api.saved_archive
|
||||||
assert save_api.archive_url == save_api.saved_archive
|
assert save_api.archive_url == save_api.saved_archive
|
||||||
|
@ -35,4 +35,11 @@ def test_total_archives() -> None:
|
|||||||
|
|
||||||
def test_known_urls() -> None:
|
def test_known_urls() -> None:
|
||||||
wayback = Url("akamhy.github.io")
|
wayback = Url("akamhy.github.io")
|
||||||
assert len(list(wayback.known_urls())) > 40
|
assert len(list(wayback.known_urls(subdomain=True))) > 40
|
||||||
|
|
||||||
|
|
||||||
|
def test_Save() -> None:
|
||||||
|
wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property")
|
||||||
|
wayback.save()
|
||||||
|
archive_url = str(wayback.archive_url)
|
||||||
|
assert archive_url.find("Asymptotic_equipartition_property") != -1
|
||||||
|
@ -50,6 +50,8 @@ class WaybackMachineCDXServerAPI:
|
|||||||
collapses: Optional[List[str]] = None,
|
collapses: Optional[List[str]] = None,
|
||||||
limit: Optional[str] = None,
|
limit: Optional[str] = None,
|
||||||
max_tries: int = 3,
|
max_tries: int = 3,
|
||||||
|
use_pagination: bool = False,
|
||||||
|
closest: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.url = str(url).strip().replace(" ", "%20")
|
self.url = str(url).strip().replace(" ", "%20")
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
@ -66,60 +68,58 @@ class WaybackMachineCDXServerAPI:
|
|||||||
check_collapses(self.collapses)
|
check_collapses(self.collapses)
|
||||||
self.limit = 25000 if limit is None else limit
|
self.limit = 25000 if limit is None else limit
|
||||||
self.max_tries = max_tries
|
self.max_tries = max_tries
|
||||||
|
self.use_pagination = use_pagination
|
||||||
|
self.closest = None if closest is None else str(closest)
|
||||||
self.last_api_request_url: Optional[str] = None
|
self.last_api_request_url: Optional[str] = None
|
||||||
self.use_page = False
|
|
||||||
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||||
|
|
||||||
def cdx_api_manager(
|
def cdx_api_manager(
|
||||||
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
|
self, payload: Dict[str, str], headers: Dict[str, str]
|
||||||
) -> Generator[str, None, None]:
|
) -> Generator[str, None, None]:
|
||||||
"""
|
"""
|
||||||
Manages the API calls for the instance, it automatically selects the best
|
This method uses the pagination API of the CDX server if
|
||||||
parameters by looking as the query of the end-user. For bigger queries
|
use_pagination attribute is True else uses the standard
|
||||||
automatically use the CDX pagination API and for smaller queries use the
|
CDX server response data.
|
||||||
normal API.
|
|
||||||
|
|
||||||
CDX Server API is a complex API and to make it easy for the end user to
|
|
||||||
consume it the CDX manager(this method) handles the selection of the
|
|
||||||
API output, whether to use the pagination API or not.
|
|
||||||
|
|
||||||
For doing large/bulk queries, the use of the Pagination API is
|
|
||||||
recommended by the Wayback Machine authors. And it determines if the
|
|
||||||
query would be large or not by using the showNumPages=true parameter,
|
|
||||||
this tells the number of pages of CDX DATA that the pagination API
|
|
||||||
will return.
|
|
||||||
|
|
||||||
If the number of page is less than 2 we use the normal non-pagination
|
|
||||||
API as the pagination API is known to lag and for big queries it should
|
|
||||||
not matter but for queries where the number of pages are less this
|
|
||||||
method chooses accuracy over the pagination API.
|
|
||||||
"""
|
"""
|
||||||
# number of pages that will returned by the pagination API.
|
|
||||||
# get_total_pages adds the showNumPages=true param to pagination API
|
|
||||||
# requests.
|
|
||||||
# This is a special query that will return a single number indicating
|
|
||||||
# the number of pages.
|
|
||||||
total_pages = get_total_pages(self.url, self.user_agent)
|
|
||||||
|
|
||||||
if use_page is True and total_pages >= 2:
|
# When using the pagination API of the CDX server.
|
||||||
blank_pages = 0
|
if self.use_pagination is True:
|
||||||
|
|
||||||
|
total_pages = get_total_pages(self.url, self.user_agent)
|
||||||
|
successive_blank_pages = 0
|
||||||
|
|
||||||
for i in range(total_pages):
|
for i in range(total_pages):
|
||||||
payload["page"] = str(i)
|
payload["page"] = str(i)
|
||||||
|
|
||||||
url = full_url(self.endpoint, params=payload)
|
url = full_url(self.endpoint, params=payload)
|
||||||
res = get_response(url, headers=headers)
|
res = get_response(url, headers=headers)
|
||||||
|
|
||||||
if isinstance(res, Exception):
|
if isinstance(res, Exception):
|
||||||
raise res
|
raise res
|
||||||
|
|
||||||
self.last_api_request_url = url
|
self.last_api_request_url = url
|
||||||
text = res.text
|
text = res.text
|
||||||
if len(text) == 0:
|
|
||||||
blank_pages += 1
|
|
||||||
|
|
||||||
if blank_pages >= 2:
|
# Reset the counter if the last page was blank
|
||||||
|
# but the current page is not.
|
||||||
|
if successive_blank_pages == 1:
|
||||||
|
if len(text) != 0:
|
||||||
|
successive_blank_pages = 0
|
||||||
|
|
||||||
|
# Increase the succesive page counter on encountering
|
||||||
|
# blank page.
|
||||||
|
if len(text) == 0:
|
||||||
|
successive_blank_pages += 1
|
||||||
|
|
||||||
|
# If two succesive pages are blank
|
||||||
|
# then we don't have any more pages left to
|
||||||
|
# iterate.
|
||||||
|
if successive_blank_pages >= 2:
|
||||||
break
|
break
|
||||||
|
|
||||||
yield text
|
yield text
|
||||||
|
|
||||||
|
# When not using the pagination API of the CDX server
|
||||||
else:
|
else:
|
||||||
payload["showResumeKey"] = "true"
|
payload["showResumeKey"] = "true"
|
||||||
payload["limit"] = str(self.limit)
|
payload["limit"] = str(self.limit)
|
||||||
@ -166,6 +166,9 @@ class WaybackMachineCDXServerAPI:
|
|||||||
if self.gzip is None:
|
if self.gzip is None:
|
||||||
payload["gzip"] = "false"
|
payload["gzip"] = "false"
|
||||||
|
|
||||||
|
if self.closest:
|
||||||
|
payload["closest"] = self.closest
|
||||||
|
|
||||||
if self.match_type:
|
if self.match_type:
|
||||||
payload["matchType"] = self.match_type
|
payload["matchType"] = self.match_type
|
||||||
|
|
||||||
@ -206,13 +209,7 @@ class WaybackMachineCDXServerAPI:
|
|||||||
|
|
||||||
self.add_payload(payload)
|
self.add_payload(payload)
|
||||||
|
|
||||||
if not self.start_timestamp or self.end_timestamp:
|
entries = self.cdx_api_manager(payload, headers)
|
||||||
self.use_page = True
|
|
||||||
|
|
||||||
if self.collapses != []:
|
|
||||||
self.use_page = False
|
|
||||||
|
|
||||||
entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
|
|
||||||
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ import requests
|
|||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from urllib3.util.retry import Retry
|
from urllib3.util.retry import Retry
|
||||||
|
|
||||||
from .exceptions import WaybackError
|
from .exceptions import BlockedSiteError, WaybackError
|
||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
@ -28,12 +28,38 @@ def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
|
|||||||
headers = {"User-Agent": user_agent}
|
headers = {"User-Agent": user_agent}
|
||||||
request_url = full_url(endpoint, params=payload)
|
request_url = full_url(endpoint, params=payload)
|
||||||
response = get_response(request_url, headers=headers)
|
response = get_response(request_url, headers=headers)
|
||||||
|
check_for_blocked_site(response, url)
|
||||||
if isinstance(response, requests.Response):
|
if isinstance(response, requests.Response):
|
||||||
return int(response.text.strip())
|
return int(response.text.strip())
|
||||||
raise response
|
raise response
|
||||||
|
|
||||||
|
|
||||||
|
def check_for_blocked_site(
|
||||||
|
response: Union[requests.Response, Exception], url: Optional[str] = None
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Checks that the URL can be archived by wayback machine or not.
|
||||||
|
robots.txt policy of the site may prevent the wayback machine.
|
||||||
|
"""
|
||||||
|
# see https://github.com/akamhy/waybackpy/issues/157
|
||||||
|
|
||||||
|
# the following if block is to make mypy happy.
|
||||||
|
if isinstance(response, Exception):
|
||||||
|
raise response
|
||||||
|
|
||||||
|
if not url:
|
||||||
|
url = "The requested content"
|
||||||
|
if (
|
||||||
|
"org.archive.util.io.RuntimeIOException: "
|
||||||
|
+ "org.archive.wayback.exception.AdministrativeAccessControlException: "
|
||||||
|
+ "Blocked Site Error"
|
||||||
|
in response.text.strip()
|
||||||
|
):
|
||||||
|
raise BlockedSiteError(
|
||||||
|
f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
|
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
As the function's name already implies that it returns
|
As the function's name already implies that it returns
|
||||||
@ -76,6 +102,7 @@ def get_response(
|
|||||||
session.mount("https://", HTTPAdapter(max_retries=retries_))
|
session.mount("https://", HTTPAdapter(max_retries=retries_))
|
||||||
response = session.get(url, headers=headers)
|
response = session.get(url, headers=headers)
|
||||||
session.close()
|
session.close()
|
||||||
|
check_for_blocked_site(response)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,6 +63,9 @@ def handle_cdx(data: List[Any]) -> None:
|
|||||||
limit = data[7]
|
limit = data[7]
|
||||||
gzip = data[8]
|
gzip = data[8]
|
||||||
match_type = data[9]
|
match_type = data[9]
|
||||||
|
sort = data[10]
|
||||||
|
use_pagination = data[11]
|
||||||
|
closest = data[12]
|
||||||
|
|
||||||
filters = list(cdx_filter)
|
filters = list(cdx_filter)
|
||||||
collapses = list(collapse)
|
collapses = list(collapse)
|
||||||
@ -73,8 +76,11 @@ def handle_cdx(data: List[Any]) -> None:
|
|||||||
user_agent=user_agent,
|
user_agent=user_agent,
|
||||||
start_timestamp=start_timestamp,
|
start_timestamp=start_timestamp,
|
||||||
end_timestamp=end_timestamp,
|
end_timestamp=end_timestamp,
|
||||||
|
closest=closest,
|
||||||
filters=filters,
|
filters=filters,
|
||||||
match_type=match_type,
|
match_type=match_type,
|
||||||
|
sort=sort,
|
||||||
|
use_pagination=use_pagination,
|
||||||
gzip=gzip,
|
gzip=gzip,
|
||||||
collapses=collapses,
|
collapses=collapses,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
@ -249,7 +255,6 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
|||||||
help="Use with '--known_urls' to save the URLs in file at current directory.",
|
help="Use with '--known_urls' to save the URLs in file at current directory.",
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-c",
|
|
||||||
"--cdx",
|
"--cdx",
|
||||||
default=False,
|
default=False,
|
||||||
is_flag=True,
|
is_flag=True,
|
||||||
@ -269,6 +274,12 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
|||||||
"--to",
|
"--to",
|
||||||
help="End timestamp for CDX API in yyyyMMddhhmmss format.",
|
help="End timestamp for CDX API in yyyyMMddhhmmss format.",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"-C",
|
||||||
|
"--closest",
|
||||||
|
help="Archive that are closest the timestamp passed as arguments to this "
|
||||||
|
+ "parameter.",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-f",
|
"-f",
|
||||||
"--cdx-filter",
|
"--cdx-filter",
|
||||||
@ -285,6 +296,20 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
|||||||
+ "However, the CDX server can also return results matching a certain prefix, "
|
+ "However, the CDX server can also return results matching a certain prefix, "
|
||||||
+ "a certain host, or all sub-hosts by using the match_type",
|
+ "a certain host, or all sub-hosts by using the match_type",
|
||||||
)
|
)
|
||||||
|
@click.option(
|
||||||
|
"-st",
|
||||||
|
"--sort",
|
||||||
|
help="Choose one from default, closest or reverse. It returns sorted CDX entries "
|
||||||
|
+ "in the response.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"-up",
|
||||||
|
"--use-pagination",
|
||||||
|
"--use_pagination",
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="Use the pagination API of the CDX server instead of the default one.",
|
||||||
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"-gz",
|
"-gz",
|
||||||
"--gzip",
|
"--gzip",
|
||||||
@ -326,6 +351,7 @@ def main( # pylint: disable=no-value-for-parameter
|
|||||||
subdomain: bool,
|
subdomain: bool,
|
||||||
file: bool,
|
file: bool,
|
||||||
cdx: bool,
|
cdx: bool,
|
||||||
|
use_pagination: bool,
|
||||||
cdx_filter: List[str],
|
cdx_filter: List[str],
|
||||||
collapse: List[str],
|
collapse: List[str],
|
||||||
cdx_print: List[str],
|
cdx_print: List[str],
|
||||||
@ -337,7 +363,9 @@ def main( # pylint: disable=no-value-for-parameter
|
|||||||
minute: Optional[int] = None,
|
minute: Optional[int] = None,
|
||||||
start_timestamp: Optional[str] = None,
|
start_timestamp: Optional[str] = None,
|
||||||
end_timestamp: Optional[str] = None,
|
end_timestamp: Optional[str] = None,
|
||||||
|
closest: Optional[str] = None,
|
||||||
match_type: Optional[str] = None,
|
match_type: Optional[str] = None,
|
||||||
|
sort: Optional[str] = None,
|
||||||
gzip: Optional[str] = None,
|
gzip: Optional[str] = None,
|
||||||
limit: Optional[str] = None,
|
limit: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -428,6 +456,9 @@ def main( # pylint: disable=no-value-for-parameter
|
|||||||
limit,
|
limit,
|
||||||
gzip,
|
gzip,
|
||||||
match_type,
|
match_type,
|
||||||
|
sort,
|
||||||
|
use_pagination,
|
||||||
|
closest,
|
||||||
]
|
]
|
||||||
handle_cdx(data)
|
handle_cdx(data)
|
||||||
|
|
||||||
|
@ -16,6 +16,13 @@ class WaybackError(Exception):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class BlockedSiteError(WaybackError):
|
||||||
|
"""
|
||||||
|
Raised when the archives for website/URLs that was excluded from Wayback
|
||||||
|
Machine are requested via the CDX server API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class TooManyRequestsError(WaybackError):
|
class TooManyRequestsError(WaybackError):
|
||||||
"""
|
"""
|
||||||
Raised when you make more than 15 requests per
|
Raised when you make more than 15 requests per
|
||||||
|
Loading…
Reference in New Issue
Block a user