Add sort, use_pagination and closest (#158)

* add sort param support in CDX API class

see https://nla.github.io/outbackcdx/api.html#operation/query

sort takes string input which must be one of the follwoing:
- default
- closest
- reverse

This commit shall help in closing issue at https://github.com/akamhy/waybackpy/issues/155

* add BlockedSiteError for cases when archiving is blocked by site's robots.txt

* create check_for_blocked_site for handling the BlockedSiteError for sites that are blocking wayback machine by their robots.txt policy

* add attrs use_pagination and closest, which are can be used to use the pagination API and lookup archive close to a timestamp respectively. And now to get out of infinte blank pages loop just check for two succesive black and not total two blank pages while using the CDX server API.

* added cli support for sort, use-pagination and closest

* added tests

* fix codeql warnings, nothing to worry about here.

* fix save test for archive_url
This commit is contained in:
Akash Mahanty
2022-02-18 00:24:14 +05:30
committed by GitHub
parent 3a44a710d3
commit f990b93f8a
7 changed files with 164 additions and 44 deletions

View File

@ -32,7 +32,11 @@ def test_b() -> None:
url = "https://www.google.com"
wayback = WaybackMachineCDXServerAPI(
url=url, user_agent=user_agent, start_timestamp="202101", end_timestamp="202112"
url=url,
user_agent=user_agent,
start_timestamp="202101",
end_timestamp="202112",
collapses=["urlkey"],
)
# timeframe bound prefix matching enabled along with active urlkey based collapsing
@ -40,3 +44,49 @@ def test_b() -> None:
for snapshot in snapshots:
assert snapshot.timestamp.startswith("2021")
def test_c() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
url = "https://www.google.com"
cdx = WaybackMachineCDXServerAPI(
url=url,
user_agent=user_agent,
closest="201010101010",
sort="closest",
limit="1",
)
snapshots = cdx.snapshots()
for snapshot in snapshots:
archive_url = snapshot.archive_url
timestamp = snapshot.timestamp
break
assert str(archive_url).find("google.com")
assert "20101010" in timestamp
def test_d() -> None:
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
cdx = WaybackMachineCDXServerAPI(
url="akamhy.github.io",
user_agent=user_agent,
match_type="prefix",
use_pagination=True,
filters=["statuscode:200"],
)
snapshots = cdx.snapshots()
count = 0
for snapshot in snapshots:
count += 1
assert str(snapshot.archive_url).find("akamhy.github.io")
assert count > 50

View File

@ -219,4 +219,5 @@ def test_archive_url() -> None:
save_api.saved_archive = (
"https://web.archive.org/web/20220124063056/https://example.com/"
)
save_api._archive_url = save_api.saved_archive
assert save_api.archive_url == save_api.saved_archive

View File

@ -35,4 +35,11 @@ def test_total_archives() -> None:
def test_known_urls() -> None:
wayback = Url("akamhy.github.io")
assert len(list(wayback.known_urls())) > 40
assert len(list(wayback.known_urls(subdomain=True))) > 40
def test_Save() -> None:
wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property")
wayback.save()
archive_url = str(wayback.archive_url)
assert archive_url.find("Asymptotic_equipartition_property") != -1