waybackpy/tests/test_cdx.py

94 lines
2.5 KiB
Python
Raw Normal View History

2021-01-09 21:53:53 +01:00
import pytest
from waybackpy.cdx import Cdx
from waybackpy.exceptions import WaybackError
def test_all_cdx():
url = "akamhy.github.io"
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, \
like Gecko) Chrome/45.0.2454.85 Safari/537.36"
cdx = Cdx(
url=url,
user_agent=user_agent,
start_timestamp=2017,
end_timestamp=2020,
filters=[
"statuscode:200",
"mimetype:text/html",
"timestamp:20201002182319",
"original:https://akamhy.github.io/",
],
gzip=False,
collapses=["timestamp:10", "digest"],
limit=50,
match_type="prefix",
)
snapshots = cdx.snapshots()
for snapshot in snapshots:
ans = snapshot.archive_url
assert "https://web.archive.org/web/20201002182319/https://akamhy.github.io/" == ans
url = "akahfjgjkmhy.gihthub.ip"
cdx = Cdx(
url=url,
user_agent=user_agent,
start_timestamp=None,
end_timestamp=None,
filters=[],
match_type=None,
gzip=True,
collapses=[],
limit=10,
)
snapshots = cdx.snapshots()
print(snapshots)
i = 0
for _ in snapshots:
i += 1
assert i == 0
url = "https://github.com/akamhy/waybackpy/*"
cdx = Cdx(url=url, user_agent=user_agent, limit=50)
snapshots = cdx.snapshots()
for snapshot in snapshots:
print(snapshot.archive_url)
url = "https://github.com/akamhy/waybackpy"
with pytest.raises(WaybackError):
cdx = Cdx(url=url, user_agent=user_agent, limit=50, filters=["ghddhfhj"])
snapshots = cdx.snapshots()
with pytest.raises(WaybackError):
cdx = Cdx(url=url, user_agent=user_agent, collapses=["timestamp", "ghdd:hfhj"])
snapshots = cdx.snapshots()
url = "https://github.com"
cdx = Cdx(url=url, user_agent=user_agent, limit=50)
snapshots = cdx.snapshots()
c = 0
for snapshot in snapshots:
c += 1
if c > 100:
break
url = "https://github.com/*"
cdx = Cdx(url=url, user_agent=user_agent, collapses=["timestamp"])
snapshots = cdx.snapshots()
c = 0
for snapshot in snapshots:
c += 1
if c > 30529: # deafult limit is 10k
2021-01-09 21:53:53 +01:00
break
url = "https://github.com/*"
cdx = Cdx(url=url, user_agent=user_agent)
c = 0
snapshots = cdx.snapshots()
for snapshot in snapshots:
c += 1
if c > 100529:
2021-01-09 21:53:53 +01:00
break