waybackpy/tests/test_cdx_utils.py

104 lines
2.7 KiB
Python
Raw Normal View History

from typing import Any, Dict, List
2022-01-24 18:35:47 +01:00
import pytest
2022-01-24 18:35:47 +01:00
from waybackpy.cdx_utils import (
check_collapses,
check_filters,
2022-01-24 18:35:47 +01:00
check_match_type,
full_url,
get_response,
get_total_pages,
2022-01-24 18:35:47 +01:00
)
from waybackpy.exceptions import WaybackError
2022-01-24 18:35:47 +01:00
def test_get_total_pages() -> None:
2022-01-24 18:35:47 +01:00
url = "twitter.com"
user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
)
2022-01-24 18:35:47 +01:00
assert get_total_pages(url=url, user_agent=user_agent) >= 56
def test_full_url() -> None:
2022-01-24 18:35:47 +01:00
endpoint = "https://web.archive.org/cdx/search/cdx"
params: Dict[str, Any] = {}
2022-01-24 18:35:47 +01:00
assert endpoint == full_url(endpoint, params)
params = {"a": "1"}
assert full_url(endpoint, params) == "https://web.archive.org/cdx/search/cdx?a=1"
assert (
full_url(endpoint + "?", params) == "https://web.archive.org/cdx/search/cdx?a=1"
2022-01-24 18:35:47 +01:00
)
params["b"] = 2
assert (
full_url(endpoint + "?", params)
== "https://web.archive.org/cdx/search/cdx?a=1&b=2"
2022-01-24 18:35:47 +01:00
)
params["c"] = "foo bar"
assert (
full_url(endpoint + "?", params)
== "https://web.archive.org/cdx/search/cdx?a=1&b=2&c=foo%20bar"
2022-01-24 18:35:47 +01:00
)
def test_get_response() -> None:
2022-01-24 18:35:47 +01:00
url = "https://github.com"
user_agent = (
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
)
headers = {"User-Agent": str(user_agent)}
2022-01-24 18:35:47 +01:00
response = get_response(url, headers=headers)
assert not isinstance(response, Exception) and response.status_code == 200
2022-01-24 18:35:47 +01:00
def test_check_filters() -> None:
filters: List[str] = []
2022-01-24 18:35:47 +01:00
check_filters(filters)
filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"]
check_filters(filters)
with pytest.raises(WaybackError):
check_filters("not-list") # type: ignore[arg-type]
2022-01-24 18:35:47 +01:00
with pytest.raises(WaybackError):
check_filters(["invalid"])
def test_check_collapses() -> None:
collapses: List[str] = []
2022-01-24 18:35:47 +01:00
check_collapses(collapses)
collapses = ["timestamp:10"]
check_collapses(collapses)
collapses = ["urlkey"]
check_collapses(collapses)
collapses = "urlkey" # type: ignore[assignment]
2022-01-24 18:35:47 +01:00
with pytest.raises(WaybackError):
check_collapses(collapses)
collapses = ["also illegal collapse"]
with pytest.raises(WaybackError):
check_collapses(collapses)
def test_check_match_type() -> None:
assert check_match_type(None, "url")
2022-01-24 18:35:47 +01:00
match_type = "exact"
url = "test_url"
assert check_match_type(match_type, url)
2022-01-24 18:35:47 +01:00
url = "has * in it"
with pytest.raises(WaybackError):
check_match_type("domain", url)
with pytest.raises(WaybackError):
check_match_type("not a valid type", "url")