Typing (#128)
* fix: CI yml name * add: mypy configuraion * add: type annotation to waybackpy modules * add: type annotation to test modules * fix: mypy command * add: types-requests to dev deps * fix: disable max-line-length * fix: move pytest.ini into setup.cfg * add: urllib3 to deps * fix: Retry (ref: https://github.com/python/typeshed/issues/6893) * fix: f-string * fix: shorten long lines * add: staticmethod decorator to no-self-use methods * fix: str(headers)->headers_str * fix: error message * fix: revert "str(headers)->headers_str" and ignore assignment CaseInsensitiveDict with str * fix: mypy error
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
from typing import Dict, Generator, List, Optional, cast
|
||||
|
||||
from .cdx_snapshot import CDXSnapshot
|
||||
from .cdx_utils import (
|
||||
check_collapses,
|
||||
@@ -11,43 +13,48 @@ from .exceptions import WaybackError
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
class WaybackMachineCDXServerAPI:
|
||||
class WaybackMachineCDXServerAPI(object):
|
||||
"""
|
||||
Class that interfaces the CDX server API of the Wayback Machine.
|
||||
"""
|
||||
|
||||
# start_timestamp: from, can not use from as it's a keyword
|
||||
# end_timestamp: to, not using to as can not use from
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
start_timestamp=None, # from, can not use from as it's a keyword
|
||||
end_timestamp=None, # to, not using to as can not use from
|
||||
filters=[],
|
||||
match_type=None,
|
||||
gzip=None,
|
||||
collapses=[],
|
||||
limit=None,
|
||||
max_tries=3,
|
||||
):
|
||||
url: str,
|
||||
user_agent: str = DEFAULT_USER_AGENT,
|
||||
start_timestamp: Optional[str] = None,
|
||||
end_timestamp: Optional[str] = None,
|
||||
filters: List[str] = [],
|
||||
match_type: Optional[str] = None,
|
||||
gzip: Optional[str] = None,
|
||||
collapses: List[str] = [],
|
||||
limit: Optional[str] = None,
|
||||
max_tries: int = 3,
|
||||
) -> None:
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
self.start_timestamp = str(start_timestamp) if start_timestamp else None
|
||||
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
||||
self.start_timestamp = (
|
||||
str(start_timestamp) if start_timestamp is not None else None
|
||||
)
|
||||
self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
|
||||
self.filters = filters
|
||||
check_filters(self.filters)
|
||||
self.match_type = str(match_type).strip() if match_type else None
|
||||
self.match_type = str(match_type).strip() if match_type is not None else None
|
||||
check_match_type(self.match_type, self.url)
|
||||
self.gzip = gzip if gzip else True
|
||||
self.gzip = gzip
|
||||
self.collapses = collapses
|
||||
check_collapses(self.collapses)
|
||||
self.limit = limit if limit else 5000
|
||||
self.limit = limit if limit is not None else 5000
|
||||
self.max_tries = max_tries
|
||||
self.last_api_request_url = None
|
||||
self.last_api_request_url: Optional[str] = None
|
||||
self.use_page = False
|
||||
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||
|
||||
def cdx_api_manager(self, payload, headers, use_page=False):
|
||||
|
||||
def cdx_api_manager(
|
||||
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
|
||||
) -> Generator[str, None, None]:
|
||||
total_pages = get_total_pages(self.url, self.user_agent)
|
||||
# If we only have two or less pages of archives then we care for more accuracy
|
||||
# pagination API is lagged sometimes
|
||||
@@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI:
|
||||
|
||||
url = full_url(self.endpoint, params=payload)
|
||||
res = get_response(url, headers=headers)
|
||||
if isinstance(res, Exception):
|
||||
raise res
|
||||
|
||||
self.last_api_request_url = url
|
||||
text = res.text
|
||||
@@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI:
|
||||
|
||||
yield text
|
||||
else:
|
||||
|
||||
payload["showResumeKey"] = "true"
|
||||
payload["limit"] = str(self.limit)
|
||||
resumeKey = None
|
||||
|
||||
more = True
|
||||
while more:
|
||||
|
||||
if resumeKey:
|
||||
payload["resumeKey"] = resumeKey
|
||||
|
||||
url = full_url(self.endpoint, params=payload)
|
||||
res = get_response(url, headers=headers)
|
||||
if isinstance(res, Exception):
|
||||
raise res
|
||||
|
||||
self.last_api_request_url = url
|
||||
|
||||
@@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI:
|
||||
|
||||
yield text
|
||||
|
||||
def add_payload(self, payload):
|
||||
def add_payload(self, payload: Dict[str, str]) -> None:
|
||||
if self.start_timestamp:
|
||||
payload["from"] = self.start_timestamp
|
||||
|
||||
if self.end_timestamp:
|
||||
payload["to"] = self.end_timestamp
|
||||
|
||||
if self.gzip is not True:
|
||||
if self.gzip is None:
|
||||
payload["gzip"] = "false"
|
||||
|
||||
if self.match_type:
|
||||
@@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI:
|
||||
# Don't need to return anything as it's dictionary.
|
||||
payload["url"] = self.url
|
||||
|
||||
def snapshots(self):
|
||||
payload = {}
|
||||
def snapshots(self) -> Generator[CDXSnapshot, None, None]:
|
||||
payload: Dict[str, str] = {}
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
|
||||
self.add_payload(payload)
|
||||
@@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI:
|
||||
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
|
||||
continue
|
||||
|
||||
properties = {
|
||||
properties: Dict[str, Optional[str]] = {
|
||||
"urlkey": None,
|
||||
"timestamp": None,
|
||||
"original": None,
|
||||
@@ -169,15 +177,9 @@ class WaybackMachineCDXServerAPI:
|
||||
|
||||
if prop_values_len != properties_len:
|
||||
raise WaybackError(
|
||||
"Snapshot returned by Cdx API has {prop_values_len} properties".format(
|
||||
prop_values_len=prop_values_len
|
||||
)
|
||||
+ " instead of expected {properties_len} ".format(
|
||||
properties_len=properties_len
|
||||
)
|
||||
+ "properties.\nProblematic Snapshot : {snapshot}".format(
|
||||
snapshot=snapshot
|
||||
)
|
||||
f"Snapshot returned by Cdx API has {prop_values_len} "
|
||||
f"properties instead of expected {properties_len} properties.\n"
|
||||
f"Problematic Snapshot: {snapshot}"
|
||||
)
|
||||
|
||||
(
|
||||
@@ -190,4 +192,4 @@ class WaybackMachineCDXServerAPI:
|
||||
properties["length"],
|
||||
) = prop_values
|
||||
|
||||
yield CDXSnapshot(properties)
|
||||
yield CDXSnapshot(cast(Dict[str, str], properties))
|
||||
|
Reference in New Issue
Block a user