* fix: CI yml name

* add: mypy configuraion

* add: type annotation to waybackpy modules

* add: type annotation to test modules

* fix: mypy command

* add: types-requests to dev deps

* fix: disable max-line-length

* fix: move pytest.ini into setup.cfg

* add: urllib3 to deps

* fix: Retry (ref: https://github.com/python/typeshed/issues/6893)

* fix: f-string

* fix: shorten long lines

* add: staticmethod decorator to no-self-use methods

* fix: str(headers)->headers_str

* fix: error message

* fix: revert "str(headers)->headers_str" and ignore assignment CaseInsensitiveDict with str

* fix: mypy error
This commit is contained in:
eggplants
2022-02-05 03:23:36 +09:00
committed by GitHub
parent 320ef30371
commit d8cabdfdb5
22 changed files with 537 additions and 364 deletions

View File

@@ -1,3 +1,5 @@
from typing import Dict, Generator, List, Optional, cast
from .cdx_snapshot import CDXSnapshot
from .cdx_utils import (
check_collapses,
@@ -11,43 +13,48 @@ from .exceptions import WaybackError
from .utils import DEFAULT_USER_AGENT
class WaybackMachineCDXServerAPI:
class WaybackMachineCDXServerAPI(object):
"""
Class that interfaces the CDX server API of the Wayback Machine.
"""
# start_timestamp: from, can not use from as it's a keyword
# end_timestamp: to, not using to as can not use from
def __init__(
self,
url,
user_agent=DEFAULT_USER_AGENT,
start_timestamp=None, # from, can not use from as it's a keyword
end_timestamp=None, # to, not using to as can not use from
filters=[],
match_type=None,
gzip=None,
collapses=[],
limit=None,
max_tries=3,
):
url: str,
user_agent: str = DEFAULT_USER_AGENT,
start_timestamp: Optional[str] = None,
end_timestamp: Optional[str] = None,
filters: List[str] = [],
match_type: Optional[str] = None,
gzip: Optional[str] = None,
collapses: List[str] = [],
limit: Optional[str] = None,
max_tries: int = 3,
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
self.start_timestamp = str(start_timestamp) if start_timestamp else None
self.end_timestamp = str(end_timestamp) if end_timestamp else None
self.start_timestamp = (
str(start_timestamp) if start_timestamp is not None else None
)
self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
self.filters = filters
check_filters(self.filters)
self.match_type = str(match_type).strip() if match_type else None
self.match_type = str(match_type).strip() if match_type is not None else None
check_match_type(self.match_type, self.url)
self.gzip = gzip if gzip else True
self.gzip = gzip
self.collapses = collapses
check_collapses(self.collapses)
self.limit = limit if limit else 5000
self.limit = limit if limit is not None else 5000
self.max_tries = max_tries
self.last_api_request_url = None
self.last_api_request_url: Optional[str] = None
self.use_page = False
self.endpoint = "https://web.archive.org/cdx/search/cdx"
def cdx_api_manager(self, payload, headers, use_page=False):
def cdx_api_manager(
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
) -> Generator[str, None, None]:
total_pages = get_total_pages(self.url, self.user_agent)
# If we only have two or less pages of archives then we care for more accuracy
# pagination API is lagged sometimes
@@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI:
url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url
text = res.text
@@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI:
yield text
else:
payload["showResumeKey"] = "true"
payload["limit"] = str(self.limit)
resumeKey = None
more = True
while more:
if resumeKey:
payload["resumeKey"] = resumeKey
url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url
@@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI:
yield text
def add_payload(self, payload):
def add_payload(self, payload: Dict[str, str]) -> None:
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip is not True:
if self.gzip is None:
payload["gzip"] = "false"
if self.match_type:
@@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI:
# Don't need to return anything as it's dictionary.
payload["url"] = self.url
def snapshots(self):
payload = {}
def snapshots(self) -> Generator[CDXSnapshot, None, None]:
payload: Dict[str, str] = {}
headers = {"User-Agent": self.user_agent}
self.add_payload(payload)
@@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI:
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
continue
properties = {
properties: Dict[str, Optional[str]] = {
"urlkey": None,
"timestamp": None,
"original": None,
@@ -169,15 +177,9 @@ class WaybackMachineCDXServerAPI:
if prop_values_len != properties_len:
raise WaybackError(
"Snapshot returned by Cdx API has {prop_values_len} properties".format(
prop_values_len=prop_values_len
)
+ " instead of expected {properties_len} ".format(
properties_len=properties_len
)
+ "properties.\nProblematic Snapshot : {snapshot}".format(
snapshot=snapshot
)
f"Snapshot returned by Cdx API has {prop_values_len} "
f"properties instead of expected {properties_len} properties.\n"
f"Problematic Snapshot: {snapshot}"
)
(
@@ -190,4 +192,4 @@ class WaybackMachineCDXServerAPI:
properties["length"],
) = prop_values
yield CDXSnapshot(properties)
yield CDXSnapshot(cast(Dict[str, str], properties))