Files
waybackpy/waybackpy/cdx_api.py
2022-02-08 04:17:41 +09:00

264 lines
9.6 KiB
Python

"""
This module interfaces the Wayback Machine's CDX server API.
The module has WaybackMachineCDXServerAPI which should be used by the users of
this module to consume the CDX server API.
WaybackMachineCDXServerAPI has a snapshot method that yields the snapshots, and
the snapshots are yielded as instances of the CDXSnapshot class.
"""
from typing import Dict, Generator, List, Optional, cast
from .cdx_snapshot import CDXSnapshot
from .cdx_utils import (
check_collapses,
check_filters,
check_match_type,
full_url,
get_response,
get_total_pages,
)
from .exceptions import WaybackError
from .utils import DEFAULT_USER_AGENT
class WaybackMachineCDXServerAPI:
"""
Class that interfaces the CDX server API of the Wayback Machine.
snapshot() returns a generator that can be iterated upon by the end-user,
the generator returns the snapshots/entries as instance of CDXSnapshot to
make the usage easy, just use '.' to get any attribute as the attributes are
accessible via a dot ".".
"""
# start_timestamp: from, can not use from as it's a keyword
# end_timestamp: to, not using to as can not use from
def __init__(
self,
url: str,
user_agent: str = DEFAULT_USER_AGENT,
start_timestamp: Optional[str] = None,
end_timestamp: Optional[str] = None,
filters: Optional[List[str]] = None,
match_type: Optional[str] = None,
gzip: Optional[str] = None,
collapses: Optional[List[str]] = None,
limit: Optional[str] = None,
max_tries: int = 3,
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
self.start_timestamp = None if start_timestamp is None else str(start_timestamp)
self.end_timestamp = None if end_timestamp is None else str(end_timestamp)
self.filters = [] if filters is None else filters
check_filters(self.filters)
self.match_type = None if match_type is None else str(match_type).strip()
check_match_type(self.match_type, self.url)
self.gzip = gzip
self.collapses = [] if collapses is None else collapses
check_collapses(self.collapses)
self.limit = 5000 if limit is None else limit
self.max_tries = max_tries
self.last_api_request_url: Optional[str] = None
self.use_page = False
self.endpoint = "https://web.archive.org/cdx/search/cdx"
def cdx_api_manager(
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
) -> Generator[str, None, None]:
"""
Manages the API calls for the instance, it automatically selects the best
parameters by looking as the query of the end-user. For bigger queries
automatically use the CDX pagination API and for smaller queries use the
normal API.
CDX Server API is a complex API and to make it easy for the end user to
consume it the CDX manager(this method) handles the selection of the
API output, whether to use the pagination API or not.
For doing large/bulk queries, the use of the Pagination API is
recommended by the Wayback Machine authors. And it determines if the
query would be large or not by using the showNumPages=true parameter,
this tells the number of pages of CDX DATA that the pagination API
will return.
If the number of page is less than 2 we use the normal non-pagination
API as the pagination API is known to lag and for big queries it should
not matter but for queries where the number of pages are less this
method chooses accuracy over the pagination API.
"""
# number of pages that will returned by the pagination API.
# get_total_pages adds the showNumPages=true param to pagination API
# requests.
# This is a special query that will return a single number indicating
# the number of pages.
total_pages = get_total_pages(self.url, self.user_agent)
if use_page is True and total_pages >= 2:
blank_pages = 0
for i in range(total_pages):
payload["page"] = str(i)
url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url
text = res.text
if len(text) == 0:
blank_pages += 1
if blank_pages >= 2:
break
yield text
else:
payload["showResumeKey"] = "true"
payload["limit"] = str(self.limit)
resume_key = None
more = True
while more:
if resume_key:
payload["resumeKey"] = resume_key
url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url
text = res.text.strip()
lines = text.splitlines()
more = False
if len(lines) >= 3:
second_last_line = lines[-2]
if len(second_last_line) == 0:
resume_key = lines[-1].strip()
text = text.replace(resume_key, "", 1).strip()
more = True
yield text
def add_payload(self, payload: Dict[str, str]) -> None:
"""
Adds the payload to the payload dictionary.
"""
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip is None:
payload["gzip"] = "false"
if self.match_type:
payload["matchType"] = self.match_type
if self.filters and len(self.filters) > 0:
for i, _filter in enumerate(self.filters):
payload["filter" + str(i)] = _filter
if self.collapses and len(self.collapses) > 0:
for i, collapse in enumerate(self.collapses):
payload["collapse" + str(i)] = collapse
# Don't need to return anything as it's dictionary.
payload["url"] = self.url
def snapshots(self) -> Generator[CDXSnapshot, None, None]:
"""
This function yields the CDX data lines as snapshots.
As it is a generator it exhaustible, the reason that this is
a generator and not a list are:
a) CDX server API can return millions of entries for a query and list
is not suitable for such cases.
b) Preventing memory usage issues, as told before this method may yield
millions of records for some queries and your system may not have enough
memory for such a big list. Also Remember this if outputing to Jupyter
Notebooks.
The objects yielded by this method are instance of CDXSnapshot class,
you can access the attributes of the entries as the attribute of the instance
itself.
"""
payload: Dict[str, str] = {}
headers = {"User-Agent": self.user_agent}
self.add_payload(payload)
if not self.start_timestamp or self.end_timestamp:
self.use_page = True
if self.collapses != []:
self.use_page = False
entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
for entry in entries:
if entry.isspace() or len(entry) <= 1 or not entry:
continue
# each line is a snapshot aka entry of the CDX server API.
# We are able to split the page by lines because it only
# splits the lines on a sinlge page and not all the entries
# at once, thus there should be no issues of too much memory usage.
snapshot_list = entry.split("\n")
for snapshot in snapshot_list:
# 14 + 32 == 46 ( timestamp + digest ), ignore the invalid entries.
# they are invalid if their length is smaller than sum of length
# of a standard wayback_timestamp and standard digest of an entry.
if len(snapshot) < 46:
continue
properties: Dict[str, Optional[str]] = {
"urlkey": None,
"timestamp": None,
"original": None,
"mimetype": None,
"statuscode": None,
"digest": None,
"length": None,
}
property_value = snapshot.split(" ")
total_property_values = len(property_value)
warranted_total_property_values = len(properties)
if total_property_values != warranted_total_property_values:
raise WaybackError(
f"Snapshot returned by CDX API has {total_property_values} prop"
f"erties instead of expected {warranted_total_property_values} "
f"properties.\nProblematic Snapshot: {snapshot}"
)
(
properties["urlkey"],
properties["timestamp"],
properties["original"],
properties["mimetype"],
properties["statuscode"],
properties["digest"],
properties["length"],
) = property_value
yield CDXSnapshot(cast(Dict[str, str], properties))