2021-01-10 03:05:48 +05:30

233 lines
7.7 KiB
Python

from .snapshot import CdxSnapshot
from .exceptions import WaybackError
from .utils import (
_get_total_pages,
_get_response,
default_user_agent,
_check_filters,
_check_collapses,
_check_match_type,
)
# TODO : Threading support for pagination API. It's designed for Threading.
class Cdx:
def __init__(
self,
url,
user_agent=default_user_agent,
start_timestamp=None,
end_timestamp=None,
filters=[],
match_type=None,
gzip=True,
collapses=[],
limit=10000,
):
self.url = str(url).strip()
self.user_agent = str(user_agent)
self.start_timestamp = str(start_timestamp) if start_timestamp else None
self.end_timestamp = str(end_timestamp) if end_timestamp else None
self.filters = filters
_check_filters(self.filters)
self.match_type = str(match_type).strip() if match_type else None
_check_match_type(self.match_type, self.url)
self.gzip = gzip
self.collapses = collapses
_check_collapses(self.collapses)
self.limit = limit
self.last_api_request_url = None
self.use_page = False
def cdx_api_manager(self, payload, headers, use_page=False):
"""
We have two options to get the snapshots, we use this
method to make a selection between pagination API and
the normal one with Resumption Key, sequential querying
of CDX data. For very large querying (for example domain query),
it may be useful to perform queries in parallel and also estimate
the total size of the query.
read more about the pagination API at:
https://web.archive.org/web/20201228063237/https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api
if use_page is false if will use the normal sequential query API,
else use the pagination API.
two mutually exclusive cases possible:
1) pagination API is selected
a) get the total number of pages to read, using _get_total_pages()
b) then we use a for loop to get all the pages and yield the response text
2) normal sequential query API is selected.
a) get use showResumeKey=true to ask the API to add a query resumption key
at the bottom of response
b) check if the page has more than 3 lines, if not return the text
c) if it has atleast three lines, we check the second last line for zero length.
d) if the second last line has length zero than we assume that the last line contains
the resumption key, we set the resumeKey and remove the resumeKey from text
e) if the second line has non zero length we return the text as there will no resumption key
f) if we find the resumption key we set the "more" variable status to True which is always set
to False on each iteration. If more is not True the iteration stops and function returns.
"""
endpoint = "https://web.archive.org/cdx/search/cdx"
if use_page == True:
total_pages = _get_total_pages(self.url, self.user_agent)
for i in range(total_pages):
payload["page"] = str(i)
url, res = _get_response(
endpoint, params=payload, headers=headers, return_full_url=True
)
self.last_api_request_url = url
yield res.text
else:
payload["showResumeKey"] = "true"
payload["limit"] = str(self.limit)
resumeKey = None
more = True
while more:
if resumeKey:
payload["resumeKey"] = resumeKey
url, res = _get_response(
endpoint, params=payload, headers=headers, return_full_url=True
)
self.last_api_request_url = url
text = res.text.strip()
lines = text.splitlines()
more = False
if len(lines) >= 3:
second_last_line = lines[-2]
if len(second_last_line) == 0:
resumeKey = lines[-1].strip()
text = text.replace(resumeKey, "", 1).strip()
more = True
yield text
def snapshots(self):
"""
This function yeilds snapshots encapsulated
in CdxSnapshot for more usability.
All the get request values are set if the conditions match
And we use logic that if someone's only inputs don't have any
of [start_timestamp, end_timestamp] and don't use any collapses
then we use the pagination API as it returns archives starting
from the first archive and the recent most archive will be on
the last page.
"""
payload = {}
headers = {"User-Agent": self.user_agent}
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip != True:
payload["gzip"] = "false"
if self.match_type:
payload["matchType"] = self.match_type
if self.filters and len(self.filters) > 0:
for i, f in enumerate(self.filters):
payload["filter" + str(i)] = f
if self.collapses and len(self.collapses) > 0:
for i, f in enumerate(self.collapses):
payload["collapse" + str(i)] = f
payload["url"] = self.url
if not self.start_timestamp or self.end_timestamp:
self.use_page = True
if self.collapses != []:
self.use_page = False
texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
for text in texts:
if text.isspace() or len(text) <= 1 or not text:
continue
snapshot_list = text.split("\n")
for snapshot in snapshot_list:
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
continue
properties = {
"urlkey": None,
"timestamp": None,
"original": None,
"mimetype": None,
"statuscode": None,
"digest": None,
"length": None,
}
prop_values = snapshot.split(" ")
# Making sure that we get the same number of
# property values as the number of properties
prop_values_len = len(prop_values)
properties_len = len(properties)
if prop_values_len != properties_len:
raise WaybackError(
"Snapshot returned by Cdx API has %s properties instead of expected %s properties.\nInvolved Snapshot : %s"
% (prop_values_len, properties_len, snapshot)
)
(
properties["urlkey"],
properties["timestamp"],
properties["original"],
properties["mimetype"],
properties["statuscode"],
properties["digest"],
properties["length"],
) = prop_values
yield CdxSnapshot(
properties["urlkey"],
properties["timestamp"],
properties["original"],
properties["mimetype"],
properties["statuscode"],
properties["digest"],
properties["length"],
)