not passing dict to cdxsnapshot

This commit is contained in:
Akash Mahanty
2021-01-10 10:40:32 +05:30
parent 04cda4558e
commit a6470b1036
5 changed files with 59 additions and 71 deletions

View File

@@ -7,6 +7,7 @@ from .utils import (
_check_filters,
_check_collapses,
_check_match_type,
_add_payload,
)
# TODO : Threading support for pagination API. It's designed for Threading.
@@ -147,27 +148,7 @@ class Cdx:
payload = {}
headers = {"User-Agent": self.user_agent}
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip != True:
payload["gzip"] = "false"
if self.match_type:
payload["matchType"] = self.match_type
if self.filters and len(self.filters) > 0:
for i, f in enumerate(self.filters):
payload["filter" + str(i)] = f
if self.collapses and len(self.collapses) > 0:
for i, f in enumerate(self.collapses):
payload["collapse" + str(i)] = f
payload["url"] = self.url
_add_payload(self, payload)
if not self.start_timestamp or self.end_timestamp:
self.use_page = True
@@ -221,12 +202,4 @@ class Cdx:
properties["length"],
) = prop_values
yield CdxSnapshot(
properties["urlkey"],
properties["timestamp"],
properties["original"],
properties["mimetype"],
properties["statuscode"],
properties["digest"],
properties["length"],
)
yield CdxSnapshot(properties)

View File

@@ -9,18 +9,18 @@ class CdxSnapshot:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
"""
def __init__(
self, urlkey, timestamp, original, mimetype, statuscode, digest, length
):
self.urlkey = urlkey
self.timestamp = timestamp
self.datetime_timestamp = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
self.original = original
self.mimetype = mimetype
self.statuscode = statuscode
self.digest = digest
self.length = length
self.archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
def __init__(self, properties):
self.urlkey = properties["urlkey"]
self.timestamp = properties["timestamp"]
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
self.original = properties["original"]
self.mimetype = properties["mimetype"]
self.statuscode = properties["statuscode"]
self.digest = properties["digest"]
self.length = properties["length"]
self.archive_url = (
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
)
def __str__(self):
return self.archive_url

View File

@@ -11,6 +11,30 @@ quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _add_payload(self, payload):
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip != True:
payload["gzip"] = "false"
if self.match_type:
payload["matchType"] = self.match_type
if self.filters and len(self.filters) > 0:
for i, f in enumerate(self.filters):
payload["filter" + str(i)] = f
if self.collapses and len(self.collapses) > 0:
for i, f in enumerate(self.collapses):
payload["collapse" + str(i)] = f
payload["url"] = self.url
def _ts(timestamp, data):
"""
Get timestamp of last fetched archive.
@@ -96,18 +120,12 @@ def _check_filters(filters):
key = match.group(1)
val = match.group(2)
except Exception:
e = "Filter '%s' not following the cdx filter syntax." % f
raise WaybackError(e)
def _cleaned_url(url):
print(1)
"""
Remove EOL
replace " " with "_"
"""
return str(url).strip().replace(" ", "%20")
@@ -258,7 +276,6 @@ def _get_response(
)
s.mount("https://", HTTPAdapter(max_retries=retries))
url = _full_url(endpoint, params)
print(url)
try:
if not return_full_url:
return s.get(url, headers=headers)