not passing dict to cdxsnapshot

This commit is contained in:
Akash Mahanty 2021-01-10 10:40:32 +05:30
parent 04cda4558e
commit a6470b1036
5 changed files with 59 additions and 71 deletions

View File

@ -11,7 +11,6 @@
<a href="https://github.com/akamhy/waybackpy/actions?query=workflow%3ACI"><img alt="Build Status" src="https://github.com/akamhy/waybackpy/workflows/CI/badge.svg"></a>
<a href="https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=akamhy/waybackpy&amp;utm_campaign=Badge_Grade"><img alt="Codacy Badge" src="https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65"></a>
<a href="https://codecov.io/gh/akamhy/waybackpy"><img alt="codecov" src="https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg"></a>
<a href="https://codeclimate.com/github/akamhy/waybackpy/maintainability"><img alt="Maintainability" src="https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability"></a>
<a href="https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md"><img alt="Contributions Welcome" src="https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square"></a>
<a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a>
<a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a>
@ -100,4 +99,3 @@ Released under the MIT License. See
-----------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -5,28 +5,28 @@ from waybackpy.snapshot import CdxSnapshot, datetime
def test_CdxSnapshot():
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
prop_values = sample_input.split(" ")
properties = {}
(
urlkey,
timestamp,
original,
mimetype,
statuscode,
digest,
length,
) = sample_input.split(" ")
properties["urlkey"],
properties["timestamp"],
properties["original"],
properties["mimetype"],
properties["statuscode"],
properties["digest"],
properties["length"],
) = prop_values
snapshot = CdxSnapshot(
urlkey, timestamp, original, mimetype, statuscode, digest, length
)
snapshot = CdxSnapshot(properties)
assert urlkey == snapshot.urlkey
assert timestamp == snapshot.timestamp
assert original == snapshot.original
assert mimetype == snapshot.mimetype
assert statuscode == snapshot.statuscode
assert digest == snapshot.digest
assert length == snapshot.length
assert datetime.strptime(timestamp, "%Y%m%d%H%M%S") == snapshot.datetime_timestamp
archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
assert properties["urlkey"] == snapshot.urlkey
assert properties["timestamp"] == snapshot.timestamp
assert properties["original"] == snapshot.original
assert properties["mimetype"] == snapshot.mimetype
assert properties["statuscode"] == snapshot.statuscode
assert properties["digest"] == snapshot.digest
assert properties["length"] == snapshot.length
assert datetime.strptime(properties["timestamp"], "%Y%m%d%H%M%S") == snapshot.datetime_timestamp
archive_url = "https://web.archive.org/web/" + properties["timestamp"] + "/" + properties["original"]
assert archive_url == snapshot.archive_url
assert archive_url == str(snapshot)

View File

@ -7,6 +7,7 @@ from .utils import (
_check_filters,
_check_collapses,
_check_match_type,
_add_payload,
)
# TODO : Threading support for pagination API. It's designed for Threading.
@ -147,27 +148,7 @@ class Cdx:
payload = {}
headers = {"User-Agent": self.user_agent}
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip != True:
payload["gzip"] = "false"
if self.match_type:
payload["matchType"] = self.match_type
if self.filters and len(self.filters) > 0:
for i, f in enumerate(self.filters):
payload["filter" + str(i)] = f
if self.collapses and len(self.collapses) > 0:
for i, f in enumerate(self.collapses):
payload["collapse" + str(i)] = f
payload["url"] = self.url
_add_payload(self, payload)
if not self.start_timestamp or self.end_timestamp:
self.use_page = True
@ -221,12 +202,4 @@ class Cdx:
properties["length"],
) = prop_values
yield CdxSnapshot(
properties["urlkey"],
properties["timestamp"],
properties["original"],
properties["mimetype"],
properties["statuscode"],
properties["digest"],
properties["length"],
)
yield CdxSnapshot(properties)

View File

@ -9,18 +9,18 @@ class CdxSnapshot:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
"""
def __init__(
self, urlkey, timestamp, original, mimetype, statuscode, digest, length
):
self.urlkey = urlkey
self.timestamp = timestamp
self.datetime_timestamp = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
self.original = original
self.mimetype = mimetype
self.statuscode = statuscode
self.digest = digest
self.length = length
self.archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
def __init__(self, properties):
self.urlkey = properties["urlkey"]
self.timestamp = properties["timestamp"]
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
self.original = properties["original"]
self.mimetype = properties["mimetype"]
self.statuscode = properties["statuscode"]
self.digest = properties["digest"]
self.length = properties["length"]
self.archive_url = (
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
)
def __str__(self):
return self.archive_url

View File

@ -11,6 +11,30 @@ quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _add_payload(self, payload):
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip != True:
payload["gzip"] = "false"
if self.match_type:
payload["matchType"] = self.match_type
if self.filters and len(self.filters) > 0:
for i, f in enumerate(self.filters):
payload["filter" + str(i)] = f
if self.collapses and len(self.collapses) > 0:
for i, f in enumerate(self.collapses):
payload["collapse" + str(i)] = f
payload["url"] = self.url
def _ts(timestamp, data):
"""
Get timestamp of last fetched archive.
@ -96,18 +120,12 @@ def _check_filters(filters):
key = match.group(1)
val = match.group(2)
except Exception:
e = "Filter '%s' not following the cdx filter syntax." % f
raise WaybackError(e)
def _cleaned_url(url):
print(1)
"""
Remove EOL
replace " " with "_"
"""
return str(url).strip().replace(" ", "%20")
@ -258,7 +276,6 @@ def _get_response(
)
s.mount("https://", HTTPAdapter(max_retries=retries))
url = _full_url(endpoint, params)
print(url)
try:
if not return_full_url:
return s.get(url, headers=headers)