not passing dict to cdxsnapshot
This commit is contained in:
parent
04cda4558e
commit
a6470b1036
@ -11,7 +11,6 @@
|
|||||||
<a href="https://github.com/akamhy/waybackpy/actions?query=workflow%3ACI"><img alt="Build Status" src="https://github.com/akamhy/waybackpy/workflows/CI/badge.svg"></a>
|
<a href="https://github.com/akamhy/waybackpy/actions?query=workflow%3ACI"><img alt="Build Status" src="https://github.com/akamhy/waybackpy/workflows/CI/badge.svg"></a>
|
||||||
<a href="https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade"><img alt="Codacy Badge" src="https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65"></a>
|
<a href="https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade"><img alt="Codacy Badge" src="https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65"></a>
|
||||||
<a href="https://codecov.io/gh/akamhy/waybackpy"><img alt="codecov" src="https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg"></a>
|
<a href="https://codecov.io/gh/akamhy/waybackpy"><img alt="codecov" src="https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg"></a>
|
||||||
<a href="https://codeclimate.com/github/akamhy/waybackpy/maintainability"><img alt="Maintainability" src="https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability"></a>
|
|
||||||
<a href="https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md"><img alt="Contributions Welcome" src="https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square"></a>
|
<a href="https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md"><img alt="Contributions Welcome" src="https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square"></a>
|
||||||
<a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a>
|
<a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a>
|
||||||
<a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a>
|
<a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a>
|
||||||
@ -100,4 +99,3 @@ Released under the MIT License. See
|
|||||||
|
|
||||||
|
|
||||||
-----------------------------------------------------------------------------------------------------------------------------------------------
|
-----------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -5,28 +5,28 @@ from waybackpy.snapshot import CdxSnapshot, datetime
|
|||||||
|
|
||||||
def test_CdxSnapshot():
|
def test_CdxSnapshot():
|
||||||
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
|
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
|
||||||
|
prop_values = sample_input.split(" ")
|
||||||
|
properties = {}
|
||||||
(
|
(
|
||||||
urlkey,
|
properties["urlkey"],
|
||||||
timestamp,
|
properties["timestamp"],
|
||||||
original,
|
properties["original"],
|
||||||
mimetype,
|
properties["mimetype"],
|
||||||
statuscode,
|
properties["statuscode"],
|
||||||
digest,
|
properties["digest"],
|
||||||
length,
|
properties["length"],
|
||||||
) = sample_input.split(" ")
|
) = prop_values
|
||||||
|
|
||||||
snapshot = CdxSnapshot(
|
snapshot = CdxSnapshot(properties)
|
||||||
urlkey, timestamp, original, mimetype, statuscode, digest, length
|
|
||||||
)
|
|
||||||
|
|
||||||
assert urlkey == snapshot.urlkey
|
assert properties["urlkey"] == snapshot.urlkey
|
||||||
assert timestamp == snapshot.timestamp
|
assert properties["timestamp"] == snapshot.timestamp
|
||||||
assert original == snapshot.original
|
assert properties["original"] == snapshot.original
|
||||||
assert mimetype == snapshot.mimetype
|
assert properties["mimetype"] == snapshot.mimetype
|
||||||
assert statuscode == snapshot.statuscode
|
assert properties["statuscode"] == snapshot.statuscode
|
||||||
assert digest == snapshot.digest
|
assert properties["digest"] == snapshot.digest
|
||||||
assert length == snapshot.length
|
assert properties["length"] == snapshot.length
|
||||||
assert datetime.strptime(timestamp, "%Y%m%d%H%M%S") == snapshot.datetime_timestamp
|
assert datetime.strptime(properties["timestamp"], "%Y%m%d%H%M%S") == snapshot.datetime_timestamp
|
||||||
archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
|
archive_url = "https://web.archive.org/web/" + properties["timestamp"] + "/" + properties["original"]
|
||||||
assert archive_url == snapshot.archive_url
|
assert archive_url == snapshot.archive_url
|
||||||
assert archive_url == str(snapshot)
|
assert archive_url == str(snapshot)
|
||||||
|
@ -7,6 +7,7 @@ from .utils import (
|
|||||||
_check_filters,
|
_check_filters,
|
||||||
_check_collapses,
|
_check_collapses,
|
||||||
_check_match_type,
|
_check_match_type,
|
||||||
|
_add_payload,
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO : Threading support for pagination API. It's designed for Threading.
|
# TODO : Threading support for pagination API. It's designed for Threading.
|
||||||
@ -147,27 +148,7 @@ class Cdx:
|
|||||||
payload = {}
|
payload = {}
|
||||||
headers = {"User-Agent": self.user_agent}
|
headers = {"User-Agent": self.user_agent}
|
||||||
|
|
||||||
if self.start_timestamp:
|
_add_payload(self, payload)
|
||||||
payload["from"] = self.start_timestamp
|
|
||||||
|
|
||||||
if self.end_timestamp:
|
|
||||||
payload["to"] = self.end_timestamp
|
|
||||||
|
|
||||||
if self.gzip != True:
|
|
||||||
payload["gzip"] = "false"
|
|
||||||
|
|
||||||
if self.match_type:
|
|
||||||
payload["matchType"] = self.match_type
|
|
||||||
|
|
||||||
if self.filters and len(self.filters) > 0:
|
|
||||||
for i, f in enumerate(self.filters):
|
|
||||||
payload["filter" + str(i)] = f
|
|
||||||
|
|
||||||
if self.collapses and len(self.collapses) > 0:
|
|
||||||
for i, f in enumerate(self.collapses):
|
|
||||||
payload["collapse" + str(i)] = f
|
|
||||||
|
|
||||||
payload["url"] = self.url
|
|
||||||
|
|
||||||
if not self.start_timestamp or self.end_timestamp:
|
if not self.start_timestamp or self.end_timestamp:
|
||||||
self.use_page = True
|
self.use_page = True
|
||||||
@ -221,12 +202,4 @@ class Cdx:
|
|||||||
properties["length"],
|
properties["length"],
|
||||||
) = prop_values
|
) = prop_values
|
||||||
|
|
||||||
yield CdxSnapshot(
|
yield CdxSnapshot(properties)
|
||||||
properties["urlkey"],
|
|
||||||
properties["timestamp"],
|
|
||||||
properties["original"],
|
|
||||||
properties["mimetype"],
|
|
||||||
properties["statuscode"],
|
|
||||||
properties["digest"],
|
|
||||||
properties["length"],
|
|
||||||
)
|
|
||||||
|
@ -9,18 +9,18 @@ class CdxSnapshot:
|
|||||||
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
|
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, properties):
|
||||||
self, urlkey, timestamp, original, mimetype, statuscode, digest, length
|
self.urlkey = properties["urlkey"]
|
||||||
):
|
self.timestamp = properties["timestamp"]
|
||||||
self.urlkey = urlkey
|
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
||||||
self.timestamp = timestamp
|
self.original = properties["original"]
|
||||||
self.datetime_timestamp = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
|
self.mimetype = properties["mimetype"]
|
||||||
self.original = original
|
self.statuscode = properties["statuscode"]
|
||||||
self.mimetype = mimetype
|
self.digest = properties["digest"]
|
||||||
self.statuscode = statuscode
|
self.length = properties["length"]
|
||||||
self.digest = digest
|
self.archive_url = (
|
||||||
self.length = length
|
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
|
||||||
self.archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.archive_url
|
return self.archive_url
|
||||||
|
@ -11,6 +11,30 @@ quote = requests.utils.quote
|
|||||||
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||||
|
|
||||||
|
|
||||||
|
def _add_payload(self, payload):
|
||||||
|
if self.start_timestamp:
|
||||||
|
payload["from"] = self.start_timestamp
|
||||||
|
|
||||||
|
if self.end_timestamp:
|
||||||
|
payload["to"] = self.end_timestamp
|
||||||
|
|
||||||
|
if self.gzip != True:
|
||||||
|
payload["gzip"] = "false"
|
||||||
|
|
||||||
|
if self.match_type:
|
||||||
|
payload["matchType"] = self.match_type
|
||||||
|
|
||||||
|
if self.filters and len(self.filters) > 0:
|
||||||
|
for i, f in enumerate(self.filters):
|
||||||
|
payload["filter" + str(i)] = f
|
||||||
|
|
||||||
|
if self.collapses and len(self.collapses) > 0:
|
||||||
|
for i, f in enumerate(self.collapses):
|
||||||
|
payload["collapse" + str(i)] = f
|
||||||
|
|
||||||
|
payload["url"] = self.url
|
||||||
|
|
||||||
|
|
||||||
def _ts(timestamp, data):
|
def _ts(timestamp, data):
|
||||||
"""
|
"""
|
||||||
Get timestamp of last fetched archive.
|
Get timestamp of last fetched archive.
|
||||||
@ -96,18 +120,12 @@ def _check_filters(filters):
|
|||||||
key = match.group(1)
|
key = match.group(1)
|
||||||
val = match.group(2)
|
val = match.group(2)
|
||||||
|
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
e = "Filter '%s' not following the cdx filter syntax." % f
|
e = "Filter '%s' not following the cdx filter syntax." % f
|
||||||
raise WaybackError(e)
|
raise WaybackError(e)
|
||||||
|
|
||||||
|
|
||||||
def _cleaned_url(url):
|
def _cleaned_url(url):
|
||||||
print(1)
|
|
||||||
"""
|
|
||||||
Remove EOL
|
|
||||||
replace " " with "_"
|
|
||||||
"""
|
|
||||||
return str(url).strip().replace(" ", "%20")
|
return str(url).strip().replace(" ", "%20")
|
||||||
|
|
||||||
|
|
||||||
@ -258,7 +276,6 @@ def _get_response(
|
|||||||
)
|
)
|
||||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||||
url = _full_url(endpoint, params)
|
url = _full_url(endpoint, params)
|
||||||
print(url)
|
|
||||||
try:
|
try:
|
||||||
if not return_full_url:
|
if not return_full_url:
|
||||||
return s.get(url, headers=headers)
|
return s.get(url, headers=headers)
|
||||||
|
Loading…
Reference in New Issue
Block a user