not passing dict to cdxsnapshot

This commit is contained in:
Akash Mahanty 2021-01-10 10:40:32 +05:30
parent 04cda4558e
commit a6470b1036
5 changed files with 59 additions and 71 deletions

View File

@ -11,7 +11,6 @@
<a href="https://github.com/akamhy/waybackpy/actions?query=workflow%3ACI"><img alt="Build Status" src="https://github.com/akamhy/waybackpy/workflows/CI/badge.svg"></a> <a href="https://github.com/akamhy/waybackpy/actions?query=workflow%3ACI"><img alt="Build Status" src="https://github.com/akamhy/waybackpy/workflows/CI/badge.svg"></a>
<a href="https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=akamhy/waybackpy&amp;utm_campaign=Badge_Grade"><img alt="Codacy Badge" src="https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65"></a> <a href="https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=akamhy/waybackpy&amp;utm_campaign=Badge_Grade"><img alt="Codacy Badge" src="https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65"></a>
<a href="https://codecov.io/gh/akamhy/waybackpy"><img alt="codecov" src="https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg"></a> <a href="https://codecov.io/gh/akamhy/waybackpy"><img alt="codecov" src="https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg"></a>
<a href="https://codeclimate.com/github/akamhy/waybackpy/maintainability"><img alt="Maintainability" src="https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability"></a>
<a href="https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md"><img alt="Contributions Welcome" src="https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square"></a> <a href="https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md"><img alt="Contributions Welcome" src="https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square"></a>
<a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a> <a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a>
<a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a> <a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a>
@ -100,4 +99,3 @@ Released under the MIT License. See
----------------------------------------------------------------------------------------------------------------------------------------------- -----------------------------------------------------------------------------------------------------------------------------------------------

View File

@ -5,28 +5,28 @@ from waybackpy.snapshot import CdxSnapshot, datetime
def test_CdxSnapshot(): def test_CdxSnapshot():
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
prop_values = sample_input.split(" ")
properties = {}
( (
urlkey, properties["urlkey"],
timestamp, properties["timestamp"],
original, properties["original"],
mimetype, properties["mimetype"],
statuscode, properties["statuscode"],
digest, properties["digest"],
length, properties["length"],
) = sample_input.split(" ") ) = prop_values
snapshot = CdxSnapshot( snapshot = CdxSnapshot(properties)
urlkey, timestamp, original, mimetype, statuscode, digest, length
)
assert urlkey == snapshot.urlkey assert properties["urlkey"] == snapshot.urlkey
assert timestamp == snapshot.timestamp assert properties["timestamp"] == snapshot.timestamp
assert original == snapshot.original assert properties["original"] == snapshot.original
assert mimetype == snapshot.mimetype assert properties["mimetype"] == snapshot.mimetype
assert statuscode == snapshot.statuscode assert properties["statuscode"] == snapshot.statuscode
assert digest == snapshot.digest assert properties["digest"] == snapshot.digest
assert length == snapshot.length assert properties["length"] == snapshot.length
assert datetime.strptime(timestamp, "%Y%m%d%H%M%S") == snapshot.datetime_timestamp assert datetime.strptime(properties["timestamp"], "%Y%m%d%H%M%S") == snapshot.datetime_timestamp
archive_url = "https://web.archive.org/web/" + timestamp + "/" + original archive_url = "https://web.archive.org/web/" + properties["timestamp"] + "/" + properties["original"]
assert archive_url == snapshot.archive_url assert archive_url == snapshot.archive_url
assert archive_url == str(snapshot) assert archive_url == str(snapshot)

View File

@ -7,6 +7,7 @@ from .utils import (
_check_filters, _check_filters,
_check_collapses, _check_collapses,
_check_match_type, _check_match_type,
_add_payload,
) )
# TODO : Threading support for pagination API. It's designed for Threading. # TODO : Threading support for pagination API. It's designed for Threading.
@ -147,27 +148,7 @@ class Cdx:
payload = {} payload = {}
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
if self.start_timestamp: _add_payload(self, payload)
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip != True:
payload["gzip"] = "false"
if self.match_type:
payload["matchType"] = self.match_type
if self.filters and len(self.filters) > 0:
for i, f in enumerate(self.filters):
payload["filter" + str(i)] = f
if self.collapses and len(self.collapses) > 0:
for i, f in enumerate(self.collapses):
payload["collapse" + str(i)] = f
payload["url"] = self.url
if not self.start_timestamp or self.end_timestamp: if not self.start_timestamp or self.end_timestamp:
self.use_page = True self.use_page = True
@ -221,12 +202,4 @@ class Cdx:
properties["length"], properties["length"],
) = prop_values ) = prop_values
yield CdxSnapshot( yield CdxSnapshot(properties)
properties["urlkey"],
properties["timestamp"],
properties["original"],
properties["mimetype"],
properties["statuscode"],
properties["digest"],
properties["length"],
)

View File

@ -9,18 +9,18 @@ class CdxSnapshot:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415 org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
""" """
def __init__( def __init__(self, properties):
self, urlkey, timestamp, original, mimetype, statuscode, digest, length self.urlkey = properties["urlkey"]
): self.timestamp = properties["timestamp"]
self.urlkey = urlkey self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
self.timestamp = timestamp self.original = properties["original"]
self.datetime_timestamp = datetime.strptime(timestamp, "%Y%m%d%H%M%S") self.mimetype = properties["mimetype"]
self.original = original self.statuscode = properties["statuscode"]
self.mimetype = mimetype self.digest = properties["digest"]
self.statuscode = statuscode self.length = properties["length"]
self.digest = digest self.archive_url = (
self.length = length "https://web.archive.org/web/" + self.timestamp + "/" + self.original
self.archive_url = "https://web.archive.org/web/" + timestamp + "/" + original )
def __str__(self): def __str__(self):
return self.archive_url return self.archive_url

View File

@ -11,6 +11,30 @@ quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy" default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _add_payload(self, payload):
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip != True:
payload["gzip"] = "false"
if self.match_type:
payload["matchType"] = self.match_type
if self.filters and len(self.filters) > 0:
for i, f in enumerate(self.filters):
payload["filter" + str(i)] = f
if self.collapses and len(self.collapses) > 0:
for i, f in enumerate(self.collapses):
payload["collapse" + str(i)] = f
payload["url"] = self.url
def _ts(timestamp, data): def _ts(timestamp, data):
""" """
Get timestamp of last fetched archive. Get timestamp of last fetched archive.
@ -96,18 +120,12 @@ def _check_filters(filters):
key = match.group(1) key = match.group(1)
val = match.group(2) val = match.group(2)
except Exception: except Exception:
e = "Filter '%s' not following the cdx filter syntax." % f e = "Filter '%s' not following the cdx filter syntax." % f
raise WaybackError(e) raise WaybackError(e)
def _cleaned_url(url): def _cleaned_url(url):
print(1)
"""
Remove EOL
replace " " with "_"
"""
return str(url).strip().replace(" ", "%20") return str(url).strip().replace(" ", "%20")
@ -258,7 +276,6 @@ def _get_response(
) )
s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries))
url = _full_url(endpoint, params) url = _full_url(endpoint, params)
print(url)
try: try:
if not return_full_url: if not return_full_url:
return s.get(url, headers=headers) return s.get(url, headers=headers)