diff --git a/README.md b/README.md
index d1377d8..0fae689 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,6 @@
-
@@ -100,4 +99,3 @@ Released under the MIT License. See
-----------------------------------------------------------------------------------------------------------------------------------------------
-
diff --git a/tests/test_snapshot.py b/tests/test_snapshot.py
index 8d02a49..5f09af1 100644
--- a/tests/test_snapshot.py
+++ b/tests/test_snapshot.py
@@ -5,28 +5,28 @@ from waybackpy.snapshot import CdxSnapshot, datetime
def test_CdxSnapshot():
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
+ prop_values = sample_input.split(" ")
+ properties = {}
(
- urlkey,
- timestamp,
- original,
- mimetype,
- statuscode,
- digest,
- length,
- ) = sample_input.split(" ")
+ properties["urlkey"],
+ properties["timestamp"],
+ properties["original"],
+ properties["mimetype"],
+ properties["statuscode"],
+ properties["digest"],
+ properties["length"],
+ ) = prop_values
- snapshot = CdxSnapshot(
- urlkey, timestamp, original, mimetype, statuscode, digest, length
- )
+ snapshot = CdxSnapshot(properties)
- assert urlkey == snapshot.urlkey
- assert timestamp == snapshot.timestamp
- assert original == snapshot.original
- assert mimetype == snapshot.mimetype
- assert statuscode == snapshot.statuscode
- assert digest == snapshot.digest
- assert length == snapshot.length
- assert datetime.strptime(timestamp, "%Y%m%d%H%M%S") == snapshot.datetime_timestamp
- archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
+ assert properties["urlkey"] == snapshot.urlkey
+ assert properties["timestamp"] == snapshot.timestamp
+ assert properties["original"] == snapshot.original
+ assert properties["mimetype"] == snapshot.mimetype
+ assert properties["statuscode"] == snapshot.statuscode
+ assert properties["digest"] == snapshot.digest
+ assert properties["length"] == snapshot.length
+ assert datetime.strptime(properties["timestamp"], "%Y%m%d%H%M%S") == snapshot.datetime_timestamp
+ archive_url = "https://web.archive.org/web/" + properties["timestamp"] + "/" + properties["original"]
assert archive_url == snapshot.archive_url
assert archive_url == str(snapshot)
diff --git a/waybackpy/cdx.py b/waybackpy/cdx.py
index ae50f7b..69c25b5 100644
--- a/waybackpy/cdx.py
+++ b/waybackpy/cdx.py
@@ -7,6 +7,7 @@ from .utils import (
_check_filters,
_check_collapses,
_check_match_type,
+ _add_payload,
)
# TODO : Threading support for pagination API. It's designed for Threading.
@@ -147,27 +148,7 @@ class Cdx:
payload = {}
headers = {"User-Agent": self.user_agent}
- if self.start_timestamp:
- payload["from"] = self.start_timestamp
-
- if self.end_timestamp:
- payload["to"] = self.end_timestamp
-
- if self.gzip != True:
- payload["gzip"] = "false"
-
- if self.match_type:
- payload["matchType"] = self.match_type
-
- if self.filters and len(self.filters) > 0:
- for i, f in enumerate(self.filters):
- payload["filter" + str(i)] = f
-
- if self.collapses and len(self.collapses) > 0:
- for i, f in enumerate(self.collapses):
- payload["collapse" + str(i)] = f
-
- payload["url"] = self.url
+ _add_payload(self, payload)
if not self.start_timestamp or self.end_timestamp:
self.use_page = True
@@ -221,12 +202,4 @@ class Cdx:
properties["length"],
) = prop_values
- yield CdxSnapshot(
- properties["urlkey"],
- properties["timestamp"],
- properties["original"],
- properties["mimetype"],
- properties["statuscode"],
- properties["digest"],
- properties["length"],
- )
+ yield CdxSnapshot(properties)
diff --git a/waybackpy/snapshot.py b/waybackpy/snapshot.py
index d6a2c1e..7fa2653 100644
--- a/waybackpy/snapshot.py
+++ b/waybackpy/snapshot.py
@@ -9,18 +9,18 @@ class CdxSnapshot:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
"""
- def __init__(
- self, urlkey, timestamp, original, mimetype, statuscode, digest, length
- ):
- self.urlkey = urlkey
- self.timestamp = timestamp
- self.datetime_timestamp = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
- self.original = original
- self.mimetype = mimetype
- self.statuscode = statuscode
- self.digest = digest
- self.length = length
- self.archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
+ def __init__(self, properties):
+ self.urlkey = properties["urlkey"]
+ self.timestamp = properties["timestamp"]
+ self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
+ self.original = properties["original"]
+ self.mimetype = properties["mimetype"]
+ self.statuscode = properties["statuscode"]
+ self.digest = properties["digest"]
+ self.length = properties["length"]
+ self.archive_url = (
+ "https://web.archive.org/web/" + self.timestamp + "/" + self.original
+ )
def __str__(self):
return self.archive_url
diff --git a/waybackpy/utils.py b/waybackpy/utils.py
index 41656e7..02f3f46 100644
--- a/waybackpy/utils.py
+++ b/waybackpy/utils.py
@@ -11,6 +11,30 @@ quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
+def _add_payload(self, payload):
+ if self.start_timestamp:
+ payload["from"] = self.start_timestamp
+
+ if self.end_timestamp:
+ payload["to"] = self.end_timestamp
+
+ if self.gzip != True:
+ payload["gzip"] = "false"
+
+ if self.match_type:
+ payload["matchType"] = self.match_type
+
+ if self.filters and len(self.filters) > 0:
+ for i, f in enumerate(self.filters):
+ payload["filter" + str(i)] = f
+
+ if self.collapses and len(self.collapses) > 0:
+ for i, f in enumerate(self.collapses):
+ payload["collapse" + str(i)] = f
+
+ payload["url"] = self.url
+
+
def _ts(timestamp, data):
"""
Get timestamp of last fetched archive.
@@ -96,18 +120,12 @@ def _check_filters(filters):
key = match.group(1)
val = match.group(2)
-
except Exception:
e = "Filter '%s' not following the cdx filter syntax." % f
raise WaybackError(e)
def _cleaned_url(url):
- print(1)
- """
- Remove EOL
- replace " " with "_"
- """
return str(url).strip().replace(" ", "%20")
@@ -258,7 +276,6 @@ def _get_response(
)
s.mount("https://", HTTPAdapter(max_retries=retries))
url = _full_url(endpoint, params)
- print(url)
try:
if not return_full_url:
return s.get(url, headers=headers)