From a6470b10365d8bbd71a458f20b69c4d43b499f9f Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Sun, 10 Jan 2021 10:40:32 +0530 Subject: [PATCH] not passing dict to cdxsnapshot --- README.md | 2 -- tests/test_snapshot.py | 40 ++++++++++++++++++++-------------------- waybackpy/cdx.py | 33 +++------------------------------ waybackpy/snapshot.py | 24 ++++++++++++------------ waybackpy/utils.py | 31 ++++++++++++++++++++++++------- 5 files changed, 59 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index d1377d8..0fae689 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ Build Status Codacy Badge codecov -Maintainability Contributions Welcome Downloads GitHub lastest commit @@ -100,4 +99,3 @@ Released under the MIT License. See ----------------------------------------------------------------------------------------------------------------------------------------------- - diff --git a/tests/test_snapshot.py b/tests/test_snapshot.py index 8d02a49..5f09af1 100644 --- a/tests/test_snapshot.py +++ b/tests/test_snapshot.py @@ -5,28 +5,28 @@ from waybackpy.snapshot import CdxSnapshot, datetime def test_CdxSnapshot(): sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" + prop_values = sample_input.split(" ") + properties = {} ( - urlkey, - timestamp, - original, - mimetype, - statuscode, - digest, - length, - ) = sample_input.split(" ") + properties["urlkey"], + properties["timestamp"], + properties["original"], + properties["mimetype"], + properties["statuscode"], + properties["digest"], + properties["length"], + ) = prop_values - snapshot = CdxSnapshot( - urlkey, timestamp, original, mimetype, statuscode, digest, length - ) + snapshot = CdxSnapshot(properties) - assert urlkey == snapshot.urlkey - assert timestamp == snapshot.timestamp - assert original == snapshot.original - assert mimetype == snapshot.mimetype - assert statuscode == snapshot.statuscode - assert digest == snapshot.digest - assert length == snapshot.length - assert datetime.strptime(timestamp, "%Y%m%d%H%M%S") == snapshot.datetime_timestamp - archive_url = "https://web.archive.org/web/" + timestamp + "/" + original + assert properties["urlkey"] == snapshot.urlkey + assert properties["timestamp"] == snapshot.timestamp + assert properties["original"] == snapshot.original + assert properties["mimetype"] == snapshot.mimetype + assert properties["statuscode"] == snapshot.statuscode + assert properties["digest"] == snapshot.digest + assert properties["length"] == snapshot.length + assert datetime.strptime(properties["timestamp"], "%Y%m%d%H%M%S") == snapshot.datetime_timestamp + archive_url = "https://web.archive.org/web/" + properties["timestamp"] + "/" + properties["original"] assert archive_url == snapshot.archive_url assert archive_url == str(snapshot) diff --git a/waybackpy/cdx.py b/waybackpy/cdx.py index ae50f7b..69c25b5 100644 --- a/waybackpy/cdx.py +++ b/waybackpy/cdx.py @@ -7,6 +7,7 @@ from .utils import ( _check_filters, _check_collapses, _check_match_type, + _add_payload, ) # TODO : Threading support for pagination API. It's designed for Threading. @@ -147,27 +148,7 @@ class Cdx: payload = {} headers = {"User-Agent": self.user_agent} - if self.start_timestamp: - payload["from"] = self.start_timestamp - - if self.end_timestamp: - payload["to"] = self.end_timestamp - - if self.gzip != True: - payload["gzip"] = "false" - - if self.match_type: - payload["matchType"] = self.match_type - - if self.filters and len(self.filters) > 0: - for i, f in enumerate(self.filters): - payload["filter" + str(i)] = f - - if self.collapses and len(self.collapses) > 0: - for i, f in enumerate(self.collapses): - payload["collapse" + str(i)] = f - - payload["url"] = self.url + _add_payload(self, payload) if not self.start_timestamp or self.end_timestamp: self.use_page = True @@ -221,12 +202,4 @@ class Cdx: properties["length"], ) = prop_values - yield CdxSnapshot( - properties["urlkey"], - properties["timestamp"], - properties["original"], - properties["mimetype"], - properties["statuscode"], - properties["digest"], - properties["length"], - ) + yield CdxSnapshot(properties) diff --git a/waybackpy/snapshot.py b/waybackpy/snapshot.py index d6a2c1e..7fa2653 100644 --- a/waybackpy/snapshot.py +++ b/waybackpy/snapshot.py @@ -9,18 +9,18 @@ class CdxSnapshot: org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415 """ - def __init__( - self, urlkey, timestamp, original, mimetype, statuscode, digest, length - ): - self.urlkey = urlkey - self.timestamp = timestamp - self.datetime_timestamp = datetime.strptime(timestamp, "%Y%m%d%H%M%S") - self.original = original - self.mimetype = mimetype - self.statuscode = statuscode - self.digest = digest - self.length = length - self.archive_url = "https://web.archive.org/web/" + timestamp + "/" + original + def __init__(self, properties): + self.urlkey = properties["urlkey"] + self.timestamp = properties["timestamp"] + self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S") + self.original = properties["original"] + self.mimetype = properties["mimetype"] + self.statuscode = properties["statuscode"] + self.digest = properties["digest"] + self.length = properties["length"] + self.archive_url = ( + "https://web.archive.org/web/" + self.timestamp + "/" + self.original + ) def __str__(self): return self.archive_url diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 41656e7..02f3f46 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -11,6 +11,30 @@ quote = requests.utils.quote default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy" +def _add_payload(self, payload): + if self.start_timestamp: + payload["from"] = self.start_timestamp + + if self.end_timestamp: + payload["to"] = self.end_timestamp + + if self.gzip != True: + payload["gzip"] = "false" + + if self.match_type: + payload["matchType"] = self.match_type + + if self.filters and len(self.filters) > 0: + for i, f in enumerate(self.filters): + payload["filter" + str(i)] = f + + if self.collapses and len(self.collapses) > 0: + for i, f in enumerate(self.collapses): + payload["collapse" + str(i)] = f + + payload["url"] = self.url + + def _ts(timestamp, data): """ Get timestamp of last fetched archive. @@ -96,18 +120,12 @@ def _check_filters(filters): key = match.group(1) val = match.group(2) - except Exception: e = "Filter '%s' not following the cdx filter syntax." % f raise WaybackError(e) def _cleaned_url(url): - print(1) - """ - Remove EOL - replace " " with "_" - """ return str(url).strip().replace(" ", "%20") @@ -258,7 +276,6 @@ def _get_response( ) s.mount("https://", HTTPAdapter(max_retries=retries)) url = _full_url(endpoint, params) - print(url) try: if not return_full_url: return s.get(url, headers=headers)