From cf18090f9064f312db301997ee3d35c89abb92af Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 09:52:20 +0530 Subject: [PATCH 01/10] fix typo --- waybackpy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py index 4052148..5939946 100644 --- a/waybackpy/__init__.py +++ b/waybackpy/__init__.py @@ -1,4 +1,4 @@ -"""Module initializer and provider of static infomation.""" +"""Module initializer and provider of static information.""" __title__ = "waybackpy" __description__ = ( From ec341fa8b3a38088017dfefd11736fe6d1b8a148 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 11:20:10 +0530 Subject: [PATCH 02/10] refactor code in cli module --- waybackpy/cli.py | 129 ++++++++++++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 52 deletions(-) diff --git a/waybackpy/cli.py b/waybackpy/cli.py index e9c45a3..6cb6d0f 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -7,7 +7,7 @@ import random import re import string from json import dumps -from typing import Generator, List, Optional +from typing import Any, Generator, List, Optional import click import requests @@ -43,6 +43,70 @@ def echo_availability_api( click.echo(dumps(availability_api_instance.json)) +def handle_cdx(data: List[Any]) -> None: + """ + Handles the CDX CLI options and output. + """ + url = data[0] + user_agent = data[1] + start_timestamp = data[2] + end_timestamp = data[3] + cdx_filter = data[4] + collapse = data[5] + cdx_print = data[6] + limit = data[7] + gzip = data[8] + match_type = data[9] + + filters = list(cdx_filter) + collapses = list(collapse) + cdx_print = list(cdx_print) + + cdx_api = WaybackMachineCDXServerAPI( + url, + user_agent=user_agent, + start_timestamp=start_timestamp, + end_timestamp=end_timestamp, + filters=filters, + match_type=match_type, + gzip=gzip, + collapses=collapses, + limit=limit, + ) + + snapshots = cdx_api.snapshots() + + for snapshot in snapshots: + if len(cdx_print) == 0: + click.echo(snapshot) + else: + output_string = [] + if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]): + output_string.append(snapshot.urlkey) + if any( + val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"] + ): + output_string.append(snapshot.timestamp) + if "original" in cdx_print: + output_string.append(snapshot.original) + if any(val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]): + output_string.append(snapshot.mimetype) + if any( + val in cdx_print for val in ["statuscode", "status-code", "status_code"] + ): + output_string.append(snapshot.statuscode) + if "digest" in cdx_print: + output_string.append(snapshot.digest) + if "length" in cdx_print: + output_string.append(snapshot.length) + if any( + val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"] + ): + output_string.append(snapshot.archive_url) + + click.echo(" ".join(output_string)) + + def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: """ Save output of CDX API on file. @@ -347,58 +411,19 @@ def main( # pylint: disable=no-value-for-parameter click.echo(url_) elif cdx: - filters = list(cdx_filter) - collapses = list(collapse) - cdx_print = list(cdx_print) - - cdx_api = WaybackMachineCDXServerAPI( + data = [ url, - user_agent=user_agent, - start_timestamp=start_timestamp, - end_timestamp=end_timestamp, - filters=filters, - match_type=match_type, - gzip=gzip, - collapses=collapses, - limit=limit, - ) - - snapshots = cdx_api.snapshots() - - for snapshot in snapshots: - if len(cdx_print) == 0: - click.echo(snapshot) - else: - output_string = [] - if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]): - output_string.append(snapshot.urlkey) - if any( - val in cdx_print - for val in ["timestamp", "time-stamp", "time_stamp"] - ): - output_string.append(snapshot.timestamp) - if "original" in cdx_print: - output_string.append(snapshot.original) - if any( - val in cdx_print for val in ["mimetype", "mime-type", "mime_type"] - ): - output_string.append(snapshot.mimetype) - if any( - val in cdx_print - for val in ["statuscode", "status-code", "status_code"] - ): - output_string.append(snapshot.statuscode) - if "digest" in cdx_print: - output_string.append(snapshot.digest) - if "length" in cdx_print: - output_string.append(snapshot.length) - if any( - val in cdx_print - for val in ["archiveurl", "archive-url", "archive_url"] - ): - output_string.append(snapshot.archive_url) - - click.echo(" ".join(output_string)) + user_agent, + start_timestamp, + end_timestamp, + cdx_filter, + collapse, + cdx_print, + limit, + gzip, + match_type, + ] + handle_cdx(data) else: click.echo( From 6d233f24fc262373241e2ac636ac7d610373a663 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 11:20:59 +0530 Subject: [PATCH 03/10] apply isort --- tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 921ff67..5a8b033 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,8 +1,8 @@ import requests from click.testing import CliRunner -from waybackpy.cli import main from waybackpy import __version__ +from waybackpy.cli import main def test_oldest() -> None: From 25eb709ade310487ed03c4f58564c66aaa98bd64 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 14:32:15 +0530 Subject: [PATCH 04/10] improve doc strings and comments and remove useless exceptions. --- waybackpy/availability_api.py | 45 ++++++++++++++++++----------------- waybackpy/cdx_api.py | 1 - waybackpy/cdx_snapshot.py | 2 +- waybackpy/cdx_utils.py | 4 ++-- waybackpy/cli.py | 22 ++++++++++------- waybackpy/exceptions.py | 17 +++---------- waybackpy/wrapper.py | 1 + 7 files changed, 44 insertions(+), 48 deletions(-) diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 324dae4..63e4892 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -1,19 +1,19 @@ """ This module interfaces the Wayback Machine's availability API. -The interface could be useful for looking up archives and finding archives +The interface is useful for looking up archives and finding archives that are close to a specific date and time. -It has a class called WaybackMachineAvailabilityAPI, and the class has -methods such as: +It has a class WaybackMachineAvailabilityAPI, and the class has +methods like: -near() for looking up archives close to a specific date and time. +near() for retrieving archives close to a specific date and time. oldest() for retrieving the first archive URL of the webpage. -newest() for retrieving the latest archive of an URL. +newest() for retrieving the latest archive of the webpage. -The Wayback Machine Availability response should be a valid JSON and +The Wayback Machine Availability API response must be a valid JSON and if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised. If the Availability API returned valid JSON but archive URL could not be found @@ -39,7 +39,7 @@ ResponseJSON = Dict[str, Any] class WaybackMachineAvailabilityAPI: """ - Class that interfaces the availability API of the Wayback Machine. + Class that interfaces the Wayback Machine's availability API. """ def __init__( @@ -61,7 +61,7 @@ class WaybackMachineAvailabilityAPI: @staticmethod def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: """ - Converts Unix time to wayback Machine timestamp and the Wayback Machine + Converts Unix time to Wayback Machine timestamp, Wayback Machine timestamp format is yyyyMMddhhmmss. """ return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") @@ -76,10 +76,10 @@ class WaybackMachineAvailabilityAPI: """ String representation of the class. If atleast one API call was successfully made then return the archive URL - as a string. Else returns "". + as a string. Else returns "" (empty string literal). """ - # String should not return anything other than a string object - # So, if a string repr is asked for before making any API requests + # __str__ can not return anything other than a string object + # So, if a string repr is asked even before making a API request # just return "" if not self.json: return "" @@ -147,7 +147,7 @@ class WaybackMachineAvailabilityAPI: self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" ) - raise ValueError("Could not get timestamp from result") + raise ValueError("Timestamp not found in the Availability API's JSON response.") @property def archive_url(self) -> str: @@ -159,8 +159,8 @@ class WaybackMachineAvailabilityAPI: archive_url = "" data = self.json - # If the user didn't invoke oldest, newest or near but tries to access the - # archive_url attribute then assume they are fine with any archive + # If the user didn't invoke oldest, newest or near but tries to access + # archive_url attribute then assume they that are fine with any archive # and invoke the oldest method. if not data: self.oldest() @@ -172,10 +172,10 @@ class WaybackMachineAvailabilityAPI: not data or not data["archived_snapshots"] ): self.setup_json() # It makes a new API call - data = self.json # json() updated the value of JSON attribute + data = self.json # setup_json() updates value of json attribute - # If we exhausted the max_tries, then we give up and - # raise exception. + # If exhausted max_tries, then give up and + # raise ArchiveNotInAvailabilityAPIResponse. if not data or not data["archived_snapshots"]: raise ArchiveNotInAvailabilityAPIResponse( @@ -198,7 +198,7 @@ class WaybackMachineAvailabilityAPI: def wayback_timestamp(**kwargs: int) -> str: """ Prepends zero before the year, month, day, hour and minute so that they - are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. + are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format. """ return "".join( str(kwargs[key]).zfill(2) @@ -218,7 +218,7 @@ class WaybackMachineAvailabilityAPI: Passes the current UNIX time to near() for retrieving the newest archive from the availability API. - We assume that wayback machine can not archive the future of a webpage. + Remember UNIX time is UTC and Wayback Machine is also UTC based. """ return self.near(unix_timestamp=int(time.time())) @@ -232,7 +232,8 @@ class WaybackMachineAvailabilityAPI: unix_timestamp: Optional[int] = None, ) -> "WaybackMachineAvailabilityAPI": """ - The main method for the Class, oldest() and newest() are dependent on it. + The most important method of this Class, oldest() and newest() are + dependent on it. It generates the timestamp based on the input either by calling the unix_timestamp_to_wayback_timestamp or wayback_timestamp method with @@ -240,8 +241,8 @@ class WaybackMachineAvailabilityAPI: Adds the timestamp to the payload dictionary. - And finally invoking the json method to make the API call then returns - the instance. + And finally invokes the setup_json method to make the API call then + finally returns the instance. """ if unix_timestamp: timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index ce46ae4..fb7587a 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -173,7 +173,6 @@ class WaybackMachineCDXServerAPI: for i, collapse in enumerate(self.collapses): payload["collapse" + str(i)] = collapse - # Don't need to return anything as it's dictionary. payload["url"] = self.url def snapshots(self) -> Generator[CDXSnapshot, None, None]: diff --git a/waybackpy/cdx_snapshot.py b/waybackpy/cdx_snapshot.py index d7a4a16..20218c1 100644 --- a/waybackpy/cdx_snapshot.py +++ b/waybackpy/cdx_snapshot.py @@ -1,5 +1,5 @@ """ -Module that contains the CDXSnapshot class, CDX records are casted +Module that contains the CDXSnapshot class, CDX records/lines are casted to CDXSnapshot objects for easier access. The CDX index format is plain text data. Each line ('record') indicates a diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py index 8826b21..583dd26 100644 --- a/waybackpy/cdx_utils.py +++ b/waybackpy/cdx_utils.py @@ -2,7 +2,7 @@ Utility functions required for accessing the CDX server API. These are here in this module so that we don’t make any module too -big. +long. """ import re @@ -63,7 +63,7 @@ def get_response( backoff_factor: float = 0.5, ) -> Union[requests.Response, Exception]: """ - Make get request to the CDX server and return the response. + Makes get request to the CDX server and returns the response. """ session = requests.Session() diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 6cb6d0f..db7c774 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -1,5 +1,5 @@ """ -Module that makes waybackpy a CLI tool. +Module responsible for enabling waybackpy to function as a CLI tool. """ import os @@ -18,24 +18,30 @@ from .cdx_api import WaybackMachineCDXServerAPI from .save_api import WaybackMachineSaveAPI from .utils import DEFAULT_USER_AGENT from .wrapper import Url +from .exceptions import ArchiveNotInAvailabilityAPIResponse def echo_availability_api( availability_api_instance: WaybackMachineAvailabilityAPI, json: bool ) -> None: """ - Output availability API depending functions. - Near, oldest and newest output by this method. + Output for method that use the availability API. + Near, oldest and newest output via this function. """ - if not availability_api_instance.archive_url: - archive_url = ( + try: + if availability_api_instance.archive_url: + archive_url = availability_api_instance.archive_url + except ArchiveNotInAvailabilityAPIResponse as error: + message = ( "NO ARCHIVE FOUND - The requested URL is probably " + "not yet archived or if the URL was recently archived then it is " + "not yet available via the Wayback Machine's availability API " + "because of database lag and should be available after some time." ) - else: - archive_url = availability_api_instance.archive_url + + click.echo(message + "\nJSON response:\n" + str(error), err=True) + return + click.echo("Archive URL:") click.echo(archive_url) if json: @@ -45,7 +51,7 @@ def echo_availability_api( def handle_cdx(data: List[Any]) -> None: """ - Handles the CDX CLI options and output. + Handles the CDX CLI options and output format. """ url = data[0] user_agent = data[1] diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 02ee953..3e8d347 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -12,20 +12,7 @@ class WaybackError(Exception): 1) Wayback Machine API Service is unreachable/down. 2) You passed illegal arguments. - All other exceptions are inherited from this class. - """ - - -class RedirectSaveError(WaybackError): - """ - Raised when the original URL is redirected and the - redirect URL is archived but not the original URL. - """ - - -class URLError(Exception): - """ - Raised when malformed URLs are passed as arguments. + All other exceptions are inherited from this main exception. """ @@ -33,6 +20,8 @@ class TooManyRequestsError(WaybackError): """ Raised when you make more than 15 requests per minute and the Wayback Machine returns 429. + + See https://github.com/akamhy/waybackpy/issues/131 """ diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 21b6782..30ff24b 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -113,6 +113,7 @@ class Url: """Set the attributes for total backwards compatibility.""" self.archive_url = self.wayback_machine_availability_api.archive_url self.json = self.wayback_machine_availability_api.json + self.JSON = self.json # for backwards compatibility, do not remove it. self.timestamp = self.wayback_machine_availability_api.timestamp() def total_archives( From 16f94db144c455975fd2209f85e04b563b8de7bc Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 14:33:16 +0530 Subject: [PATCH 05/10] incr version to v3.0.3 --- waybackpy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py index 5939946..bf80948 100644 --- a/waybackpy/__init__.py +++ b/waybackpy/__init__.py @@ -6,7 +6,7 @@ __description__ = ( "Archive pages and retrieve archived pages easily." ) __url__ = "https://akamhy.github.io/waybackpy/" -__version__ = "3.0.2" +__version__ = "3.0.3" __download_url__ = f"https://github.com/akamhy/waybackpy/archive/{__version__}.tar.gz" __author__ = "Akash Mahanty" __author_email__ = "akamhy@yahoo.com" From edaa1d5d54fe539764570fad18d41c0c8c8c7a7a Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 15:40:38 +0530 Subject: [PATCH 06/10] update value to the new limit. --- waybackpy/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/waybackpy/cli.py b/waybackpy/cli.py index db7c774..a742ca0 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -301,7 +301,7 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: "-l", "--limit", help="Number of maximum record that CDX API is asked to return per API call, " - + "default value is 500 records.", + + "default value is 25000 records.", ) @click.option( "-cp", From 89016d433c28bf5d1450f97e5867247cce77e9c7 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 15:47:38 +0530 Subject: [PATCH 07/10] added trove Typing :: Typed and Development Status :: 5 - Production/Stable --- setup.cfg | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 866c15e..0678e33 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,9 +24,11 @@ keywords = CDX API savepagenow classifiers = - Development Status :: 4 - Beta + Development Status :: 5 - Production/Stable Intended Audience :: Developers + Intended Audience :: End Users/Desktop Natural Language :: English + Typing :: Typed License :: OSI Approved :: MIT License Programming Language :: Python Programming Language :: Python :: 3 From 5954fcc646aa3ef4dbf7932e3eaad07ccb279ff4 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 15:51:11 +0530 Subject: [PATCH 08/10] format with black --- waybackpy/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 30ff24b..93e317a 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -113,7 +113,7 @@ class Url: """Set the attributes for total backwards compatibility.""" self.archive_url = self.wayback_machine_availability_api.archive_url self.json = self.wayback_machine_availability_api.json - self.JSON = self.json # for backwards compatibility, do not remove it. + self.JSON = self.json # for backwards compatibility, do not remove it. self.timestamp = self.wayback_machine_availability_api.timestamp() def total_archives( From 87fb5ecd58a6bf22bc32e1a0a9697176ff9f5d65 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 16:12:30 +0530 Subject: [PATCH 09/10] remove latest version funcs from utils, they were unused. --- tests/test_utils.py | 13 +++---------- waybackpy/utils.py | 39 --------------------------------------- 2 files changed, 3 insertions(+), 49 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index c6467f6..aa651e3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,7 @@ +import pytest + from waybackpy import __version__ -from waybackpy.utils import ( - DEFAULT_USER_AGENT, - latest_version_github, - latest_version_pypi, -) +from waybackpy.utils import DEFAULT_USER_AGENT def test_default_user_agent() -> None: @@ -11,8 +9,3 @@ def test_default_user_agent() -> None: DEFAULT_USER_AGENT == f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" ) - - -def test_latest_version() -> None: - package_name = "waybackpy" - assert latest_version_github(package_name) == latest_version_pypi(package_name) diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 1c3462e..65f7dec 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -9,42 +9,3 @@ from . import __version__ DEFAULT_USER_AGENT: str = ( f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" ) - - -def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str: - """Latest waybackpy version on PyPi.""" - request_url = "https://pypi.org/pypi/" + package_name + "/json" - headers = {"User-Agent": user_agent} - response = requests.get(request_url, headers=headers) - data = response.json() - if ( - data is not None - and "info" in data - and data["info"] is not None - and "version" in data["info"] - and data["info"]["version"] is not None - ): - return str(data["info"]["version"]) - - raise ValueError("Could not get latest pypi version") - - -def latest_version_github( - package_name: str, user_agent: str = DEFAULT_USER_AGENT -) -> str: - """Latest waybackpy version on GitHub.""" - request_url = ( - "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1" - ) - headers = {"User-Agent": user_agent} - response = requests.get(request_url, headers=headers) - data = response.json() - if ( - data is not None - and len(data) > 0 - and data[0] is not None - and "tag_name" in data[0] - ): - return str(data[0]["tag_name"]) - - raise ValueError("Could not get latest github version") From cd5c3c61a5b9e0a61962d3cc747cc89ea8bdff5a Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Wed, 9 Feb 2022 16:18:25 +0530 Subject: [PATCH 10/10] fix imports with isort --- tests/test_utils.py | 2 -- waybackpy/cli.py | 2 +- waybackpy/utils.py | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index aa651e3..98ee1bd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,3 @@ -import pytest - from waybackpy import __version__ from waybackpy.utils import DEFAULT_USER_AGENT diff --git a/waybackpy/cli.py b/waybackpy/cli.py index a742ca0..f8eb424 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -15,10 +15,10 @@ import requests from . import __version__ from .availability_api import WaybackMachineAvailabilityAPI from .cdx_api import WaybackMachineCDXServerAPI +from .exceptions import ArchiveNotInAvailabilityAPIResponse from .save_api import WaybackMachineSaveAPI from .utils import DEFAULT_USER_AGENT from .wrapper import Url -from .exceptions import ArchiveNotInAvailabilityAPIResponse def echo_availability_api( diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 65f7dec..3890a8f 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -2,8 +2,6 @@ Utility functions and shared variables like DEFAULT_USER_AGENT are here. """ -import requests - from . import __version__ DEFAULT_USER_AGENT: str = (