diff --git a/setup.cfg b/setup.cfg index 655a0f9..e885a01 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,9 +24,11 @@ keywords = CDX API savepagenow classifiers = - Development Status :: 4 - Beta + Development Status :: 5 - Production/Stable Intended Audience :: Developers + Intended Audience :: End Users/Desktop Natural Language :: English + Typing :: Typed License :: OSI Approved :: MIT License Programming Language :: Python Programming Language :: Python :: 3 diff --git a/tests/test_utils.py b/tests/test_utils.py index c6467f6..98ee1bd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,5 @@ from waybackpy import __version__ -from waybackpy.utils import ( - DEFAULT_USER_AGENT, - latest_version_github, - latest_version_pypi, -) +from waybackpy.utils import DEFAULT_USER_AGENT def test_default_user_agent() -> None: @@ -11,8 +7,3 @@ def test_default_user_agent() -> None: DEFAULT_USER_AGENT == f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" ) - - -def test_latest_version() -> None: - package_name = "waybackpy" - assert latest_version_github(package_name) == latest_version_pypi(package_name) diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py index a2f8843..98cba3d 100644 --- a/waybackpy/__init__.py +++ b/waybackpy/__init__.py @@ -1,6 +1,6 @@ -"""Module initializer and provider of static infomation.""" +"""Module initializer and provider of static information.""" -__version__ = "3.0.2" +__version__ = "3.0.3" from .availability_api import WaybackMachineAvailabilityAPI from .cdx_api import WaybackMachineCDXServerAPI diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 324dae4..63e4892 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -1,19 +1,19 @@ """ This module interfaces the Wayback Machine's availability API. -The interface could be useful for looking up archives and finding archives +The interface is useful for looking up archives and finding archives that are close to a specific date and time. -It has a class called WaybackMachineAvailabilityAPI, and the class has -methods such as: +It has a class WaybackMachineAvailabilityAPI, and the class has +methods like: -near() for looking up archives close to a specific date and time. +near() for retrieving archives close to a specific date and time. oldest() for retrieving the first archive URL of the webpage. -newest() for retrieving the latest archive of an URL. +newest() for retrieving the latest archive of the webpage. -The Wayback Machine Availability response should be a valid JSON and +The Wayback Machine Availability API response must be a valid JSON and if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised. If the Availability API returned valid JSON but archive URL could not be found @@ -39,7 +39,7 @@ ResponseJSON = Dict[str, Any] class WaybackMachineAvailabilityAPI: """ - Class that interfaces the availability API of the Wayback Machine. + Class that interfaces the Wayback Machine's availability API. """ def __init__( @@ -61,7 +61,7 @@ class WaybackMachineAvailabilityAPI: @staticmethod def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: """ - Converts Unix time to wayback Machine timestamp and the Wayback Machine + Converts Unix time to Wayback Machine timestamp, Wayback Machine timestamp format is yyyyMMddhhmmss. """ return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") @@ -76,10 +76,10 @@ class WaybackMachineAvailabilityAPI: """ String representation of the class. If atleast one API call was successfully made then return the archive URL - as a string. Else returns "". + as a string. Else returns "" (empty string literal). """ - # String should not return anything other than a string object - # So, if a string repr is asked for before making any API requests + # __str__ can not return anything other than a string object + # So, if a string repr is asked even before making a API request # just return "" if not self.json: return "" @@ -147,7 +147,7 @@ class WaybackMachineAvailabilityAPI: self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" ) - raise ValueError("Could not get timestamp from result") + raise ValueError("Timestamp not found in the Availability API's JSON response.") @property def archive_url(self) -> str: @@ -159,8 +159,8 @@ class WaybackMachineAvailabilityAPI: archive_url = "" data = self.json - # If the user didn't invoke oldest, newest or near but tries to access the - # archive_url attribute then assume they are fine with any archive + # If the user didn't invoke oldest, newest or near but tries to access + # archive_url attribute then assume they that are fine with any archive # and invoke the oldest method. if not data: self.oldest() @@ -172,10 +172,10 @@ class WaybackMachineAvailabilityAPI: not data or not data["archived_snapshots"] ): self.setup_json() # It makes a new API call - data = self.json # json() updated the value of JSON attribute + data = self.json # setup_json() updates value of json attribute - # If we exhausted the max_tries, then we give up and - # raise exception. + # If exhausted max_tries, then give up and + # raise ArchiveNotInAvailabilityAPIResponse. if not data or not data["archived_snapshots"]: raise ArchiveNotInAvailabilityAPIResponse( @@ -198,7 +198,7 @@ class WaybackMachineAvailabilityAPI: def wayback_timestamp(**kwargs: int) -> str: """ Prepends zero before the year, month, day, hour and minute so that they - are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. + are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format. """ return "".join( str(kwargs[key]).zfill(2) @@ -218,7 +218,7 @@ class WaybackMachineAvailabilityAPI: Passes the current UNIX time to near() for retrieving the newest archive from the availability API. - We assume that wayback machine can not archive the future of a webpage. + Remember UNIX time is UTC and Wayback Machine is also UTC based. """ return self.near(unix_timestamp=int(time.time())) @@ -232,7 +232,8 @@ class WaybackMachineAvailabilityAPI: unix_timestamp: Optional[int] = None, ) -> "WaybackMachineAvailabilityAPI": """ - The main method for the Class, oldest() and newest() are dependent on it. + The most important method of this Class, oldest() and newest() are + dependent on it. It generates the timestamp based on the input either by calling the unix_timestamp_to_wayback_timestamp or wayback_timestamp method with @@ -240,8 +241,8 @@ class WaybackMachineAvailabilityAPI: Adds the timestamp to the payload dictionary. - And finally invoking the json method to make the API call then returns - the instance. + And finally invokes the setup_json method to make the API call then + finally returns the instance. """ if unix_timestamp: timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index ce46ae4..fb7587a 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -173,7 +173,6 @@ class WaybackMachineCDXServerAPI: for i, collapse in enumerate(self.collapses): payload["collapse" + str(i)] = collapse - # Don't need to return anything as it's dictionary. payload["url"] = self.url def snapshots(self) -> Generator[CDXSnapshot, None, None]: diff --git a/waybackpy/cdx_snapshot.py b/waybackpy/cdx_snapshot.py index d7a4a16..20218c1 100644 --- a/waybackpy/cdx_snapshot.py +++ b/waybackpy/cdx_snapshot.py @@ -1,5 +1,5 @@ """ -Module that contains the CDXSnapshot class, CDX records are casted +Module that contains the CDXSnapshot class, CDX records/lines are casted to CDXSnapshot objects for easier access. The CDX index format is plain text data. Each line ('record') indicates a diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py index 8826b21..583dd26 100644 --- a/waybackpy/cdx_utils.py +++ b/waybackpy/cdx_utils.py @@ -2,7 +2,7 @@ Utility functions required for accessing the CDX server API. These are here in this module so that we don’t make any module too -big. +long. """ import re @@ -63,7 +63,7 @@ def get_response( backoff_factor: float = 0.5, ) -> Union[requests.Response, Exception]: """ - Make get request to the CDX server and return the response. + Makes get request to the CDX server and returns the response. """ session = requests.Session() diff --git a/waybackpy/cli.py b/waybackpy/cli.py index e9c45a3..f8eb424 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -1,5 +1,5 @@ """ -Module that makes waybackpy a CLI tool. +Module responsible for enabling waybackpy to function as a CLI tool. """ import os @@ -7,7 +7,7 @@ import random import re import string from json import dumps -from typing import Generator, List, Optional +from typing import Any, Generator, List, Optional import click import requests @@ -15,6 +15,7 @@ import requests from . import __version__ from .availability_api import WaybackMachineAvailabilityAPI from .cdx_api import WaybackMachineCDXServerAPI +from .exceptions import ArchiveNotInAvailabilityAPIResponse from .save_api import WaybackMachineSaveAPI from .utils import DEFAULT_USER_AGENT from .wrapper import Url @@ -24,18 +25,23 @@ def echo_availability_api( availability_api_instance: WaybackMachineAvailabilityAPI, json: bool ) -> None: """ - Output availability API depending functions. - Near, oldest and newest output by this method. + Output for method that use the availability API. + Near, oldest and newest output via this function. """ - if not availability_api_instance.archive_url: - archive_url = ( + try: + if availability_api_instance.archive_url: + archive_url = availability_api_instance.archive_url + except ArchiveNotInAvailabilityAPIResponse as error: + message = ( "NO ARCHIVE FOUND - The requested URL is probably " + "not yet archived or if the URL was recently archived then it is " + "not yet available via the Wayback Machine's availability API " + "because of database lag and should be available after some time." ) - else: - archive_url = availability_api_instance.archive_url + + click.echo(message + "\nJSON response:\n" + str(error), err=True) + return + click.echo("Archive URL:") click.echo(archive_url) if json: @@ -43,6 +49,70 @@ def echo_availability_api( click.echo(dumps(availability_api_instance.json)) +def handle_cdx(data: List[Any]) -> None: + """ + Handles the CDX CLI options and output format. + """ + url = data[0] + user_agent = data[1] + start_timestamp = data[2] + end_timestamp = data[3] + cdx_filter = data[4] + collapse = data[5] + cdx_print = data[6] + limit = data[7] + gzip = data[8] + match_type = data[9] + + filters = list(cdx_filter) + collapses = list(collapse) + cdx_print = list(cdx_print) + + cdx_api = WaybackMachineCDXServerAPI( + url, + user_agent=user_agent, + start_timestamp=start_timestamp, + end_timestamp=end_timestamp, + filters=filters, + match_type=match_type, + gzip=gzip, + collapses=collapses, + limit=limit, + ) + + snapshots = cdx_api.snapshots() + + for snapshot in snapshots: + if len(cdx_print) == 0: + click.echo(snapshot) + else: + output_string = [] + if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]): + output_string.append(snapshot.urlkey) + if any( + val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"] + ): + output_string.append(snapshot.timestamp) + if "original" in cdx_print: + output_string.append(snapshot.original) + if any(val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]): + output_string.append(snapshot.mimetype) + if any( + val in cdx_print for val in ["statuscode", "status-code", "status_code"] + ): + output_string.append(snapshot.statuscode) + if "digest" in cdx_print: + output_string.append(snapshot.digest) + if "length" in cdx_print: + output_string.append(snapshot.length) + if any( + val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"] + ): + output_string.append(snapshot.archive_url) + + click.echo(" ".join(output_string)) + + def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: """ Save output of CDX API on file. @@ -231,7 +301,7 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: "-l", "--limit", help="Number of maximum record that CDX API is asked to return per API call, " - + "default value is 500 records.", + + "default value is 25000 records.", ) @click.option( "-cp", @@ -347,58 +417,19 @@ def main( # pylint: disable=no-value-for-parameter click.echo(url_) elif cdx: - filters = list(cdx_filter) - collapses = list(collapse) - cdx_print = list(cdx_print) - - cdx_api = WaybackMachineCDXServerAPI( + data = [ url, - user_agent=user_agent, - start_timestamp=start_timestamp, - end_timestamp=end_timestamp, - filters=filters, - match_type=match_type, - gzip=gzip, - collapses=collapses, - limit=limit, - ) - - snapshots = cdx_api.snapshots() - - for snapshot in snapshots: - if len(cdx_print) == 0: - click.echo(snapshot) - else: - output_string = [] - if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]): - output_string.append(snapshot.urlkey) - if any( - val in cdx_print - for val in ["timestamp", "time-stamp", "time_stamp"] - ): - output_string.append(snapshot.timestamp) - if "original" in cdx_print: - output_string.append(snapshot.original) - if any( - val in cdx_print for val in ["mimetype", "mime-type", "mime_type"] - ): - output_string.append(snapshot.mimetype) - if any( - val in cdx_print - for val in ["statuscode", "status-code", "status_code"] - ): - output_string.append(snapshot.statuscode) - if "digest" in cdx_print: - output_string.append(snapshot.digest) - if "length" in cdx_print: - output_string.append(snapshot.length) - if any( - val in cdx_print - for val in ["archiveurl", "archive-url", "archive_url"] - ): - output_string.append(snapshot.archive_url) - - click.echo(" ".join(output_string)) + user_agent, + start_timestamp, + end_timestamp, + cdx_filter, + collapse, + cdx_print, + limit, + gzip, + match_type, + ] + handle_cdx(data) else: click.echo( diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 02ee953..3e8d347 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -12,20 +12,7 @@ class WaybackError(Exception): 1) Wayback Machine API Service is unreachable/down. 2) You passed illegal arguments. - All other exceptions are inherited from this class. - """ - - -class RedirectSaveError(WaybackError): - """ - Raised when the original URL is redirected and the - redirect URL is archived but not the original URL. - """ - - -class URLError(Exception): - """ - Raised when malformed URLs are passed as arguments. + All other exceptions are inherited from this main exception. """ @@ -33,6 +20,8 @@ class TooManyRequestsError(WaybackError): """ Raised when you make more than 15 requests per minute and the Wayback Machine returns 429. + + See https://github.com/akamhy/waybackpy/issues/131 """ diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 1c3462e..3890a8f 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -2,49 +2,8 @@ Utility functions and shared variables like DEFAULT_USER_AGENT are here. """ -import requests - from . import __version__ DEFAULT_USER_AGENT: str = ( f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" ) - - -def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str: - """Latest waybackpy version on PyPi.""" - request_url = "https://pypi.org/pypi/" + package_name + "/json" - headers = {"User-Agent": user_agent} - response = requests.get(request_url, headers=headers) - data = response.json() - if ( - data is not None - and "info" in data - and data["info"] is not None - and "version" in data["info"] - and data["info"]["version"] is not None - ): - return str(data["info"]["version"]) - - raise ValueError("Could not get latest pypi version") - - -def latest_version_github( - package_name: str, user_agent: str = DEFAULT_USER_AGENT -) -> str: - """Latest waybackpy version on GitHub.""" - request_url = ( - "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1" - ) - headers = {"User-Agent": user_agent} - response = requests.get(request_url, headers=headers) - data = response.json() - if ( - data is not None - and len(data) > 0 - and data[0] is not None - and "tag_name" in data[0] - ): - return str(data[0]["tag_name"]) - - raise ValueError("Could not get latest github version") diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 21b6782..93e317a 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -113,6 +113,7 @@ class Url: """Set the attributes for total backwards compatibility.""" self.archive_url = self.wayback_machine_availability_api.archive_url self.json = self.wayback_machine_availability_api.json + self.JSON = self.json # for backwards compatibility, do not remove it. self.timestamp = self.wayback_machine_availability_api.timestamp() def total_archives(