diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 324dae4..63e4892 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -1,19 +1,19 @@ """ This module interfaces the Wayback Machine's availability API. -The interface could be useful for looking up archives and finding archives +The interface is useful for looking up archives and finding archives that are close to a specific date and time. -It has a class called WaybackMachineAvailabilityAPI, and the class has -methods such as: +It has a class WaybackMachineAvailabilityAPI, and the class has +methods like: -near() for looking up archives close to a specific date and time. +near() for retrieving archives close to a specific date and time. oldest() for retrieving the first archive URL of the webpage. -newest() for retrieving the latest archive of an URL. +newest() for retrieving the latest archive of the webpage. -The Wayback Machine Availability response should be a valid JSON and +The Wayback Machine Availability API response must be a valid JSON and if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised. If the Availability API returned valid JSON but archive URL could not be found @@ -39,7 +39,7 @@ ResponseJSON = Dict[str, Any] class WaybackMachineAvailabilityAPI: """ - Class that interfaces the availability API of the Wayback Machine. + Class that interfaces the Wayback Machine's availability API. """ def __init__( @@ -61,7 +61,7 @@ class WaybackMachineAvailabilityAPI: @staticmethod def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: """ - Converts Unix time to wayback Machine timestamp and the Wayback Machine + Converts Unix time to Wayback Machine timestamp, Wayback Machine timestamp format is yyyyMMddhhmmss. """ return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") @@ -76,10 +76,10 @@ class WaybackMachineAvailabilityAPI: """ String representation of the class. If atleast one API call was successfully made then return the archive URL - as a string. Else returns "". + as a string. Else returns "" (empty string literal). """ - # String should not return anything other than a string object - # So, if a string repr is asked for before making any API requests + # __str__ can not return anything other than a string object + # So, if a string repr is asked even before making a API request # just return "" if not self.json: return "" @@ -147,7 +147,7 @@ class WaybackMachineAvailabilityAPI: self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" ) - raise ValueError("Could not get timestamp from result") + raise ValueError("Timestamp not found in the Availability API's JSON response.") @property def archive_url(self) -> str: @@ -159,8 +159,8 @@ class WaybackMachineAvailabilityAPI: archive_url = "" data = self.json - # If the user didn't invoke oldest, newest or near but tries to access the - # archive_url attribute then assume they are fine with any archive + # If the user didn't invoke oldest, newest or near but tries to access + # archive_url attribute then assume they that are fine with any archive # and invoke the oldest method. if not data: self.oldest() @@ -172,10 +172,10 @@ class WaybackMachineAvailabilityAPI: not data or not data["archived_snapshots"] ): self.setup_json() # It makes a new API call - data = self.json # json() updated the value of JSON attribute + data = self.json # setup_json() updates value of json attribute - # If we exhausted the max_tries, then we give up and - # raise exception. + # If exhausted max_tries, then give up and + # raise ArchiveNotInAvailabilityAPIResponse. if not data or not data["archived_snapshots"]: raise ArchiveNotInAvailabilityAPIResponse( @@ -198,7 +198,7 @@ class WaybackMachineAvailabilityAPI: def wayback_timestamp(**kwargs: int) -> str: """ Prepends zero before the year, month, day, hour and minute so that they - are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. + are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format. """ return "".join( str(kwargs[key]).zfill(2) @@ -218,7 +218,7 @@ class WaybackMachineAvailabilityAPI: Passes the current UNIX time to near() for retrieving the newest archive from the availability API. - We assume that wayback machine can not archive the future of a webpage. + Remember UNIX time is UTC and Wayback Machine is also UTC based. """ return self.near(unix_timestamp=int(time.time())) @@ -232,7 +232,8 @@ class WaybackMachineAvailabilityAPI: unix_timestamp: Optional[int] = None, ) -> "WaybackMachineAvailabilityAPI": """ - The main method for the Class, oldest() and newest() are dependent on it. + The most important method of this Class, oldest() and newest() are + dependent on it. It generates the timestamp based on the input either by calling the unix_timestamp_to_wayback_timestamp or wayback_timestamp method with @@ -240,8 +241,8 @@ class WaybackMachineAvailabilityAPI: Adds the timestamp to the payload dictionary. - And finally invoking the json method to make the API call then returns - the instance. + And finally invokes the setup_json method to make the API call then + finally returns the instance. """ if unix_timestamp: timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index ce46ae4..fb7587a 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -173,7 +173,6 @@ class WaybackMachineCDXServerAPI: for i, collapse in enumerate(self.collapses): payload["collapse" + str(i)] = collapse - # Don't need to return anything as it's dictionary. payload["url"] = self.url def snapshots(self) -> Generator[CDXSnapshot, None, None]: diff --git a/waybackpy/cdx_snapshot.py b/waybackpy/cdx_snapshot.py index d7a4a16..20218c1 100644 --- a/waybackpy/cdx_snapshot.py +++ b/waybackpy/cdx_snapshot.py @@ -1,5 +1,5 @@ """ -Module that contains the CDXSnapshot class, CDX records are casted +Module that contains the CDXSnapshot class, CDX records/lines are casted to CDXSnapshot objects for easier access. The CDX index format is plain text data. Each line ('record') indicates a diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py index 8826b21..583dd26 100644 --- a/waybackpy/cdx_utils.py +++ b/waybackpy/cdx_utils.py @@ -2,7 +2,7 @@ Utility functions required for accessing the CDX server API. These are here in this module so that we don’t make any module too -big. +long. """ import re @@ -63,7 +63,7 @@ def get_response( backoff_factor: float = 0.5, ) -> Union[requests.Response, Exception]: """ - Make get request to the CDX server and return the response. + Makes get request to the CDX server and returns the response. """ session = requests.Session() diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 6cb6d0f..db7c774 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -1,5 +1,5 @@ """ -Module that makes waybackpy a CLI tool. +Module responsible for enabling waybackpy to function as a CLI tool. """ import os @@ -18,24 +18,30 @@ from .cdx_api import WaybackMachineCDXServerAPI from .save_api import WaybackMachineSaveAPI from .utils import DEFAULT_USER_AGENT from .wrapper import Url +from .exceptions import ArchiveNotInAvailabilityAPIResponse def echo_availability_api( availability_api_instance: WaybackMachineAvailabilityAPI, json: bool ) -> None: """ - Output availability API depending functions. - Near, oldest and newest output by this method. + Output for method that use the availability API. + Near, oldest and newest output via this function. """ - if not availability_api_instance.archive_url: - archive_url = ( + try: + if availability_api_instance.archive_url: + archive_url = availability_api_instance.archive_url + except ArchiveNotInAvailabilityAPIResponse as error: + message = ( "NO ARCHIVE FOUND - The requested URL is probably " + "not yet archived or if the URL was recently archived then it is " + "not yet available via the Wayback Machine's availability API " + "because of database lag and should be available after some time." ) - else: - archive_url = availability_api_instance.archive_url + + click.echo(message + "\nJSON response:\n" + str(error), err=True) + return + click.echo("Archive URL:") click.echo(archive_url) if json: @@ -45,7 +51,7 @@ def echo_availability_api( def handle_cdx(data: List[Any]) -> None: """ - Handles the CDX CLI options and output. + Handles the CDX CLI options and output format. """ url = data[0] user_agent = data[1] diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 02ee953..3e8d347 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -12,20 +12,7 @@ class WaybackError(Exception): 1) Wayback Machine API Service is unreachable/down. 2) You passed illegal arguments. - All other exceptions are inherited from this class. - """ - - -class RedirectSaveError(WaybackError): - """ - Raised when the original URL is redirected and the - redirect URL is archived but not the original URL. - """ - - -class URLError(Exception): - """ - Raised when malformed URLs are passed as arguments. + All other exceptions are inherited from this main exception. """ @@ -33,6 +20,8 @@ class TooManyRequestsError(WaybackError): """ Raised when you make more than 15 requests per minute and the Wayback Machine returns 429. + + See https://github.com/akamhy/waybackpy/issues/131 """ diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 21b6782..30ff24b 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -113,6 +113,7 @@ class Url: """Set the attributes for total backwards compatibility.""" self.archive_url = self.wayback_machine_availability_api.archive_url self.json = self.wayback_machine_availability_api.json + self.JSON = self.json # for backwards compatibility, do not remove it. self.timestamp = self.wayback_machine_availability_api.timestamp() def total_archives(