Merge branch 'master' into fix_metadata

This commit is contained in:
eggplants
2022-02-09 20:42:26 +09:00
committed by GitHub
11 changed files with 127 additions and 154 deletions

View File

@@ -24,9 +24,11 @@ keywords =
CDX API
savepagenow
classifiers =
Development Status :: 4 - Beta
Development Status :: 5 - Production/Stable
Intended Audience :: Developers
Intended Audience :: End Users/Desktop
Natural Language :: English
Typing :: Typed
License :: OSI Approved :: MIT License
Programming Language :: Python
Programming Language :: Python :: 3

View File

@@ -1,9 +1,5 @@
from waybackpy import __version__
from waybackpy.utils import (
DEFAULT_USER_AGENT,
latest_version_github,
latest_version_pypi,
)
from waybackpy.utils import DEFAULT_USER_AGENT
def test_default_user_agent() -> None:
@@ -11,8 +7,3 @@ def test_default_user_agent() -> None:
DEFAULT_USER_AGENT
== f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
)
def test_latest_version() -> None:
package_name = "waybackpy"
assert latest_version_github(package_name) == latest_version_pypi(package_name)

View File

@@ -1,6 +1,6 @@
"""Module initializer and provider of static infomation."""
"""Module initializer and provider of static information."""
__version__ = "3.0.2"
__version__ = "3.0.3"
from .availability_api import WaybackMachineAvailabilityAPI
from .cdx_api import WaybackMachineCDXServerAPI

View File

@@ -1,19 +1,19 @@
"""
This module interfaces the Wayback Machine's availability API.
The interface could be useful for looking up archives and finding archives
The interface is useful for looking up archives and finding archives
that are close to a specific date and time.
It has a class called WaybackMachineAvailabilityAPI, and the class has
methods such as:
It has a class WaybackMachineAvailabilityAPI, and the class has
methods like:
near() for looking up archives close to a specific date and time.
near() for retrieving archives close to a specific date and time.
oldest() for retrieving the first archive URL of the webpage.
newest() for retrieving the latest archive of an URL.
newest() for retrieving the latest archive of the webpage.
The Wayback Machine Availability response should be a valid JSON and
The Wayback Machine Availability API response must be a valid JSON and
if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
If the Availability API returned valid JSON but archive URL could not be found
@@ -39,7 +39,7 @@ ResponseJSON = Dict[str, Any]
class WaybackMachineAvailabilityAPI:
"""
Class that interfaces the availability API of the Wayback Machine.
Class that interfaces the Wayback Machine's availability API.
"""
def __init__(
@@ -61,7 +61,7 @@ class WaybackMachineAvailabilityAPI:
@staticmethod
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
"""
Converts Unix time to wayback Machine timestamp and the Wayback Machine
Converts Unix time to Wayback Machine timestamp, Wayback Machine
timestamp format is yyyyMMddhhmmss.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
@@ -76,10 +76,10 @@ class WaybackMachineAvailabilityAPI:
"""
String representation of the class. If atleast one API
call was successfully made then return the archive URL
as a string. Else returns "".
as a string. Else returns "" (empty string literal).
"""
# String should not return anything other than a string object
# So, if a string repr is asked for before making any API requests
# __str__ can not return anything other than a string object
# So, if a string repr is asked even before making a API request
# just return ""
if not self.json:
return ""
@@ -147,7 +147,7 @@ class WaybackMachineAvailabilityAPI:
self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
raise ValueError("Could not get timestamp from result")
raise ValueError("Timestamp not found in the Availability API's JSON response.")
@property
def archive_url(self) -> str:
@@ -159,8 +159,8 @@ class WaybackMachineAvailabilityAPI:
archive_url = ""
data = self.json
# If the user didn't invoke oldest, newest or near but tries to access the
# archive_url attribute then assume they are fine with any archive
# If the user didn't invoke oldest, newest or near but tries to access
# archive_url attribute then assume they that are fine with any archive
# and invoke the oldest method.
if not data:
self.oldest()
@@ -172,10 +172,10 @@ class WaybackMachineAvailabilityAPI:
not data or not data["archived_snapshots"]
):
self.setup_json() # It makes a new API call
data = self.json # json() updated the value of JSON attribute
data = self.json # setup_json() updates value of json attribute
# If we exhausted the max_tries, then we give up and
# raise exception.
# If exhausted max_tries, then give up and
# raise ArchiveNotInAvailabilityAPIResponse.
if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse(
@@ -198,7 +198,7 @@ class WaybackMachineAvailabilityAPI:
def wayback_timestamp(**kwargs: int) -> str:
"""
Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format.
"""
return "".join(
str(kwargs[key]).zfill(2)
@@ -218,7 +218,7 @@ class WaybackMachineAvailabilityAPI:
Passes the current UNIX time to near() for retrieving the newest archive
from the availability API.
We assume that wayback machine can not archive the future of a webpage.
Remember UNIX time is UTC and Wayback Machine is also UTC based.
"""
return self.near(unix_timestamp=int(time.time()))
@@ -232,7 +232,8 @@ class WaybackMachineAvailabilityAPI:
unix_timestamp: Optional[int] = None,
) -> "WaybackMachineAvailabilityAPI":
"""
The main method for the Class, oldest() and newest() are dependent on it.
The most important method of this Class, oldest() and newest() are
dependent on it.
It generates the timestamp based on the input either by calling the
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
@@ -240,8 +241,8 @@ class WaybackMachineAvailabilityAPI:
Adds the timestamp to the payload dictionary.
And finally invoking the json method to make the API call then returns
the instance.
And finally invokes the setup_json method to make the API call then
finally returns the instance.
"""
if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)

View File

@@ -173,7 +173,6 @@ class WaybackMachineCDXServerAPI:
for i, collapse in enumerate(self.collapses):
payload["collapse" + str(i)] = collapse
# Don't need to return anything as it's dictionary.
payload["url"] = self.url
def snapshots(self) -> Generator[CDXSnapshot, None, None]:

View File

@@ -1,5 +1,5 @@
"""
Module that contains the CDXSnapshot class, CDX records are casted
Module that contains the CDXSnapshot class, CDX records/lines are casted
to CDXSnapshot objects for easier access.
The CDX index format is plain text data. Each line ('record') indicates a

View File

@@ -2,7 +2,7 @@
Utility functions required for accessing the CDX server API.
These are here in this module so that we dont make any module too
big.
long.
"""
import re
@@ -63,7 +63,7 @@ def get_response(
backoff_factor: float = 0.5,
) -> Union[requests.Response, Exception]:
"""
Make get request to the CDX server and return the response.
Makes get request to the CDX server and returns the response.
"""
session = requests.Session()

View File

@@ -1,5 +1,5 @@
"""
Module that makes waybackpy a CLI tool.
Module responsible for enabling waybackpy to function as a CLI tool.
"""
import os
@@ -7,7 +7,7 @@ import random
import re
import string
from json import dumps
from typing import Generator, List, Optional
from typing import Any, Generator, List, Optional
import click
import requests
@@ -15,6 +15,7 @@ import requests
from . import __version__
from .availability_api import WaybackMachineAvailabilityAPI
from .cdx_api import WaybackMachineCDXServerAPI
from .exceptions import ArchiveNotInAvailabilityAPIResponse
from .save_api import WaybackMachineSaveAPI
from .utils import DEFAULT_USER_AGENT
from .wrapper import Url
@@ -24,18 +25,23 @@ def echo_availability_api(
availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
) -> None:
"""
Output availability API depending functions.
Near, oldest and newest output by this method.
Output for method that use the availability API.
Near, oldest and newest output via this function.
"""
if not availability_api_instance.archive_url:
archive_url = (
try:
if availability_api_instance.archive_url:
archive_url = availability_api_instance.archive_url
except ArchiveNotInAvailabilityAPIResponse as error:
message = (
"NO ARCHIVE FOUND - The requested URL is probably "
+ "not yet archived or if the URL was recently archived then it is "
+ "not yet available via the Wayback Machine's availability API "
+ "because of database lag and should be available after some time."
)
else:
archive_url = availability_api_instance.archive_url
click.echo(message + "\nJSON response:\n" + str(error), err=True)
return
click.echo("Archive URL:")
click.echo(archive_url)
if json:
@@ -43,6 +49,70 @@ def echo_availability_api(
click.echo(dumps(availability_api_instance.json))
def handle_cdx(data: List[Any]) -> None:
"""
Handles the CDX CLI options and output format.
"""
url = data[0]
user_agent = data[1]
start_timestamp = data[2]
end_timestamp = data[3]
cdx_filter = data[4]
collapse = data[5]
cdx_print = data[6]
limit = data[7]
gzip = data[8]
match_type = data[9]
filters = list(cdx_filter)
collapses = list(collapse)
cdx_print = list(cdx_print)
cdx_api = WaybackMachineCDXServerAPI(
url,
user_agent=user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
filters=filters,
match_type=match_type,
gzip=gzip,
collapses=collapses,
limit=limit,
)
snapshots = cdx_api.snapshots()
for snapshot in snapshots:
if len(cdx_print) == 0:
click.echo(snapshot)
else:
output_string = []
if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
output_string.append(snapshot.urlkey)
if any(
val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"]
):
output_string.append(snapshot.timestamp)
if "original" in cdx_print:
output_string.append(snapshot.original)
if any(val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]):
output_string.append(snapshot.mimetype)
if any(
val in cdx_print for val in ["statuscode", "status-code", "status_code"]
):
output_string.append(snapshot.statuscode)
if "digest" in cdx_print:
output_string.append(snapshot.digest)
if "length" in cdx_print:
output_string.append(snapshot.length)
if any(
val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"]
):
output_string.append(snapshot.archive_url)
click.echo(" ".join(output_string))
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
"""
Save output of CDX API on file.
@@ -231,7 +301,7 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
"-l",
"--limit",
help="Number of maximum record that CDX API is asked to return per API call, "
+ "default value is 500 records.",
+ "default value is 25000 records.",
)
@click.option(
"-cp",
@@ -347,58 +417,19 @@ def main( # pylint: disable=no-value-for-parameter
click.echo(url_)
elif cdx:
filters = list(cdx_filter)
collapses = list(collapse)
cdx_print = list(cdx_print)
cdx_api = WaybackMachineCDXServerAPI(
data = [
url,
user_agent=user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
filters=filters,
match_type=match_type,
gzip=gzip,
collapses=collapses,
limit=limit,
)
snapshots = cdx_api.snapshots()
for snapshot in snapshots:
if len(cdx_print) == 0:
click.echo(snapshot)
else:
output_string = []
if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
output_string.append(snapshot.urlkey)
if any(
val in cdx_print
for val in ["timestamp", "time-stamp", "time_stamp"]
):
output_string.append(snapshot.timestamp)
if "original" in cdx_print:
output_string.append(snapshot.original)
if any(
val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]
):
output_string.append(snapshot.mimetype)
if any(
val in cdx_print
for val in ["statuscode", "status-code", "status_code"]
):
output_string.append(snapshot.statuscode)
if "digest" in cdx_print:
output_string.append(snapshot.digest)
if "length" in cdx_print:
output_string.append(snapshot.length)
if any(
val in cdx_print
for val in ["archiveurl", "archive-url", "archive_url"]
):
output_string.append(snapshot.archive_url)
click.echo(" ".join(output_string))
user_agent,
start_timestamp,
end_timestamp,
cdx_filter,
collapse,
cdx_print,
limit,
gzip,
match_type,
]
handle_cdx(data)
else:
click.echo(

View File

@@ -12,20 +12,7 @@ class WaybackError(Exception):
1) Wayback Machine API Service is unreachable/down.
2) You passed illegal arguments.
All other exceptions are inherited from this class.
"""
class RedirectSaveError(WaybackError):
"""
Raised when the original URL is redirected and the
redirect URL is archived but not the original URL.
"""
class URLError(Exception):
"""
Raised when malformed URLs are passed as arguments.
All other exceptions are inherited from this main exception.
"""
@@ -33,6 +20,8 @@ class TooManyRequestsError(WaybackError):
"""
Raised when you make more than 15 requests per
minute and the Wayback Machine returns 429.
See https://github.com/akamhy/waybackpy/issues/131
"""

View File

@@ -2,49 +2,8 @@
Utility functions and shared variables like DEFAULT_USER_AGENT are here.
"""
import requests
from . import __version__
DEFAULT_USER_AGENT: str = (
f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
)
def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
"""Latest waybackpy version on PyPi."""
request_url = "https://pypi.org/pypi/" + package_name + "/json"
headers = {"User-Agent": user_agent}
response = requests.get(request_url, headers=headers)
data = response.json()
if (
data is not None
and "info" in data
and data["info"] is not None
and "version" in data["info"]
and data["info"]["version"] is not None
):
return str(data["info"]["version"])
raise ValueError("Could not get latest pypi version")
def latest_version_github(
package_name: str, user_agent: str = DEFAULT_USER_AGENT
) -> str:
"""Latest waybackpy version on GitHub."""
request_url = (
"https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
)
headers = {"User-Agent": user_agent}
response = requests.get(request_url, headers=headers)
data = response.json()
if (
data is not None
and len(data) > 0
and data[0] is not None
and "tag_name" in data[0]
):
return str(data[0]["tag_name"])
raise ValueError("Could not get latest github version")

View File

@@ -113,6 +113,7 @@ class Url:
"""Set the attributes for total backwards compatibility."""
self.archive_url = self.wayback_machine_availability_api.archive_url
self.json = self.wayback_machine_availability_api.json
self.JSON = self.json # for backwards compatibility, do not remove it.
self.timestamp = self.wayback_machine_availability_api.timestamp()
def total_archives(