Merge branch 'master' into fix_metadata

This commit is contained in:
eggplants
2022-02-09 20:42:26 +09:00
committed by GitHub
11 changed files with 127 additions and 154 deletions

View File

@@ -24,9 +24,11 @@ keywords =
CDX API CDX API
savepagenow savepagenow
classifiers = classifiers =
Development Status :: 4 - Beta Development Status :: 5 - Production/Stable
Intended Audience :: Developers Intended Audience :: Developers
Intended Audience :: End Users/Desktop
Natural Language :: English Natural Language :: English
Typing :: Typed
License :: OSI Approved :: MIT License License :: OSI Approved :: MIT License
Programming Language :: Python Programming Language :: Python
Programming Language :: Python :: 3 Programming Language :: Python :: 3

View File

@@ -1,9 +1,5 @@
from waybackpy import __version__ from waybackpy import __version__
from waybackpy.utils import ( from waybackpy.utils import DEFAULT_USER_AGENT
DEFAULT_USER_AGENT,
latest_version_github,
latest_version_pypi,
)
def test_default_user_agent() -> None: def test_default_user_agent() -> None:
@@ -11,8 +7,3 @@ def test_default_user_agent() -> None:
DEFAULT_USER_AGENT DEFAULT_USER_AGENT
== f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" == f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
) )
def test_latest_version() -> None:
package_name = "waybackpy"
assert latest_version_github(package_name) == latest_version_pypi(package_name)

View File

@@ -1,6 +1,6 @@
"""Module initializer and provider of static infomation.""" """Module initializer and provider of static information."""
__version__ = "3.0.2" __version__ = "3.0.3"
from .availability_api import WaybackMachineAvailabilityAPI from .availability_api import WaybackMachineAvailabilityAPI
from .cdx_api import WaybackMachineCDXServerAPI from .cdx_api import WaybackMachineCDXServerAPI

View File

@@ -1,19 +1,19 @@
""" """
This module interfaces the Wayback Machine's availability API. This module interfaces the Wayback Machine's availability API.
The interface could be useful for looking up archives and finding archives The interface is useful for looking up archives and finding archives
that are close to a specific date and time. that are close to a specific date and time.
It has a class called WaybackMachineAvailabilityAPI, and the class has It has a class WaybackMachineAvailabilityAPI, and the class has
methods such as: methods like:
near() for looking up archives close to a specific date and time. near() for retrieving archives close to a specific date and time.
oldest() for retrieving the first archive URL of the webpage. oldest() for retrieving the first archive URL of the webpage.
newest() for retrieving the latest archive of an URL. newest() for retrieving the latest archive of the webpage.
The Wayback Machine Availability response should be a valid JSON and The Wayback Machine Availability API response must be a valid JSON and
if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised. if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
If the Availability API returned valid JSON but archive URL could not be found If the Availability API returned valid JSON but archive URL could not be found
@@ -39,7 +39,7 @@ ResponseJSON = Dict[str, Any]
class WaybackMachineAvailabilityAPI: class WaybackMachineAvailabilityAPI:
""" """
Class that interfaces the availability API of the Wayback Machine. Class that interfaces the Wayback Machine's availability API.
""" """
def __init__( def __init__(
@@ -61,7 +61,7 @@ class WaybackMachineAvailabilityAPI:
@staticmethod @staticmethod
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
""" """
Converts Unix time to wayback Machine timestamp and the Wayback Machine Converts Unix time to Wayback Machine timestamp, Wayback Machine
timestamp format is yyyyMMddhhmmss. timestamp format is yyyyMMddhhmmss.
""" """
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
@@ -76,10 +76,10 @@ class WaybackMachineAvailabilityAPI:
""" """
String representation of the class. If atleast one API String representation of the class. If atleast one API
call was successfully made then return the archive URL call was successfully made then return the archive URL
as a string. Else returns "". as a string. Else returns "" (empty string literal).
""" """
# String should not return anything other than a string object # __str__ can not return anything other than a string object
# So, if a string repr is asked for before making any API requests # So, if a string repr is asked even before making a API request
# just return "" # just return ""
if not self.json: if not self.json:
return "" return ""
@@ -147,7 +147,7 @@ class WaybackMachineAvailabilityAPI:
self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
) )
raise ValueError("Could not get timestamp from result") raise ValueError("Timestamp not found in the Availability API's JSON response.")
@property @property
def archive_url(self) -> str: def archive_url(self) -> str:
@@ -159,8 +159,8 @@ class WaybackMachineAvailabilityAPI:
archive_url = "" archive_url = ""
data = self.json data = self.json
# If the user didn't invoke oldest, newest or near but tries to access the # If the user didn't invoke oldest, newest or near but tries to access
# archive_url attribute then assume they are fine with any archive # archive_url attribute then assume they that are fine with any archive
# and invoke the oldest method. # and invoke the oldest method.
if not data: if not data:
self.oldest() self.oldest()
@@ -172,10 +172,10 @@ class WaybackMachineAvailabilityAPI:
not data or not data["archived_snapshots"] not data or not data["archived_snapshots"]
): ):
self.setup_json() # It makes a new API call self.setup_json() # It makes a new API call
data = self.json # json() updated the value of JSON attribute data = self.json # setup_json() updates value of json attribute
# If we exhausted the max_tries, then we give up and # If exhausted max_tries, then give up and
# raise exception. # raise ArchiveNotInAvailabilityAPIResponse.
if not data or not data["archived_snapshots"]: if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse( raise ArchiveNotInAvailabilityAPIResponse(
@@ -198,7 +198,7 @@ class WaybackMachineAvailabilityAPI:
def wayback_timestamp(**kwargs: int) -> str: def wayback_timestamp(**kwargs: int) -> str:
""" """
Prepends zero before the year, month, day, hour and minute so that they Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format.
""" """
return "".join( return "".join(
str(kwargs[key]).zfill(2) str(kwargs[key]).zfill(2)
@@ -218,7 +218,7 @@ class WaybackMachineAvailabilityAPI:
Passes the current UNIX time to near() for retrieving the newest archive Passes the current UNIX time to near() for retrieving the newest archive
from the availability API. from the availability API.
We assume that wayback machine can not archive the future of a webpage. Remember UNIX time is UTC and Wayback Machine is also UTC based.
""" """
return self.near(unix_timestamp=int(time.time())) return self.near(unix_timestamp=int(time.time()))
@@ -232,7 +232,8 @@ class WaybackMachineAvailabilityAPI:
unix_timestamp: Optional[int] = None, unix_timestamp: Optional[int] = None,
) -> "WaybackMachineAvailabilityAPI": ) -> "WaybackMachineAvailabilityAPI":
""" """
The main method for the Class, oldest() and newest() are dependent on it. The most important method of this Class, oldest() and newest() are
dependent on it.
It generates the timestamp based on the input either by calling the It generates the timestamp based on the input either by calling the
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
@@ -240,8 +241,8 @@ class WaybackMachineAvailabilityAPI:
Adds the timestamp to the payload dictionary. Adds the timestamp to the payload dictionary.
And finally invoking the json method to make the API call then returns And finally invokes the setup_json method to make the API call then
the instance. finally returns the instance.
""" """
if unix_timestamp: if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)

View File

@@ -173,7 +173,6 @@ class WaybackMachineCDXServerAPI:
for i, collapse in enumerate(self.collapses): for i, collapse in enumerate(self.collapses):
payload["collapse" + str(i)] = collapse payload["collapse" + str(i)] = collapse
# Don't need to return anything as it's dictionary.
payload["url"] = self.url payload["url"] = self.url
def snapshots(self) -> Generator[CDXSnapshot, None, None]: def snapshots(self) -> Generator[CDXSnapshot, None, None]:

View File

@@ -1,5 +1,5 @@
""" """
Module that contains the CDXSnapshot class, CDX records are casted Module that contains the CDXSnapshot class, CDX records/lines are casted
to CDXSnapshot objects for easier access. to CDXSnapshot objects for easier access.
The CDX index format is plain text data. Each line ('record') indicates a The CDX index format is plain text data. Each line ('record') indicates a

View File

@@ -2,7 +2,7 @@
Utility functions required for accessing the CDX server API. Utility functions required for accessing the CDX server API.
These are here in this module so that we dont make any module too These are here in this module so that we dont make any module too
big. long.
""" """
import re import re
@@ -63,7 +63,7 @@ def get_response(
backoff_factor: float = 0.5, backoff_factor: float = 0.5,
) -> Union[requests.Response, Exception]: ) -> Union[requests.Response, Exception]:
""" """
Make get request to the CDX server and return the response. Makes get request to the CDX server and returns the response.
""" """
session = requests.Session() session = requests.Session()

View File

@@ -1,5 +1,5 @@
""" """
Module that makes waybackpy a CLI tool. Module responsible for enabling waybackpy to function as a CLI tool.
""" """
import os import os
@@ -7,7 +7,7 @@ import random
import re import re
import string import string
from json import dumps from json import dumps
from typing import Generator, List, Optional from typing import Any, Generator, List, Optional
import click import click
import requests import requests
@@ -15,6 +15,7 @@ import requests
from . import __version__ from . import __version__
from .availability_api import WaybackMachineAvailabilityAPI from .availability_api import WaybackMachineAvailabilityAPI
from .cdx_api import WaybackMachineCDXServerAPI from .cdx_api import WaybackMachineCDXServerAPI
from .exceptions import ArchiveNotInAvailabilityAPIResponse
from .save_api import WaybackMachineSaveAPI from .save_api import WaybackMachineSaveAPI
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
from .wrapper import Url from .wrapper import Url
@@ -24,18 +25,23 @@ def echo_availability_api(
availability_api_instance: WaybackMachineAvailabilityAPI, json: bool availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
) -> None: ) -> None:
""" """
Output availability API depending functions. Output for method that use the availability API.
Near, oldest and newest output by this method. Near, oldest and newest output via this function.
""" """
if not availability_api_instance.archive_url: try:
archive_url = ( if availability_api_instance.archive_url:
archive_url = availability_api_instance.archive_url
except ArchiveNotInAvailabilityAPIResponse as error:
message = (
"NO ARCHIVE FOUND - The requested URL is probably " "NO ARCHIVE FOUND - The requested URL is probably "
+ "not yet archived or if the URL was recently archived then it is " + "not yet archived or if the URL was recently archived then it is "
+ "not yet available via the Wayback Machine's availability API " + "not yet available via the Wayback Machine's availability API "
+ "because of database lag and should be available after some time." + "because of database lag and should be available after some time."
) )
else:
archive_url = availability_api_instance.archive_url click.echo(message + "\nJSON response:\n" + str(error), err=True)
return
click.echo("Archive URL:") click.echo("Archive URL:")
click.echo(archive_url) click.echo(archive_url)
if json: if json:
@@ -43,6 +49,70 @@ def echo_availability_api(
click.echo(dumps(availability_api_instance.json)) click.echo(dumps(availability_api_instance.json))
def handle_cdx(data: List[Any]) -> None:
"""
Handles the CDX CLI options and output format.
"""
url = data[0]
user_agent = data[1]
start_timestamp = data[2]
end_timestamp = data[3]
cdx_filter = data[4]
collapse = data[5]
cdx_print = data[6]
limit = data[7]
gzip = data[8]
match_type = data[9]
filters = list(cdx_filter)
collapses = list(collapse)
cdx_print = list(cdx_print)
cdx_api = WaybackMachineCDXServerAPI(
url,
user_agent=user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
filters=filters,
match_type=match_type,
gzip=gzip,
collapses=collapses,
limit=limit,
)
snapshots = cdx_api.snapshots()
for snapshot in snapshots:
if len(cdx_print) == 0:
click.echo(snapshot)
else:
output_string = []
if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
output_string.append(snapshot.urlkey)
if any(
val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"]
):
output_string.append(snapshot.timestamp)
if "original" in cdx_print:
output_string.append(snapshot.original)
if any(val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]):
output_string.append(snapshot.mimetype)
if any(
val in cdx_print for val in ["statuscode", "status-code", "status_code"]
):
output_string.append(snapshot.statuscode)
if "digest" in cdx_print:
output_string.append(snapshot.digest)
if "length" in cdx_print:
output_string.append(snapshot.length)
if any(
val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"]
):
output_string.append(snapshot.archive_url)
click.echo(" ".join(output_string))
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
""" """
Save output of CDX API on file. Save output of CDX API on file.
@@ -231,7 +301,7 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
"-l", "-l",
"--limit", "--limit",
help="Number of maximum record that CDX API is asked to return per API call, " help="Number of maximum record that CDX API is asked to return per API call, "
+ "default value is 500 records.", + "default value is 25000 records.",
) )
@click.option( @click.option(
"-cp", "-cp",
@@ -347,58 +417,19 @@ def main( # pylint: disable=no-value-for-parameter
click.echo(url_) click.echo(url_)
elif cdx: elif cdx:
filters = list(cdx_filter) data = [
collapses = list(collapse)
cdx_print = list(cdx_print)
cdx_api = WaybackMachineCDXServerAPI(
url, url,
user_agent=user_agent, user_agent,
start_timestamp=start_timestamp, start_timestamp,
end_timestamp=end_timestamp, end_timestamp,
filters=filters, cdx_filter,
match_type=match_type, collapse,
gzip=gzip, cdx_print,
collapses=collapses, limit,
limit=limit, gzip,
) match_type,
]
snapshots = cdx_api.snapshots() handle_cdx(data)
for snapshot in snapshots:
if len(cdx_print) == 0:
click.echo(snapshot)
else:
output_string = []
if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
output_string.append(snapshot.urlkey)
if any(
val in cdx_print
for val in ["timestamp", "time-stamp", "time_stamp"]
):
output_string.append(snapshot.timestamp)
if "original" in cdx_print:
output_string.append(snapshot.original)
if any(
val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]
):
output_string.append(snapshot.mimetype)
if any(
val in cdx_print
for val in ["statuscode", "status-code", "status_code"]
):
output_string.append(snapshot.statuscode)
if "digest" in cdx_print:
output_string.append(snapshot.digest)
if "length" in cdx_print:
output_string.append(snapshot.length)
if any(
val in cdx_print
for val in ["archiveurl", "archive-url", "archive_url"]
):
output_string.append(snapshot.archive_url)
click.echo(" ".join(output_string))
else: else:
click.echo( click.echo(

View File

@@ -12,20 +12,7 @@ class WaybackError(Exception):
1) Wayback Machine API Service is unreachable/down. 1) Wayback Machine API Service is unreachable/down.
2) You passed illegal arguments. 2) You passed illegal arguments.
All other exceptions are inherited from this class. All other exceptions are inherited from this main exception.
"""
class RedirectSaveError(WaybackError):
"""
Raised when the original URL is redirected and the
redirect URL is archived but not the original URL.
"""
class URLError(Exception):
"""
Raised when malformed URLs are passed as arguments.
""" """
@@ -33,6 +20,8 @@ class TooManyRequestsError(WaybackError):
""" """
Raised when you make more than 15 requests per Raised when you make more than 15 requests per
minute and the Wayback Machine returns 429. minute and the Wayback Machine returns 429.
See https://github.com/akamhy/waybackpy/issues/131
""" """

View File

@@ -2,49 +2,8 @@
Utility functions and shared variables like DEFAULT_USER_AGENT are here. Utility functions and shared variables like DEFAULT_USER_AGENT are here.
""" """
import requests
from . import __version__ from . import __version__
DEFAULT_USER_AGENT: str = ( DEFAULT_USER_AGENT: str = (
f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
) )
def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
"""Latest waybackpy version on PyPi."""
request_url = "https://pypi.org/pypi/" + package_name + "/json"
headers = {"User-Agent": user_agent}
response = requests.get(request_url, headers=headers)
data = response.json()
if (
data is not None
and "info" in data
and data["info"] is not None
and "version" in data["info"]
and data["info"]["version"] is not None
):
return str(data["info"]["version"])
raise ValueError("Could not get latest pypi version")
def latest_version_github(
package_name: str, user_agent: str = DEFAULT_USER_AGENT
) -> str:
"""Latest waybackpy version on GitHub."""
request_url = (
"https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
)
headers = {"User-Agent": user_agent}
response = requests.get(request_url, headers=headers)
data = response.json()
if (
data is not None
and len(data) > 0
and data[0] is not None
and "tag_name" in data[0]
):
return str(data[0]["tag_name"])
raise ValueError("Could not get latest github version")

View File

@@ -113,6 +113,7 @@ class Url:
"""Set the attributes for total backwards compatibility.""" """Set the attributes for total backwards compatibility."""
self.archive_url = self.wayback_machine_availability_api.archive_url self.archive_url = self.wayback_machine_availability_api.archive_url
self.json = self.wayback_machine_availability_api.json self.json = self.wayback_machine_availability_api.json
self.JSON = self.json # for backwards compatibility, do not remove it.
self.timestamp = self.wayback_machine_availability_api.timestamp() self.timestamp = self.wayback_machine_availability_api.timestamp()
def total_archives( def total_archives(