added docstrings, added some static type hints and also lint. (#141)
* added docstrings, added some static type hints and also lint. * added doc strings and changed some internal variable names for more clarity. * make flake8 happy * add descriptive docstrings and type hints in waybackpy/cdx_snapshot.py * remove useless code and add docstrings and also lint using pylint. * remove unwarented test * added docstrings, lint using pylint and add a raise on 509 SC * added docstrings and lint with pylint * lint * add doc strings and lint * add docstrings and lint
This commit is contained in:
parent
004ff26196
commit
97f8b96411
@ -53,10 +53,6 @@ def test_get_response() -> None:
|
||||
response = get_response(url, headers=headers)
|
||||
assert not isinstance(response, Exception) and response.status_code == 200
|
||||
|
||||
url = "http/wwhfhfvhvjhmom"
|
||||
with pytest.raises(WaybackError):
|
||||
get_response(url, headers=headers)
|
||||
|
||||
|
||||
def test_check_filters() -> None:
|
||||
filters: List[str] = []
|
||||
|
@ -1,9 +1,32 @@
|
||||
"""
|
||||
This module interfaces the Wayback Machine's availability API.
|
||||
|
||||
The interface could be useful for looking up archives and finding archives
|
||||
that are close to a specific date and time.
|
||||
|
||||
It has a class called WaybackMachineAvailabilityAPI, and the class has
|
||||
methods such as:
|
||||
|
||||
near() for looking up archives close to a specific date and time.
|
||||
|
||||
oldest() for retrieving the first archive URL of the webpage.
|
||||
|
||||
newest() for retrieving the latest archive of an URL.
|
||||
|
||||
The Wayback Machine Availability response should be a valid JSON and
|
||||
if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
|
||||
|
||||
If the Availability API returned valid JSON but archive URL could not be found
|
||||
it it then ArchiveNotInAvailabilityAPIResponse is raised.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
from requests.models import Response
|
||||
|
||||
from .exceptions import (
|
||||
ArchiveNotInAvailabilityAPIResponse,
|
||||
@ -22,38 +45,43 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
def __init__(
|
||||
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
|
||||
) -> None:
|
||||
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
|
||||
self.payload = {"url": self.url}
|
||||
self.endpoint = "https://archive.org/wayback/available"
|
||||
self.max_tries = max_tries
|
||||
self.tries = 0
|
||||
self.last_api_call_unix_time = int(time.time())
|
||||
self.api_call_time_gap = 5
|
||||
self.payload: Dict[str, str] = {"url": self.url}
|
||||
self.endpoint: str = "https://archive.org/wayback/available"
|
||||
self.max_tries: int = max_tries
|
||||
self.tries: int = 0
|
||||
self.last_api_call_unix_time: int = int(time.time())
|
||||
self.api_call_time_gap: int = 5
|
||||
self.JSON: Optional[ResponseJSON] = None
|
||||
|
||||
@staticmethod
|
||||
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
|
||||
"""
|
||||
Converts Unix time to wayback Machine timestamp.
|
||||
Converts Unix time to wayback Machine timestamp and the Wayback Machine
|
||||
timestamp format is yyyyMMddhhmmss.
|
||||
"""
|
||||
|
||||
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""
|
||||
Same as string representation, just return the archive URL as a string.
|
||||
"""
|
||||
|
||||
return str(self)
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
String representation of the class. If atleast one API call was successfully
|
||||
made then return the archive URL as a string. Else returns None.
|
||||
String representation of the class. If atleast one API
|
||||
call was successfully made then return the archive URL
|
||||
as a string. Else returns "".
|
||||
"""
|
||||
|
||||
# String must not return anything other than a string object
|
||||
# So, if some asks for string repr before making the API requests
|
||||
# String should not return anything other than a string object
|
||||
# So, if a string repr is asked for before making any API requests
|
||||
# just return ""
|
||||
if not self.JSON:
|
||||
return ""
|
||||
@ -62,26 +90,36 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
|
||||
def json(self) -> Optional[ResponseJSON]:
|
||||
"""
|
||||
Makes the API call to the availability API can set the JSON response
|
||||
to the JSON attribute of the instance and also returns the JSON attribute.
|
||||
Makes the API call to the availability API and set the JSON response
|
||||
to the JSON attribute of the instance and also returns the JSON
|
||||
attribute.
|
||||
|
||||
time_diff and sleep_time makes sure that you are not making too many
|
||||
requests in a short interval of item, making too many requests is bad
|
||||
as Wayback Machine may reject them above a certain threshold.
|
||||
|
||||
The end-user can change the api_call_time_gap attribute of the instance
|
||||
to increase or decrease the default time gap between two successive API
|
||||
calls, but it is not recommended to increase it.
|
||||
"""
|
||||
|
||||
time_diff = int(time.time()) - self.last_api_call_unix_time
|
||||
sleep_time = self.api_call_time_gap - time_diff
|
||||
|
||||
if sleep_time > 0:
|
||||
time.sleep(sleep_time)
|
||||
|
||||
self.response = requests.get(
|
||||
self.response: Response = requests.get(
|
||||
self.endpoint, params=self.payload, headers=self.headers
|
||||
)
|
||||
self.last_api_call_unix_time = int(time.time())
|
||||
self.tries += 1
|
||||
try:
|
||||
self.JSON = self.response.json()
|
||||
except json.decoder.JSONDecodeError:
|
||||
except json.decoder.JSONDecodeError as json_decode_error:
|
||||
raise InvalidJSONInAvailabilityAPIResponse(
|
||||
f"Response data:\n{self.response.text}"
|
||||
)
|
||||
) from json_decode_error
|
||||
|
||||
return self.JSON
|
||||
|
||||
@ -91,15 +129,17 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
If JSON attribute of the instance is None it implies that the either
|
||||
the the last API call failed or one was never made.
|
||||
|
||||
If not JSON or if JSON but no timestamp in the JSON response then returns
|
||||
the maximum value for datetime object that is possible.
|
||||
If not JSON or if JSON but no timestamp in the JSON response then
|
||||
returns the maximum value for datetime object that is possible.
|
||||
|
||||
If you get an URL as a response form the availability API it is guaranteed
|
||||
that you can get the datetime object from the timestamp.
|
||||
If you get an URL as a response form the availability API it is
|
||||
guaranteed that you can get the datetime object from the timestamp.
|
||||
"""
|
||||
|
||||
if self.JSON is None or "archived_snapshots" not in self.JSON:
|
||||
return datetime.max
|
||||
elif (
|
||||
|
||||
if (
|
||||
self.JSON is not None
|
||||
and "archived_snapshots" in self.JSON
|
||||
and self.JSON["archived_snapshots"] is not None
|
||||
@ -110,21 +150,23 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
return datetime.strptime(
|
||||
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||
)
|
||||
else:
|
||||
|
||||
raise ValueError("Could not get timestamp from result")
|
||||
|
||||
@property
|
||||
def archive_url(self) -> str:
|
||||
"""
|
||||
Reads the the JSON response data and tries to get the timestamp and returns
|
||||
the timestamp if found else returns None.
|
||||
Reads the the JSON response data and returns
|
||||
the timestamp if found and if not found raises
|
||||
ArchiveNotInAvailabilityAPIResponse.
|
||||
"""
|
||||
|
||||
archive_url = ""
|
||||
data = self.JSON
|
||||
|
||||
# If the user didn't used oldest, newest or near but tries to access the
|
||||
# archive_url attribute then, we assume they are fine with any archive
|
||||
# and invoke the oldest archive function.
|
||||
# If the user didn't invoke oldest, newest or near but tries to access the
|
||||
# archive_url attribute then assume they are fine with any archive
|
||||
# and invoke the oldest method.
|
||||
if not data:
|
||||
self.oldest()
|
||||
|
||||
@ -137,7 +179,7 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
self.json() # It makes a new API call
|
||||
data = self.JSON # json() updated the value of JSON attribute
|
||||
|
||||
# Even if after we exhausted teh max_tries, then we give up and
|
||||
# If we exhausted the max_tries, then we give up and
|
||||
# raise exception.
|
||||
|
||||
if not data or not data["archived_snapshots"]:
|
||||
@ -160,6 +202,7 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
Prepends zero before the year, month, day, hour and minute so that they
|
||||
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
|
||||
"""
|
||||
|
||||
return "".join(
|
||||
str(kwargs[key]).zfill(2)
|
||||
for key in ["year", "month", "day", "hour", "minute"]
|
||||
@ -167,18 +210,21 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
|
||||
def oldest(self) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
Passing the year 1994 should return the oldest archive because
|
||||
wayback machine was started in May, 1996 and there should be no archive
|
||||
before the year 1994.
|
||||
Passes the date 1994-01-01 to near which should return the oldest archive
|
||||
because Wayback Machine was started in May, 1996 and it is assumed that
|
||||
there would be no archive older than January 1, 1994.
|
||||
"""
|
||||
return self.near(year=1994)
|
||||
|
||||
return self.near(year=1994, month=1, day=1)
|
||||
|
||||
def newest(self) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
Passing the current UNIX time should be sufficient to get the newest
|
||||
archive considering the API request-response time delay and also the
|
||||
database lags on Wayback machine.
|
||||
Passes the current UNIX time to near() for retrieving the newest archive
|
||||
from the availability API.
|
||||
|
||||
We assume that wayback machine can not archive the future of a webpage.
|
||||
"""
|
||||
|
||||
return self.near(unix_timestamp=int(time.time()))
|
||||
|
||||
def near(
|
||||
@ -191,16 +237,18 @@ class WaybackMachineAvailabilityAPI(object):
|
||||
unix_timestamp: Optional[int] = None,
|
||||
) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
The main method for this Class, oldest and newest methods are dependent on this
|
||||
method.
|
||||
The main method for the Class, oldest() and newest() are dependent on it.
|
||||
|
||||
It generates the timestamp based on the input either by calling the
|
||||
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
|
||||
appropriate arguments for their respective parameters.
|
||||
|
||||
Adds the timestamp to the payload dictionary.
|
||||
|
||||
And finally invoking the json method to make the API call then returns
|
||||
the instance.
|
||||
"""
|
||||
|
||||
if unix_timestamp:
|
||||
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||
else:
|
||||
|
@ -1,3 +1,14 @@
|
||||
"""
|
||||
This module interfaces the Wayback Machine's CDX server API.
|
||||
|
||||
The module has WaybackMachineCDXServerAPI which should be used by the users of
|
||||
this module to consume the CDX server API.
|
||||
|
||||
WaybackMachineCDXServerAPI has a snapshot method that yields the snapshots, and
|
||||
the snapshots are yielded as instances of the CDXSnapshot class.
|
||||
"""
|
||||
|
||||
|
||||
from typing import Dict, Generator, List, Optional, cast
|
||||
|
||||
from .cdx_snapshot import CDXSnapshot
|
||||
@ -16,6 +27,11 @@ from .utils import DEFAULT_USER_AGENT
|
||||
class WaybackMachineCDXServerAPI(object):
|
||||
"""
|
||||
Class that interfaces the CDX server API of the Wayback Machine.
|
||||
|
||||
snapshot() returns a generator that can be iterated upon by the end-user,
|
||||
the generator returns the snapshots/entries as instance of CDXSnapshot to
|
||||
make the usage easy, just use '.' to get any attribute as the attributes are
|
||||
accessible via a dot ".".
|
||||
"""
|
||||
|
||||
# start_timestamp: from, can not use from as it's a keyword
|
||||
@ -53,9 +69,35 @@ class WaybackMachineCDXServerAPI(object):
|
||||
def cdx_api_manager(
|
||||
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
|
||||
) -> Generator[str, None, None]:
|
||||
"""
|
||||
Manages the API calls for the instance, it automatically selects the best
|
||||
parameters by looking as the query of the end-user. For bigger queries
|
||||
automatically use the CDX pagination API and for smaller queries use the
|
||||
normal API.
|
||||
|
||||
CDX Server API is a complex API and to make it easy for the end user to
|
||||
consume it the CDX manager(this method) handles the selection of the
|
||||
API output, whether to use the pagination API or not.
|
||||
|
||||
For doing large/bulk queries, the use of the Pagination API is
|
||||
recommended by the Wayback Machine authors. And it determines if the
|
||||
query would be large or not by using the showNumPages=true parameter,
|
||||
this tells the number of pages of CDX DATA that the pagination API
|
||||
will return.
|
||||
|
||||
If the number of page is less than 2 we use the normal non-pagination
|
||||
API as the pagination API is known to lag and for big queries it should
|
||||
not matter but for queries where the number of pages are less this
|
||||
method chooses accuracy over the pagination API.
|
||||
"""
|
||||
|
||||
# number of pages that will returned by the pagination API.
|
||||
# get_total_pages adds the showNumPages=true param to pagination API
|
||||
# requests.
|
||||
# This is a special query that will return a single number indicating
|
||||
# the number of pages.
|
||||
total_pages = get_total_pages(self.url, self.user_agent)
|
||||
# If we only have two or less pages of archives then we care for more accuracy
|
||||
# pagination API is lagged sometimes
|
||||
|
||||
if use_page is True and total_pages >= 2:
|
||||
blank_pages = 0
|
||||
for i in range(total_pages):
|
||||
@ -78,11 +120,11 @@ class WaybackMachineCDXServerAPI(object):
|
||||
else:
|
||||
payload["showResumeKey"] = "true"
|
||||
payload["limit"] = str(self.limit)
|
||||
resumeKey = None
|
||||
resume_key = None
|
||||
more = True
|
||||
while more:
|
||||
if resumeKey:
|
||||
payload["resumeKey"] = resumeKey
|
||||
if resume_key:
|
||||
payload["resumeKey"] = resume_key
|
||||
|
||||
url = full_url(self.endpoint, params=payload)
|
||||
res = get_response(url, headers=headers)
|
||||
@ -102,13 +144,16 @@ class WaybackMachineCDXServerAPI(object):
|
||||
|
||||
if len(second_last_line) == 0:
|
||||
|
||||
resumeKey = lines[-1].strip()
|
||||
text = text.replace(resumeKey, "", 1).strip()
|
||||
resume_key = lines[-1].strip()
|
||||
text = text.replace(resume_key, "", 1).strip()
|
||||
more = True
|
||||
|
||||
yield text
|
||||
|
||||
def add_payload(self, payload: Dict[str, str]) -> None:
|
||||
"""
|
||||
Adds the payload to the payload dictionary.
|
||||
"""
|
||||
if self.start_timestamp:
|
||||
payload["from"] = self.start_timestamp
|
||||
|
||||
@ -122,17 +167,35 @@ class WaybackMachineCDXServerAPI(object):
|
||||
payload["matchType"] = self.match_type
|
||||
|
||||
if self.filters and len(self.filters) > 0:
|
||||
for i, f in enumerate(self.filters):
|
||||
payload["filter" + str(i)] = f
|
||||
for i, _filter in enumerate(self.filters):
|
||||
payload["filter" + str(i)] = _filter
|
||||
|
||||
if self.collapses and len(self.collapses) > 0:
|
||||
for i, f in enumerate(self.collapses):
|
||||
payload["collapse" + str(i)] = f
|
||||
for i, collapse in enumerate(self.collapses):
|
||||
payload["collapse" + str(i)] = collapse
|
||||
|
||||
# Don't need to return anything as it's dictionary.
|
||||
payload["url"] = self.url
|
||||
|
||||
def snapshots(self) -> Generator[CDXSnapshot, None, None]:
|
||||
"""
|
||||
This function yields the CDX data lines as snapshots.
|
||||
|
||||
As it is a generator it exhaustible, the reason that this is
|
||||
a generator and not a list are:
|
||||
|
||||
a) CDX server API can return millions of entries for a query and list
|
||||
is not suitable for such cases.
|
||||
|
||||
b) Preventing memory usage issues, as told before this method may yield
|
||||
millions of records for some queries and your system may not have enough
|
||||
memory for such a big list. Also Remember this if outputing to Jupyter
|
||||
Notebooks.
|
||||
|
||||
The objects yielded by this method are instance of CDXSnapshot class,
|
||||
you can access the attributes of the entries as the attribute of the instance
|
||||
itself.
|
||||
"""
|
||||
payload: Dict[str, str] = {}
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
|
||||
@ -144,18 +207,25 @@ class WaybackMachineCDXServerAPI(object):
|
||||
if self.collapses != []:
|
||||
self.use_page = False
|
||||
|
||||
texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
|
||||
entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
|
||||
|
||||
for text in texts:
|
||||
for entry in entries:
|
||||
|
||||
if text.isspace() or len(text) <= 1 or not text:
|
||||
if entry.isspace() or len(entry) <= 1 or not entry:
|
||||
continue
|
||||
|
||||
snapshot_list = text.split("\n")
|
||||
# each line is a snapshot aka entry of the CDX server API.
|
||||
# We are able to split the page by lines because it only
|
||||
# splits the lines on a sinlge page and not all the entries
|
||||
# at once, thus there should be no issues of too much memory usage.
|
||||
snapshot_list = entry.split("\n")
|
||||
|
||||
for snapshot in snapshot_list:
|
||||
|
||||
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
|
||||
# 14 + 32 == 46 ( timestamp + digest ), ignore the invalid entries.
|
||||
# they are invalid if their length is smaller than sum of length
|
||||
# of a standard wayback_timestamp and standard digest of an entry.
|
||||
if len(snapshot) < 46:
|
||||
continue
|
||||
|
||||
properties: Dict[str, Optional[str]] = {
|
||||
@ -168,16 +238,16 @@ class WaybackMachineCDXServerAPI(object):
|
||||
"length": None,
|
||||
}
|
||||
|
||||
prop_values = snapshot.split(" ")
|
||||
property_value = snapshot.split(" ")
|
||||
|
||||
prop_values_len = len(prop_values)
|
||||
properties_len = len(properties)
|
||||
total_property_values = len(property_value)
|
||||
warranted_total_property_values = len(properties)
|
||||
|
||||
if prop_values_len != properties_len:
|
||||
if total_property_values != warranted_total_property_values:
|
||||
raise WaybackError(
|
||||
f"Snapshot returned by Cdx API has {prop_values_len} "
|
||||
f"properties instead of expected {properties_len} properties.\n"
|
||||
f"Problematic Snapshot: {snapshot}"
|
||||
f"Snapshot returned by CDX API has {total_property_values} prop"
|
||||
f"erties instead of expected {warranted_total_property_values} "
|
||||
f"properties.\nProblematic Snapshot: {snapshot}"
|
||||
)
|
||||
|
||||
(
|
||||
@ -188,6 +258,6 @@ class WaybackMachineCDXServerAPI(object):
|
||||
properties["statuscode"],
|
||||
properties["digest"],
|
||||
properties["length"],
|
||||
) = prop_values
|
||||
) = property_value
|
||||
|
||||
yield CDXSnapshot(cast(Dict[str, str], properties))
|
||||
|
@ -1,30 +1,83 @@
|
||||
"""
|
||||
Module that contains the CDXSnapshot class, CDX records are casted
|
||||
to CDXSnapshot objects for easier access.
|
||||
|
||||
The CDX index format is plain text data. Each line ('record') indicates a
|
||||
crawled document. And these lines are casted to CDXSnapshot.
|
||||
"""
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class CDXSnapshot(object):
|
||||
"""
|
||||
Class for the CDX snapshot lines returned by the CDX API,
|
||||
Class for the CDX snapshot lines('record') returned by the CDX API,
|
||||
Each valid line of the CDX API is casted to an CDXSnapshot object
|
||||
by the CDX API interface.
|
||||
by the CDX API interface, just use "." to access any attribute of the
|
||||
CDX server API snapshot.
|
||||
|
||||
This provides the end-user the ease of using the data as attributes
|
||||
of the CDXSnapshot.
|
||||
|
||||
The string representation of the class is identical to the line returned
|
||||
by the CDX server API.
|
||||
|
||||
Besides all the attributes of the CDX server API this class also provides
|
||||
archive_url attribute, yes it is the archive url of the snapshot.
|
||||
|
||||
Attributes of the this class and what they represents and are useful for:
|
||||
|
||||
urlkey: The document captured, expressed as a SURT
|
||||
SURT stands for Sort-friendly URI Reordering Transform, and is a
|
||||
transformation applied to URIs which makes their left-to-right
|
||||
representation better match the natural hierarchy of domain names.
|
||||
A URI <scheme://domain.tld/path?query> has SURT
|
||||
form <scheme://(tld,domain,)/path?query>.
|
||||
|
||||
timestamp: The timestamp of the archive, format is yyyyMMddhhmmss and type
|
||||
is string.
|
||||
|
||||
datetime_timestamp: The timestamp as a datetime object.
|
||||
|
||||
original: The original URL of the archive. If archive_url is
|
||||
https://web.archive.org/web/20220113130051/https://google.com then the
|
||||
original URL is https://google.com
|
||||
|
||||
mimetype: The document’s file type. e.g. text/html
|
||||
|
||||
statuscode: HTTP response code for the document at the time of its crawling
|
||||
|
||||
digest: Base32-encoded SHA-1 checksum of the document for discriminating
|
||||
with others
|
||||
|
||||
length: Document’s volume of bytes in the WARC file
|
||||
|
||||
archive_url: The archive url of the snapshot, this is not returned by the
|
||||
CDX server API but created by this class on init.
|
||||
"""
|
||||
|
||||
def __init__(self, properties: Dict[str, str]) -> None:
|
||||
self.urlkey = properties["urlkey"]
|
||||
self.timestamp = properties["timestamp"]
|
||||
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
||||
self.original = properties["original"]
|
||||
self.mimetype = properties["mimetype"]
|
||||
self.statuscode = properties["statuscode"]
|
||||
self.digest = properties["digest"]
|
||||
self.length = properties["length"]
|
||||
self.archive_url = (
|
||||
self.urlkey: str = properties["urlkey"]
|
||||
self.timestamp: str = properties["timestamp"]
|
||||
self.datetime_timestamp: datetime = datetime.strptime(
|
||||
self.timestamp, "%Y%m%d%H%M%S"
|
||||
)
|
||||
self.original: str = properties["original"]
|
||||
self.mimetype: str = properties["mimetype"]
|
||||
self.statuscode: str = properties["statuscode"]
|
||||
self.digest: str = properties["digest"]
|
||||
self.length: str = properties["length"]
|
||||
self.archive_url: str = (
|
||||
f"https://web.archive.org/web/{self.timestamp}/{self.original}"
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
The string representation is same as the line returned by the
|
||||
CDX server API for the snapshot.
|
||||
"""
|
||||
return (
|
||||
f"{self.urlkey} {self.timestamp} {self.original} "
|
||||
f"{self.mimetype} {self.statuscode} {self.digest} {self.length}"
|
||||
|
@ -1,3 +1,10 @@
|
||||
"""
|
||||
Utility functions required for accessing the CDX server API.
|
||||
|
||||
These are here in this module so that we don’t make any module too
|
||||
big.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from urllib.parse import quote
|
||||
@ -11,28 +18,44 @@ from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
|
||||
"""
|
||||
When using the pagination use adding showNumPages=true to the request
|
||||
URL makes the CDX server return an integer which is the number of pages
|
||||
of CDX pages available for us to query using the pagination API.
|
||||
"""
|
||||
|
||||
endpoint = "https://web.archive.org/cdx/search/cdx?"
|
||||
payload = {"showNumPages": "true", "url": str(url)}
|
||||
headers = {"User-Agent": user_agent}
|
||||
request_url = full_url(endpoint, params=payload)
|
||||
response = get_response(request_url, headers=headers)
|
||||
|
||||
if isinstance(response, requests.Response):
|
||||
return int(response.text.strip())
|
||||
else:
|
||||
raise response
|
||||
|
||||
|
||||
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
|
||||
"""
|
||||
As the function's name already implies that it returns
|
||||
full URL, but why we need a function for generating full URL?
|
||||
The CDX server can support multiple arguments for parameters
|
||||
such as filter and collapse and this function adds them without
|
||||
overwriting earlier added arguments.
|
||||
"""
|
||||
|
||||
if not params:
|
||||
return endpoint
|
||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||
_full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||
|
||||
for key, val in params.items():
|
||||
key = "filter" if key.startswith("filter") else key
|
||||
key = "collapse" if key.startswith("collapse") else key
|
||||
amp = "" if full_url.endswith("?") else "&"
|
||||
amp = "" if _full_url.endswith("?") else "&"
|
||||
val = quote(str(val), safe="")
|
||||
full_url += f"{amp}{key}={val}"
|
||||
return full_url
|
||||
_full_url += f"{amp}{key}={val}"
|
||||
|
||||
return _full_url
|
||||
|
||||
|
||||
def get_response(
|
||||
@ -40,29 +63,31 @@ def get_response(
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
retries: int = 5,
|
||||
backoff_factor: float = 0.5,
|
||||
# no_raise_on_redirects=False,
|
||||
) -> Union[requests.Response, Exception]:
|
||||
"""
|
||||
Make get request to the CDX server and return the response.
|
||||
"""
|
||||
|
||||
session = requests.Session()
|
||||
|
||||
retries_ = Retry(
|
||||
total=retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
)
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries_))
|
||||
|
||||
try:
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries_))
|
||||
response = session.get(url, headers=headers)
|
||||
session.close()
|
||||
return response
|
||||
except Exception as e:
|
||||
reason = str(e)
|
||||
exc_message = f"Error while retrieving {url}.\n{reason}"
|
||||
exc = WaybackError(exc_message)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
||||
|
||||
|
||||
def check_filters(filters: List[str]) -> None:
|
||||
"""
|
||||
Check that the filter arguments passed by the end-user are valid.
|
||||
If not valid then raise WaybackError.
|
||||
"""
|
||||
|
||||
if not isinstance(filters, list):
|
||||
raise WaybackError("filters must be a list.")
|
||||
|
||||
@ -81,9 +106,15 @@ def check_filters(filters: List[str]) -> None:
|
||||
|
||||
|
||||
def check_collapses(collapses: List[str]) -> bool:
|
||||
"""
|
||||
Check that the collapse arguments passed by the end-user are valid.
|
||||
If not valid then raise WaybackError.
|
||||
"""
|
||||
|
||||
if not isinstance(collapses, list):
|
||||
raise WaybackError("collapses must be a list.")
|
||||
elif len(collapses) == 0:
|
||||
|
||||
if len(collapses) == 0:
|
||||
return True
|
||||
|
||||
for collapse in collapses:
|
||||
@ -103,18 +134,26 @@ def check_collapses(collapses: List[str]) -> bool:
|
||||
|
||||
|
||||
def check_match_type(match_type: Optional[str], url: str) -> bool:
|
||||
"""
|
||||
Check that the match_type argument passed by the end-user is valid.
|
||||
If not valid then raise WaybackError.
|
||||
"""
|
||||
|
||||
legal_match_type = ["exact", "prefix", "host", "domain"]
|
||||
|
||||
if not match_type:
|
||||
return True
|
||||
elif "*" in url:
|
||||
|
||||
if "*" in url:
|
||||
raise WaybackError(
|
||||
"Can not use wildcard in the URL along with the match_type arguments."
|
||||
)
|
||||
elif match_type not in legal_match_type:
|
||||
|
||||
if match_type not in legal_match_type:
|
||||
exc_message = (
|
||||
f"{match_type} is not an allowed match type.\n"
|
||||
"Use one from 'exact', 'prefix', 'host' or 'domain'"
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
else:
|
||||
|
||||
return True
|
||||
|
@ -1,3 +1,7 @@
|
||||
"""
|
||||
Module that makes waybackpy a CLI tool.
|
||||
"""
|
||||
|
||||
import json as JSON
|
||||
import os
|
||||
import random
|
||||
@ -19,7 +23,10 @@ from .wrapper import Url
|
||||
def echo_availability_api(
|
||||
availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
|
||||
) -> None:
|
||||
click.echo("Archive URL:")
|
||||
"""
|
||||
Output availability API depending functions.
|
||||
Near, oldest and newest output by this method.
|
||||
"""
|
||||
if not availability_api_instance.archive_url:
|
||||
archive_url = (
|
||||
"NO ARCHIVE FOUND - The requested URL is probably "
|
||||
@ -29,6 +36,7 @@ def echo_availability_api(
|
||||
)
|
||||
else:
|
||||
archive_url = availability_api_instance.archive_url
|
||||
click.echo("Archive URL:")
|
||||
click.echo(archive_url)
|
||||
if json:
|
||||
click.echo("JSON response:")
|
||||
@ -36,6 +44,10 @@ def echo_availability_api(
|
||||
|
||||
|
||||
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||
"""
|
||||
Save output of CDX API on file.
|
||||
Mainly here because of backwards compatibility.
|
||||
"""
|
||||
domain = None
|
||||
sys_random = random.SystemRandom()
|
||||
uid = "".join(
|
||||
@ -51,8 +63,8 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||
domain = "domain-unknown" if match is None else match.group(1)
|
||||
file_name = f"{domain}-urls-{uid}.txt"
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
with open(file_path, "a") as f:
|
||||
f.write(f"{url}\n")
|
||||
with open(file_path, "a") as file:
|
||||
file.write(f"{url}\n")
|
||||
|
||||
click.echo(url)
|
||||
|
||||
@ -269,6 +281,7 @@ def main( # pylint: disable=no-value-for-parameter
|
||||
"""
|
||||
if version:
|
||||
click.echo(f"waybackpy version {__version__}")
|
||||
|
||||
elif show_license:
|
||||
click.echo(
|
||||
requests.get(
|
||||
@ -277,6 +290,7 @@ def main( # pylint: disable=no-value-for-parameter
|
||||
)
|
||||
elif url is None:
|
||||
click.echo("No URL detected. Please provide an URL.", err=True)
|
||||
|
||||
elif (
|
||||
not version
|
||||
and not oldest
|
||||
@ -291,14 +305,17 @@ def main( # pylint: disable=no-value-for-parameter
|
||||
"Use --help flag for help using waybackpy.",
|
||||
err=True,
|
||||
)
|
||||
|
||||
elif oldest:
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
availability_api.oldest()
|
||||
echo_availability_api(availability_api, json)
|
||||
|
||||
elif newest:
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
availability_api.newest()
|
||||
echo_availability_api(availability_api, json)
|
||||
|
||||
elif near:
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
near_args = {}
|
||||
@ -309,6 +326,7 @@ def main( # pylint: disable=no-value-for-parameter
|
||||
near_args[key] = arg
|
||||
availability_api.near(**near_args)
|
||||
echo_availability_api(availability_api, json)
|
||||
|
||||
elif save:
|
||||
save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
|
||||
save_api.save()
|
||||
@ -319,15 +337,17 @@ def main( # pylint: disable=no-value-for-parameter
|
||||
if headers:
|
||||
click.echo("Save API headers:")
|
||||
click.echo(save_api.headers)
|
||||
|
||||
elif known_urls:
|
||||
wayback = Url(url, user_agent)
|
||||
url_gen = wayback.known_urls(subdomain=subdomain)
|
||||
|
||||
if file:
|
||||
return save_urls_on_file(url_gen)
|
||||
else:
|
||||
|
||||
for url in url_gen:
|
||||
click.echo(url)
|
||||
|
||||
elif cdx:
|
||||
filters = list(cdx_filter)
|
||||
collapses = list(collapse)
|
||||
|
@ -1,3 +1,10 @@
|
||||
"""
|
||||
This module interfaces the Wayback Machine's SavePageNow (SPN) API.
|
||||
|
||||
The module has WaybackMachineSaveAPI class which should be used by the users of
|
||||
this module to use the SavePageNow API.
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
@ -8,7 +15,7 @@ from requests.adapters import HTTPAdapter
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError
|
||||
from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, WaybackError
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
@ -47,7 +54,7 @@ class WaybackMachineSaveAPI(object):
|
||||
|
||||
if self._archive_url:
|
||||
return self._archive_url
|
||||
else:
|
||||
|
||||
return self.save()
|
||||
|
||||
def get_save_request_headers(self) -> None:
|
||||
@ -66,6 +73,7 @@ class WaybackMachineSaveAPI(object):
|
||||
to be very unreliable thus if it fails first check opening
|
||||
the response URL yourself in the browser.
|
||||
"""
|
||||
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
total=self.total_save_retries,
|
||||
@ -79,11 +87,24 @@ class WaybackMachineSaveAPI(object):
|
||||
self.status_code = self.response.status_code
|
||||
self.response_url = self.response.url
|
||||
session.close()
|
||||
|
||||
if self.status_code == 429:
|
||||
# why wait 5 minutes and 429?
|
||||
# see https://github.com/akamhy/waybackpy/issues/97
|
||||
raise TooManyRequestsError(
|
||||
"Seem to be refused to request by the server. "
|
||||
"Save Page Now receives up to 15 URLs per minutes. "
|
||||
"Wait a moment and run again."
|
||||
f"Can not save '{self.url}'. "
|
||||
f"Save request refused by the server. "
|
||||
f"Save Page Now limits saving 15 URLs per minutes. "
|
||||
f"Try waiting for 5 minutes and then try again."
|
||||
)
|
||||
|
||||
# why 509?
|
||||
# see https://github.com/akamhy/waybackpy/pull/99
|
||||
# also https://t.co/xww4YJ0Iwc
|
||||
if self.status_code == 509:
|
||||
raise WaybackError(
|
||||
f"Can not save '{self.url}'. You have probably reached the "
|
||||
f"limit of active sessions."
|
||||
)
|
||||
|
||||
def archive_url_parser(self) -> Optional[str]:
|
||||
@ -146,13 +167,17 @@ class WaybackMachineSaveAPI(object):
|
||||
the Wayback Machine to serve cached archive if last archive was captured
|
||||
before last 45 minutes.
|
||||
"""
|
||||
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
|
||||
m = re.search(regex, str(self._archive_url))
|
||||
if m is None or len(m.groups()) != 1:
|
||||
raise ValueError("Could not get timestamp")
|
||||
string_timestamp = m.group(1)
|
||||
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
|
||||
|
||||
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
|
||||
match = re.search(regex, str(self._archive_url))
|
||||
|
||||
if match is None or len(match.groups()) != 1:
|
||||
raise ValueError(
|
||||
f"Can not parse timestamp from archive URL, '{self._archive_url}'."
|
||||
)
|
||||
|
||||
string_timestamp = match.group(1)
|
||||
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
|
||||
timestamp_unixtime = time.mktime(timestamp.timetuple())
|
||||
instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple())
|
||||
|
||||
|
@ -1,3 +1,7 @@
|
||||
"""
|
||||
Utility functions and shared variables like DEFAULT_USER_AGENT are here.
|
||||
"""
|
||||
|
||||
import requests
|
||||
|
||||
from . import __version__
|
||||
@ -8,6 +12,7 @@ DEFAULT_USER_AGENT: str = (
|
||||
|
||||
|
||||
def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
|
||||
"""Latest waybackpy version on PyPi."""
|
||||
request_url = "https://pypi.org/pypi/" + package_name + "/json"
|
||||
headers = {"User-Agent": user_agent}
|
||||
response = requests.get(request_url, headers=headers)
|
||||
@ -20,13 +25,14 @@ def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT)
|
||||
and data["info"]["version"] is not None
|
||||
):
|
||||
return str(data["info"]["version"])
|
||||
else:
|
||||
|
||||
raise ValueError("Could not get latest pypi version")
|
||||
|
||||
|
||||
def latest_version_github(
|
||||
package_name: str, user_agent: str = DEFAULT_USER_AGENT
|
||||
) -> str:
|
||||
"""Latest waybackpy version on GitHub."""
|
||||
request_url = (
|
||||
"https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
|
||||
)
|
||||
@ -40,5 +46,5 @@ def latest_version_github(
|
||||
and "tag_name" in data[0]
|
||||
):
|
||||
return str(data[0]["tag_name"])
|
||||
else:
|
||||
|
||||
raise ValueError("Could not get latest github version")
|
||||
|
@ -1,3 +1,9 @@
|
||||
"""
|
||||
This module exists because backwards compatibility matters.
|
||||
Don't touch this or add any new functionality here and don't use
|
||||
the Url class.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Generator, Optional
|
||||
|
||||
@ -49,12 +55,14 @@ class Url(object):
|
||||
|
||||
if not isinstance(self.timestamp, datetime):
|
||||
raise TypeError("timestamp must be a datetime")
|
||||
elif self.timestamp == datetime.max:
|
||||
|
||||
if self.timestamp == datetime.max:
|
||||
return td_max.days
|
||||
else:
|
||||
|
||||
return (datetime.utcnow() - self.timestamp).days
|
||||
|
||||
def save(self) -> "Url":
|
||||
"""Save the URL on wayback machine."""
|
||||
self.wayback_machine_save_api = WaybackMachineSaveAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
@ -72,7 +80,7 @@ class Url(object):
|
||||
minute: Optional[int] = None,
|
||||
unix_timestamp: Optional[int] = None,
|
||||
) -> "Url":
|
||||
|
||||
"""Returns the archive of the URL close to a date and time."""
|
||||
self.wayback_machine_availability_api.near(
|
||||
year=year,
|
||||
month=month,
|
||||
@ -85,16 +93,19 @@ class Url(object):
|
||||
return self
|
||||
|
||||
def oldest(self) -> "Url":
|
||||
"""Returns the oldest archive of the URL."""
|
||||
self.wayback_machine_availability_api.oldest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def newest(self) -> "Url":
|
||||
"""Returns the newest archive of the URL."""
|
||||
self.wayback_machine_availability_api.newest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def set_availability_api_attrs(self) -> None:
|
||||
"""Set the attributes for total backwards compatibility."""
|
||||
self.archive_url = self.wayback_machine_availability_api.archive_url
|
||||
self.JSON = self.wayback_machine_availability_api.JSON
|
||||
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
||||
@ -102,6 +113,10 @@ class Url(object):
|
||||
def total_archives(
|
||||
self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
|
||||
) -> int:
|
||||
"""
|
||||
Returns an integer which indicates total number of archives for an URL.
|
||||
Useless in my opinion, only here because of backwards compatibility.
|
||||
"""
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
self.url,
|
||||
user_agent=self.user_agent,
|
||||
@ -122,6 +137,7 @@ class Url(object):
|
||||
end_timestamp: Optional[str] = None,
|
||||
match_type: str = "prefix",
|
||||
) -> Generator[str, None, None]:
|
||||
"""Yields known URLs for any URL."""
|
||||
if subdomain:
|
||||
match_type = "domain"
|
||||
if host:
|
||||
@ -137,4 +153,4 @@ class Url(object):
|
||||
)
|
||||
|
||||
for snapshot in cdx.snapshots():
|
||||
yield (snapshot.original)
|
||||
yield snapshot.original
|
||||
|
Loading…
Reference in New Issue
Block a user