add: type annotation to waybackpy modules
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
@@ -10,37 +11,41 @@ from .exceptions import (
|
||||
)
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
ResponseJSON = Dict[str, Any]
|
||||
|
||||
class WaybackMachineAvailabilityAPI:
|
||||
|
||||
class WaybackMachineAvailabilityAPI(object):
|
||||
"""
|
||||
Class that interfaces the availability API of the Wayback Machine.
|
||||
"""
|
||||
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
|
||||
def __init__(
|
||||
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
|
||||
) -> None:
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
self.headers = {"User-Agent": self.user_agent}
|
||||
self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
|
||||
self.payload = {"url": "{url}".format(url=self.url)}
|
||||
self.endpoint = "https://archive.org/wayback/available"
|
||||
self.max_tries = max_tries
|
||||
self.tries = 0
|
||||
self.last_api_call_unix_time = int(time.time())
|
||||
self.api_call_time_gap = 5
|
||||
self.JSON = None
|
||||
self.JSON: Optional[ResponseJSON] = None
|
||||
|
||||
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
|
||||
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp: int) -> str:
|
||||
"""
|
||||
Converts Unix time to wayback Machine timestamp.
|
||||
"""
|
||||
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||
|
||||
def __repr__(self):
|
||||
def __repr__(self) -> str:
|
||||
"""
|
||||
Same as string representation, just return the archive URL as a string.
|
||||
"""
|
||||
return str(self)
|
||||
|
||||
def __str__(self):
|
||||
def __str__(self) -> str:
|
||||
"""
|
||||
String representation of the class. If atleast one API call was successfully
|
||||
made then return the archive URL as a string. Else returns None.
|
||||
@@ -54,7 +59,7 @@ class WaybackMachineAvailabilityAPI:
|
||||
|
||||
return self.archive_url
|
||||
|
||||
def json(self):
|
||||
def json(self) -> Optional[ResponseJSON]:
|
||||
"""
|
||||
Makes the API call to the availability API can set the JSON response
|
||||
to the JSON attribute of the instance and also returns the JSON attribute.
|
||||
@@ -79,7 +84,7 @@ class WaybackMachineAvailabilityAPI:
|
||||
|
||||
return self.JSON
|
||||
|
||||
def timestamp(self):
|
||||
def timestamp(self) -> datetime:
|
||||
"""
|
||||
Converts the timestamp form the JSON response to datetime object.
|
||||
If JSON attribute of the instance is None it implies that the either
|
||||
@@ -91,19 +96,29 @@ class WaybackMachineAvailabilityAPI:
|
||||
If you get an URL as a response form the availability API it is guaranteed
|
||||
that you can get the datetime object from the timestamp.
|
||||
"""
|
||||
if not self.JSON or not self.JSON["archived_snapshots"]:
|
||||
if self.JSON is None or "archived_snapshots" not in self.JSON:
|
||||
return datetime.max
|
||||
|
||||
elif (
|
||||
self.JSON is not None
|
||||
and "archived_snapshots" in self.JSON
|
||||
and self.JSON["archived_snapshots"] is not None
|
||||
and "closest" in self.JSON["archived_snapshots"]
|
||||
and self.JSON["archived_snapshots"]["closest"] is not None
|
||||
and "timestamp" in self.JSON["archived_snapshots"]["closest"]
|
||||
):
|
||||
return datetime.strptime(
|
||||
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||
)
|
||||
else:
|
||||
raise ValueError("Could not get timestamp from result")
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
def archive_url(self) -> str:
|
||||
"""
|
||||
Reads the the JSON response data and tries to get the timestamp and returns
|
||||
the timestamp if found else returns None.
|
||||
"""
|
||||
archive_url = ""
|
||||
data = self.JSON
|
||||
|
||||
# If the user didn't used oldest, newest or near but tries to access the
|
||||
@@ -138,7 +153,7 @@ class WaybackMachineAvailabilityAPI:
|
||||
)
|
||||
return archive_url
|
||||
|
||||
def wayback_timestamp(self, **kwargs):
|
||||
def wayback_timestamp(self, **kwargs: int) -> str:
|
||||
"""
|
||||
Prepends zero before the year, month, day, hour and minute so that they
|
||||
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
|
||||
@@ -148,7 +163,7 @@ class WaybackMachineAvailabilityAPI:
|
||||
for key in ["year", "month", "day", "hour", "minute"]
|
||||
)
|
||||
|
||||
def oldest(self):
|
||||
def oldest(self) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
Passing the year 1994 should return the oldest archive because
|
||||
wayback machine was started in May, 1996 and there should be no archive
|
||||
@@ -156,7 +171,7 @@ class WaybackMachineAvailabilityAPI:
|
||||
"""
|
||||
return self.near(year=1994)
|
||||
|
||||
def newest(self):
|
||||
def newest(self) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
Passing the current UNIX time should be sufficient to get the newest
|
||||
archive considering the API request-response time delay and also the
|
||||
@@ -166,13 +181,13 @@ class WaybackMachineAvailabilityAPI:
|
||||
|
||||
def near(
|
||||
self,
|
||||
year=None,
|
||||
month=None,
|
||||
day=None,
|
||||
hour=None,
|
||||
minute=None,
|
||||
unix_timestamp=None,
|
||||
):
|
||||
year: Optional[int] = None,
|
||||
month: Optional[int] = None,
|
||||
day: Optional[int] = None,
|
||||
hour: Optional[int] = None,
|
||||
minute: Optional[int] = None,
|
||||
unix_timestamp: Optional[int] = None,
|
||||
) -> "WaybackMachineAvailabilityAPI":
|
||||
"""
|
||||
The main method for this Class, oldest and newest methods are dependent on this
|
||||
method.
|
||||
@@ -188,11 +203,11 @@ class WaybackMachineAvailabilityAPI:
|
||||
else:
|
||||
now = datetime.utcnow().timetuple()
|
||||
timestamp = self.wayback_timestamp(
|
||||
year=year if year else now.tm_year,
|
||||
month=month if month else now.tm_mon,
|
||||
day=day if day else now.tm_mday,
|
||||
hour=hour if hour else now.tm_hour,
|
||||
minute=minute if minute else now.tm_min,
|
||||
year=now.tm_year if year is None else year,
|
||||
month=now.tm_mon if month is None else month,
|
||||
day=now.tm_mday if day is None else day,
|
||||
hour=now.tm_hour if hour is None else hour,
|
||||
minute=now.tm_min if minute is None else minute,
|
||||
)
|
||||
|
||||
self.payload["timestamp"] = timestamp
|
||||
|
@@ -1,3 +1,5 @@
|
||||
from typing import Dict, Generator, List, Optional, cast
|
||||
|
||||
from .cdx_snapshot import CDXSnapshot
|
||||
from .cdx_utils import (
|
||||
check_collapses,
|
||||
@@ -11,43 +13,48 @@ from .exceptions import WaybackError
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
class WaybackMachineCDXServerAPI:
|
||||
class WaybackMachineCDXServerAPI(object):
|
||||
"""
|
||||
Class that interfaces the CDX server API of the Wayback Machine.
|
||||
"""
|
||||
|
||||
# start_timestamp: from, can not use from as it's a keyword
|
||||
# end_timestamp: to, not using to as can not use from
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
start_timestamp=None, # from, can not use from as it's a keyword
|
||||
end_timestamp=None, # to, not using to as can not use from
|
||||
filters=[],
|
||||
match_type=None,
|
||||
gzip=None,
|
||||
collapses=[],
|
||||
limit=None,
|
||||
max_tries=3,
|
||||
):
|
||||
url: str,
|
||||
user_agent: str = DEFAULT_USER_AGENT,
|
||||
start_timestamp: Optional[str] = None,
|
||||
end_timestamp: Optional[str] = None,
|
||||
filters: List[str] = [],
|
||||
match_type: Optional[str] = None,
|
||||
gzip: Optional[str] = None,
|
||||
collapses: List[str] = [],
|
||||
limit: Optional[str] = None,
|
||||
max_tries: int = 3,
|
||||
) -> None:
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
self.start_timestamp = str(start_timestamp) if start_timestamp else None
|
||||
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
||||
self.start_timestamp = (
|
||||
str(start_timestamp) if start_timestamp is not None else None
|
||||
)
|
||||
self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
|
||||
self.filters = filters
|
||||
check_filters(self.filters)
|
||||
self.match_type = str(match_type).strip() if match_type else None
|
||||
self.match_type = str(match_type).strip() if match_type is not None else None
|
||||
check_match_type(self.match_type, self.url)
|
||||
self.gzip = gzip if gzip else True
|
||||
self.gzip = gzip
|
||||
self.collapses = collapses
|
||||
check_collapses(self.collapses)
|
||||
self.limit = limit if limit else 5000
|
||||
self.limit = limit if limit is not None else 5000
|
||||
self.max_tries = max_tries
|
||||
self.last_api_request_url = None
|
||||
self.last_api_request_url: Optional[str] = None
|
||||
self.use_page = False
|
||||
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||
|
||||
def cdx_api_manager(self, payload, headers, use_page=False):
|
||||
|
||||
def cdx_api_manager(
|
||||
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
|
||||
) -> Generator[str, None, None]:
|
||||
total_pages = get_total_pages(self.url, self.user_agent)
|
||||
# If we only have two or less pages of archives then we care for more accuracy
|
||||
# pagination API is lagged sometimes
|
||||
@@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI:
|
||||
|
||||
url = full_url(self.endpoint, params=payload)
|
||||
res = get_response(url, headers=headers)
|
||||
if isinstance(res, Exception):
|
||||
raise res
|
||||
|
||||
self.last_api_request_url = url
|
||||
text = res.text
|
||||
@@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI:
|
||||
|
||||
yield text
|
||||
else:
|
||||
|
||||
payload["showResumeKey"] = "true"
|
||||
payload["limit"] = str(self.limit)
|
||||
resumeKey = None
|
||||
|
||||
more = True
|
||||
while more:
|
||||
|
||||
if resumeKey:
|
||||
payload["resumeKey"] = resumeKey
|
||||
|
||||
url = full_url(self.endpoint, params=payload)
|
||||
res = get_response(url, headers=headers)
|
||||
if isinstance(res, Exception):
|
||||
raise res
|
||||
|
||||
self.last_api_request_url = url
|
||||
|
||||
@@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI:
|
||||
|
||||
yield text
|
||||
|
||||
def add_payload(self, payload):
|
||||
def add_payload(self, payload: Dict[str, str]) -> None:
|
||||
if self.start_timestamp:
|
||||
payload["from"] = self.start_timestamp
|
||||
|
||||
if self.end_timestamp:
|
||||
payload["to"] = self.end_timestamp
|
||||
|
||||
if self.gzip is not True:
|
||||
if self.gzip is None:
|
||||
payload["gzip"] = "false"
|
||||
|
||||
if self.match_type:
|
||||
@@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI:
|
||||
# Don't need to return anything as it's dictionary.
|
||||
payload["url"] = self.url
|
||||
|
||||
def snapshots(self):
|
||||
payload = {}
|
||||
def snapshots(self) -> Generator[CDXSnapshot, None, None]:
|
||||
payload: Dict[str, str] = {}
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
|
||||
self.add_payload(payload)
|
||||
@@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI:
|
||||
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
|
||||
continue
|
||||
|
||||
properties = {
|
||||
properties: Dict[str, Optional[str]] = {
|
||||
"urlkey": None,
|
||||
"timestamp": None,
|
||||
"original": None,
|
||||
@@ -190,4 +198,4 @@ class WaybackMachineCDXServerAPI:
|
||||
properties["length"],
|
||||
) = prop_values
|
||||
|
||||
yield CDXSnapshot(properties)
|
||||
yield CDXSnapshot(cast(Dict[str, str], properties))
|
||||
|
@@ -1,7 +1,8 @@
|
||||
from datetime import datetime
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class CDXSnapshot:
|
||||
class CDXSnapshot(object):
|
||||
"""
|
||||
Class for the CDX snapshot lines returned by the CDX API,
|
||||
Each valid line of the CDX API is casted to an CDXSnapshot object
|
||||
@@ -10,7 +11,7 @@ class CDXSnapshot:
|
||||
of the CDXSnapshot.
|
||||
"""
|
||||
|
||||
def __init__(self, properties):
|
||||
def __init__(self, properties: Dict[str, str]) -> None:
|
||||
self.urlkey = properties["urlkey"]
|
||||
self.timestamp = properties["timestamp"]
|
||||
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
||||
@@ -23,7 +24,7 @@ class CDXSnapshot:
|
||||
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
def __str__(self) -> str:
|
||||
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
|
||||
urlkey=self.urlkey,
|
||||
timestamp=self.timestamp,
|
||||
|
@@ -1,23 +1,30 @@
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
# from urllib3.util.retry import Retry
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
|
||||
from .exceptions import WaybackError
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
def get_total_pages(url, user_agent=DEFAULT_USER_AGENT):
|
||||
def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
|
||||
endpoint = "https://web.archive.org/cdx/search/cdx?"
|
||||
payload = {"showNumPages": "true", "url": str(url)}
|
||||
headers = {"User-Agent": user_agent}
|
||||
request_url = full_url(endpoint, params=payload)
|
||||
response = get_response(request_url, headers=headers)
|
||||
if isinstance(response, requests.Response):
|
||||
return int(response.text.strip())
|
||||
else:
|
||||
raise response
|
||||
|
||||
|
||||
def full_url(endpoint, params):
|
||||
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
|
||||
if not params:
|
||||
return endpoint
|
||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||
@@ -26,27 +33,25 @@ def full_url(endpoint, params):
|
||||
key = "collapse" if key.startswith("collapse") else key
|
||||
amp = "" if full_url.endswith("?") else "&"
|
||||
full_url = (
|
||||
full_url
|
||||
+ amp
|
||||
+ "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
|
||||
full_url + amp + "{key}={val}".format(key=key, val=quote(str(val), safe=""))
|
||||
)
|
||||
return full_url
|
||||
|
||||
|
||||
def get_response(
|
||||
url,
|
||||
headers=None,
|
||||
retries=5,
|
||||
backoff_factor=0.5,
|
||||
no_raise_on_redirects=False,
|
||||
):
|
||||
url: str,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
retries: int = 5,
|
||||
backoff_factor: float = 0.5,
|
||||
# no_raise_on_redirects=False,
|
||||
) -> Union[requests.Response, Exception]:
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
retries_ = Retry(
|
||||
total=retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
)
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries_))
|
||||
|
||||
try:
|
||||
response = session.get(url, headers=headers)
|
||||
@@ -62,23 +67,18 @@ def get_response(
|
||||
raise exc
|
||||
|
||||
|
||||
def check_filters(filters):
|
||||
def check_filters(filters: List[str]) -> None:
|
||||
if not isinstance(filters, list):
|
||||
raise WaybackError("filters must be a list.")
|
||||
|
||||
# [!]field:regex
|
||||
for _filter in filters:
|
||||
try:
|
||||
|
||||
match = re.search(
|
||||
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
|
||||
_filter,
|
||||
)
|
||||
|
||||
match.group(1)
|
||||
match.group(2)
|
||||
|
||||
except Exception:
|
||||
if match is None or len(match.groups()) != 2:
|
||||
|
||||
exc_message = (
|
||||
"Filter '{_filter}' is not following the cdx filter syntax.".format(
|
||||
@@ -88,43 +88,38 @@ def check_filters(filters):
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def check_collapses(collapses):
|
||||
|
||||
def check_collapses(collapses: List[str]) -> bool:
|
||||
if not isinstance(collapses, list):
|
||||
raise WaybackError("collapses must be a list.")
|
||||
|
||||
if len(collapses) == 0:
|
||||
return
|
||||
elif len(collapses) == 0:
|
||||
return True
|
||||
|
||||
for collapse in collapses:
|
||||
try:
|
||||
match = re.search(
|
||||
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
|
||||
collapse,
|
||||
)
|
||||
match.group(1)
|
||||
if 2 == len(match.groups()):
|
||||
match.group(2)
|
||||
except Exception:
|
||||
if match is None or len(match.groups()) != 2:
|
||||
exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
|
||||
collapse=collapse
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def check_match_type(match_type, url):
|
||||
def check_match_type(match_type: Optional[str], url: str) -> bool:
|
||||
legal_match_type = ["exact", "prefix", "host", "domain"]
|
||||
if not match_type:
|
||||
return
|
||||
|
||||
if "*" in url:
|
||||
return True
|
||||
elif "*" in url:
|
||||
raise WaybackError(
|
||||
"Can not use wildcard in the URL along with the match_type arguments."
|
||||
)
|
||||
|
||||
legal_match_type = ["exact", "prefix", "host", "domain"]
|
||||
|
||||
if match_type not in legal_match_type:
|
||||
elif match_type not in legal_match_type:
|
||||
exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
|
||||
match_type=match_type
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
else:
|
||||
return True
|
||||
|
@@ -3,6 +3,7 @@ import os
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
from typing import Generator, List, Optional
|
||||
|
||||
import click
|
||||
import requests
|
||||
@@ -163,34 +164,34 @@ from .wrapper import Url
|
||||
+ "will be printed.",
|
||||
)
|
||||
def main(
|
||||
url,
|
||||
user_agent,
|
||||
version,
|
||||
license,
|
||||
newest,
|
||||
oldest,
|
||||
json,
|
||||
near,
|
||||
year,
|
||||
month,
|
||||
day,
|
||||
hour,
|
||||
minute,
|
||||
save,
|
||||
headers,
|
||||
known_urls,
|
||||
subdomain,
|
||||
file,
|
||||
cdx,
|
||||
start_timestamp,
|
||||
end_timestamp,
|
||||
filter,
|
||||
match_type,
|
||||
gzip,
|
||||
collapse,
|
||||
limit,
|
||||
cdx_print,
|
||||
):
|
||||
url: Optional[str],
|
||||
user_agent: str,
|
||||
version: bool,
|
||||
license: bool,
|
||||
newest: bool,
|
||||
oldest: bool,
|
||||
json: bool,
|
||||
near: bool,
|
||||
year: Optional[int],
|
||||
month: Optional[int],
|
||||
day: Optional[int],
|
||||
hour: Optional[int],
|
||||
minute: Optional[int],
|
||||
save: bool,
|
||||
headers: bool,
|
||||
known_urls: bool,
|
||||
subdomain: bool,
|
||||
file: bool,
|
||||
cdx: bool,
|
||||
start_timestamp: Optional[str],
|
||||
end_timestamp: Optional[str],
|
||||
filter: List[str],
|
||||
match_type: Optional[str],
|
||||
gzip: Optional[str],
|
||||
collapse: List[str],
|
||||
limit: Optional[str],
|
||||
cdx_print: List[str],
|
||||
) -> None:
|
||||
"""\b
|
||||
_ _
|
||||
| | | |
|
||||
@@ -244,7 +245,9 @@ def main(
|
||||
)
|
||||
return
|
||||
|
||||
def echo_availability_api(availability_api_instance):
|
||||
def echo_availability_api(
|
||||
availability_api_instance: WaybackMachineAvailabilityAPI,
|
||||
) -> None:
|
||||
click.echo("Archive URL:")
|
||||
if not availability_api_instance.archive_url:
|
||||
archive_url = (
|
||||
@@ -295,13 +298,14 @@ def main(
|
||||
click.echo(save_api.headers)
|
||||
return
|
||||
|
||||
def save_urls_on_file(url_gen):
|
||||
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||
domain = None
|
||||
sys_random = random.SystemRandom()
|
||||
uid = "".join(
|
||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url_count = 0
|
||||
file_name = None
|
||||
|
||||
for url in url_gen:
|
||||
url_count += 1
|
||||
@@ -310,7 +314,7 @@ def main(
|
||||
|
||||
domain = "domain-unknown"
|
||||
|
||||
if match:
|
||||
if match is not None:
|
||||
domain = match.group(1)
|
||||
|
||||
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
|
||||
@@ -323,7 +327,7 @@ def main(
|
||||
|
||||
click.echo(url)
|
||||
|
||||
if url_count > 0:
|
||||
if url_count > 0 or file_name is not None:
|
||||
click.echo(
|
||||
"\n\n'{file_name}' saved in current working directory".format(
|
||||
file_name=file_name
|
||||
|
@@ -14,6 +14,8 @@ class WaybackError(Exception):
|
||||
All other exceptions are inherited from this class.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class RedirectSaveError(WaybackError):
|
||||
"""
|
||||
@@ -21,32 +23,44 @@ class RedirectSaveError(WaybackError):
|
||||
redirect URL is archived but not the original URL.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class URLError(Exception):
|
||||
"""
|
||||
Raised when malformed URLs are passed as arguments.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MaximumRetriesExceeded(WaybackError):
|
||||
"""
|
||||
MaximumRetriesExceeded
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
|
||||
"""
|
||||
MaximumSaveRetriesExceeded
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ArchiveNotInAvailabilityAPIResponse(WaybackError):
|
||||
"""
|
||||
Could not parse the archive in the JSON response of the availability API.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class InvalidJSONInAvailabilityAPIResponse(WaybackError):
|
||||
"""
|
||||
availability api returned invalid JSON
|
||||
"""
|
||||
|
||||
pass
|
||||
|
@@ -1,38 +1,42 @@
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, Optional
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
# from urllib3.util.retry import Retry
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
|
||||
from .exceptions import MaximumSaveRetriesExceeded
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
class WaybackMachineSaveAPI:
|
||||
|
||||
class WaybackMachineSaveAPI(object):
|
||||
"""
|
||||
WaybackMachineSaveAPI class provides an interface for saving URLs on the
|
||||
Wayback Machine.
|
||||
"""
|
||||
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
|
||||
def __init__(
|
||||
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
|
||||
) -> None:
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.request_url = "https://web.archive.org/save/" + self.url
|
||||
self.user_agent = user_agent
|
||||
self.request_headers = {"User-Agent": self.user_agent}
|
||||
self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
|
||||
if max_tries < 1:
|
||||
raise ValueError("max_tries should be positive")
|
||||
self.max_tries = max_tries
|
||||
self.total_save_retries = 5
|
||||
self.backoff_factor = 0.5
|
||||
self.status_forcelist = [500, 502, 503, 504]
|
||||
self._archive_url = None
|
||||
self._archive_url: Optional[str] = None
|
||||
self.instance_birth_time = datetime.utcnow()
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
def archive_url(self) -> str:
|
||||
"""
|
||||
Returns the archive URL is already cached by _archive_url
|
||||
else invoke the save method to save the archive which returns the
|
||||
@@ -44,7 +48,7 @@ class WaybackMachineSaveAPI:
|
||||
else:
|
||||
return self.save()
|
||||
|
||||
def get_save_request_headers(self):
|
||||
def get_save_request_headers(self) -> None:
|
||||
"""
|
||||
Creates a session and tries 'retries' number of times to
|
||||
retrieve the archive.
|
||||
@@ -61,21 +65,21 @@ class WaybackMachineSaveAPI:
|
||||
the response URL yourself in the browser.
|
||||
"""
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
retries_ = Retry(
|
||||
total=self.total_save_retries,
|
||||
backoff_factor=self.backoff_factor,
|
||||
status_forcelist=self.status_forcelist,
|
||||
)
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries_))
|
||||
self.response = session.get(self.request_url, headers=self.request_headers)
|
||||
self.headers = (
|
||||
self.response.headers
|
||||
) # <class 'requests.structures.CaseInsensitiveDict'>
|
||||
# requests.response.headers is requests.structures.CaseInsensitiveDict
|
||||
self.headers = self.response.headers
|
||||
self.headers_str = str(self.headers)
|
||||
self.status_code = self.response.status_code
|
||||
self.response_url = self.response.url
|
||||
session.close()
|
||||
|
||||
def archive_url_parser(self):
|
||||
def archive_url_parser(self) -> Optional[str]:
|
||||
"""
|
||||
Three regexen (like oxen?) are used to search for the
|
||||
archive URL in the headers and finally look in the response URL
|
||||
@@ -83,18 +87,18 @@ class WaybackMachineSaveAPI:
|
||||
"""
|
||||
|
||||
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
||||
match = re.search(regex1, str(self.headers))
|
||||
match = re.search(regex1, self.headers_str)
|
||||
if match:
|
||||
return "https://web.archive.org" + match.group(1)
|
||||
|
||||
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
|
||||
match = re.search(regex2, str(self.headers))
|
||||
if match:
|
||||
match = re.search(regex2, self.headers_str)
|
||||
if match is not None and len(match.groups()) == 1:
|
||||
return "https://" + match.group(1)
|
||||
|
||||
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
|
||||
match = re.search(regex3, str(self.headers))
|
||||
if match:
|
||||
match = re.search(regex3, self.headers_str)
|
||||
if match is not None and len(match.groups()) == 1:
|
||||
return "https" + match.group(1)
|
||||
|
||||
if self.response_url:
|
||||
@@ -105,7 +109,9 @@ class WaybackMachineSaveAPI:
|
||||
if match:
|
||||
return "https://" + match.group(0)
|
||||
|
||||
def sleep(self, tries):
|
||||
return None
|
||||
|
||||
def sleep(self, tries: int) -> None:
|
||||
"""
|
||||
Ensure that the we wait some time before succesive retries so that we
|
||||
don't waste the retries before the page is even captured by the Wayback
|
||||
@@ -120,7 +126,7 @@ class WaybackMachineSaveAPI:
|
||||
sleep_seconds = 10
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
def timestamp(self):
|
||||
def timestamp(self) -> datetime:
|
||||
"""
|
||||
Read the timestamp off the archive URL and convert the Wayback Machine
|
||||
timestamp to datetime object.
|
||||
@@ -133,9 +139,10 @@ class WaybackMachineSaveAPI:
|
||||
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
|
||||
cached archive if last archive was captured before last 45 minutes.
|
||||
"""
|
||||
m = re.search(
|
||||
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||
)
|
||||
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
|
||||
m = re.search(regex, str(self._archive_url))
|
||||
if m is None or len(m.groups()) != 1:
|
||||
raise ValueError("Could not find get timestamp")
|
||||
string_timestamp = m.group(1)
|
||||
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
|
||||
|
||||
@@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:
|
||||
|
||||
return timestamp
|
||||
|
||||
def save(self):
|
||||
def save(self) -> str:
|
||||
"""
|
||||
Calls the SavePageNow API of the Wayback Machine with required parameters
|
||||
and headers to save the URL.
|
||||
@@ -169,7 +176,7 @@ class WaybackMachineSaveAPI:
|
||||
self.get_save_request_headers()
|
||||
self.saved_archive = self.archive_url_parser()
|
||||
|
||||
if self.saved_archive is not None:
|
||||
if isinstance(self.saved_archive, str):
|
||||
self._archive_url = self.saved_archive
|
||||
self.timestamp()
|
||||
return self.saved_archive
|
||||
@@ -179,5 +186,5 @@ class WaybackMachineSaveAPI:
|
||||
raise MaximumSaveRetriesExceeded(
|
||||
"Tried %s times but failed to save and retrieve the" % str(tries)
|
||||
+ " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
||||
% (self.url, self.response_url, str(self.headers)),
|
||||
% (self.url, self.response_url, self.headers_str),
|
||||
)
|
||||
|
@@ -2,22 +2,43 @@ import requests
|
||||
|
||||
from . import __version__
|
||||
|
||||
DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
|
||||
DEFAULT_USER_AGENT: str = (
|
||||
"waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
|
||||
)
|
||||
|
||||
|
||||
def latest_version_pypi(package_name, user_agent=DEFAULT_USER_AGENT):
|
||||
def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
|
||||
request_url = "https://pypi.org/pypi/" + package_name + "/json"
|
||||
headers = {"User-Agent": user_agent}
|
||||
response = requests.get(request_url, headers=headers)
|
||||
data = response.json()
|
||||
return data["info"]["version"]
|
||||
if (
|
||||
data is not None
|
||||
and "info" in data
|
||||
and data["info"] is not None
|
||||
and "version" in data["info"]
|
||||
and data["info"]["version"] is not None
|
||||
):
|
||||
return str(data["info"]["version"])
|
||||
else:
|
||||
raise ValueError("Could not get latest pypi version")
|
||||
|
||||
|
||||
def latest_version_github(package_name, user_agent=DEFAULT_USER_AGENT):
|
||||
def latest_version_github(
|
||||
package_name: str, user_agent: str = DEFAULT_USER_AGENT
|
||||
) -> str:
|
||||
request_url = (
|
||||
"https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
|
||||
)
|
||||
headers = {"User-Agent": user_agent}
|
||||
response = requests.get(request_url, headers=headers)
|
||||
data = response.json()
|
||||
return data[0]["tag_name"]
|
||||
if (
|
||||
data is not None
|
||||
and len(data) > 0
|
||||
and data[0] is not None
|
||||
and "tag_name" in data[0]
|
||||
):
|
||||
return str(data[0]["tag_name"])
|
||||
else:
|
||||
raise ValueError("Could not get latest github version")
|
||||
|
@@ -1,4 +1,5 @@
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Generator, Optional
|
||||
|
||||
from .availability_api import WaybackMachineAvailabilityAPI
|
||||
from .cdx_api import WaybackMachineCDXServerAPI
|
||||
@@ -19,35 +20,37 @@ the older interface code.
|
||||
"""
|
||||
|
||||
|
||||
class Url:
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
|
||||
class Url(object):
|
||||
def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
|
||||
self.url = url
|
||||
self.user_agent = str(user_agent)
|
||||
self.archive_url = None
|
||||
self.timestamp = None
|
||||
self.archive_url: Optional[str] = None
|
||||
self.timestamp: Optional[datetime] = None
|
||||
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
def __str__(self) -> str:
|
||||
if not self.archive_url:
|
||||
self.newest()
|
||||
return self.archive_url
|
||||
return str(self.archive_url)
|
||||
|
||||
def __len__(self):
|
||||
def __len__(self) -> int:
|
||||
td_max = timedelta(
|
||||
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
||||
)
|
||||
|
||||
if not self.timestamp:
|
||||
if not isinstance(self.timestamp, datetime):
|
||||
self.oldest()
|
||||
|
||||
if self.timestamp == datetime.max:
|
||||
if not isinstance(self.timestamp, datetime):
|
||||
raise TypeError("timestamp must be a datetime")
|
||||
elif self.timestamp == datetime.max:
|
||||
return td_max.days
|
||||
|
||||
else:
|
||||
return (datetime.utcnow() - self.timestamp).days
|
||||
|
||||
def save(self):
|
||||
def save(self) -> "Url":
|
||||
self.wayback_machine_save_api = WaybackMachineSaveAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
@@ -58,13 +61,13 @@ class Url:
|
||||
|
||||
def near(
|
||||
self,
|
||||
year=None,
|
||||
month=None,
|
||||
day=None,
|
||||
hour=None,
|
||||
minute=None,
|
||||
unix_timestamp=None,
|
||||
):
|
||||
year: Optional[int] = None,
|
||||
month: Optional[int] = None,
|
||||
day: Optional[int] = None,
|
||||
hour: Optional[int] = None,
|
||||
minute: Optional[int] = None,
|
||||
unix_timestamp: Optional[int] = None,
|
||||
) -> "Url":
|
||||
|
||||
self.wayback_machine_availability_api.near(
|
||||
year=year,
|
||||
@@ -77,22 +80,24 @@ class Url:
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def oldest(self):
|
||||
def oldest(self) -> "Url":
|
||||
self.wayback_machine_availability_api.oldest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def newest(self):
|
||||
def newest(self) -> "Url":
|
||||
self.wayback_machine_availability_api.newest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def set_availability_api_attrs(self):
|
||||
def set_availability_api_attrs(self) -> None:
|
||||
self.archive_url = self.wayback_machine_availability_api.archive_url
|
||||
self.JSON = self.wayback_machine_availability_api.JSON
|
||||
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
||||
|
||||
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
||||
def total_archives(
|
||||
self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
|
||||
) -> int:
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
self.url,
|
||||
user_agent=self.user_agent,
|
||||
@@ -107,12 +112,12 @@ class Url:
|
||||
|
||||
def known_urls(
|
||||
self,
|
||||
subdomain=False,
|
||||
host=False,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
match_type="prefix",
|
||||
):
|
||||
subdomain: bool = False,
|
||||
host: bool = False,
|
||||
start_timestamp: Optional[str] = None,
|
||||
end_timestamp: Optional[str] = None,
|
||||
match_type: str = "prefix",
|
||||
) -> Generator[str, None, None]:
|
||||
if subdomain:
|
||||
match_type = "domain"
|
||||
if host:
|
||||
|
Reference in New Issue
Block a user