add: type annotation to waybackpy modules

This commit is contained in:
eggplants
2022-02-04 04:25:01 +09:00
parent c274c474b2
commit 38088fa0d8
9 changed files with 275 additions and 205 deletions

View File

@@ -1,38 +1,42 @@
import re
import time
from datetime import datetime
from typing import Dict, Optional
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# from urllib3.util.retry import Retry
from requests.packages.urllib3.util.retry import Retry
from .exceptions import MaximumSaveRetriesExceeded
from .utils import DEFAULT_USER_AGENT
class WaybackMachineSaveAPI:
class WaybackMachineSaveAPI(object):
"""
WaybackMachineSaveAPI class provides an interface for saving URLs on the
Wayback Machine.
"""
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.request_url = "https://web.archive.org/save/" + self.url
self.user_agent = user_agent
self.request_headers = {"User-Agent": self.user_agent}
self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
if max_tries < 1:
raise ValueError("max_tries should be positive")
self.max_tries = max_tries
self.total_save_retries = 5
self.backoff_factor = 0.5
self.status_forcelist = [500, 502, 503, 504]
self._archive_url = None
self._archive_url: Optional[str] = None
self.instance_birth_time = datetime.utcnow()
@property
def archive_url(self):
def archive_url(self) -> str:
"""
Returns the archive URL is already cached by _archive_url
else invoke the save method to save the archive which returns the
@@ -44,7 +48,7 @@ class WaybackMachineSaveAPI:
else:
return self.save()
def get_save_request_headers(self):
def get_save_request_headers(self) -> None:
"""
Creates a session and tries 'retries' number of times to
retrieve the archive.
@@ -61,21 +65,21 @@ class WaybackMachineSaveAPI:
the response URL yourself in the browser.
"""
session = requests.Session()
retries = Retry(
retries_ = Retry(
total=self.total_save_retries,
backoff_factor=self.backoff_factor,
status_forcelist=self.status_forcelist,
)
session.mount("https://", HTTPAdapter(max_retries=retries))
session.mount("https://", HTTPAdapter(max_retries=retries_))
self.response = session.get(self.request_url, headers=self.request_headers)
self.headers = (
self.response.headers
) # <class 'requests.structures.CaseInsensitiveDict'>
# requests.response.headers is requests.structures.CaseInsensitiveDict
self.headers = self.response.headers
self.headers_str = str(self.headers)
self.status_code = self.response.status_code
self.response_url = self.response.url
session.close()
def archive_url_parser(self):
def archive_url_parser(self) -> Optional[str]:
"""
Three regexen (like oxen?) are used to search for the
archive URL in the headers and finally look in the response URL
@@ -83,18 +87,18 @@ class WaybackMachineSaveAPI:
"""
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
match = re.search(regex1, str(self.headers))
match = re.search(regex1, self.headers_str)
if match:
return "https://web.archive.org" + match.group(1)
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
match = re.search(regex2, str(self.headers))
if match:
match = re.search(regex2, self.headers_str)
if match is not None and len(match.groups()) == 1:
return "https://" + match.group(1)
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
match = re.search(regex3, str(self.headers))
if match:
match = re.search(regex3, self.headers_str)
if match is not None and len(match.groups()) == 1:
return "https" + match.group(1)
if self.response_url:
@@ -105,7 +109,9 @@ class WaybackMachineSaveAPI:
if match:
return "https://" + match.group(0)
def sleep(self, tries):
return None
def sleep(self, tries: int) -> None:
"""
Ensure that the we wait some time before succesive retries so that we
don't waste the retries before the page is even captured by the Wayback
@@ -120,7 +126,7 @@ class WaybackMachineSaveAPI:
sleep_seconds = 10
time.sleep(sleep_seconds)
def timestamp(self):
def timestamp(self) -> datetime:
"""
Read the timestamp off the archive URL and convert the Wayback Machine
timestamp to datetime object.
@@ -133,9 +139,10 @@ class WaybackMachineSaveAPI:
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
cached archive if last archive was captured before last 45 minutes.
"""
m = re.search(
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
)
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
m = re.search(regex, str(self._archive_url))
if m is None or len(m.groups()) != 1:
raise ValueError("Could not find get timestamp")
string_timestamp = m.group(1)
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
@@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:
return timestamp
def save(self):
def save(self) -> str:
"""
Calls the SavePageNow API of the Wayback Machine with required parameters
and headers to save the URL.
@@ -169,7 +176,7 @@ class WaybackMachineSaveAPI:
self.get_save_request_headers()
self.saved_archive = self.archive_url_parser()
if self.saved_archive is not None:
if isinstance(self.saved_archive, str):
self._archive_url = self.saved_archive
self.timestamp()
return self.saved_archive
@@ -179,5 +186,5 @@ class WaybackMachineSaveAPI:
raise MaximumSaveRetriesExceeded(
"Tried %s times but failed to save and retrieve the" % str(tries)
+ " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
% (self.url, self.response_url, str(self.headers)),
% (self.url, self.response_url, self.headers_str),
)