* fix: CI yml name

* add: mypy configuraion

* add: type annotation to waybackpy modules

* add: type annotation to test modules

* fix: mypy command

* add: types-requests to dev deps

* fix: disable max-line-length

* fix: move pytest.ini into setup.cfg

* add: urllib3 to deps

* fix: Retry (ref: https://github.com/python/typeshed/issues/6893)

* fix: f-string

* fix: shorten long lines

* add: staticmethod decorator to no-self-use methods

* fix: str(headers)->headers_str

* fix: error message

* fix: revert "str(headers)->headers_str" and ignore assignment CaseInsensitiveDict with str

* fix: mypy error
This commit is contained in:
eggplants
2022-02-05 03:23:36 +09:00
committed by GitHub
parent 320ef30371
commit d8cabdfdb5
22 changed files with 537 additions and 364 deletions

View File

@@ -1,38 +1,41 @@
import re
import time
from datetime import datetime
from typing import Dict, Optional
import requests
from requests.adapters import HTTPAdapter
from requests.structures import CaseInsensitiveDict
from urllib3.util.retry import Retry
from .exceptions import MaximumSaveRetriesExceeded
from .utils import DEFAULT_USER_AGENT
class WaybackMachineSaveAPI:
class WaybackMachineSaveAPI(object):
"""
WaybackMachineSaveAPI class provides an interface for saving URLs on the
Wayback Machine.
"""
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.request_url = "https://web.archive.org/save/" + self.url
self.user_agent = user_agent
self.request_headers = {"User-Agent": self.user_agent}
self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
if max_tries < 1:
raise ValueError("max_tries should be positive")
self.max_tries = max_tries
self.total_save_retries = 5
self.backoff_factor = 0.5
self.status_forcelist = [500, 502, 503, 504]
self._archive_url = None
self._archive_url: Optional[str] = None
self.instance_birth_time = datetime.utcnow()
@property
def archive_url(self):
def archive_url(self) -> str:
"""
Returns the archive URL is already cached by _archive_url
else invoke the save method to save the archive which returns the
@@ -44,7 +47,7 @@ class WaybackMachineSaveAPI:
else:
return self.save()
def get_save_request_headers(self):
def get_save_request_headers(self) -> None:
"""
Creates a session and tries 'retries' number of times to
retrieve the archive.
@@ -68,14 +71,13 @@ class WaybackMachineSaveAPI:
)
session.mount("https://", HTTPAdapter(max_retries=retries))
self.response = session.get(self.request_url, headers=self.request_headers)
self.headers = (
self.response.headers
) # <class 'requests.structures.CaseInsensitiveDict'>
# requests.response.headers is requests.structures.CaseInsensitiveDict
self.headers: CaseInsensitiveDict[str] = self.response.headers
self.status_code = self.response.status_code
self.response_url = self.response.url
session.close()
def archive_url_parser(self):
def archive_url_parser(self) -> Optional[str]:
"""
Three regexen (like oxen?) are used to search for the
archive URL in the headers and finally look in the response URL
@@ -89,12 +91,12 @@ class WaybackMachineSaveAPI:
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
match = re.search(regex2, str(self.headers))
if match:
if match is not None and len(match.groups()) == 1:
return "https://" + match.group(1)
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
match = re.search(regex3, str(self.headers))
if match:
if match is not None and len(match.groups()) == 1:
return "https" + match.group(1)
if self.response_url:
@@ -105,7 +107,10 @@ class WaybackMachineSaveAPI:
if match:
return "https://" + match.group(0)
def sleep(self, tries):
return None
@staticmethod
def sleep(tries: int) -> None:
"""
Ensure that the we wait some time before succesive retries so that we
don't waste the retries before the page is even captured by the Wayback
@@ -120,7 +125,7 @@ class WaybackMachineSaveAPI:
sleep_seconds = 10
time.sleep(sleep_seconds)
def timestamp(self):
def timestamp(self) -> datetime:
"""
Read the timestamp off the archive URL and convert the Wayback Machine
timestamp to datetime object.
@@ -128,14 +133,16 @@ class WaybackMachineSaveAPI:
Also check if the time on archive is URL and compare it to instance birth
time.
If time on the archive is older than the instance creation time set the cached_save
to True else set it to False. The flag can be used to check if the Wayback Machine
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
cached archive if last archive was captured before last 45 minutes.
If time on the archive is older than the instance creation time set the
cached_save to True else set it to False. The flag can be used to check
if the Wayback Machine didn't serve a Cached URL. It is quite common for
the Wayback Machine to serve cached archive if last archive was captured
before last 45 minutes.
"""
m = re.search(
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
)
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
m = re.search(regex, str(self._archive_url))
if m is None or len(m.groups()) != 1:
raise ValueError("Could not get timestamp")
string_timestamp = m.group(1)
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
@@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:
return timestamp
def save(self):
def save(self) -> str:
"""
Calls the SavePageNow API of the Wayback Machine with required parameters
and headers to save the URL.
@@ -162,14 +169,14 @@ class WaybackMachineSaveAPI:
tries = 0
while True:
if not self.saved_archive:
if self.saved_archive is None:
if tries >= 1:
self.sleep(tries)
self.get_save_request_headers()
self.saved_archive = self.archive_url_parser()
if self.saved_archive is not None:
if isinstance(self.saved_archive, str):
self._archive_url = self.saved_archive
self.timestamp()
return self.saved_archive
@@ -177,7 +184,8 @@ class WaybackMachineSaveAPI:
tries += 1
if tries >= self.max_tries:
raise MaximumSaveRetriesExceeded(
"Tried %s times but failed to save and retrieve the" % str(tries)
+ " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
% (self.url, self.response_url, str(self.headers)),
f"Tried {tries} times but failed to save "
f"and retrieve the archive for {self.url}.\n"
f"Response URL:\n{self.response_url}\n"
f"Response Header:\n{self.headers}"
)