132 lines
4.1 KiB
Python
132 lines
4.1 KiB
Python
import re
|
|
import time
|
|
import requests
|
|
|
|
from datetime import datetime
|
|
from urllib3.util.retry import Retry
|
|
from requests.adapters import HTTPAdapter
|
|
|
|
from .utils import DEFAULT_USER_AGENT
|
|
from .exceptions import MaximumSaveRetriesExceeded
|
|
|
|
|
|
class WaybackMachineSaveAPI:
|
|
|
|
"""
|
|
WaybackMachineSaveAPI class provides an interface for saving URLs on the
|
|
Wayback Machine.
|
|
"""
|
|
|
|
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
|
|
self.url = str(url).strip().replace(" ", "%20")
|
|
self.request_url = "https://web.archive.org/save/" + self.url
|
|
self.user_agent = user_agent
|
|
self.request_headers = {"User-Agent": self.user_agent}
|
|
self.max_tries = max_tries
|
|
self.total_save_retries = 5
|
|
self.backoff_factor = 0.5
|
|
self.status_forcelist = [500, 502, 503, 504]
|
|
self._archive_url = None
|
|
self.instance_birth_time = datetime.utcnow()
|
|
|
|
@property
|
|
def archive_url(self):
|
|
|
|
if self._archive_url:
|
|
return self._archive_url
|
|
else:
|
|
return self.save()
|
|
|
|
def get_save_request_headers(self):
|
|
|
|
session = requests.Session()
|
|
retries = Retry(
|
|
total=self.total_save_retries,
|
|
backoff_factor=self.backoff_factor,
|
|
status_forcelist=self.status_forcelist,
|
|
)
|
|
session.mount("https://", HTTPAdapter(max_retries=retries))
|
|
self.response = session.get(self.request_url, headers=self.request_headers)
|
|
self.headers = self.response.headers
|
|
self.status_code = self.response.status_code
|
|
self.response_url = self.response.url
|
|
|
|
def archive_url_parser(self):
|
|
|
|
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
|
match = re.search(regex1, str(self.headers))
|
|
if match:
|
|
return "https://web.archive.org" + match.group(1)
|
|
|
|
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
|
|
match = re.search(regex2, str(self.headers))
|
|
if match:
|
|
return "https://" + match.group(1)
|
|
|
|
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
|
|
match = re.search(regex3, str(self.headers))
|
|
if match:
|
|
return "https://" + match.group(1)
|
|
|
|
if self.response_url:
|
|
self.response_url = self.response_url.strip()
|
|
if "web.archive.org/web" in self.response_url:
|
|
regex = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$"
|
|
match = re.search(regex, self.response_url)
|
|
if match:
|
|
return "https://" + match.group(0)
|
|
|
|
def sleep(self, tries):
|
|
|
|
sleep_seconds = 5
|
|
if tries % 3 == 0:
|
|
sleep_seconds = 10
|
|
time.sleep(sleep_seconds)
|
|
|
|
def timestamp(self):
|
|
m = re.search(
|
|
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
|
)
|
|
string_timestamp = m.group(1)
|
|
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
|
|
|
|
timestamp_unixtime = time.mktime(timestamp.timetuple())
|
|
instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple())
|
|
|
|
if timestamp_unixtime < instance_birth_time_unixtime:
|
|
self.cached_save = True
|
|
else:
|
|
self.cached_save = False
|
|
|
|
return timestamp
|
|
|
|
def save(self):
|
|
|
|
saved_archive = None
|
|
tries = 0
|
|
|
|
while True:
|
|
|
|
tries += 1
|
|
|
|
if tries >= self.max_tries:
|
|
raise MaximumSaveRetriesExceeded(
|
|
"Tried %s times but failed to save and return the archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
|
% (str(tries), self.url, self.response_url, str(self.headers)),
|
|
)
|
|
|
|
if not saved_archive:
|
|
|
|
if tries > 1:
|
|
self.sleep(tries)
|
|
|
|
self.get_save_request_headers()
|
|
saved_archive = self.archive_url_parser()
|
|
|
|
if not saved_archive:
|
|
continue
|
|
else:
|
|
self._archive_url = saved_archive
|
|
self.timestamp()
|
|
return saved_archive
|