waybackpy/save_api.py : Added doc strings and also lint with black.

This commit is contained in:
Akash Mahanty 2022-01-22 00:41:10 +05:30
parent db551abbf6
commit 405e9a2a79

View File

@ -31,6 +31,11 @@ class WaybackMachineSaveAPI:
@property @property
def archive_url(self): def archive_url(self):
"""
Returns the archive URL is already cached by _archive_url
else invoke the save method to save the archive which returns the
archive thus we return the methods return value.
"""
if self._archive_url: if self._archive_url:
return self._archive_url return self._archive_url
@ -38,7 +43,21 @@ class WaybackMachineSaveAPI:
return self.save() return self.save()
def get_save_request_headers(self): def get_save_request_headers(self):
"""
Creates a session and tries 'retries' number of times to
retrieve the archive.
If successful in getting the response, sets the headers, status_code
and response_url attributes.
The archive is usually in the headers but it can also be the response URL
as the Wayback Machine redirects to the archive after a successful capture
of the webpage.
Wayback Machine's save API is known
to be very unreliable thus if it fails first check opening
the response URL yourself in the browser.
"""
session = requests.Session() session = requests.Session()
retries = Retry( retries = Retry(
total=self.total_save_retries, total=self.total_save_retries,
@ -52,6 +71,11 @@ class WaybackMachineSaveAPI:
self.response_url = self.response.url self.response_url = self.response.url
def archive_url_parser(self): def archive_url_parser(self):
"""
Three regexen (like oxen?) are used to search for the
archive URL in the headers and finally look in the response URL
for the archive URL.
"""
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)" regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
match = re.search(regex1, str(self.headers)) match = re.search(regex1, str(self.headers))
@ -77,6 +101,14 @@ class WaybackMachineSaveAPI:
return "https://" + match.group(0) return "https://" + match.group(0)
def sleep(self, tries): def sleep(self, tries):
"""
Ensure that the we wait some time before succesive retries so that we
don't waste the retries before the page is even captured by the Wayback
Machine crawlers also ensures that we are not putting too much load on
the Wayback Machine's save API.
If tries are multiple of 3 sleep 10 seconds else sleep 5 seconds.
"""
sleep_seconds = 5 sleep_seconds = 5
if tries % 3 == 0: if tries % 3 == 0:
@ -84,6 +116,18 @@ class WaybackMachineSaveAPI:
time.sleep(sleep_seconds) time.sleep(sleep_seconds)
def timestamp(self): def timestamp(self):
"""
Read the timestamp off the archive URL and convert the Wayback Machine
timestamp to datetime object.
Also check if the time on archive is URL and compare it to instance birth
time.
If time on the archive is older than the instance creation time set the cached_save
to True else set it to False. The flag can be used to check if the Wayback Machine
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
cached archive if last archive was captured before last 45 minutes.
"""
m = re.search( m = re.search(
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
) )
@ -101,6 +145,13 @@ class WaybackMachineSaveAPI:
return timestamp return timestamp
def save(self): def save(self):
"""
Calls the SavePageNow API of the Wayback Machine with required parameters
and headers to save the URL.
Raises MaximumSaveRetriesExceeded is maximum retries are exhausted but still
we were unable to retrieve the archive from the Wayback Machine.
"""
saved_archive = None saved_archive = None
tries = 0 tries = 0
@ -111,8 +162,9 @@ class WaybackMachineSaveAPI:
if tries >= self.max_tries: if tries >= self.max_tries:
raise MaximumSaveRetriesExceeded( raise MaximumSaveRetriesExceeded(
"Tried %s times but failed to save and return the archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n" "Tried %s times but failed to save and retrieve the" % str(tries)
% (str(tries), self.url, self.response_url, str(self.headers)), + " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
% (self.url, self.response_url, str(self.headers)),
) )
if not saved_archive: if not saved_archive: