waybackpy/save_api.py : Added doc strings and also lint with black.
This commit is contained in:
parent
db551abbf6
commit
405e9a2a79
@ -31,6 +31,11 @@ class WaybackMachineSaveAPI:
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
"""
|
||||
Returns the archive URL is already cached by _archive_url
|
||||
else invoke the save method to save the archive which returns the
|
||||
archive thus we return the methods return value.
|
||||
"""
|
||||
|
||||
if self._archive_url:
|
||||
return self._archive_url
|
||||
@ -38,7 +43,21 @@ class WaybackMachineSaveAPI:
|
||||
return self.save()
|
||||
|
||||
def get_save_request_headers(self):
|
||||
"""
|
||||
Creates a session and tries 'retries' number of times to
|
||||
retrieve the archive.
|
||||
|
||||
If successful in getting the response, sets the headers, status_code
|
||||
and response_url attributes.
|
||||
|
||||
The archive is usually in the headers but it can also be the response URL
|
||||
as the Wayback Machine redirects to the archive after a successful capture
|
||||
of the webpage.
|
||||
|
||||
Wayback Machine's save API is known
|
||||
to be very unreliable thus if it fails first check opening
|
||||
the response URL yourself in the browser.
|
||||
"""
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
total=self.total_save_retries,
|
||||
@ -52,6 +71,11 @@ class WaybackMachineSaveAPI:
|
||||
self.response_url = self.response.url
|
||||
|
||||
def archive_url_parser(self):
|
||||
"""
|
||||
Three regexen (like oxen?) are used to search for the
|
||||
archive URL in the headers and finally look in the response URL
|
||||
for the archive URL.
|
||||
"""
|
||||
|
||||
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
||||
match = re.search(regex1, str(self.headers))
|
||||
@ -77,6 +101,14 @@ class WaybackMachineSaveAPI:
|
||||
return "https://" + match.group(0)
|
||||
|
||||
def sleep(self, tries):
|
||||
"""
|
||||
Ensure that the we wait some time before succesive retries so that we
|
||||
don't waste the retries before the page is even captured by the Wayback
|
||||
Machine crawlers also ensures that we are not putting too much load on
|
||||
the Wayback Machine's save API.
|
||||
|
||||
If tries are multiple of 3 sleep 10 seconds else sleep 5 seconds.
|
||||
"""
|
||||
|
||||
sleep_seconds = 5
|
||||
if tries % 3 == 0:
|
||||
@ -84,6 +116,18 @@ class WaybackMachineSaveAPI:
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
def timestamp(self):
|
||||
"""
|
||||
Read the timestamp off the archive URL and convert the Wayback Machine
|
||||
timestamp to datetime object.
|
||||
|
||||
Also check if the time on archive is URL and compare it to instance birth
|
||||
time.
|
||||
|
||||
If time on the archive is older than the instance creation time set the cached_save
|
||||
to True else set it to False. The flag can be used to check if the Wayback Machine
|
||||
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
|
||||
cached archive if last archive was captured before last 45 minutes.
|
||||
"""
|
||||
m = re.search(
|
||||
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||
)
|
||||
@ -101,6 +145,13 @@ class WaybackMachineSaveAPI:
|
||||
return timestamp
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Calls the SavePageNow API of the Wayback Machine with required parameters
|
||||
and headers to save the URL.
|
||||
|
||||
Raises MaximumSaveRetriesExceeded is maximum retries are exhausted but still
|
||||
we were unable to retrieve the archive from the Wayback Machine.
|
||||
"""
|
||||
|
||||
saved_archive = None
|
||||
tries = 0
|
||||
@ -111,8 +162,9 @@ class WaybackMachineSaveAPI:
|
||||
|
||||
if tries >= self.max_tries:
|
||||
raise MaximumSaveRetriesExceeded(
|
||||
"Tried %s times but failed to save and return the archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
||||
% (str(tries), self.url, self.response_url, str(self.headers)),
|
||||
"Tried %s times but failed to save and retrieve the" % str(tries)
|
||||
+ " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
||||
% (self.url, self.response_url, str(self.headers)),
|
||||
)
|
||||
|
||||
if not saved_archive:
|
||||
|
Loading…
Reference in New Issue
Block a user