waybackpy/save_api.py : Added doc strings and also lint with black.
This commit is contained in:
parent
db551abbf6
commit
405e9a2a79
@ -31,6 +31,11 @@ class WaybackMachineSaveAPI:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def archive_url(self):
|
def archive_url(self):
|
||||||
|
"""
|
||||||
|
Returns the archive URL is already cached by _archive_url
|
||||||
|
else invoke the save method to save the archive which returns the
|
||||||
|
archive thus we return the methods return value.
|
||||||
|
"""
|
||||||
|
|
||||||
if self._archive_url:
|
if self._archive_url:
|
||||||
return self._archive_url
|
return self._archive_url
|
||||||
@ -38,7 +43,21 @@ class WaybackMachineSaveAPI:
|
|||||||
return self.save()
|
return self.save()
|
||||||
|
|
||||||
def get_save_request_headers(self):
|
def get_save_request_headers(self):
|
||||||
|
"""
|
||||||
|
Creates a session and tries 'retries' number of times to
|
||||||
|
retrieve the archive.
|
||||||
|
|
||||||
|
If successful in getting the response, sets the headers, status_code
|
||||||
|
and response_url attributes.
|
||||||
|
|
||||||
|
The archive is usually in the headers but it can also be the response URL
|
||||||
|
as the Wayback Machine redirects to the archive after a successful capture
|
||||||
|
of the webpage.
|
||||||
|
|
||||||
|
Wayback Machine's save API is known
|
||||||
|
to be very unreliable thus if it fails first check opening
|
||||||
|
the response URL yourself in the browser.
|
||||||
|
"""
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
retries = Retry(
|
retries = Retry(
|
||||||
total=self.total_save_retries,
|
total=self.total_save_retries,
|
||||||
@ -52,6 +71,11 @@ class WaybackMachineSaveAPI:
|
|||||||
self.response_url = self.response.url
|
self.response_url = self.response.url
|
||||||
|
|
||||||
def archive_url_parser(self):
|
def archive_url_parser(self):
|
||||||
|
"""
|
||||||
|
Three regexen (like oxen?) are used to search for the
|
||||||
|
archive URL in the headers and finally look in the response URL
|
||||||
|
for the archive URL.
|
||||||
|
"""
|
||||||
|
|
||||||
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
||||||
match = re.search(regex1, str(self.headers))
|
match = re.search(regex1, str(self.headers))
|
||||||
@ -77,6 +101,14 @@ class WaybackMachineSaveAPI:
|
|||||||
return "https://" + match.group(0)
|
return "https://" + match.group(0)
|
||||||
|
|
||||||
def sleep(self, tries):
|
def sleep(self, tries):
|
||||||
|
"""
|
||||||
|
Ensure that the we wait some time before succesive retries so that we
|
||||||
|
don't waste the retries before the page is even captured by the Wayback
|
||||||
|
Machine crawlers also ensures that we are not putting too much load on
|
||||||
|
the Wayback Machine's save API.
|
||||||
|
|
||||||
|
If tries are multiple of 3 sleep 10 seconds else sleep 5 seconds.
|
||||||
|
"""
|
||||||
|
|
||||||
sleep_seconds = 5
|
sleep_seconds = 5
|
||||||
if tries % 3 == 0:
|
if tries % 3 == 0:
|
||||||
@ -84,6 +116,18 @@ class WaybackMachineSaveAPI:
|
|||||||
time.sleep(sleep_seconds)
|
time.sleep(sleep_seconds)
|
||||||
|
|
||||||
def timestamp(self):
|
def timestamp(self):
|
||||||
|
"""
|
||||||
|
Read the timestamp off the archive URL and convert the Wayback Machine
|
||||||
|
timestamp to datetime object.
|
||||||
|
|
||||||
|
Also check if the time on archive is URL and compare it to instance birth
|
||||||
|
time.
|
||||||
|
|
||||||
|
If time on the archive is older than the instance creation time set the cached_save
|
||||||
|
to True else set it to False. The flag can be used to check if the Wayback Machine
|
||||||
|
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
|
||||||
|
cached archive if last archive was captured before last 45 minutes.
|
||||||
|
"""
|
||||||
m = re.search(
|
m = re.search(
|
||||||
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
|
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||||
)
|
)
|
||||||
@ -101,6 +145,13 @@ class WaybackMachineSaveAPI:
|
|||||||
return timestamp
|
return timestamp
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
|
"""
|
||||||
|
Calls the SavePageNow API of the Wayback Machine with required parameters
|
||||||
|
and headers to save the URL.
|
||||||
|
|
||||||
|
Raises MaximumSaveRetriesExceeded is maximum retries are exhausted but still
|
||||||
|
we were unable to retrieve the archive from the Wayback Machine.
|
||||||
|
"""
|
||||||
|
|
||||||
saved_archive = None
|
saved_archive = None
|
||||||
tries = 0
|
tries = 0
|
||||||
@ -111,8 +162,9 @@ class WaybackMachineSaveAPI:
|
|||||||
|
|
||||||
if tries >= self.max_tries:
|
if tries >= self.max_tries:
|
||||||
raise MaximumSaveRetriesExceeded(
|
raise MaximumSaveRetriesExceeded(
|
||||||
"Tried %s times but failed to save and return the archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
"Tried %s times but failed to save and retrieve the" % str(tries)
|
||||||
% (str(tries), self.url, self.response_url, str(self.headers)),
|
+ " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
||||||
|
% (self.url, self.response_url, str(self.headers)),
|
||||||
)
|
)
|
||||||
|
|
||||||
if not saved_archive:
|
if not saved_archive:
|
||||||
|
Loading…
Reference in New Issue
Block a user