From 405e9a2a79c9db5afee8eb8b13135464775cc3ac Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Sat, 22 Jan 2022 00:41:10 +0530 Subject: [PATCH] waybackpy/save_api.py : Added doc strings and also lint with black. --- waybackpy/save_api.py | 56 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/waybackpy/save_api.py b/waybackpy/save_api.py index fd43400..bf18171 100644 --- a/waybackpy/save_api.py +++ b/waybackpy/save_api.py @@ -31,6 +31,11 @@ class WaybackMachineSaveAPI: @property def archive_url(self): + """ + Returns the archive URL is already cached by _archive_url + else invoke the save method to save the archive which returns the + archive thus we return the methods return value. + """ if self._archive_url: return self._archive_url @@ -38,7 +43,21 @@ class WaybackMachineSaveAPI: return self.save() def get_save_request_headers(self): + """ + Creates a session and tries 'retries' number of times to + retrieve the archive. + If successful in getting the response, sets the headers, status_code + and response_url attributes. + + The archive is usually in the headers but it can also be the response URL + as the Wayback Machine redirects to the archive after a successful capture + of the webpage. + + Wayback Machine's save API is known + to be very unreliable thus if it fails first check opening + the response URL yourself in the browser. + """ session = requests.Session() retries = Retry( total=self.total_save_retries, @@ -52,6 +71,11 @@ class WaybackMachineSaveAPI: self.response_url = self.response.url def archive_url_parser(self): + """ + Three regexen (like oxen?) are used to search for the + archive URL in the headers and finally look in the response URL + for the archive URL. + """ regex1 = r"Content-Location: (/web/[0-9]{14}/.*)" match = re.search(regex1, str(self.headers)) @@ -77,6 +101,14 @@ class WaybackMachineSaveAPI: return "https://" + match.group(0) def sleep(self, tries): + """ + Ensure that the we wait some time before succesive retries so that we + don't waste the retries before the page is even captured by the Wayback + Machine crawlers also ensures that we are not putting too much load on + the Wayback Machine's save API. + + If tries are multiple of 3 sleep 10 seconds else sleep 5 seconds. + """ sleep_seconds = 5 if tries % 3 == 0: @@ -84,6 +116,18 @@ class WaybackMachineSaveAPI: time.sleep(sleep_seconds) def timestamp(self): + """ + Read the timestamp off the archive URL and convert the Wayback Machine + timestamp to datetime object. + + Also check if the time on archive is URL and compare it to instance birth + time. + + If time on the archive is older than the instance creation time set the cached_save + to True else set it to False. The flag can be used to check if the Wayback Machine + didn't serve a Cached URL. It is quite common for the Wayback Machine to serve + cached archive if last archive was captured before last 45 minutes. + """ m = re.search( r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url ) @@ -101,6 +145,13 @@ class WaybackMachineSaveAPI: return timestamp def save(self): + """ + Calls the SavePageNow API of the Wayback Machine with required parameters + and headers to save the URL. + + Raises MaximumSaveRetriesExceeded is maximum retries are exhausted but still + we were unable to retrieve the archive from the Wayback Machine. + """ saved_archive = None tries = 0 @@ -111,8 +162,9 @@ class WaybackMachineSaveAPI: if tries >= self.max_tries: raise MaximumSaveRetriesExceeded( - "Tried %s times but failed to save and return the archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n" - % (str(tries), self.url, self.response_url, str(self.headers)), + "Tried %s times but failed to save and retrieve the" % str(tries) + + " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n" + % (self.url, self.response_url, str(self.headers)), ) if not saved_archive: