added docstrings, lint using pylint and add a raise on 509 SC

This commit is contained in:
Akash Mahanty
2022-02-07 18:17:35 +05:30
parent 30adee5f00
commit f782a1343c

View File

@@ -1,3 +1,10 @@
"""
This module interfaces the Wayback Machine's SavePageNow (SPN) API.
The module has WaybackMachineSaveAPI class which should be used by the users of
this module to use the SavePageNow API.
"""
import re import re
import time import time
from datetime import datetime from datetime import datetime
@@ -8,7 +15,7 @@ from requests.adapters import HTTPAdapter
from requests.structures import CaseInsensitiveDict from requests.structures import CaseInsensitiveDict
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, WaybackError
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
@@ -47,8 +54,8 @@ class WaybackMachineSaveAPI(object):
if self._archive_url: if self._archive_url:
return self._archive_url return self._archive_url
else:
return self.save() return self.save()
def get_save_request_headers(self) -> None: def get_save_request_headers(self) -> None:
""" """
@@ -66,6 +73,7 @@ class WaybackMachineSaveAPI(object):
to be very unreliable thus if it fails first check opening to be very unreliable thus if it fails first check opening
the response URL yourself in the browser. the response URL yourself in the browser.
""" """
session = requests.Session() session = requests.Session()
retries = Retry( retries = Retry(
total=self.total_save_retries, total=self.total_save_retries,
@@ -79,11 +87,24 @@ class WaybackMachineSaveAPI(object):
self.status_code = self.response.status_code self.status_code = self.response.status_code
self.response_url = self.response.url self.response_url = self.response.url
session.close() session.close()
if self.status_code == 429: if self.status_code == 429:
# why wait 5 minutes and 429?
# see https://github.com/akamhy/waybackpy/issues/97
raise TooManyRequestsError( raise TooManyRequestsError(
"Seem to be refused to request by the server. " f"Can not save '{self.url}'. "
"Save Page Now receives up to 15 URLs per minutes. " f"Save request refused by the server. "
"Wait a moment and run again." f"Save Page Now limits saving 15 URLs per minutes. "
f"Try waiting for 5 minutes and then try again."
)
# why 509?
# see https://github.com/akamhy/waybackpy/pull/99
# also https://t.co/xww4YJ0Iwc
if self.status_code == 509:
raise WaybackError(
f"Can not save '{self.url}'. You have probably reached the "
f"limit of active sessions."
) )
def archive_url_parser(self) -> Optional[str]: def archive_url_parser(self) -> Optional[str]:
@@ -146,13 +167,17 @@ class WaybackMachineSaveAPI(object):
the Wayback Machine to serve cached archive if last archive was captured the Wayback Machine to serve cached archive if last archive was captured
before last 45 minutes. before last 45 minutes.
""" """
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
m = re.search(regex, str(self._archive_url))
if m is None or len(m.groups()) != 1:
raise ValueError("Could not get timestamp")
string_timestamp = m.group(1)
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
match = re.search(regex, str(self._archive_url))
if match is None or len(match.groups()) != 1:
raise ValueError(
f"Can not parse timestamp from archive URL, '{self._archive_url}'."
)
string_timestamp = match.group(1)
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
timestamp_unixtime = time.mktime(timestamp.timetuple()) timestamp_unixtime = time.mktime(timestamp.timetuple())
instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple()) instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple())