From d549d314218415d826c930b2eddbd71bc0d72607 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Sat, 16 Jan 2021 10:47:43 +0530 Subject: [PATCH] improve save method, now we know that 302 errors indicates that wayback machine is archiving the URL and hasn't yet archived. We construct an artifical archive with the current UTC time and check for HTTP status code 20* or 30*. If we verify the archival, we return the artifical archive. The artificial archive will automatically point to the new archive or in best case will be the new archive after some time. --- waybackpy/utils.py | 58 ++++++++++++++++++++++++++++++++++++++++++-- waybackpy/wrapper.py | 17 +++++++++++-- 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/waybackpy/utils.py b/waybackpy/utils.py index fa217d9..d94e594 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -1,4 +1,5 @@ import re +import time import requests from .exceptions import WaybackError, URLError from datetime import datetime @@ -189,7 +190,7 @@ def _get_total_pages(url, user_agent): return int((_get_response(total_pages_url, headers=headers).text).strip()) -def _archive_url_parser(header, url, latest_version=__version__): +def _archive_url_parser(header, url, latest_version=__version__, instance=None): """ The wayback machine's save API doesn't return JSON response, we are required @@ -211,10 +212,40 @@ def _archive_url_parser(header, url, latest_version=__version__): If we found the archive URL we return it. + Return format: + + web.archive.org/web// + And if we couldn't find it, we raise WaybackError with an error message. """ + if "save redirected" in header: + time.sleep(60) # makeup for archive time + + now = datetime.utcnow().timetuple() + timestamp = _wayback_timestamp( + year=now.tm_year, + month=now.tm_mon, + day=now.tm_mday, + hour=now.tm_hour, + minute=now.tm_min, + ) + + return_str = "web.archive.org/web/{timestamp}/{url}".format( + timestamp=timestamp, url=url + ) + url = "https://" + return_str + + headers = {"User-Agent": instance.user_agent} + + res = _get_response(url, headers=headers) + + if res.status_code < 400: + return "web.archive.org/web/{timestamp}/{url}".format( + timestamp=timestamp, url=url + ) + # Regex1 m = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header)) if m: @@ -232,6 +263,24 @@ def _archive_url_parser(header, url, latest_version=__version__): if m: return m.group(1) + if instance: + newest_archive = None + try: + newest_archive = instance.newest() + except Exception as e: + pass # We don't care as this is a save request + + if newest_archive: + minutes_old = ( + datetime.utcnow() - newest_archive.timestamp + ).total_seconds() / 60.0 + + if minutes_old <= 30: + archive_url = newest_archive.archive_url + m = re.search(r"web\.archive\.org/web/[0-9]{14}/.*", archive_url) + if m: + return m.group(0) + if __version__ == latest_version: exc_message = ( "No archive URL found in the API response. " @@ -287,6 +336,7 @@ def _get_response( return_full_url=False, retries=5, backoff_factor=0.5, + no_raise_on_redirects=False, ): """ This function is used make get request. @@ -326,8 +376,12 @@ def _get_response( return s.get(url, headers=headers) return (url, s.get(url, headers=headers)) except Exception as e: + reason = str(e) + if no_raise_on_redirects: + if "Exceeded 30 redirects" in reason: + return exc_message = "Error while retrieving {url}.\n{reason}".format( - url=url, reason=str(e) + url=url, reason=reason ) exc = WaybackError(exc_message) exc.__cause__ = e diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 22f42ea..c8f7e60 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -139,13 +139,26 @@ class Url: """ request_url = "https://web.archive.org/save/" + _cleaned_url(self.url) headers = {"User-Agent": self.user_agent} + response = _get_response( - request_url, params=None, headers=headers, backoff_factor=2 + request_url, + params=None, + headers=headers, + backoff_factor=2, + no_raise_on_redirects=True, ) + if not self.latest_version: self.latest_version = _latest_version("waybackpy", headers=headers) + if response: + res_headers = response.headers + else: + res_headers = "save redirected" self._archive_url = "https://" + _archive_url_parser( - response.headers, self.url, self.latest_version + res_headers, + self.url, + latest_version=self.latest_version, + instance=self, ) self.timestamp = datetime.utcnow() return self