From d549d314218415d826c930b2eddbd71bc0d72607 Mon Sep 17 00:00:00 2001
From: Akash Mahanty <akamhy@yahoo.com>
Date: Sat, 16 Jan 2021 10:47:43 +0530
Subject: [PATCH] improve save method, now we know that 302 errors indicates
 that wayback machine is archiving the URL and hasn't yet archived. We
 construct an artifical archive with the current UTC time and check for HTTP
 status code 20* or 30*. If we verify the archival, we return the artifical
 archive. The artificial archive will automatically point to the new archive
 or in best case will be the new archive after some time.

---
 waybackpy/utils.py   | 58 ++++++++++++++++++++++++++++++++++++++++++--
 waybackpy/wrapper.py | 17 +++++++++++--
 2 files changed, 71 insertions(+), 4 deletions(-)
diff --git a/waybackpy/utils.py b/waybackpy/utils.py
index fa217d9..d94e594 100644
--- a/waybackpy/utils.py
+++ b/waybackpy/utils.py
@@ -1,4 +1,5 @@
 import re
+import time
 import requests
 from .exceptions import WaybackError, URLError
 from datetime import datetime
@@ -189,7 +190,7 @@ def _get_total_pages(url, user_agent):
     return int((_get_response(total_pages_url, headers=headers).text).strip())
 
 
-def _archive_url_parser(header, url, latest_version=__version__):
+def _archive_url_parser(header, url, latest_version=__version__, instance=None):
     """
     The wayback machine's save API doesn't
     return JSON response, we are required
@@ -211,10 +212,40 @@ def _archive_url_parser(header, url, latest_version=__version__):
 
     If we found the archive URL we return it.
 
+    Return format:
+
+    web.archive.org/web/<TIMESTAMP>/<URL>
+
     And if we couldn't find it, we raise
     WaybackError with an error message.
     """
 
+    if "save redirected" in header:
+        time.sleep(60)  # makeup for archive time
+
+        now = datetime.utcnow().timetuple()
+        timestamp = _wayback_timestamp(
+            year=now.tm_year,
+            month=now.tm_mon,
+            day=now.tm_mday,
+            hour=now.tm_hour,
+            minute=now.tm_min,
+        )
+
+        return_str = "web.archive.org/web/{timestamp}/{url}".format(
+            timestamp=timestamp, url=url
+        )
+        url = "https://" + return_str
+
+        headers = {"User-Agent": instance.user_agent}
+
+        res = _get_response(url, headers=headers)
+
+        if res.status_code < 400:
+            return "web.archive.org/web/{timestamp}/{url}".format(
+                timestamp=timestamp, url=url
+            )
+
     # Regex1
     m = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
     if m:
@@ -232,6 +263,24 @@ def _archive_url_parser(header, url, latest_version=__version__):
     if m:
         return m.group(1)
 
+    if instance:
+        newest_archive = None
+        try:
+            newest_archive = instance.newest()
+        except Exception as e:
+            pass  # We don't care as this is a save request
+
+        if newest_archive:
+            minutes_old = (
+                datetime.utcnow() - newest_archive.timestamp
+            ).total_seconds() / 60.0
+
+            if minutes_old <= 30:
+                archive_url = newest_archive.archive_url
+                m = re.search(r"web\.archive\.org/web/[0-9]{14}/.*", archive_url)
+                if m:
+                    return m.group(0)
+
     if __version__ == latest_version:
         exc_message = (
             "No archive URL found in the API response. "
@@ -287,6 +336,7 @@ def _get_response(
     return_full_url=False,
     retries=5,
     backoff_factor=0.5,
+    no_raise_on_redirects=False,
 ):
     """
     This function is used make get request.
@@ -326,8 +376,12 @@ def _get_response(
             return s.get(url, headers=headers)
         return (url, s.get(url, headers=headers))
     except Exception as e:
+        reason = str(e)
+        if no_raise_on_redirects:
+            if "Exceeded 30 redirects" in reason:
+                return
         exc_message = "Error while retrieving {url}.\n{reason}".format(
-            url=url, reason=str(e)
+            url=url, reason=reason
         )
         exc = WaybackError(exc_message)
         exc.__cause__ = e
diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py
index 22f42ea..c8f7e60 100644
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -139,13 +139,26 @@ class Url:
         """
         request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
         headers = {"User-Agent": self.user_agent}
+
         response = _get_response(
-            request_url, params=None, headers=headers, backoff_factor=2
+            request_url,
+            params=None,
+            headers=headers,
+            backoff_factor=2,
+            no_raise_on_redirects=True,
         )
+
         if not self.latest_version:
             self.latest_version = _latest_version("waybackpy", headers=headers)
+        if response:
+            res_headers = response.headers
+        else:
+            res_headers = "save redirected"
         self._archive_url = "https://" + _archive_url_parser(
-            response.headers, self.url, self.latest_version
+            res_headers,
+            self.url,
+            latest_version=self.latest_version,
+            instance=self,
         )
         self.timestamp = datetime.utcnow()
         return self