From 3fc212294c0aea229a21e70020000035c768c23f Mon Sep 17 00:00:00 2001 From: Akash <64683866+akamhy@users.noreply.github.com> Date: Mon, 20 Jul 2020 10:01:19 +0530 Subject: [PATCH] 2 regex for parsing the archive url --- waybackpy/wrapper.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index e0fa8ab..17051f1 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -74,9 +74,18 @@ class Url(): header = response.headers def archive_url_parser(header): + """Parse out the archive from header.""" + + #Regex1 + arch = re.search(r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)) + if arch: + return arch.group(1) + + #Regex2 arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) if arch: return arch.group(1) + raise WaybackError( "No archive url found in the API response. Visit https://github.com/akamhy/waybackpy for latest version of waybackpy.\nHeader:\n%s" % str(header) )