From 8a4b631c133230574d9413ec66c970c4b41c0ff3 Mon Sep 17 00:00:00 2001 From: Akash <64683866+akamhy@users.noreply.github.com> Date: Sun, 9 Aug 2020 10:36:25 +0530 Subject: [PATCH] new regex to parse archive, IA changed the header again :( --- waybackpy/wrapper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 9c69e54..849f4dc 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -19,12 +19,18 @@ default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy" def _archive_url_parser(header): """Parse out the archive from header.""" # Regex1 + arch = re.search( + r"Content-Location: (/web/[0-9]{14}/.*)", str(header) + ) + if arch: + return "web.archive.org" + arch.group(1) + # Regex2 arch = re.search( r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header) ) if arch: return arch.group(1) - # Regex2 + # Regex3 arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) if arch: return arch.group(1)