new regex to parse archive, IA changed the header again :(

This commit is contained in:
Akash 2020-08-09 10:36:25 +05:30 committed by GitHub
parent ec9ce92f48
commit 8a4b631c13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -19,12 +19,18 @@ default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _archive_url_parser(header): def _archive_url_parser(header):
"""Parse out the archive from header.""" """Parse out the archive from header."""
# Regex1 # Regex1
arch = re.search(
r"Content-Location: (/web/[0-9]{14}/.*)", str(header)
)
if arch:
return "web.archive.org" + arch.group(1)
# Regex2
arch = re.search( arch = re.search(
r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header) r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
) )
if arch: if arch:
return arch.group(1) return arch.group(1)
# Regex2 # Regex3
arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
if arch: if arch:
return arch.group(1) return arch.group(1)