API chnaged again. updated

* Update wrapper.py

* Update wrapper.py

* Update wrapper.py

* Update wrapper.py

* Update wrapper.py

* api changed; fix archive url parser

* Update wrapper.py

* - Trailing whitespace

* include the header in exception
This commit is contained in:
Akash 2020-07-19 20:39:07 +05:30 committed by GitHub
parent 6f82041ec9
commit 8ab116f276
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -8,9 +8,9 @@ from waybackpy.exceptions import WaybackError
if sys.version_info >= (3, 0): # If the python ver >= 3 if sys.version_info >= (3, 0): # If the python ver >= 3
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError from urllib.error import URLError
else: # For python2.x else: # For python2.x
from urllib2 import Request, urlopen, HTTPError, URLError from urllib2 import Request, urlopen, URLError
default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy" default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
@ -59,13 +59,6 @@ class Url():
str(kwargs["minute"]).zfill(2) str(kwargs["minute"]).zfill(2)
) )
def handle_HTTPError(self, e):
"""Handle some common HTTPErrors."""
if e.code == 404:
raise HTTPError(e)
if e.code >= 400:
raise WaybackError(e)
def save(self): def save(self):
"""Create a new archives for an URL on the Wayback Machine.""" """Create a new archives for an URL on the Wayback Machine."""
request_url = ("https://web.archive.org/save/" + self.clean_url()) request_url = ("https://web.archive.org/save/" + self.clean_url())
@ -79,32 +72,42 @@ class Url():
except Exception as e: except Exception as e:
raise WaybackError(e) raise WaybackError(e)
header = response.headers header = response.headers
try:
arch = re.search(r"rel=\"memento.*?web\.archive\.org(/web/[0-9]{14}/.*?)>", str(header)).group(1) def archive_url_parser(header):
except KeyError as e: arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
raise WaybackError(e) if arch:
return "https://web.archive.org" + arch return arch.group(1)
raise WaybackError(
"No archive url found in the API response. Visit https://github.com/akamhy/waybackpy for latest version of waybackpy.\nHeader:\n%s" % str(header)
)
return "https://" + archive_url_parser(header)
def get(self, url=None, user_agent=None, encoding=None): def get(self, url=None, user_agent=None, encoding=None):
"""Returns the source code of the supplied URL. Auto detects the encoding if not supplied.""" """Returns the source code of the supplied URL. Auto detects the encoding if not supplied."""
if not url: if not url:
url = self.clean_url() url = self.clean_url()
if not user_agent: if not user_agent:
user_agent = self.user_agent user_agent = self.user_agent
hdr = { 'User-Agent' : '%s' % user_agent } hdr = { 'User-Agent' : '%s' % user_agent }
req = Request(url, headers=hdr) #nosec req = Request(url, headers=hdr) #nosec
try: try:
resp=urlopen(req) #nosec resp=urlopen(req) #nosec
except URLError: except Exception:
try: try:
resp=urlopen(req) #nosec resp=urlopen(req) #nosec
except URLError as e: except Exception as e:
raise HTTPError(e) raise WaybackError(e)
if not encoding: if not encoding:
try: try:
encoding= resp.headers['content-type'].split('charset=')[-1] encoding= resp.headers['content-type'].split('charset=')[-1]
except AttributeError: except AttributeError:
encoding = "UTF-8" encoding = "UTF-8"
return resp.read().decode(encoding.replace("text/html", "UTF-8", 1)) return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
def near(self, **kwargs): def near(self, **kwargs):
@ -121,10 +124,15 @@ class Url():
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (self.clean_url(), str(timestamp)) request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (self.clean_url(), str(timestamp))
hdr = { 'User-Agent' : '%s' % self.user_agent } hdr = { 'User-Agent' : '%s' % self.user_agent }
req = Request(request_url, headers=hdr) # nosec req = Request(request_url, headers=hdr) # nosec
try:
response = urlopen(req) #nosec
except Exception:
try: try:
response = urlopen(req) #nosec response = urlopen(req) #nosec
except Exception as e: except Exception as e:
self.handle_HTTPError(e) WaybackError(e)
data = json.loads(response.read().decode("UTF-8")) data = json.loads(response.read().decode("UTF-8"))
if not data["archived_snapshots"]: if not data["archived_snapshots"]:
raise WaybackError("'%s' is not yet archived." % url) raise WaybackError("'%s' is not yet archived." % url)
@ -146,8 +154,13 @@ class Url():
hdr = { 'User-Agent' : '%s' % self.user_agent } hdr = { 'User-Agent' : '%s' % self.user_agent }
request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url() request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url()
req = Request(request_url, headers=hdr) # nosec req = Request(request_url, headers=hdr) # nosec
try:
response = urlopen(req) #nosec
except Exception:
try: try:
response = urlopen(req) #nosec response = urlopen(req) #nosec
except Exception as e: except Exception as e:
self.handle_HTTPError(e) WaybackError(e)
return str(response.read()).count(",") # Most efficient method to count number of archives (yet) return str(response.read()).count(",") # Most efficient method to count number of archives (yet)