Adding attributes to Url class (#28)
* added class functionality * Update wrapper.py * style edits
This commit is contained in:
parent
1a81eb97fb
commit
d465454019
@ -69,21 +69,68 @@ class Url:
|
|||||||
self.url = url
|
self.url = url
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
self._url_check() # checks url validity on init.
|
self._url_check() # checks url validity on init.
|
||||||
|
self.JSON = self._JSON() # JSON of most recent archive
|
||||||
|
self.archive_url = self._archive_url() # URL of archive
|
||||||
|
self.timestamp = self._archive_timestamp() # timestamp for last archive
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
|
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "%s" % self._clean_url()
|
return "%s" % self.archive_url
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self._clean_url())
|
diff = datetime.utcnow() - self.timestamp
|
||||||
|
return diff.days
|
||||||
|
|
||||||
def _url_check(self):
|
def _url_check(self):
|
||||||
"""Check for common URL problems."""
|
"""Check for common URL problems."""
|
||||||
if "." not in self.url:
|
if "." not in self.url:
|
||||||
raise URLError("'%s' is not a vaild URL." % self.url)
|
raise URLError("'%s' is not a vaild URL." % self.url)
|
||||||
|
|
||||||
|
def _JSON(self):
|
||||||
|
request_url = "https://archive.org/wayback/available?url=%s" % (
|
||||||
|
self._clean_url(),
|
||||||
|
)
|
||||||
|
|
||||||
|
hdr = {"User-Agent": "%s" % self.user_agent}
|
||||||
|
req = Request(request_url, headers=hdr) # nosec
|
||||||
|
response = _get_response(req)
|
||||||
|
data_string = response.read().decode("UTF-8")
|
||||||
|
data = json.loads(data_string)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _archive_url(self):
|
||||||
|
"""Get URL of archive."""
|
||||||
|
data = self.JSON
|
||||||
|
|
||||||
|
if not data["archived_snapshots"]:
|
||||||
|
archive_url = None
|
||||||
|
else:
|
||||||
|
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||||
|
archive_url = archive_url.replace(
|
||||||
|
"http://web.archive.org/web/",
|
||||||
|
"https://web.archive.org/web/",
|
||||||
|
1
|
||||||
|
)
|
||||||
|
|
||||||
|
return archive_url
|
||||||
|
|
||||||
|
def _archive_timestamp(self):
|
||||||
|
"""Get timestamp of last archive."""
|
||||||
|
data = self.JSON
|
||||||
|
|
||||||
|
if not data["archived_snapshots"]:
|
||||||
|
time = None
|
||||||
|
else:
|
||||||
|
time = datetime.strptime(data["archived_snapshots"]
|
||||||
|
["closest"]
|
||||||
|
["timestamp"],
|
||||||
|
'%Y%m%d%H%M%S')
|
||||||
|
|
||||||
|
return time
|
||||||
|
|
||||||
def _clean_url(self):
|
def _clean_url(self):
|
||||||
"""Fix the URL, if possible."""
|
"""Fix the URL, if possible."""
|
||||||
return str(self.url).strip().replace(" ", "_")
|
return str(self.url).strip().replace(" ", "_")
|
||||||
@ -94,7 +141,9 @@ class Url:
|
|||||||
hdr = {"User-Agent": "%s" % self.user_agent} # nosec
|
hdr = {"User-Agent": "%s" % self.user_agent} # nosec
|
||||||
req = Request(request_url, headers=hdr) # nosec
|
req = Request(request_url, headers=hdr) # nosec
|
||||||
header = _get_response(req).headers
|
header = _get_response(req).headers
|
||||||
return "https://" + _archive_url_parser(header)
|
self.archive_url = "https://" + _archive_url_parser(header)
|
||||||
|
self.timestamp = datetime.utcnow()
|
||||||
|
return self
|
||||||
|
|
||||||
def get(self, url="", user_agent="", encoding=""):
|
def get(self, url="", user_agent="", encoding=""):
|
||||||
"""Return the source code of the supplied URL.
|
"""Return the source code of the supplied URL.
|
||||||
@ -146,11 +195,18 @@ class Url:
|
|||||||
"to create a new archive." % self._clean_url()
|
"to create a new archive." % self._clean_url()
|
||||||
)
|
)
|
||||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||||
# wayback machine returns http sometimes, idk why? But they support https
|
|
||||||
archive_url = archive_url.replace(
|
archive_url = archive_url.replace(
|
||||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||||
)
|
)
|
||||||
return archive_url
|
|
||||||
|
self.archive_url = archive_url
|
||||||
|
self.timestamp = datetime.strptime(data["archived_snapshots"]
|
||||||
|
["closest"]
|
||||||
|
["timestamp"],
|
||||||
|
'%Y%m%d%H%M%S')
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
def oldest(self, year=1994):
|
def oldest(self, year=1994):
|
||||||
"""Return the oldest Wayback Machine archive for this URL."""
|
"""Return the oldest Wayback Machine archive for this URL."""
|
||||||
|
Loading…
Reference in New Issue
Block a user