added class functionality

This commit is contained in:
Daniel Valenzuela
2020-10-14 23:52:40 -04:00
parent 1a81eb97fb
commit 8768fb1f22

View File

@@ -3,6 +3,7 @@
import re import re
import sys import sys
import json import json
import warnings
from datetime import datetime from datetime import datetime
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
from waybackpy.__version__ import __version__ from waybackpy.__version__ import __version__
@@ -69,20 +70,67 @@ class Url:
self.url = url self.url = url
self.user_agent = user_agent self.user_agent = user_agent
self._url_check() # checks url validity on init. self._url_check() # checks url validity on init.
self.JSON = self._JSON() # JSON of most recent archive
self.archive_url = self._archive_url() # URL of archive
self.timestamp = self._archive_timestamp() # timestamp for last archive
def __repr__(self): def __repr__(self):
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
def __str__(self): def __str__(self):
return "%s" % self._clean_url() return "%s" % self.archive_url
def __len__(self): def __len__(self):
return len(self._clean_url()) diff = datetime.utcnow() - self.timestamp
return diff.days
def _url_check(self): def _url_check(self):
"""Check for common URL problems.""" """Check for common URL problems."""
if "." not in self.url: if "." not in self.url:
raise URLError("'%s' is not a vaild URL." % self.url) raise URLError("'%s' is not a vaild URL." % self.url)
def _JSON(self):
request_url = "https://archive.org/wayback/available?url=%s" % (
self._clean_url(),
)
hdr = {"User-Agent": "%s" % self.user_agent}
req = Request(request_url, headers=hdr) # nosec
response = _get_response(req)
data_string = response.read().decode("UTF-8")
data = json.loads(data_string)
return data
def _archive_url(self):
"""Get URL of archive."""
data = self.JSON
if not data["archived_snapshots"]:
archive_url = None
else:
archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace(
"http://web.archive.org/web/",
"https://web.archive.org/web/",
1
)
return archive_url
def _archive_timestamp(self):
"""Get timestamp of last archive."""
data = self.JSON
if not data["archived_snapshots"]:
time = None
else:
time = datetime.strptime(data["archived_snapshots"]
["closest"]
["timestamp"],
'%Y%m%d%H%M%S')
return time
def _clean_url(self): def _clean_url(self):
"""Fix the URL, if possible.""" """Fix the URL, if possible."""
@@ -94,7 +142,9 @@ class Url:
hdr = {"User-Agent": "%s" % self.user_agent} # nosec hdr = {"User-Agent": "%s" % self.user_agent} # nosec
req = Request(request_url, headers=hdr) # nosec req = Request(request_url, headers=hdr) # nosec
header = _get_response(req).headers header = _get_response(req).headers
return "https://" + _archive_url_parser(header) self.archive_url = "https://" + _archive_url_parser(header)
self.timestamp = datetime.utcnow()
return self
def get(self, url="", user_agent="", encoding=""): def get(self, url="", user_agent="", encoding=""):
"""Return the source code of the supplied URL. """Return the source code of the supplied URL.
@@ -146,11 +196,18 @@ class Url:
"to create a new archive." % self._clean_url() "to create a new archive." % self._clean_url()
) )
archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = data["archived_snapshots"]["closest"]["url"]
# wayback machine returns http sometimes, idk why? But they support https
archive_url = archive_url.replace( archive_url = archive_url.replace(
"http://web.archive.org/web/", "https://web.archive.org/web/", 1 "http://web.archive.org/web/", "https://web.archive.org/web/", 1
) )
return archive_url
self.archive_url = archive_url
self.timestamp = datetime.strptime(data["archived_snapshots"]
["closest"]
["timestamp"],
'%Y%m%d%H%M%S')
return self
def oldest(self, year=1994): def oldest(self, year=1994):
"""Return the oldest Wayback Machine archive for this URL.""" """Return the oldest Wayback Machine archive for this URL."""
@@ -221,4 +278,4 @@ class Url:
url_list = tmp_url_list url_list = tmp_url_list
return url_list return url_list