Clean up exceptions, docstrings, and comments

Docstrings on dunder functions are redundant and typically ignored
Limit to reasonable line length
General grammar and style corrections
Clarify docstrings and exceptions
Format docstrings per PEP 257 -- Docstring Conventions
This commit is contained in:
AntiCompositeNumber 2020-07-21 15:38:35 -04:00
parent fd74e62ff9
commit b3c68add55
No known key found for this signature in database
GPG Key ID: A888A323AB506229

View File

@ -5,6 +5,7 @@ import sys
import json import json
from datetime import datetime from datetime import datetime
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
from waybackpy.__version__ import __version__
if sys.version_info >= (3, 0): # If the python ver >= 3 if sys.version_info >= (3, 0): # If the python ver >= 3
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
@ -24,21 +25,18 @@ class Url:
self.url_check() # checks url validity on init. self.url_check() # checks url validity on init.
def __repr__(self): def __repr__(self):
"""Representation of the object."""
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
def __str__(self): def __str__(self):
"""String representation of the object."""
return "%s" % self.clean_url() return "%s" % self.clean_url()
def __len__(self): def __len__(self):
"""Length of the URL."""
return len(self.clean_url()) return len(self.clean_url())
def url_check(self): def url_check(self):
"""Check for common URL problems.""" """Check for common URL problems."""
if "." not in self.url: if "." not in self.url:
raise URLError("'%s' is not a vaild url." % self.url) raise URLError("'%s' is not a vaild URL." % self.url)
return True return True
def clean_url(self): def clean_url(self):
@ -46,7 +44,7 @@ class Url:
return str(self.url).strip().replace(" ", "_") return str(self.url).strip().replace(" ", "_")
def wayback_timestamp(self, **kwargs): def wayback_timestamp(self, **kwargs):
"""Return the formatted the timestamp.""" """Return a formatted timestamp."""
return ( return (
str(kwargs["year"]) str(kwargs["year"])
+ str(kwargs["month"]).zfill(2) + str(kwargs["month"]).zfill(2)
@ -56,7 +54,7 @@ class Url:
) )
def save(self): def save(self):
"""Create a new archives for an URL on the Wayback Machine.""" """Create a new Wayback Machine archive for this URL."""
request_url = "https://web.archive.org/save/" + self.clean_url() request_url = "https://web.archive.org/save/" + self.clean_url()
hdr = {"User-Agent": "%s" % self.user_agent} # nosec hdr = {"User-Agent": "%s" % self.user_agent} # nosec
req = Request(request_url, headers=hdr) # nosec req = Request(request_url, headers=hdr) # nosec
@ -75,14 +73,18 @@ class Url:
if arch: if arch:
return arch.group(1) return arch.group(1)
raise WaybackError( raise WaybackError(
"No archive url found in the API response. Visit https://github.com/akamhy/waybackpy for latest version of waybackpy.\nHeader:\n%s" "No archive URL found in the API response. "
% str(header) "This version of waybackpy (%s) is likely out of date. Visit "
"https://github.com/akamhy/waybackpy for the latest version "
"of waybackpy.\nHeader:\n%s" % (__version__, str(header))
) )
return "https://" + archive_url_parser(header) return "https://" + archive_url_parser(header)
def get(self, url=None, user_agent=None, encoding=None): def get(self, url=None, user_agent=None, encoding=None):
"""Returns the source code of the supplied URL. Auto detects the encoding if not supplied.""" """Return the source code of the supplied URL.
If encoding is not supplied, it is auto-detected from the response.
"""
if not url: if not url:
url = self.clean_url() url = self.clean_url()
@ -111,9 +113,10 @@ class Url:
return response return response
def near(self, **kwargs): def near(self, **kwargs):
""" Returns the archived from Wayback Machine for an URL closest to the time supplied. """Return the closest Wayback Machine archive to the time supplied.
Supported params are year, month, day, hour and minute.
The non supplied parameters are default to the runtime time. Supported params are year, month, day, hour and minute.
Any non-supplied parameters default to the current time.
""" """
year = kwargs.get("year", datetime.utcnow().strftime("%Y")) year = kwargs.get("year", datetime.utcnow().strftime("%Y"))
month = kwargs.get("month", datetime.utcnow().strftime("%m")) month = kwargs.get("month", datetime.utcnow().strftime("%m"))
@ -133,8 +136,8 @@ class Url:
data = json.loads(response.read().decode("UTF-8")) data = json.loads(response.read().decode("UTF-8"))
if not data["archived_snapshots"]: if not data["archived_snapshots"]:
raise WaybackError( raise WaybackError(
"'%s' is not yet archived. Use wayback.Url(url, user_agent).save() to create a new archive." "'%s' is not yet archived. Use wayback.Url(url, user_agent).save() "
% self.clean_url() "to create a new archive." % self.clean_url()
) )
archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = data["archived_snapshots"]["closest"]["url"]
# wayback machine returns http sometimes, idk why? But they support https # wayback machine returns http sometimes, idk why? But they support https
@ -144,15 +147,19 @@ class Url:
return archive_url return archive_url
def oldest(self, year=1994): def oldest(self, year=1994):
"""Returns the oldest archive from Wayback Machine for an URL.""" """Return the oldest Wayback Machine archive for this URL."""
return self.near(year=year) return self.near(year=year)
def newest(self): def newest(self):
"""Returns the newest archive on Wayback Machine for an URL, sometimes you may not get the newest archive because wayback machine DB lag.""" """Return the newest Wayback Machine archive available for this URL.
Due to Wayback Machine database lag, this may not always be the
most recent archive.
"""
return self.near() return self.near()
def total_archives(self): def total_archives(self):
"""Returns the total number of archives on Wayback Machine for an URL.""" """Returns the total number of Wayback Machine archives for this URL."""
hdr = {"User-Agent": "%s" % self.user_agent} hdr = {"User-Agent": "%s" % self.user_agent}
request_url = ( request_url = (
"https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode"
@ -160,6 +167,5 @@ class Url:
) )
req = Request(request_url, headers=hdr) # nosec req = Request(request_url, headers=hdr) # nosec
response = self.get_response(req) response = self.get_response(req)
return str(response.read()).count( # Most efficient method to count number of archives (yet)
"," return str(response.read()).count(",")
) # Most efficient method to count number of archives (yet)