Code style improvements (#20)

* Add sane line length to setup.cfg

* Use Black for quick readability improvements

* Clean up exceptions, docstrings, and comments

Docstrings on dunder functions are redundant and typically ignored
Limit to reasonable line length
General grammar and style corrections
Clarify docstrings and exceptions
Format docstrings per PEP 257 -- Docstring Conventions

* Move archive_url_parser out of Url.save()

It's generally poor form to define a function in a function, as it will
be re-defined each time the function is run.

archive_url_parser does not depend on anything in Url, so it makes sense
to move it out of the class.

* move wayback_timestamp out of class, mark private functions

* DRY in _wayback_timestamp

* Url._url_check should return None

There's no point in returning True if it's never checked and won't ever
be False.
Implicitly returning None or raising an exception is more idiomatic.

* Default parameters should be type-consistant with expected values

* Specify parameters to near

* Use datetime.datetime in _wayback_timestamp

* cleanup __init__.py

* Cleanup formatting in tests

* Fix names in tests

* Revert "Use datetime.datetime in _wayback_timestamp"

This reverts commit 5b30380865.

Introduced unnecessary complexity

* Move _get_response outside of Url

Because Codacy reminded me that I missed it.

* fix imports in tests
This commit is contained in:
AntiCompositeNumber 2020-07-22 00:39:14 -04:00 committed by GitHub
parent a418a4e464
commit be7642c837
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 208 additions and 124 deletions

View File

@ -1,3 +1,7 @@
[metadata] [metadata]
description-file = README.md description-file = README.md
license_file = LICENSE license_file = LICENSE
[flake8]
max-line-length = 88
extend-ignore = E203,W503

View File

@ -1,10 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys import sys
sys.path.append("..")
import waybackpy
import pytest import pytest
import random import random
import time import time
sys.path.append("..")
import waybackpy.wrapper as waybackpy # noqa: E402
if sys.version_info >= (3, 0): # If the python ver >= 3 if sys.version_info >= (3, 0): # If the python ver >= 3
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import URLError from urllib.error import URLError
@ -13,18 +15,21 @@ else: # For python2.x
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0" user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
def test_clean_url(): def test_clean_url():
test_url = " https://en.wikipedia.org/wiki/Network security " test_url = " https://en.wikipedia.org/wiki/Network security "
answer = "https://en.wikipedia.org/wiki/Network_security" answer = "https://en.wikipedia.org/wiki/Network_security"
target = waybackpy.Url(test_url, user_agent) target = waybackpy.Url(test_url, user_agent)
test_result = target.clean_url() test_result = target._clean_url()
assert answer == test_result assert answer == test_result
def test_url_check(): def test_url_check():
broken_url = "http://wwwgooglecom/" broken_url = "http://wwwgooglecom/"
with pytest.raises(Exception) as e_info: with pytest.raises(Exception):
waybackpy.Url(broken_url, user_agent) waybackpy.Url(broken_url, user_agent)
def test_save(): def test_save():
# Test for urls that exist and can be archived. # Test for urls that exist and can be archived.
time.sleep(10) time.sleep(10)
@ -35,89 +40,139 @@ def test_save():
"commons.wikimedia.org", "commons.wikimedia.org",
"www.wiktionary.org", "www.wiktionary.org",
"www.w3schools.com", "www.w3schools.com",
"www.youtube.com" "www.youtube.com",
] ]
x = random.randint(0, len(url_list)-1) x = random.randint(0, len(url_list) - 1)
url1 = url_list[x] url1 = url_list[x]
target = waybackpy.Url(url1, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36") target = waybackpy.Url(
url1,
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
)
archived_url1 = target.save() archived_url1 = target.save()
assert url1 in archived_url1 assert url1 in archived_url1
if sys.version_info > (3, 6): if sys.version_info > (3, 6):
# Test for urls that are incorrect. # Test for urls that are incorrect.
with pytest.raises(Exception) as e_info: with pytest.raises(Exception):
url2 = "ha ha ha ha" url2 = "ha ha ha ha"
waybackpy.Url(url2, user_agent) waybackpy.Url(url2, user_agent)
time.sleep(5) time.sleep(5)
# Test for urls not allowed to archive by robot.txt. # Test for urls not allowed to archive by robot.txt.
with pytest.raises(Exception) as e_info: with pytest.raises(Exception):
url3 = "http://www.archive.is/faq.html" url3 = "http://www.archive.is/faq.html"
target = waybackpy.Url(url3, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0") target = waybackpy.Url(
url3,
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) "
"Gecko/20100101 Firefox/25.0",
)
target.save() target.save()
time.sleep(5) time.sleep(5)
# Non existent urls, test # Non existent urls, test
with pytest.raises(Exception) as e_info: with pytest.raises(Exception):
url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us" url4 = (
target = waybackpy.Url(url3, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27") "https://githfgdhshajagjstgeths537agajaajgsagudadhuss87623"
"46887adsiugujsdgahub.us"
)
target = waybackpy.Url(
url3,
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) "
"AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 "
"Safari/533.20.27",
)
target.save() target.save()
else: else:
pass pass
def test_near(): def test_near():
time.sleep(10) time.sleep(10)
url = "google.com" url = "google.com"
target = waybackpy.Url(url, "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4") target = waybackpy.Url(
url,
"Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 "
"(KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
)
archive_near_year = target.near(year=2010) archive_near_year = target.near(year=2010)
assert "2010" in archive_near_year assert "2010" in archive_near_year
if sys.version_info > (3, 6): if sys.version_info > (3, 6):
time.sleep(5) time.sleep(5)
archive_near_month_year = target.near( year=2015, month=2) archive_near_month_year = target.near(year=2015, month=2)
assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year) assert (
("201502" in archive_near_month_year)
or ("201501" in archive_near_month_year)
or ("201503" in archive_near_month_year)
)
target = waybackpy.Url("www.python.org", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246") target = waybackpy.Url(
archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15) "www.python.org",
assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year) "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
)
archive_near_hour_day_month_year = target.near(
year=2008, month=5, day=9, hour=15
)
assert (
("2008050915" in archive_near_hour_day_month_year)
or ("2008050914" in archive_near_hour_day_month_year)
or ("2008050913" in archive_near_hour_day_month_year)
)
with pytest.raises(Exception) as e_info: with pytest.raises(Exception):
NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity" NeverArchivedUrl = (
"https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
)
target = waybackpy.Url(NeverArchivedUrl, user_agent) target = waybackpy.Url(NeverArchivedUrl, user_agent)
target.near(year=2010) target.near(year=2010)
else: else:
pass pass
def test_oldest(): def test_oldest():
url = "github.com/akamhy/waybackpy" url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent) target = waybackpy.Url(url, user_agent)
assert "20200504141153" in target.oldest() assert "20200504141153" in target.oldest()
def test_newest(): def test_newest():
url = "github.com/akamhy/waybackpy" url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent) target = waybackpy.Url(url, user_agent)
assert url in target.newest() assert url in target.newest()
def test_get(): def test_get():
target = waybackpy.Url("google.com", user_agent) target = waybackpy.Url("google.com", user_agent)
assert "Welcome to Google" in target.get(target.oldest()) assert "Welcome to Google" in target._get(target.oldest())
def test_wayback_timestamp(): def test_wayback_timestamp():
ts = waybackpy.Url("https://www.google.com","UA").wayback_timestamp(year=2020,month=1,day=2,hour=3,minute=4) ts = waybackpy._wayback_timestamp(
year=2020, month=1, day=2, hour=3, minute=4
)
assert "202001020304" in str(ts) assert "202001020304" in str(ts)
def test_get_response(): def test_get_response():
hdr = { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'} hdr = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) "
"Gecko/20100101 Firefox/78.0"
}
req = Request("https://www.google.com", headers=hdr) # nosec req = Request("https://www.google.com", headers=hdr) # nosec
response = waybackpy.Url("https://www.google.com","UA").get_response(req) response = waybackpy._get_response(req)
assert response.code == 200 assert response.code == 200
def test_total_archives(): def test_total_archives():
if sys.version_info > (3, 6): if sys.version_info > (3, 6):
target = waybackpy.Url(" https://google.com ", user_agent) target = waybackpy.Url(" https://google.com ", user_agent)
assert target.total_archives() > 500000 assert target.total_archives() > 500000
else: else:
pass pass
target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent) target = waybackpy.Url(
" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent
)
assert target.total_archives() == 0 assert target.total_archives() == 0

View File

@ -28,5 +28,13 @@ Full documentation @ <https://akamhy.github.io/waybackpy/>.
""" """
from .wrapper import Url from .wrapper import Url
from .__version__ import __title__, __description__, __url__, __version__ from .__version__ import (
from .__version__ import __author__, __author_email__, __license__, __copyright__ __title__,
__description__,
__url__,
__version__,
__author__,
__author_email__,
__license__,
__copyright__,
)

View File

@ -5,6 +5,7 @@ import sys
import json import json
from datetime import datetime from datetime import datetime
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
from waybackpy.__version__ import __version__
if sys.version_info >= (3, 0): # If the python ver >= 3 if sys.version_info >= (3, 0): # If the python ver >= 3
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
@ -14,138 +15,154 @@ else: # For python2.x
default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy" default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
class Url():
"""waybackpy Url object"""
def _archive_url_parser(header):
def __init__(self, url, user_agent=default_UA):
self.url = url
self.user_agent = user_agent
self.url_check() # checks url validity on init.
def __repr__(self):
"""Representation of the object."""
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
def __str__(self):
"""String representation of the object."""
return "%s" % self.clean_url()
def __len__(self):
"""Length of the URL."""
return len(self.clean_url())
def url_check(self):
"""Check for common URL problems."""
if "." not in self.url:
raise URLError("'%s' is not a vaild url." % self.url)
return True
def clean_url(self):
"""Fix the URL, if possible."""
return str(self.url).strip().replace(" ","_")
def wayback_timestamp(self, **kwargs):
"""Return the formatted the timestamp."""
return (
str(kwargs["year"])
+
str(kwargs["month"]).zfill(2)
+
str(kwargs["day"]).zfill(2)
+
str(kwargs["hour"]).zfill(2)
+
str(kwargs["minute"]).zfill(2)
)
def save(self):
"""Create a new archives for an URL on the Wayback Machine."""
request_url = ("https://web.archive.org/save/" + self.clean_url())
hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec
req = Request(request_url, headers=hdr) #nosec
header = self.get_response(req).headers
def archive_url_parser(header):
"""Parse out the archive from header.""" """Parse out the archive from header."""
#Regex1 # Regex1
arch = re.search(r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)) arch = re.search(
r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
)
if arch: if arch:
return arch.group(1) return arch.group(1)
#Regex2 # Regex2
arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
if arch: if arch:
return arch.group(1) return arch.group(1)
raise WaybackError( raise WaybackError(
"No archive url found in the API response. Visit https://github.com/akamhy/waybackpy for latest version of waybackpy.\nHeader:\n%s" % str(header) "No archive URL found in the API response. "
"This version of waybackpy (%s) is likely out of date. Visit "
"https://github.com/akamhy/waybackpy for the latest version "
"of waybackpy.\nHeader:\n%s" % (__version__, str(header))
) )
return "https://" + archive_url_parser(header)
def get(self, url=None, user_agent=None, encoding=None): def _wayback_timestamp(**kwargs):
"""Returns the source code of the supplied URL. Auto detects the encoding if not supplied.""" """Return a formatted timestamp."""
return "".join(
str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
)
if not url:
url = self.clean_url()
if not user_agent:
user_agent = self.user_agent
hdr = { 'User-Agent' : '%s' % user_agent } def _get_response(req):
req = Request(url, headers=hdr) #nosec
response = self.get_response(req)
if not encoding:
try:
encoding= response.headers['content-type'].split('charset=')[-1]
except AttributeError:
encoding = "UTF-8"
return response.read().decode(encoding.replace("text/html", "UTF-8", 1))
def get_response(self, req):
"""Get response for the supplied request.""" """Get response for the supplied request."""
try: try:
response = urlopen(req) #nosec response = urlopen(req) # nosec
except Exception: except Exception:
try: try:
response = urlopen(req) #nosec response = urlopen(req) # nosec
except Exception as e: except Exception as e:
raise WaybackError(e) raise WaybackError(e)
return response return response
def near(self, **kwargs):
""" Returns the archived from Wayback Machine for an URL closest to the time supplied. class Url:
Supported params are year, month, day, hour and minute. """waybackpy Url object"""
The non supplied parameters are default to the runtime time.
""" def __init__(self, url, user_agent=default_UA):
year=kwargs.get("year", datetime.utcnow().strftime('%Y')) self.url = url
month=kwargs.get("month", datetime.utcnow().strftime('%m')) self.user_agent = user_agent
day=kwargs.get("day", datetime.utcnow().strftime('%d')) self._url_check() # checks url validity on init.
hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
minute=kwargs.get("minute", datetime.utcnow().strftime('%M')) def __repr__(self):
timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute) return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (self.clean_url(), str(timestamp))
hdr = { 'User-Agent' : '%s' % self.user_agent } def __str__(self):
return "%s" % self._clean_url()
def __len__(self):
return len(self._clean_url())
def _url_check(self):
"""Check for common URL problems."""
if "." not in self.url:
raise URLError("'%s' is not a vaild URL." % self.url)
def _clean_url(self):
"""Fix the URL, if possible."""
return str(self.url).strip().replace(" ", "_")
def save(self):
"""Create a new Wayback Machine archive for this URL."""
request_url = "https://web.archive.org/save/" + self._clean_url()
hdr = {"User-Agent": "%s" % self.user_agent} # nosec
req = Request(request_url, headers=hdr) # nosec req = Request(request_url, headers=hdr) # nosec
response = self.get_response(req) header = _get_response(req).headers
return "https://" + _archive_url_parser(header)
def _get(self, url="", user_agent="", encoding=""):
"""Return the source code of the supplied URL.
If encoding is not supplied, it is auto-detected from the response.
"""
if not url:
url = self._clean_url()
if not user_agent:
user_agent = self.user_agent
hdr = {"User-Agent": "%s" % user_agent}
req = Request(url, headers=hdr) # nosec
response = _get_response(req)
if not encoding:
try:
encoding = response.headers["content-type"].split("charset=")[-1]
except AttributeError:
encoding = "UTF-8"
return response.read().decode(encoding.replace("text/html", "UTF-8", 1))
def near(self, year=None, month=None, day=None, hour=None, minute=None):
"""Return the closest Wayback Machine archive to the time supplied.
Supported params are year, month, day, hour and minute.
Any non-supplied parameters default to the current time.
"""
now = datetime.utcnow().timetuple()
timestamp = _wayback_timestamp(
year=year if year else now.tm_year,
month=month if month else now.tm_mon,
day=day if day else now.tm_mday,
hour=hour if hour else now.tm_hour,
minute=minute if minute else now.tm_min,
)
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (
self._clean_url(),
timestamp,
)
hdr = {"User-Agent": "%s" % self.user_agent}
req = Request(request_url, headers=hdr) # nosec
response = _get_response(req)
data = json.loads(response.read().decode("UTF-8")) data = json.loads(response.read().decode("UTF-8"))
if not data["archived_snapshots"]: if not data["archived_snapshots"]:
raise WaybackError("'%s' is not yet archived. Use wayback.Url(url, user_agent).save() to create a new archive." % self.clean_url()) raise WaybackError(
archive_url = (data["archived_snapshots"]["closest"]["url"]) "'%s' is not yet archived. Use wayback.Url(url, user_agent).save() "
"to create a new archive." % self._clean_url()
)
archive_url = data["archived_snapshots"]["closest"]["url"]
# wayback machine returns http sometimes, idk why? But they support https # wayback machine returns http sometimes, idk why? But they support https
archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1) archive_url = archive_url.replace(
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
)
return archive_url return archive_url
def oldest(self, year=1994): def oldest(self, year=1994):
"""Returns the oldest archive from Wayback Machine for an URL.""" """Return the oldest Wayback Machine archive for this URL."""
return self.near(year=year) return self.near(year=year)
def newest(self): def newest(self):
"""Returns the newest archive on Wayback Machine for an URL, sometimes you may not get the newest archive because wayback machine DB lag.""" """Return the newest Wayback Machine archive available for this URL.
Due to Wayback Machine database lag, this may not always be the
most recent archive.
"""
return self.near() return self.near()
def total_archives(self): def total_archives(self):
"""Returns the total number of archives on Wayback Machine for an URL.""" """Returns the total number of Wayback Machine archives for this URL."""
hdr = { 'User-Agent' : '%s' % self.user_agent } hdr = {"User-Agent": "%s" % self.user_agent}
request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url() request_url = (
"https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode"
% self._clean_url()
)
req = Request(request_url, headers=hdr) # nosec req = Request(request_url, headers=hdr) # nosec
response = self.get_response(req) response = _get_response(req)
return str(response.read()).count(",") # Most efficient method to count number of archives (yet) # Most efficient method to count number of archives (yet)
return str(response.read()).count(",")