From be7642c8377fcb6d52d4c064e6089aaef6285d95 Mon Sep 17 00:00:00 2001 From: AntiCompositeNumber Date: Wed, 22 Jul 2020 00:39:14 -0400 Subject: [PATCH] Code style improvements (#20) * Add sane line length to setup.cfg * Use Black for quick readability improvements * Clean up exceptions, docstrings, and comments Docstrings on dunder functions are redundant and typically ignored Limit to reasonable line length General grammar and style corrections Clarify docstrings and exceptions Format docstrings per PEP 257 -- Docstring Conventions * Move archive_url_parser out of Url.save() It's generally poor form to define a function in a function, as it will be re-defined each time the function is run. archive_url_parser does not depend on anything in Url, so it makes sense to move it out of the class. * move wayback_timestamp out of class, mark private functions * DRY in _wayback_timestamp * Url._url_check should return None There's no point in returning True if it's never checked and won't ever be False. Implicitly returning None or raising an exception is more idiomatic. * Default parameters should be type-consistant with expected values * Specify parameters to near * Use datetime.datetime in _wayback_timestamp * cleanup __init__.py * Cleanup formatting in tests * Fix names in tests * Revert "Use datetime.datetime in _wayback_timestamp" This reverts commit 5b3038086582e913d525200d68abc68ab05458e7. Introduced unnecessary complexity * Move _get_response outside of Url Because Codacy reminded me that I missed it. * fix imports in tests --- setup.cfg | 4 + tests/test_1.py | 115 +++++++++++++++++------- waybackpy/__init__.py | 12 ++- waybackpy/wrapper.py | 201 +++++++++++++++++++++++------------------- 4 files changed, 208 insertions(+), 124 deletions(-) diff --git a/setup.cfg b/setup.cfg index f48fdad..8980e9c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,7 @@ [metadata] description-file = README.md license_file = LICENSE + +[flake8] +max-line-length = 88 +extend-ignore = E203,W503 diff --git a/tests/test_1.py b/tests/test_1.py index 47284d5..d043c23 100644 --- a/tests/test_1.py +++ b/tests/test_1.py @@ -1,30 +1,35 @@ # -*- coding: utf-8 -*- import sys -sys.path.append("..") -import waybackpy import pytest import random import time + +sys.path.append("..") +import waybackpy.wrapper as waybackpy # noqa: E402 + if sys.version_info >= (3, 0): # If the python ver >= 3 from urllib.request import Request, urlopen from urllib.error import URLError -else: # For python2.x +else: # For python2.x from urllib2 import Request, urlopen, URLError user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0" + def test_clean_url(): test_url = " https://en.wikipedia.org/wiki/Network security " answer = "https://en.wikipedia.org/wiki/Network_security" target = waybackpy.Url(test_url, user_agent) - test_result = target.clean_url() + test_result = target._clean_url() assert answer == test_result + def test_url_check(): broken_url = "http://wwwgooglecom/" - with pytest.raises(Exception) as e_info: + with pytest.raises(Exception): waybackpy.Url(broken_url, user_agent) + def test_save(): # Test for urls that exist and can be archived. time.sleep(10) @@ -35,89 +40,139 @@ def test_save(): "commons.wikimedia.org", "www.wiktionary.org", "www.w3schools.com", - "www.youtube.com" + "www.youtube.com", ] - x = random.randint(0, len(url_list)-1) + x = random.randint(0, len(url_list) - 1) url1 = url_list[x] - target = waybackpy.Url(url1, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36") + target = waybackpy.Url( + url1, + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36", + ) archived_url1 = target.save() assert url1 in archived_url1 if sys.version_info > (3, 6): # Test for urls that are incorrect. - with pytest.raises(Exception) as e_info: + with pytest.raises(Exception): url2 = "ha ha ha ha" waybackpy.Url(url2, user_agent) time.sleep(5) # Test for urls not allowed to archive by robot.txt. - with pytest.raises(Exception) as e_info: + with pytest.raises(Exception): url3 = "http://www.archive.is/faq.html" - target = waybackpy.Url(url3, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0") + target = waybackpy.Url( + url3, + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) " + "Gecko/20100101 Firefox/25.0", + ) target.save() time.sleep(5) # Non existent urls, test - with pytest.raises(Exception) as e_info: - url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us" - target = waybackpy.Url(url3, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27") + with pytest.raises(Exception): + url4 = ( + "https://githfgdhshajagjstgeths537agajaajgsagudadhuss87623" + "46887adsiugujsdgahub.us" + ) + target = waybackpy.Url( + url3, + "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) " + "AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 " + "Safari/533.20.27", + ) target.save() else: pass + def test_near(): time.sleep(10) url = "google.com" - target = waybackpy.Url(url, "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4") + target = waybackpy.Url( + url, + "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 " + "(KHTML, like Gecko) Version/5.0.3 Safari/533.19.4", + ) archive_near_year = target.near(year=2010) assert "2010" in archive_near_year if sys.version_info > (3, 6): time.sleep(5) - archive_near_month_year = target.near( year=2015, month=2) - assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year) - - target = waybackpy.Url("www.python.org", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246") - archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15) - assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year) - - with pytest.raises(Exception) as e_info: - NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity" + archive_near_month_year = target.near(year=2015, month=2) + assert ( + ("201502" in archive_near_month_year) + or ("201501" in archive_near_month_year) + or ("201503" in archive_near_month_year) + ) + + target = waybackpy.Url( + "www.python.org", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246", + ) + archive_near_hour_day_month_year = target.near( + year=2008, month=5, day=9, hour=15 + ) + assert ( + ("2008050915" in archive_near_hour_day_month_year) + or ("2008050914" in archive_near_hour_day_month_year) + or ("2008050913" in archive_near_hour_day_month_year) + ) + + with pytest.raises(Exception): + NeverArchivedUrl = ( + "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity" + ) target = waybackpy.Url(NeverArchivedUrl, user_agent) target.near(year=2010) else: pass + def test_oldest(): url = "github.com/akamhy/waybackpy" target = waybackpy.Url(url, user_agent) assert "20200504141153" in target.oldest() + def test_newest(): url = "github.com/akamhy/waybackpy" target = waybackpy.Url(url, user_agent) assert url in target.newest() + def test_get(): target = waybackpy.Url("google.com", user_agent) - assert "Welcome to Google" in target.get(target.oldest()) + assert "Welcome to Google" in target._get(target.oldest()) + def test_wayback_timestamp(): - ts = waybackpy.Url("https://www.google.com","UA").wayback_timestamp(year=2020,month=1,day=2,hour=3,minute=4) + ts = waybackpy._wayback_timestamp( + year=2020, month=1, day=2, hour=3, minute=4 + ) assert "202001020304" in str(ts) + def test_get_response(): - hdr = { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'} - req = Request("https://www.google.com", headers=hdr) # nosec - response = waybackpy.Url("https://www.google.com","UA").get_response(req) + hdr = { + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) " + "Gecko/20100101 Firefox/78.0" + } + req = Request("https://www.google.com", headers=hdr) # nosec + response = waybackpy._get_response(req) assert response.code == 200 + def test_total_archives(): if sys.version_info > (3, 6): target = waybackpy.Url(" https://google.com ", user_agent) assert target.total_archives() > 500000 else: pass - target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent) + target = waybackpy.Url( + " https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent + ) assert target.total_archives() == 0 diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py index 48ceb2c..0092356 100644 --- a/waybackpy/__init__.py +++ b/waybackpy/__init__.py @@ -28,5 +28,13 @@ Full documentation @ . """ from .wrapper import Url -from .__version__ import __title__, __description__, __url__, __version__ -from .__version__ import __author__, __author_email__, __license__, __copyright__ +from .__version__ import ( + __title__, + __description__, + __url__, + __version__, + __author__, + __author_email__, + __license__, + __copyright__, +) diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index d1c5a75..8c2447e 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -5,147 +5,164 @@ import sys import json from datetime import datetime from waybackpy.exceptions import WaybackError +from waybackpy.__version__ import __version__ if sys.version_info >= (3, 0): # If the python ver >= 3 from urllib.request import Request, urlopen from urllib.error import URLError -else: # For python2.x +else: # For python2.x from urllib2 import Request, urlopen, URLError default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy" -class Url(): - """waybackpy Url object""" +def _archive_url_parser(header): + """Parse out the archive from header.""" + # Regex1 + arch = re.search( + r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header) + ) + if arch: + return arch.group(1) + # Regex2 + arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) + if arch: + return arch.group(1) + raise WaybackError( + "No archive URL found in the API response. " + "This version of waybackpy (%s) is likely out of date. Visit " + "https://github.com/akamhy/waybackpy for the latest version " + "of waybackpy.\nHeader:\n%s" % (__version__, str(header)) + ) + + +def _wayback_timestamp(**kwargs): + """Return a formatted timestamp.""" + return "".join( + str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"] + ) + + +def _get_response(req): + """Get response for the supplied request.""" + try: + response = urlopen(req) # nosec + except Exception: + try: + response = urlopen(req) # nosec + except Exception as e: + raise WaybackError(e) + return response + + +class Url: + """waybackpy Url object""" def __init__(self, url, user_agent=default_UA): self.url = url self.user_agent = user_agent - self.url_check() # checks url validity on init. + self._url_check() # checks url validity on init. def __repr__(self): - """Representation of the object.""" return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) def __str__(self): - """String representation of the object.""" - return "%s" % self.clean_url() + return "%s" % self._clean_url() def __len__(self): - """Length of the URL.""" - return len(self.clean_url()) + return len(self._clean_url()) - def url_check(self): + def _url_check(self): """Check for common URL problems.""" if "." not in self.url: - raise URLError("'%s' is not a vaild url." % self.url) - return True + raise URLError("'%s' is not a vaild URL." % self.url) - def clean_url(self): + def _clean_url(self): """Fix the URL, if possible.""" - return str(self.url).strip().replace(" ","_") - - def wayback_timestamp(self, **kwargs): - """Return the formatted the timestamp.""" - return ( - str(kwargs["year"]) - + - str(kwargs["month"]).zfill(2) - + - str(kwargs["day"]).zfill(2) - + - str(kwargs["hour"]).zfill(2) - + - str(kwargs["minute"]).zfill(2) - ) + return str(self.url).strip().replace(" ", "_") def save(self): - """Create a new archives for an URL on the Wayback Machine.""" - request_url = ("https://web.archive.org/save/" + self.clean_url()) - hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec - req = Request(request_url, headers=hdr) #nosec - header = self.get_response(req).headers - - def archive_url_parser(header): - """Parse out the archive from header.""" - #Regex1 - arch = re.search(r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)) - if arch: - return arch.group(1) - #Regex2 - arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) - if arch: - return arch.group(1) - raise WaybackError( - "No archive url found in the API response. Visit https://github.com/akamhy/waybackpy for latest version of waybackpy.\nHeader:\n%s" % str(header) - ) - - return "https://" + archive_url_parser(header) - - def get(self, url=None, user_agent=None, encoding=None): - """Returns the source code of the supplied URL. Auto detects the encoding if not supplied.""" + """Create a new Wayback Machine archive for this URL.""" + request_url = "https://web.archive.org/save/" + self._clean_url() + hdr = {"User-Agent": "%s" % self.user_agent} # nosec + req = Request(request_url, headers=hdr) # nosec + header = _get_response(req).headers + return "https://" + _archive_url_parser(header) + def _get(self, url="", user_agent="", encoding=""): + """Return the source code of the supplied URL. + If encoding is not supplied, it is auto-detected from the response. + """ if not url: - url = self.clean_url() + url = self._clean_url() if not user_agent: user_agent = self.user_agent - hdr = { 'User-Agent' : '%s' % user_agent } - req = Request(url, headers=hdr) #nosec - response = self.get_response(req) + hdr = {"User-Agent": "%s" % user_agent} + req = Request(url, headers=hdr) # nosec + response = _get_response(req) if not encoding: try: - encoding= response.headers['content-type'].split('charset=')[-1] + encoding = response.headers["content-type"].split("charset=")[-1] except AttributeError: encoding = "UTF-8" return response.read().decode(encoding.replace("text/html", "UTF-8", 1)) - def get_response(self, req): - """Get response for the supplied request.""" - try: - response = urlopen(req) #nosec - except Exception: - try: - response = urlopen(req) #nosec - except Exception as e: - raise WaybackError(e) - return response + def near(self, year=None, month=None, day=None, hour=None, minute=None): + """Return the closest Wayback Machine archive to the time supplied. - def near(self, **kwargs): - """ Returns the archived from Wayback Machine for an URL closest to the time supplied. - Supported params are year, month, day, hour and minute. - The non supplied parameters are default to the runtime time. + Supported params are year, month, day, hour and minute. + Any non-supplied parameters default to the current time. """ - year=kwargs.get("year", datetime.utcnow().strftime('%Y')) - month=kwargs.get("month", datetime.utcnow().strftime('%m')) - day=kwargs.get("day", datetime.utcnow().strftime('%d')) - hour=kwargs.get("hour", datetime.utcnow().strftime('%H')) - minute=kwargs.get("minute", datetime.utcnow().strftime('%M')) - timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute) - request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % (self.clean_url(), str(timestamp)) - hdr = { 'User-Agent' : '%s' % self.user_agent } - req = Request(request_url, headers=hdr) # nosec - response = self.get_response(req) + now = datetime.utcnow().timetuple() + timestamp = _wayback_timestamp( + year=year if year else now.tm_year, + month=month if month else now.tm_mon, + day=day if day else now.tm_mday, + hour=hour if hour else now.tm_hour, + minute=minute if minute else now.tm_min, + ) + + request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % ( + self._clean_url(), + timestamp, + ) + hdr = {"User-Agent": "%s" % self.user_agent} + req = Request(request_url, headers=hdr) # nosec + response = _get_response(req) data = json.loads(response.read().decode("UTF-8")) if not data["archived_snapshots"]: - raise WaybackError("'%s' is not yet archived. Use wayback.Url(url, user_agent).save() to create a new archive." % self.clean_url()) - archive_url = (data["archived_snapshots"]["closest"]["url"]) + raise WaybackError( + "'%s' is not yet archived. Use wayback.Url(url, user_agent).save() " + "to create a new archive." % self._clean_url() + ) + archive_url = data["archived_snapshots"]["closest"]["url"] # wayback machine returns http sometimes, idk why? But they support https - archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1) + archive_url = archive_url.replace( + "http://web.archive.org/web/", "https://web.archive.org/web/", 1 + ) return archive_url def oldest(self, year=1994): - """Returns the oldest archive from Wayback Machine for an URL.""" + """Return the oldest Wayback Machine archive for this URL.""" return self.near(year=year) def newest(self): - """Returns the newest archive on Wayback Machine for an URL, sometimes you may not get the newest archive because wayback machine DB lag.""" + """Return the newest Wayback Machine archive available for this URL. + + Due to Wayback Machine database lag, this may not always be the + most recent archive. + """ return self.near() def total_archives(self): - """Returns the total number of archives on Wayback Machine for an URL.""" - hdr = { 'User-Agent' : '%s' % self.user_agent } - request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url() - req = Request(request_url, headers=hdr) # nosec - response = self.get_response(req) - return str(response.read()).count(",") # Most efficient method to count number of archives (yet) + """Returns the total number of Wayback Machine archives for this URL.""" + hdr = {"User-Agent": "%s" % self.user_agent} + request_url = ( + "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" + % self._clean_url() + ) + req = Request(request_url, headers=hdr) # nosec + response = _get_response(req) + # Most efficient method to count number of archives (yet) + return str(response.read()).count(",")