diff --git a/tests/test_1.py b/tests/test_1.py index f567318..736966f 100644 --- a/tests/test_1.py +++ b/tests/test_1.py @@ -3,81 +3,86 @@ sys.path.append("..") import waybackpy import pytest - user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0" def test_clean_url(): test_url = " https://en.wikipedia.org/wiki/Network security " answer = "https://en.wikipedia.org/wiki/Network_security" - test_result = waybackpy.clean_url(test_url) + target = waybackpy.Url(test_url, user_agent) + test_result = target.clean_url() assert answer == test_result def test_url_check(): - InvalidUrl = "http://wwwgooglecom/" + broken_url = "http://wwwgooglecom/" with pytest.raises(Exception) as e_info: - waybackpy.url_check(InvalidUrl) + waybackpy.Url(broken_url, user_agent) def test_save(): # Test for urls that exist and can be archived. url1="https://github.com/akamhy/waybackpy" - archived_url1 = waybackpy.save(url1, UA=user_agent) + target = waybackpy.Url(url1, user_agent) + archived_url1 = target.save() assert url1 in archived_url1 - + # Test for urls that are incorrect. with pytest.raises(Exception) as e_info: url2 = "ha ha ha ha" - waybackpy.save(url2, UA=user_agent) + waybackpy.Url(url2, user_agent) # Test for urls not allowed to archive by robot.txt. with pytest.raises(Exception) as e_info: url3 = "http://www.archive.is/faq.html" - waybackpy.save(url3, UA=user_agent) - + target = waybackpy.Url(url3, user_agent) + target.save() + # Non existent urls, test with pytest.raises(Exception) as e_info: url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us" - archived_url4 = waybackpy.save(url4, UA=user_agent) + target = waybackpy.Url(url3, user_agent) + target.save() def test_near(): url = "google.com" - archive_near_year = waybackpy.near(url, year=2010, UA=user_agent) + target = waybackpy.Url(url, user_agent) + archive_near_year = target.near(year=2010) assert "2010" in archive_near_year - archive_near_month_year = waybackpy.near(url, year=2015, month=2, UA=user_agent) + archive_near_month_year = target.near( year=2015, month=2) assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year) - archive_near_day_month_year = waybackpy.near(url, year=2006, month=11, day=15, UA=user_agent) + archive_near_day_month_year = target.near(year=2006, month=11, day=15) assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year) - archive_near_hour_day_month_year = waybackpy.near("www.python.org", year=2008, month=5, day=9, hour=15, UA=user_agent) + target = waybackpy.Url("www.python.org", user_agent) + archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15) assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year) with pytest.raises(Exception) as e_info: NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity" - waybackpy.near(NeverArchivedUrl, year=2010, UA=user_agent) + target = waybackpy.Url(NeverArchivedUrl, user_agent) + target.near(year=2010) def test_oldest(): url = "github.com/akamhy/waybackpy" - archive_oldest = waybackpy.oldest(url, UA=user_agent) - assert "20200504141153" in archive_oldest + target = waybackpy.Url(url, user_agent) + assert "20200504141153" in target.oldest() def test_newest(): url = "github.com/akamhy/waybackpy" - archive_newest = waybackpy.newest(url, UA=user_agent) - assert url in archive_newest + target = waybackpy.Url(url, user_agent) + assert url in target.newest() def test_get(): - oldest_google_archive = waybackpy.oldest("google.com", UA=user_agent) - oldest_google_page_text = waybackpy.get(oldest_google_archive, UA=user_agent) - assert "Welcome to Google" in oldest_google_page_text + target = waybackpy.Url("google.com", user_agent) + assert "Welcome to Google" in target.get(target.oldest()) def test_total_archives(): - count1 = waybackpy.total_archives(" https://google.com ", UA=user_agent) - assert count1 > 500000 + target = waybackpy.Url(" https://google.com ", user_agent) + assert target.total_archives() > 500000 - count2 = waybackpy.total_archives("https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8", UA=user_agent) - assert count2 == 0 + target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent) + assert target.total_archives() == 0 if __name__ == "__main__": test_clean_url() @@ -96,3 +101,4 @@ if __name__ == "__main__": print(".") test_total_archives() print(".") + print("OK") diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py index 557fd96..66d8bc6 100644 --- a/waybackpy/__init__.py +++ b/waybackpy/__init__.py @@ -25,6 +25,6 @@ Full documentation @ . :license: MIT """ -from .wrapper import save, near, oldest, newest, get, clean_url, url_check, total_archives +from .wrapper import Url from .__version__ import __title__, __description__, __url__, __version__ from .__version__ import __author__, __author_email__, __license__, __copyright__ diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 5d3bec6..69ddc20 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -1,43 +1,6 @@ # -*- coding: utf-8 -*- -class TooManyArchivingRequests(Exception): - - """Error when a single url reqeusted for archiving too many times in a short timespam. - Wayback machine doesn't supports archivng any url too many times in a short period of time. +class WaybackError(Exception): """ - -class ArchivingNotAllowed(Exception): - - """Files like robots.txt are set to deny robot archiving. - Wayback machine respects these file, will not archive. - """ - -class PageNotSaved(Exception): - """ - When unable to save a webpage. - """ - -class ArchiveNotFound(Exception): - """ - When a page was never archived but client asks for old archive. - """ - -class UrlNotFound(Exception): - """ - Raised when 404 UrlNotFound. - """ - -class BadGateWay(Exception): - """ - Raised when 502 bad gateway. - """ - -class WaybackUnavailable(Exception): - """ - Raised when 503 API Service Temporarily Unavailable. - """ - -class InvalidUrl(Exception): - """ - Raised when url doesn't follow the standard url format. + Raised when API Service error. """ diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 894b048..fa694cc 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -1,149 +1,160 @@ # -*- coding: utf-8 -*- +import re import sys import json from datetime import datetime -from waybackpy.exceptions import TooManyArchivingRequests, ArchivingNotAllowed, PageNotSaved, ArchiveNotFound, UrlNotFound, BadGateWay, InvalidUrl, WaybackUnavailable +from waybackpy.exceptions import WaybackError version = (3, 0) -cur_version = sys.version_info +python_version = sys.version_info -if cur_version >= version: # If the python ver >= 3 +if python_version >= version: # If the python ver >= 3 from urllib.request import Request, urlopen from urllib.error import HTTPError, URLError else: # For python2.x from urllib2 import Request, urlopen, HTTPError, URLError -default_UA = "waybackpy python package ; ( https://github.com/akamhy/waybackpy )" +default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy" -def url_check(url): - if "." not in url: - raise InvalidUrl("'%s' is not a vaild url." % url) +class Url(): -def clean_url(url): - return str(url).strip().replace(" ","_") + def __init__(self, url, user_agent=default_UA): + self.url = url + self.user_agent = user_agent -def wayback_timestamp(**kwargs): - return ( - str(kwargs["year"]) - + - str(kwargs["month"]).zfill(2) - + - str(kwargs["day"]).zfill(2) - + - str(kwargs["hour"]).zfill(2) - + - str(kwargs["minute"]).zfill(2) - ) + self.url_check() # checks url validity on init. -def handle_HTTPError(e): - if e.code == 502: - raise BadGateWay(e) - elif e.code == 503: - raise WaybackUnavailable(e) - elif e.code == 429: - raise TooManyArchivingRequests(e) - elif e.code == 404: - raise UrlNotFound(e) + def __repr__(self): + return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) -def save(url, UA=default_UA): - url_check(url) - request_url = ("https://web.archive.org/save/" + clean_url(url)) + def __str__(self): + return "%s" % self.clean_url() - hdr = { 'User-Agent' : '%s' % UA } #nosec - req = Request(request_url, headers=hdr) #nosec + def url_check(self): + if "." not in self.url: + raise URLError("'%s' is not a vaild url." % self.url) + + return True + + def clean_url(self): + return str(self.url).strip().replace(" ","_") + + def wayback_timestamp(self, **kwargs): + return ( + str(kwargs["year"]) + + + str(kwargs["month"]).zfill(2) + + + str(kwargs["day"]).zfill(2) + + + str(kwargs["hour"]).zfill(2) + + + str(kwargs["minute"]).zfill(2) + ) + + def handle_HTTPError(self, e): + if e.code >= 500: + raise WaybackError(e) from None + if e.code == 429: + raise WaybackError(e) from None + if e.code == 404: + raise HTTPError(e) from None + + def save(self): + request_url = ("https://web.archive.org/save/" + self.clean_url()) + hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec + req = Request(request_url, headers=hdr) #nosec - try: - response = urlopen(req) #nosec - except HTTPError as e: - if handle_HTTPError(e) is None: - raise PageNotSaved(e) - except URLError: try: response = urlopen(req) #nosec - except URLError as e: - raise UrlNotFound(e) + except HTTPError as e: + if self.handle_HTTPError(e) is None: + raise WaybackError(e) + except URLError: + try: + response = urlopen(req) #nosec + except URLError as e: + raise HTTPError(e) - header = response.headers + header = response.headers - if "exclusion.robots.policy" in str(header): - raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url)) + try: + arch = re.search(r"rel=\"memento.*?web\.archive\.org(/web/[0-9]{14}/.*?)>", str(header)).group(1) + except KeyError as e: + raise WaybackError(e) - return "https://web.archive.org" + header['Content-Location'] + return "https://web.archive.org" + arch -def get(url, encoding=None, UA=default_UA): - url_check(url) - hdr = { 'User-Agent' : '%s' % UA } - req = Request(clean_url(url), headers=hdr) #nosec + def get(self, url=None, user_agent=None, encoding=None): + + if not url: + url = self.clean_url() + + if not user_agent: + user_agent = self.user_agent + + hdr = { 'User-Agent' : '%s' % user_agent } + req = Request(url, headers=hdr) #nosec - try: - resp=urlopen(req) #nosec - except URLError: try: resp=urlopen(req) #nosec - except URLError as e: - raise UrlNotFound(e) + except URLError: + try: + resp=urlopen(req) #nosec + except URLError as e: + raise HTTPError(e) + + if not encoding: + try: + encoding= resp.headers['content-type'].split('charset=')[-1] + except AttributeError: + encoding = "UTF-8" + + return resp.read().decode(encoding.replace("text/html", "UTF-8", 1)) + + def near(self, **kwargs): + year=kwargs.get("year", datetime.utcnow().strftime('%Y')) + month=kwargs.get("month", datetime.utcnow().strftime('%m')) + day=kwargs.get("day", datetime.utcnow().strftime('%d')) + hour=kwargs.get("hour", datetime.utcnow().strftime('%H')) + minute=kwargs.get("minute", datetime.utcnow().strftime('%M')) + + timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute) + request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % (self.clean_url(), str(timestamp)) + hdr = { 'User-Agent' : '%s' % self.user_agent } + req = Request(request_url, headers=hdr) # nosec - if encoding is None: try: - encoding= resp.headers['content-type'].split('charset=')[-1] - except AttributeError: - encoding = "UTF-8" + response = urlopen(req) #nosec + except HTTPError as e: + self.handle_HTTPError(e) - return resp.read().decode(encoding.replace("text/html", "UTF-8", 1)) + data = json.loads(response.read().decode("UTF-8")) + if not data["archived_snapshots"]: + raise WaybackError("'%s' is not yet archived." % url) -def near(url, **kwargs): + archive_url = (data["archived_snapshots"]["closest"]["url"]) + # wayback machine returns http sometimes, idk why? But they support https + archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1) + return archive_url - try: - url = kwargs["url"] - except KeyError: - url = url + def oldest(self, year=1994): + return self.near(year=year) - year=kwargs.get("year", datetime.utcnow().strftime('%Y')) - month=kwargs.get("month", datetime.utcnow().strftime('%m')) - day=kwargs.get("day", datetime.utcnow().strftime('%d')) - hour=kwargs.get("hour", datetime.utcnow().strftime('%H')) - minute=kwargs.get("minute", datetime.utcnow().strftime('%M')) - UA=kwargs.get("UA", default_UA) + def newest(self): + return self.near() - url_check(url) - timestamp = wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute) - request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % (clean_url(url), str(timestamp)) - hdr = { 'User-Agent' : '%s' % UA } - req = Request(request_url, headers=hdr) # nosec + def total_archives(self): + hdr = { 'User-Agent' : '%s' % self.user_agent } + request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url() + req = Request(request_url, headers=hdr) # nosec - try: - response = urlopen(req) #nosec - except HTTPError as e: - handle_HTTPError(e) + try: + response = urlopen(req) #nosec + except HTTPError as e: + self.handle_HTTPError(e) - data = json.loads(response.read().decode("UTF-8")) - if not data["archived_snapshots"]: - raise ArchiveNotFound("'%s' is not yet archived." % url) - - archive_url = (data["archived_snapshots"]["closest"]["url"]) - # wayback machine returns http sometimes, idk why? But they support https - archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1) - return archive_url - -def oldest(url, UA=default_UA, year=1994): - return near(url, year=year, UA=UA) - -def newest(url, UA=default_UA): - return near(url, UA=UA) - -def total_archives(url, UA=default_UA): - url_check(url) - - hdr = { 'User-Agent' : '%s' % UA } - request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % clean_url(url) - req = Request(request_url, headers=hdr) # nosec - - try: - response = urlopen(req) #nosec - except HTTPError as e: - handle_HTTPError(e) - - return str(response.read()).count(",") # Most efficient method to count (yet) + return str(response.read()).count(",") # Most efficient method to count number of archives (yet)