* Update wrapper.py

* Update exceptions.py

* Update __init__.py

* test adjusted for new changes

* Update wrapper.py
This commit is contained in:
Akash 2020-07-17 20:50:00 +05:30 committed by GitHub
parent 9ac1e877c8
commit 9860527d96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 155 additions and 175 deletions

View File

@ -3,81 +3,86 @@ sys.path.append("..")
import waybackpy import waybackpy
import pytest import pytest
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0" user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
def test_clean_url(): def test_clean_url():
test_url = " https://en.wikipedia.org/wiki/Network security " test_url = " https://en.wikipedia.org/wiki/Network security "
answer = "https://en.wikipedia.org/wiki/Network_security" answer = "https://en.wikipedia.org/wiki/Network_security"
test_result = waybackpy.clean_url(test_url) target = waybackpy.Url(test_url, user_agent)
test_result = target.clean_url()
assert answer == test_result assert answer == test_result
def test_url_check(): def test_url_check():
InvalidUrl = "http://wwwgooglecom/" broken_url = "http://wwwgooglecom/"
with pytest.raises(Exception) as e_info: with pytest.raises(Exception) as e_info:
waybackpy.url_check(InvalidUrl) waybackpy.Url(broken_url, user_agent)
def test_save(): def test_save():
# Test for urls that exist and can be archived. # Test for urls that exist and can be archived.
url1="https://github.com/akamhy/waybackpy" url1="https://github.com/akamhy/waybackpy"
archived_url1 = waybackpy.save(url1, UA=user_agent) target = waybackpy.Url(url1, user_agent)
archived_url1 = target.save()
assert url1 in archived_url1 assert url1 in archived_url1
# Test for urls that are incorrect. # Test for urls that are incorrect.
with pytest.raises(Exception) as e_info: with pytest.raises(Exception) as e_info:
url2 = "ha ha ha ha" url2 = "ha ha ha ha"
waybackpy.save(url2, UA=user_agent) waybackpy.Url(url2, user_agent)
# Test for urls not allowed to archive by robot.txt. # Test for urls not allowed to archive by robot.txt.
with pytest.raises(Exception) as e_info: with pytest.raises(Exception) as e_info:
url3 = "http://www.archive.is/faq.html" url3 = "http://www.archive.is/faq.html"
waybackpy.save(url3, UA=user_agent) target = waybackpy.Url(url3, user_agent)
target.save()
# Non existent urls, test # Non existent urls, test
with pytest.raises(Exception) as e_info: with pytest.raises(Exception) as e_info:
url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us" url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
archived_url4 = waybackpy.save(url4, UA=user_agent) target = waybackpy.Url(url3, user_agent)
target.save()
def test_near(): def test_near():
url = "google.com" url = "google.com"
archive_near_year = waybackpy.near(url, year=2010, UA=user_agent) target = waybackpy.Url(url, user_agent)
archive_near_year = target.near(year=2010)
assert "2010" in archive_near_year assert "2010" in archive_near_year
archive_near_month_year = waybackpy.near(url, year=2015, month=2, UA=user_agent) archive_near_month_year = target.near( year=2015, month=2)
assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year) assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
archive_near_day_month_year = waybackpy.near(url, year=2006, month=11, day=15, UA=user_agent) archive_near_day_month_year = target.near(year=2006, month=11, day=15)
assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year) assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year)
archive_near_hour_day_month_year = waybackpy.near("www.python.org", year=2008, month=5, day=9, hour=15, UA=user_agent) target = waybackpy.Url("www.python.org", user_agent)
archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15)
assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year) assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
with pytest.raises(Exception) as e_info: with pytest.raises(Exception) as e_info:
NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity" NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
waybackpy.near(NeverArchivedUrl, year=2010, UA=user_agent) target = waybackpy.Url(NeverArchivedUrl, user_agent)
target.near(year=2010)
def test_oldest(): def test_oldest():
url = "github.com/akamhy/waybackpy" url = "github.com/akamhy/waybackpy"
archive_oldest = waybackpy.oldest(url, UA=user_agent) target = waybackpy.Url(url, user_agent)
assert "20200504141153" in archive_oldest assert "20200504141153" in target.oldest()
def test_newest(): def test_newest():
url = "github.com/akamhy/waybackpy" url = "github.com/akamhy/waybackpy"
archive_newest = waybackpy.newest(url, UA=user_agent) target = waybackpy.Url(url, user_agent)
assert url in archive_newest assert url in target.newest()
def test_get(): def test_get():
oldest_google_archive = waybackpy.oldest("google.com", UA=user_agent) target = waybackpy.Url("google.com", user_agent)
oldest_google_page_text = waybackpy.get(oldest_google_archive, UA=user_agent) assert "Welcome to Google" in target.get(target.oldest())
assert "Welcome to Google" in oldest_google_page_text
def test_total_archives(): def test_total_archives():
count1 = waybackpy.total_archives(" https://google.com ", UA=user_agent) target = waybackpy.Url(" https://google.com ", user_agent)
assert count1 > 500000 assert target.total_archives() > 500000
count2 = waybackpy.total_archives("https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8", UA=user_agent) target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent)
assert count2 == 0 assert target.total_archives() == 0
if __name__ == "__main__": if __name__ == "__main__":
test_clean_url() test_clean_url()
@ -96,3 +101,4 @@ if __name__ == "__main__":
print(".") print(".")
test_total_archives() test_total_archives()
print(".") print(".")
print("OK")

View File

@ -25,6 +25,6 @@ Full documentation @ <https://akamhy.github.io/waybackpy/>.
:license: MIT :license: MIT
""" """
from .wrapper import save, near, oldest, newest, get, clean_url, url_check, total_archives from .wrapper import Url
from .__version__ import __title__, __description__, __url__, __version__ from .__version__ import __title__, __description__, __url__, __version__
from .__version__ import __author__, __author_email__, __license__, __copyright__ from .__version__ import __author__, __author_email__, __license__, __copyright__

View File

@ -1,43 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
class TooManyArchivingRequests(Exception): class WaybackError(Exception):
"""Error when a single url reqeusted for archiving too many times in a short timespam.
Wayback machine doesn't supports archivng any url too many times in a short period of time.
""" """
Raised when API Service error.
class ArchivingNotAllowed(Exception):
"""Files like robots.txt are set to deny robot archiving.
Wayback machine respects these file, will not archive.
"""
class PageNotSaved(Exception):
"""
When unable to save a webpage.
"""
class ArchiveNotFound(Exception):
"""
When a page was never archived but client asks for old archive.
"""
class UrlNotFound(Exception):
"""
Raised when 404 UrlNotFound.
"""
class BadGateWay(Exception):
"""
Raised when 502 bad gateway.
"""
class WaybackUnavailable(Exception):
"""
Raised when 503 API Service Temporarily Unavailable.
"""
class InvalidUrl(Exception):
"""
Raised when url doesn't follow the standard url format.
""" """

View File

@ -1,149 +1,160 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re
import sys import sys
import json import json
from datetime import datetime from datetime import datetime
from waybackpy.exceptions import TooManyArchivingRequests, ArchivingNotAllowed, PageNotSaved, ArchiveNotFound, UrlNotFound, BadGateWay, InvalidUrl, WaybackUnavailable from waybackpy.exceptions import WaybackError
version = (3, 0) version = (3, 0)
cur_version = sys.version_info python_version = sys.version_info
if cur_version >= version: # If the python ver >= 3 if python_version >= version: # If the python ver >= 3
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError from urllib.error import HTTPError, URLError
else: # For python2.x else: # For python2.x
from urllib2 import Request, urlopen, HTTPError, URLError from urllib2 import Request, urlopen, HTTPError, URLError
default_UA = "waybackpy python package ; ( https://github.com/akamhy/waybackpy )" default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
def url_check(url): class Url():
if "." not in url:
raise InvalidUrl("'%s' is not a vaild url." % url)
def clean_url(url): def __init__(self, url, user_agent=default_UA):
return str(url).strip().replace(" ","_") self.url = url
self.user_agent = user_agent
def wayback_timestamp(**kwargs): self.url_check() # checks url validity on init.
return (
str(kwargs["year"])
+
str(kwargs["month"]).zfill(2)
+
str(kwargs["day"]).zfill(2)
+
str(kwargs["hour"]).zfill(2)
+
str(kwargs["minute"]).zfill(2)
)
def handle_HTTPError(e): def __repr__(self):
if e.code == 502: return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
raise BadGateWay(e)
elif e.code == 503:
raise WaybackUnavailable(e)
elif e.code == 429:
raise TooManyArchivingRequests(e)
elif e.code == 404:
raise UrlNotFound(e)
def save(url, UA=default_UA): def __str__(self):
url_check(url) return "%s" % self.clean_url()
request_url = ("https://web.archive.org/save/" + clean_url(url))
hdr = { 'User-Agent' : '%s' % UA } #nosec def url_check(self):
req = Request(request_url, headers=hdr) #nosec if "." not in self.url:
raise URLError("'%s' is not a vaild url." % self.url)
return True
def clean_url(self):
return str(self.url).strip().replace(" ","_")
def wayback_timestamp(self, **kwargs):
return (
str(kwargs["year"])
+
str(kwargs["month"]).zfill(2)
+
str(kwargs["day"]).zfill(2)
+
str(kwargs["hour"]).zfill(2)
+
str(kwargs["minute"]).zfill(2)
)
def handle_HTTPError(self, e):
if e.code >= 500:
raise WaybackError(e) from None
if e.code == 429:
raise WaybackError(e) from None
if e.code == 404:
raise HTTPError(e) from None
def save(self):
request_url = ("https://web.archive.org/save/" + self.clean_url())
hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec
req = Request(request_url, headers=hdr) #nosec
try:
response = urlopen(req) #nosec
except HTTPError as e:
if handle_HTTPError(e) is None:
raise PageNotSaved(e)
except URLError:
try: try:
response = urlopen(req) #nosec response = urlopen(req) #nosec
except URLError as e: except HTTPError as e:
raise UrlNotFound(e) if self.handle_HTTPError(e) is None:
raise WaybackError(e)
except URLError:
try:
response = urlopen(req) #nosec
except URLError as e:
raise HTTPError(e)
header = response.headers header = response.headers
if "exclusion.robots.policy" in str(header): try:
raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url)) arch = re.search(r"rel=\"memento.*?web\.archive\.org(/web/[0-9]{14}/.*?)>", str(header)).group(1)
except KeyError as e:
raise WaybackError(e)
return "https://web.archive.org" + header['Content-Location'] return "https://web.archive.org" + arch
def get(url, encoding=None, UA=default_UA): def get(self, url=None, user_agent=None, encoding=None):
url_check(url)
hdr = { 'User-Agent' : '%s' % UA } if not url:
req = Request(clean_url(url), headers=hdr) #nosec url = self.clean_url()
if not user_agent:
user_agent = self.user_agent
hdr = { 'User-Agent' : '%s' % user_agent }
req = Request(url, headers=hdr) #nosec
try:
resp=urlopen(req) #nosec
except URLError:
try: try:
resp=urlopen(req) #nosec resp=urlopen(req) #nosec
except URLError as e: except URLError:
raise UrlNotFound(e) try:
resp=urlopen(req) #nosec
except URLError as e:
raise HTTPError(e)
if not encoding:
try:
encoding= resp.headers['content-type'].split('charset=')[-1]
except AttributeError:
encoding = "UTF-8"
return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
def near(self, **kwargs):
year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
month=kwargs.get("month", datetime.utcnow().strftime('%m'))
day=kwargs.get("day", datetime.utcnow().strftime('%d'))
hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (self.clean_url(), str(timestamp))
hdr = { 'User-Agent' : '%s' % self.user_agent }
req = Request(request_url, headers=hdr) # nosec
if encoding is None:
try: try:
encoding= resp.headers['content-type'].split('charset=')[-1] response = urlopen(req) #nosec
except AttributeError: except HTTPError as e:
encoding = "UTF-8" self.handle_HTTPError(e)
return resp.read().decode(encoding.replace("text/html", "UTF-8", 1)) data = json.loads(response.read().decode("UTF-8"))
if not data["archived_snapshots"]:
raise WaybackError("'%s' is not yet archived." % url)
def near(url, **kwargs): archive_url = (data["archived_snapshots"]["closest"]["url"])
# wayback machine returns http sometimes, idk why? But they support https
archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
return archive_url
try: def oldest(self, year=1994):
url = kwargs["url"] return self.near(year=year)
except KeyError:
url = url
year=kwargs.get("year", datetime.utcnow().strftime('%Y')) def newest(self):
month=kwargs.get("month", datetime.utcnow().strftime('%m')) return self.near()
day=kwargs.get("day", datetime.utcnow().strftime('%d'))
hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
UA=kwargs.get("UA", default_UA)
url_check(url) def total_archives(self):
timestamp = wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute) hdr = { 'User-Agent' : '%s' % self.user_agent }
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (clean_url(url), str(timestamp)) request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url()
hdr = { 'User-Agent' : '%s' % UA } req = Request(request_url, headers=hdr) # nosec
req = Request(request_url, headers=hdr) # nosec
try: try:
response = urlopen(req) #nosec response = urlopen(req) #nosec
except HTTPError as e: except HTTPError as e:
handle_HTTPError(e) self.handle_HTTPError(e)
data = json.loads(response.read().decode("UTF-8")) return str(response.read()).count(",") # Most efficient method to count number of archives (yet)
if not data["archived_snapshots"]:
raise ArchiveNotFound("'%s' is not yet archived." % url)
archive_url = (data["archived_snapshots"]["closest"]["url"])
# wayback machine returns http sometimes, idk why? But they support https
archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
return archive_url
def oldest(url, UA=default_UA, year=1994):
return near(url, year=year, UA=UA)
def newest(url, UA=default_UA):
return near(url, UA=UA)
def total_archives(url, UA=default_UA):
url_check(url)
hdr = { 'User-Agent' : '%s' % UA }
request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % clean_url(url)
req = Request(request_url, headers=hdr) # nosec
try:
response = urlopen(req) #nosec
except HTTPError as e:
handle_HTTPError(e)
return str(response.read()).count(",") # Most efficient method to count (yet)