From 9860527d96da27d31a3eb376bb806dbf6c5ac0f7 Mon Sep 17 00:00:00 2001
From: Akash <64683866+akamhy@users.noreply.github.com>
Date: Fri, 17 Jul 2020 20:50:00 +0530
Subject: [PATCH] OOP (#10)

* Update wrapper.py

* Update exceptions.py

* Update __init__.py

* test adjusted for new changes

* Update wrapper.py
---
 tests/test_1.py         |  58 +++++-----
 waybackpy/__init__.py   |   2 +-
 waybackpy/exceptions.py |  41 +------
 waybackpy/wrapper.py    | 229 +++++++++++++++++++++-------------------
 4 files changed, 155 insertions(+), 175 deletions(-)

diff --git a/tests/test_1.py b/tests/test_1.py
index f567318..736966f 100644
--- a/tests/test_1.py
+++ b/tests/test_1.py
@@ -3,81 +3,86 @@ sys.path.append("..")
 import waybackpy
 import pytest
 
-
 user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
 
 def test_clean_url():
     test_url = " https://en.wikipedia.org/wiki/Network security "
     answer = "https://en.wikipedia.org/wiki/Network_security"
-    test_result = waybackpy.clean_url(test_url)
+    target = waybackpy.Url(test_url, user_agent)
+    test_result = target.clean_url()
     assert answer == test_result
 
 def test_url_check():
-    InvalidUrl = "http://wwwgooglecom/"
+    broken_url = "http://wwwgooglecom/"
     with pytest.raises(Exception) as e_info:
-        waybackpy.url_check(InvalidUrl)
+        waybackpy.Url(broken_url, user_agent)
 
 def test_save():
     # Test for urls that exist and can be archived.
     url1="https://github.com/akamhy/waybackpy"
-    archived_url1 = waybackpy.save(url1, UA=user_agent)
+    target = waybackpy.Url(url1, user_agent)
+    archived_url1 = target.save()
     assert url1 in archived_url1
-    
+
     # Test for urls that are incorrect.
     with pytest.raises(Exception) as e_info:
         url2 = "ha ha ha ha"
-        waybackpy.save(url2, UA=user_agent)
+        waybackpy.Url(url2, user_agent)
 
     # Test for urls not allowed to archive by robot.txt.
     with pytest.raises(Exception) as e_info:
         url3 = "http://www.archive.is/faq.html"
-        waybackpy.save(url3, UA=user_agent)
-    
+        target = waybackpy.Url(url3, user_agent)
+        target.save()
+
     # Non existent urls, test
     with pytest.raises(Exception) as e_info:
         url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
-        archived_url4 = waybackpy.save(url4, UA=user_agent)
+        target = waybackpy.Url(url3, user_agent)
+        target.save()
 
 def test_near():
     url = "google.com"
-    archive_near_year = waybackpy.near(url, year=2010, UA=user_agent)
+    target = waybackpy.Url(url, user_agent)
+    archive_near_year = target.near(year=2010)
     assert "2010" in archive_near_year
 
-    archive_near_month_year = waybackpy.near(url, year=2015, month=2, UA=user_agent)
+    archive_near_month_year = target.near( year=2015, month=2)
     assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
 
-    archive_near_day_month_year = waybackpy.near(url, year=2006, month=11, day=15, UA=user_agent)
+    archive_near_day_month_year = target.near(year=2006, month=11, day=15)
     assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year)
 
-    archive_near_hour_day_month_year = waybackpy.near("www.python.org", year=2008, month=5, day=9, hour=15, UA=user_agent)
+    target = waybackpy.Url("www.python.org", user_agent)
+    archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15)
     assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
 
     with pytest.raises(Exception) as e_info:
         NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
-        waybackpy.near(NeverArchivedUrl, year=2010, UA=user_agent)
+        target = waybackpy.Url(NeverArchivedUrl, user_agent)
+        target.near(year=2010)
 
 def test_oldest():
     url = "github.com/akamhy/waybackpy"
-    archive_oldest = waybackpy.oldest(url, UA=user_agent)
-    assert "20200504141153" in archive_oldest
+    target = waybackpy.Url(url, user_agent)
+    assert "20200504141153" in target.oldest()
 
 def test_newest():
     url = "github.com/akamhy/waybackpy"
-    archive_newest = waybackpy.newest(url, UA=user_agent)
-    assert url in archive_newest
+    target = waybackpy.Url(url, user_agent)
+    assert url in target.newest()
 
 def test_get():
-    oldest_google_archive = waybackpy.oldest("google.com", UA=user_agent)
-    oldest_google_page_text =  waybackpy.get(oldest_google_archive, UA=user_agent)
-    assert "Welcome to Google" in oldest_google_page_text
+    target = waybackpy.Url("google.com", user_agent)
+    assert "Welcome to Google" in target.get(target.oldest())
 
 def test_total_archives():
 
-    count1 = waybackpy.total_archives(" https://google.com ", UA=user_agent)
-    assert count1 > 500000
+    target = waybackpy.Url(" https://google.com ", user_agent)
+    assert target.total_archives() > 500000
 
-    count2 = waybackpy.total_archives("https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8", UA=user_agent)
-    assert count2 == 0
+    target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent)
+    assert target.total_archives() == 0
 
 if __name__ == "__main__":
     test_clean_url()
@@ -96,3 +101,4 @@ if __name__ == "__main__":
     print(".")
     test_total_archives()
     print(".")
+    print("OK")
diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py
index 557fd96..66d8bc6 100644
--- a/waybackpy/__init__.py
+++ b/waybackpy/__init__.py
@@ -25,6 +25,6 @@ Full documentation @ <https://akamhy.github.io/waybackpy/>.
 :license: MIT
 """
 
-from .wrapper import save, near, oldest, newest, get, clean_url, url_check, total_archives
+from .wrapper import Url
 from .__version__ import __title__, __description__, __url__, __version__
 from .__version__ import __author__, __author_email__, __license__, __copyright__
diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py
index 5d3bec6..69ddc20 100644
--- a/waybackpy/exceptions.py
+++ b/waybackpy/exceptions.py
@@ -1,43 +1,6 @@
 # -*- coding: utf-8 -*-
 
-class TooManyArchivingRequests(Exception):
-
-    """Error when a single url reqeusted for archiving too many times in a short timespam.
-    Wayback machine doesn't supports archivng any url too many times in a short period of time.
+class WaybackError(Exception):
     """
-
-class ArchivingNotAllowed(Exception):
-
-    """Files like robots.txt are set to deny robot archiving.
-    Wayback machine respects these file, will not archive.
-    """
-
-class PageNotSaved(Exception):
-    """
-    When unable to save a webpage.
-    """
-
-class ArchiveNotFound(Exception):
-    """
-    When a page was never archived but client asks for old archive.
-    """
-
-class UrlNotFound(Exception):
-    """
-    Raised when 404 UrlNotFound.
-    """
-
-class BadGateWay(Exception):
-    """
-    Raised when 502 bad gateway.
-    """
-
-class WaybackUnavailable(Exception):
-    """
-    Raised when 503 API Service Temporarily Unavailable.
-    """
-
-class InvalidUrl(Exception):
-    """
-    Raised when url doesn't follow the standard url format.
+    Raised when API Service error.
     """
diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py
index 894b048..fa694cc 100644
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -1,149 +1,160 @@
 # -*- coding: utf-8 -*-
 
+import re
 import sys
 import json
 from datetime import datetime
-from waybackpy.exceptions import TooManyArchivingRequests, ArchivingNotAllowed, PageNotSaved, ArchiveNotFound, UrlNotFound, BadGateWay, InvalidUrl, WaybackUnavailable
+from waybackpy.exceptions import WaybackError
 
 version = (3, 0)
-cur_version = sys.version_info
+python_version = sys.version_info
 
 
-if cur_version >= version:  # If the python ver >= 3
+if python_version >= version:  # If the python ver >= 3
     from urllib.request import Request, urlopen
     from urllib.error import HTTPError, URLError
 else: # For python2.x
     from urllib2 import Request, urlopen, HTTPError, URLError
 
-default_UA = "waybackpy python package ; ( https://github.com/akamhy/waybackpy )"
+default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
 
-def url_check(url):
-    if "." not in url:
-        raise InvalidUrl("'%s' is not a vaild url." % url)
+class Url():
 
-def clean_url(url):
-    return str(url).strip().replace(" ","_")
+    def __init__(self, url, user_agent=default_UA):
+        self.url = url
+        self.user_agent = user_agent
 
-def wayback_timestamp(**kwargs):
-    return (
-      str(kwargs["year"])
-      +
-      str(kwargs["month"]).zfill(2)
-      +
-      str(kwargs["day"]).zfill(2)
-      +
-      str(kwargs["hour"]).zfill(2)
-      +
-      str(kwargs["minute"]).zfill(2)
-      )
+        self.url_check() # checks url validity on init.
 
-def handle_HTTPError(e):
-    if e.code == 502:
-        raise BadGateWay(e)
-    elif e.code == 503:
-        raise WaybackUnavailable(e)
-    elif e.code == 429:
-        raise TooManyArchivingRequests(e)
-    elif e.code == 404:
-        raise UrlNotFound(e)
+    def __repr__(self):
+        return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
 
-def save(url, UA=default_UA):
-    url_check(url)
-    request_url = ("https://web.archive.org/save/" + clean_url(url))
+    def __str__(self):
+        return "%s" % self.clean_url()
 
-    hdr = { 'User-Agent' : '%s' % UA } #nosec
-    req = Request(request_url, headers=hdr) #nosec
+    def url_check(self):
+        if "." not in self.url:
+            raise URLError("'%s' is not a vaild url." % self.url)
+
+        return True
+
+    def clean_url(self):
+        return str(self.url).strip().replace(" ","_")
+
+    def wayback_timestamp(self, **kwargs):
+        return (
+          str(kwargs["year"])
+          +
+          str(kwargs["month"]).zfill(2)
+          +
+          str(kwargs["day"]).zfill(2)
+          +
+          str(kwargs["hour"]).zfill(2)
+          +
+          str(kwargs["minute"]).zfill(2)
+          )
+
+    def handle_HTTPError(self, e):
+        if e.code >= 500:
+            raise WaybackError(e) from None
+        if e.code == 429:
+            raise WaybackError(e) from None
+        if e.code == 404:
+            raise HTTPError(e) from None
+
+    def save(self):
+        request_url = ("https://web.archive.org/save/" + self.clean_url())
+        hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec
+        req = Request(request_url, headers=hdr) #nosec
 
 
-    try:
-        response = urlopen(req) #nosec
-    except HTTPError as e:
-        if handle_HTTPError(e) is None:
-            raise PageNotSaved(e)
-    except URLError:
         try:
             response = urlopen(req) #nosec
-        except URLError as e:
-            raise UrlNotFound(e)
+        except HTTPError as e:
+            if self.handle_HTTPError(e) is None:
+                raise WaybackError(e)
+        except URLError:
+            try:
+                response = urlopen(req) #nosec
+            except URLError as e:
+                raise HTTPError(e)
 
-    header = response.headers
+        header = response.headers
 
-    if "exclusion.robots.policy" in str(header):
-        raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url))
+        try:
+            arch = re.search(r"rel=\"memento.*?web\.archive\.org(/web/[0-9]{14}/.*?)>", str(header)).group(1)
+        except KeyError as e:
+            raise WaybackError(e)
 
-    return "https://web.archive.org" + header['Content-Location']
+        return "https://web.archive.org" + arch
 
-def get(url, encoding=None, UA=default_UA):
-    url_check(url)
-    hdr = { 'User-Agent' : '%s' % UA }
-    req = Request(clean_url(url), headers=hdr) #nosec
+    def get(self, url=None, user_agent=None, encoding=None):
+
+        if not url:
+            url = self.clean_url()
+
+        if not user_agent:
+            user_agent = self.user_agent
+
+        hdr = { 'User-Agent' : '%s' % user_agent }
+        req = Request(url, headers=hdr) #nosec
 
-    try:
-        resp=urlopen(req) #nosec
-    except URLError:
         try:
             resp=urlopen(req) #nosec
-        except URLError as e:
-            raise UrlNotFound(e)
+        except URLError:
+            try:
+                resp=urlopen(req) #nosec
+            except URLError as e:
+                raise HTTPError(e)
+
+        if not encoding:
+            try:
+                encoding= resp.headers['content-type'].split('charset=')[-1]
+            except AttributeError:
+                encoding = "UTF-8"
+
+        return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
+
+    def near(self, **kwargs):
+        year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
+        month=kwargs.get("month", datetime.utcnow().strftime('%m'))
+        day=kwargs.get("day", datetime.utcnow().strftime('%d'))
+        hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
+        minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
+
+        timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
+        request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (self.clean_url(), str(timestamp))
+        hdr = { 'User-Agent' : '%s' % self.user_agent }
+        req = Request(request_url, headers=hdr) # nosec
 
-    if encoding is None:
         try:
-            encoding= resp.headers['content-type'].split('charset=')[-1]
-        except AttributeError:
-            encoding = "UTF-8"
+            response = urlopen(req) #nosec
+        except HTTPError as e:
+            self.handle_HTTPError(e)
 
-    return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
+        data = json.loads(response.read().decode("UTF-8"))
+        if not data["archived_snapshots"]:
+            raise WaybackError("'%s' is not yet archived." % url)
 
-def near(url, **kwargs):
+        archive_url = (data["archived_snapshots"]["closest"]["url"])
+        # wayback machine returns http sometimes, idk why? But they support https
+        archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
+        return archive_url
 
-    try:
-        url = kwargs["url"]
-    except KeyError:
-        url = url
+    def oldest(self, year=1994):
+        return self.near(year=year)
 
-    year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
-    month=kwargs.get("month", datetime.utcnow().strftime('%m'))
-    day=kwargs.get("day", datetime.utcnow().strftime('%d'))
-    hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
-    minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
-    UA=kwargs.get("UA", default_UA)
+    def newest(self):
+        return self.near()
 
-    url_check(url)
-    timestamp = wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
-    request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (clean_url(url), str(timestamp))
-    hdr = { 'User-Agent' : '%s' % UA }
-    req = Request(request_url, headers=hdr) # nosec
+    def total_archives(self):
+        hdr = { 'User-Agent' : '%s' % self.user_agent }
+        request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url()
+        req = Request(request_url, headers=hdr) # nosec
 
-    try:
-        response = urlopen(req) #nosec
-    except HTTPError as e:
-        handle_HTTPError(e)
+        try:
+            response = urlopen(req) #nosec
+        except HTTPError as e:
+            self.handle_HTTPError(e)
 
-    data = json.loads(response.read().decode("UTF-8"))
-    if not data["archived_snapshots"]:
-        raise ArchiveNotFound("'%s' is not yet archived." % url)
-
-    archive_url = (data["archived_snapshots"]["closest"]["url"])
-    # wayback machine returns http sometimes, idk why? But they support https
-    archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
-    return archive_url
-
-def oldest(url, UA=default_UA, year=1994):
-    return near(url, year=year, UA=UA)
-
-def newest(url, UA=default_UA):
-    return near(url, UA=UA)
-
-def total_archives(url, UA=default_UA):
-    url_check(url)
-
-    hdr = { 'User-Agent' : '%s' % UA }
-    request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % clean_url(url)
-    req = Request(request_url, headers=hdr) # nosec
-
-    try:
-        response = urlopen(req) #nosec
-    except HTTPError as e:
-        handle_HTTPError(e)
-
-    return str(response.read()).count(",") # Most efficient method to count (yet)
+        return str(response.read()).count(",") # Most efficient method to count number of archives (yet)