OOP (#10)

* Update wrapper.py * Update exceptions.py * Update __init__.py * test adjusted for new changes * Update wrapper.py
2020-07-17 20:50:00 +05:30
parent 9ac1e877c8
commit 9860527d96
4 changed files with 155 additions and 175 deletions
@@ -3,81 +3,86 @@ sys.path.append("..")
 import waybackpy
 import pytest

-
 user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"

 def test_clean_url():
    test_url = " https://en.wikipedia.org/wiki/Network security "
    answer = "https://en.wikipedia.org/wiki/Network_security"
-    test_result = waybackpy.clean_url(test_url)
+    target = waybackpy.Url(test_url, user_agent)
+    test_result = target.clean_url()
    assert answer == test_result

 def test_url_check():
-    InvalidUrl = "http://wwwgooglecom/"
+    broken_url = "http://wwwgooglecom/"
    with pytest.raises(Exception) as e_info:
-        waybackpy.url_check(InvalidUrl)
+        waybackpy.Url(broken_url, user_agent)

 def test_save():
    # Test for urls that exist and can be archived.
    url1="https://github.com/akamhy/waybackpy"
-    archived_url1 = waybackpy.save(url1, UA=user_agent)
+    target = waybackpy.Url(url1, user_agent)
+    archived_url1 = target.save()
    assert url1 in archived_url1

    # Test for urls that are incorrect.
    with pytest.raises(Exception) as e_info:
        url2 = "ha ha ha ha"
-        waybackpy.save(url2, UA=user_agent)
+        waybackpy.Url(url2, user_agent)

    # Test for urls not allowed to archive by robot.txt.
    with pytest.raises(Exception) as e_info:
        url3 = "http://www.archive.is/faq.html"
-        waybackpy.save(url3, UA=user_agent)
+        target = waybackpy.Url(url3, user_agent)
+        target.save()

    # Non existent urls, test
    with pytest.raises(Exception) as e_info:
        url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
-        archived_url4 = waybackpy.save(url4, UA=user_agent)
+        target = waybackpy.Url(url3, user_agent)
+        target.save()

 def test_near():
    url = "google.com"
-    archive_near_year = waybackpy.near(url, year=2010, UA=user_agent)
+    target = waybackpy.Url(url, user_agent)
+    archive_near_year = target.near(year=2010)
    assert "2010" in archive_near_year

-    archive_near_month_year = waybackpy.near(url, year=2015, month=2, UA=user_agent)
+    archive_near_month_year = target.near( year=2015, month=2)
    assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)

-    archive_near_day_month_year = waybackpy.near(url, year=2006, month=11, day=15, UA=user_agent)
+    archive_near_day_month_year = target.near(year=2006, month=11, day=15)
    assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year)

-    archive_near_hour_day_month_year = waybackpy.near("www.python.org", year=2008, month=5, day=9, hour=15, UA=user_agent)
+    target = waybackpy.Url("www.python.org", user_agent)
+    archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15)
    assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)

    with pytest.raises(Exception) as e_info:
        NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
-        waybackpy.near(NeverArchivedUrl, year=2010, UA=user_agent)
+        target = waybackpy.Url(NeverArchivedUrl, user_agent)
+        target.near(year=2010)

 def test_oldest():
    url = "github.com/akamhy/waybackpy"
-    archive_oldest = waybackpy.oldest(url, UA=user_agent)
-    assert "20200504141153" in archive_oldest
+    target = waybackpy.Url(url, user_agent)
+    assert "20200504141153" in target.oldest()

 def test_newest():
    url = "github.com/akamhy/waybackpy"
-    archive_newest = waybackpy.newest(url, UA=user_agent)
-    assert url in archive_newest
+    target = waybackpy.Url(url, user_agent)
+    assert url in target.newest()

 def test_get():
-    oldest_google_archive = waybackpy.oldest("google.com", UA=user_agent)
-    oldest_google_page_text =  waybackpy.get(oldest_google_archive, UA=user_agent)
-    assert "Welcome to Google" in oldest_google_page_text
+    target = waybackpy.Url("google.com", user_agent)
+    assert "Welcome to Google" in target.get(target.oldest())

 def test_total_archives():

-    count1 = waybackpy.total_archives(" https://google.com ", UA=user_agent)
-    assert count1 > 500000
+    target = waybackpy.Url(" https://google.com ", user_agent)
+    assert target.total_archives() > 500000

-    count2 = waybackpy.total_archives("https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8", UA=user_agent)
-    assert count2 == 0
+    target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent)
+    assert target.total_archives() == 0

 if __name__ == "__main__":
    test_clean_url()
@@ -96,3 +101,4 @@ if __name__ == "__main__":
    print(".")
    test_total_archives()
    print(".")
+    print("OK")
@@ -25,6 +25,6 @@ Full documentation @ <https://akamhy.github.io/waybackpy/>.
 :license: MIT
 """

-from .wrapper import save, near, oldest, newest, get, clean_url, url_check, total_archives
+from .wrapper import Url
 from .__version__ import __title__, __description__, __url__, __version__
 from .__version__ import __author__, __author_email__, __license__, __copyright__
@@ -1,43 +1,6 @@
 # -*- coding: utf-8 -*-

-class TooManyArchivingRequests(Exception):
-
-    """Error when a single url reqeusted for archiving too many times in a short timespam.
-    Wayback machine doesn't supports archivng any url too many times in a short period of time.
+class WaybackError(Exception):
    """
-
-class ArchivingNotAllowed(Exception):
-
-    """Files like robots.txt are set to deny robot archiving.
-    Wayback machine respects these file, will not archive.
-    """
-
-class PageNotSaved(Exception):
-    """
-    When unable to save a webpage.
-    """
-
-class ArchiveNotFound(Exception):
-    """
-    When a page was never archived but client asks for old archive.
-    """
-
-class UrlNotFound(Exception):
-    """
-    Raised when 404 UrlNotFound.
-    """
-
-class BadGateWay(Exception):
-    """
-    Raised when 502 bad gateway.
-    """
-
-class WaybackUnavailable(Exception):
-    """
-    Raised when 503 API Service Temporarily Unavailable.
-    """
-
-class InvalidUrl(Exception):
-    """
-    Raised when url doesn't follow the standard url format.
+    Raised when API Service error.
    """
@@ -1,30 +1,47 @@
 # -*- coding: utf-8 -*-

+import re
 import sys
 import json
 from datetime import datetime
-from waybackpy.exceptions import TooManyArchivingRequests, ArchivingNotAllowed, PageNotSaved, ArchiveNotFound, UrlNotFound, BadGateWay, InvalidUrl, WaybackUnavailable
+from waybackpy.exceptions import WaybackError

 version = (3, 0)
-cur_version = sys.version_info
+python_version = sys.version_info


-if cur_version >= version:  # If the python ver >= 3
+if python_version >= version:  # If the python ver >= 3
    from urllib.request import Request, urlopen
    from urllib.error import HTTPError, URLError
 else: # For python2.x
    from urllib2 import Request, urlopen, HTTPError, URLError

-default_UA = "waybackpy python package ; ( https://github.com/akamhy/waybackpy )"
+default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"

-def url_check(url):
-    if "." not in url:
-        raise InvalidUrl("'%s' is not a vaild url." % url)
+class Url():

-def clean_url(url):
-    return str(url).strip().replace(" ","_")
+    def __init__(self, url, user_agent=default_UA):
+        self.url = url
+        self.user_agent = user_agent

-def wayback_timestamp(**kwargs):
+        self.url_check() # checks url validity on init.
+
+    def __repr__(self):
+        return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
+
+    def __str__(self):
+        return "%s" % self.clean_url()
+
+    def url_check(self):
+        if "." not in self.url:
+            raise URLError("'%s' is not a vaild url." % self.url)
+
+        return True
+
+    def clean_url(self):
+        return str(self.url).strip().replace(" ","_")
+
+    def wayback_timestamp(self, **kwargs):
        return (
          str(kwargs["year"])
          +
@@ -37,46 +54,50 @@ def wayback_timestamp(**kwargs):
          str(kwargs["minute"]).zfill(2)
          )

-def handle_HTTPError(e):
-    if e.code == 502:
-        raise BadGateWay(e)
-    elif e.code == 503:
-        raise WaybackUnavailable(e)
-    elif e.code == 429:
-        raise TooManyArchivingRequests(e)
-    elif e.code == 404:
-        raise UrlNotFound(e)
+    def handle_HTTPError(self, e):
+        if e.code >= 500:
+            raise WaybackError(e) from None
+        if e.code == 429:
+            raise WaybackError(e) from None
+        if e.code == 404:
+            raise HTTPError(e) from None

-def save(url, UA=default_UA):
-    url_check(url)
-    request_url = ("https://web.archive.org/save/" + clean_url(url))
-
-    hdr = { 'User-Agent' : '%s' % UA } #nosec
+    def save(self):
+        request_url = ("https://web.archive.org/save/" + self.clean_url())
+        hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec
        req = Request(request_url, headers=hdr) #nosec


        try:
            response = urlopen(req) #nosec
        except HTTPError as e:
-        if handle_HTTPError(e) is None:
-            raise PageNotSaved(e)
+            if self.handle_HTTPError(e) is None:
+                raise WaybackError(e)
        except URLError:
            try:
                response = urlopen(req) #nosec
            except URLError as e:
-            raise UrlNotFound(e)
+                raise HTTPError(e)

        header = response.headers

-    if "exclusion.robots.policy" in str(header):
-        raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url))
+        try:
+            arch = re.search(r"rel=\"memento.*?web\.archive\.org(/web/[0-9]{14}/.*?)>", str(header)).group(1)
+        except KeyError as e:
+            raise WaybackError(e)

-    return "https://web.archive.org" + header['Content-Location']
+        return "https://web.archive.org" + arch

-def get(url, encoding=None, UA=default_UA):
-    url_check(url)
-    hdr = { 'User-Agent' : '%s' % UA }
-    req = Request(clean_url(url), headers=hdr) #nosec
+    def get(self, url=None, user_agent=None, encoding=None):
+
+        if not url:
+            url = self.clean_url()
+
+        if not user_agent:
+            user_agent = self.user_agent
+
+        hdr = { 'User-Agent' : '%s' % user_agent }
+        req = Request(url, headers=hdr) #nosec

        try:
            resp=urlopen(req) #nosec
@@ -84,9 +105,9 @@ def get(url, encoding=None, UA=default_UA):
            try:
                resp=urlopen(req) #nosec
            except URLError as e:
-            raise UrlNotFound(e)
+                raise HTTPError(e)

-    if encoding is None:
+        if not encoding:
            try:
                encoding= resp.headers['content-type'].split('charset=')[-1]
            except AttributeError:
@@ -94,56 +115,46 @@ def get(url, encoding=None, UA=default_UA):

        return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))

-def near(url, **kwargs):
-
-    try:
-        url = kwargs["url"]
-    except KeyError:
-        url = url
-
+    def near(self, **kwargs):
        year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
        month=kwargs.get("month", datetime.utcnow().strftime('%m'))
        day=kwargs.get("day", datetime.utcnow().strftime('%d'))
        hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
        minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
-    UA=kwargs.get("UA", default_UA)

-    url_check(url)
-    timestamp = wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
-    request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (clean_url(url), str(timestamp))
-    hdr = { 'User-Agent' : '%s' % UA }
+        timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
+        request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (self.clean_url(), str(timestamp))
+        hdr = { 'User-Agent' : '%s' % self.user_agent }
        req = Request(request_url, headers=hdr) # nosec

        try:
            response = urlopen(req) #nosec
        except HTTPError as e:
-        handle_HTTPError(e)
+            self.handle_HTTPError(e)

        data = json.loads(response.read().decode("UTF-8"))
        if not data["archived_snapshots"]:
-        raise ArchiveNotFound("'%s' is not yet archived." % url)
+            raise WaybackError("'%s' is not yet archived." % url)

        archive_url = (data["archived_snapshots"]["closest"]["url"])
        # wayback machine returns http sometimes, idk why? But they support https
        archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
        return archive_url

-def oldest(url, UA=default_UA, year=1994):
-    return near(url, year=year, UA=UA)
+    def oldest(self, year=1994):
+        return self.near(year=year)

-def newest(url, UA=default_UA):
-    return near(url, UA=UA)
+    def newest(self):
+        return self.near()

-def total_archives(url, UA=default_UA):
-    url_check(url)
-
-    hdr = { 'User-Agent' : '%s' % UA }
-    request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % clean_url(url)
+    def total_archives(self):
+        hdr = { 'User-Agent' : '%s' % self.user_agent }
+        request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url()
        req = Request(request_url, headers=hdr) # nosec

        try:
            response = urlopen(req) #nosec
        except HTTPError as e:
-        handle_HTTPError(e)
+            self.handle_HTTPError(e)

-    return str(response.read()).count(",") # Most efficient method to count (yet)
+        return str(response.read()).count(",") # Most efficient method to count number of archives (yet)