improve docs

2021-01-05 01:46:12 +05:30
parent 6fb6b2deee
commit e0a4b007d5
3 changed files with 84 additions and 27 deletions
--- a/waybackpy/init.py
+++ b/waybackpy/init.py
@@ -1,5 +1,3 @@
 # -*- coding: utf-8 -*-
 # ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
 # ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
 # ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
@@ -10,24 +8,43 @@
 # ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
 """
-Waybackpy is a Python package that interfaces with the Internet Archive's Wayback Machine API.
+Waybackpy is a Python package & command-line program that interfaces with the Internet Archive's Wayback Machine API.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Archive pages and retrieve archived pages easily.
+Archive webpage and retrieve archived URLs easily.
 Usage:
    >>> import waybackpy
   >>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
   >>> new_archive = target_url.save()
   >>> print(new_archive)
   https://web.archive.org/web/20200502170312/https://www.python.org/
-Full documentation @ <https://akamhy.github.io/waybackpy/>.
+    >>> url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
-:copyright: (c) 2020 by akamhy.
+    >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
    >>> wayback = waybackpy.Url(url, user_agent)
    >>> archive = wayback.save()
    >>> str(archive)
    'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
    >>> archive.timestamp
    datetime.datetime(2021, 1, 4, 17, 35, 12, 691741)
    >>> oldest_archive = wayback.oldest()
    >>> str(oldest_archive)
    'https://web.archive.org/web/20050422130129/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
    >>> archive_close_to_2010_feb = wayback.near(year=2010, month=2)
    >>> str(archive_close_to_2010_feb)
    'https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
    >>> str(wayback.newest())
    'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
 Full documentation @ <https://github.com/akamhy/waybackpy/wiki>.
 :copyright: (c) 2020-2021 AKash Mahanty Et al.
 :license: MIT
 """
-from .wrapper import Url
+from .wrapper import Url, Cdx
 from .__version__ import (
    __title__,
    __description__,
--- a/waybackpy/exceptions.py
+++ b/waybackpy/exceptions.py
@@ -1,6 +1,3 @@
 # -*- coding: utf-8 -*-
 class WaybackError(Exception):
    """
    Raised when Wayback Machine API Service is unreachable/down.
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -11,10 +11,10 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
 def _get_total_pages(url, user_agent):
    """
-    If showNumPages is passed in cdx API, it returns 'number of pages of'
+    If showNumPages is passed in cdx API, it returns
-    and each page has many archives.
+    'number of archive pages'and each page has many archives.
-    This func returns number of pages (type int).
+    This func returns number of pages of archives (type int).
    """
    total_pages_url = (
        "https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url
@@ -167,6 +167,19 @@ class Url:
        return "%s" % self._archive_url
    def __len__(self):
        """
        Why do we have len here?
        Applying len() on <class 'waybackpy.wrapper.Url'>
        will calculate the number of days between today and
        the archive timestamp.
        Can be applied on return values of near and its
        childs (e.g. oldest) and if applied on waybackpy.Url()
        whithout using any functions, it just grabs
        self._timestamp and def _timestamp gets it
        from def JSON.
        """
        td_max = timedelta(
            days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
        )
@@ -194,7 +207,12 @@ class Url:
    @property
    def JSON(self):
        """
-        Returns JSON data from 'https://archive.org/wayback/available?url=YOUR-URL'.
+        If the end user has used near() or its childs like oldest, newest
        and archive_url then the JSON response of these are cached in self._JSON
        If we find that self._JSON is not None we return it.
        else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
        and return it.
        """
        if self._JSON:
@@ -235,8 +253,12 @@ class Url:
    def _timestamp(self):
        """
        Get timestamp of last fetched archive.
-        If used before fetching any archive, This
+        If used before fetching any archive, will
-        randomly picks archive.
+        use whatever self.JSON returns.
        self.timestamp is None implies that
        self.JSON will return any archive's JSON
        that wayback machine provides it.
        """
        if self.timestamp:
@@ -256,13 +278,25 @@ class Url:
    def _cleaned_url(self):
        """
-        Remove newlines
+        Remove EOL
        replace " " with "_"
        """
        return str(self.url).strip().replace(" ", "_")
    def save(self):
-        """Create a new Wayback Machine archive for this URL."""
+        """
        To save a webpage on WayBack machine we
        need to send get request to https://web.archive.org/save/
        And to get the archive URL we are required to read the
        header of the API response.
        _get_response() takes care of the get requests. It uses requests
        package.
        _archive_url_parser() parses the archive from the header.
        """
        request_url = "https://web.archive.org/save/" + self._cleaned_url()
        headers = {"User-Agent": self.user_agent}
        response = _get_response(request_url, params=None, headers=headers)
@@ -271,8 +305,10 @@ class Url:
        return self
    def get(self, url="", user_agent="", encoding=""):
-        """Return the source code of the supplied URL.
+        """
-        If encoding is not supplied, it is auto-detected from the response.
+        Return the source code of the supplied URL.
        If encoding is not supplied, it is auto-detected
         from the response itself by requests package.
        """
        if not url:
@@ -452,8 +488,10 @@ class Url:
 class CdxSnapshot:
    """
-    [["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
+    This class helps to handle the Cdx Snapshots easily.
-    ["org,archive)/", "19970126045828", "http://www.archive.org:80/", "text/html", "200", "Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY", "1415"]]
+
    What the raw data looks like:
    org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
    """
    def __init__(
@@ -478,6 +516,7 @@ class CdxSnapshot:
 class Cdx:
    """
    waybackpy Cdx class, Type : <class 'waybackpy.wrapper.Cdx'>
    Cdx keys are :
    urlkey
    timestamp
@@ -501,6 +540,10 @@ class Cdx:
        self.end_timestamp = str(end_timestamp) if end_timestamp else None
    def snapshots(self):
        """
        This function yeilds snapshots encapsulated
        in CdxSnapshot for more usability.
        """
        payload = {}
        endpoint = "https://web.archive.org/cdx/search/cdx"
        total_pages = _get_total_pages(self.url, self.user_agent)