Add doc strings (#90)

* Added some docstrings in utils.py * renamed some func/meth to better names and added doc strings + lint * added more docstrings * more docstrings * improve docstrings * docstrings * added more docstrings, lint * fix import error
2021-01-26 11:56:03 +05:30
parent 88cda94c0b
commit db8f902cff
9 changed files with 443 additions and 121 deletions
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -1,5 +1,6 @@
 import re
 from datetime import datetime, timedelta
+
 from .exceptions import WaybackError
 from .cdx import Cdx
 from .utils import (
@@ -9,13 +10,85 @@ from .utils import (
    default_user_agent,
    _url_check,
    _cleaned_url,
-    _ts,
-    _unix_ts_to_wayback_ts,
+    _timestamp_manager,
+    _unix_timestamp_to_wayback_timestamp,
    _latest_version,
 )


 class Url:
+    """
+
+    Attributes
+    ----------
+    url : str
+        The input URL, wayback machine API operations are performed
+        on this URL after sanatizing it.
+
+    user_agent : str
+        The user_agent used while making the GET requests to the
+        Wayback machine APIs
+
+    _archive_url : str
+        Caches the last fetched archive.
+
+    timestamp : datetime.datetime
+        timestamp of the archive URL as datetime object for
+        greater usability
+
+    _JSON : dict
+        Caches the last fetched availability API data
+
+    latest_version : str
+        The latest version of waybackpy on PyPi
+
+    cached_save : bool
+        Flag to check if WayBack machine returned a cached
+        archive instead of creating a new archive. WayBack
+        machine allows only one 1 archive for an URL in
+        30 minutes. If the archive returned by WayBack machine
+        is older than 3 minutes than this flag is set to True
+
+    Methods turned properties
+    ----------
+    JSON : dict
+        JSON response of availability API as dictionary / loaded JSON
+
+    archive_url : str
+        Return the archive url, returns str
+
+    _timestamp : datetime.datetime
+        Sets the value of self.timestamp if still not set
+
+    Methods
+    -------
+    save()
+        Archives the URL on WayBack machine
+
+    get(url="", user_agent="", encoding="")
+        Gets the source of archive url, can also be used to get source
+        of any URL if passed into it.
+
+    near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
+        Wayback Machine can have many archives for a URL/webpage, sometimes we want
+        archive close to a specific time.
+        This method takes year, month, day, hour, minute and unix_timestamp as input.
+
+    oldest(year=1994)
+        The oldest archive of an URL.
+
+    newest()
+        The newest archive of an URL
+
+    total_archives(start_timestamp=None, end_timestamp=None)
+        total number of archives of an URL, the timeframe can be confined by
+        start_timestamp and end_timestamp
+
+    known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
+        Known URLs for an URL, subdomain, URL as prefix etc.
+
+    """
+
    def __init__(self, url, user_agent=default_user_agent):
        self.url = url
        self.user_agent = str(user_agent)
@@ -32,29 +105,17 @@ class Url:
        )

    def __str__(self):
-        """
-        Output when print() is used on <class 'waybackpy.wrapper.Url'>
-        This should print an archive URL.
-
-        We check if self._archive_url is not None.
-        If not None, good. We return string of self._archive_url.
-
-        If self._archive_url is None, it means we ain't used any method that
-        sets self._archive_url, we now set self._archive_url to self.archive_url
-        and return it.
-        """
-
        if not self._archive_url:
            self._archive_url = self.archive_url
+
        return "{archive_url}".format(archive_url=self._archive_url)

    def __len__(self):
-        """
-        Why do we have len here?
+        """Number of days between today and the date of archive based on the timestamp

-        Applying len() on <class 'waybackpy.wrapper.Url'>
-        will calculate the number of days between today and
-        the archive timestamp.
+        len() of waybackpy.wrapper.Url should return
+        the number of days between today and the
+        archive timestamp.

        Can be applied on return values of near and its
        childs (e.g. oldest) and if applied on waybackpy.Url()
@@ -76,32 +137,30 @@ class Url:

    @property
    def JSON(self):
-        """
-        If the end user has used near() or its childs like oldest, newest
-        and archive_url then the JSON response of these are cached in self._JSON
+        """Returns JSON response of availability API as dictionary / loaded JSON

-        If we find that self._JSON is not None we return it.
-        else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
-        and return it.
+        return type : dict
        """

+        # If user used the near method or any method that depends on near, we
+        # are certain that we have a loaded dictionary cached in self._JSON.
+        # Return the loaded JSON data.
        if self._JSON:
            return self._JSON

+        # If no cached data found, get data and return + cache it.
        endpoint = "https://archive.org/wayback/available"
        headers = {"User-Agent": self.user_agent}
        payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
        response = _get_response(endpoint, params=payload, headers=headers)
-        return response.json()
+        self._JSON = response.json()
+        return self._JSON

    @property
    def archive_url(self):
-        """
-        Returns any random archive for the instance.
-        But if near, oldest, newest were used before
-        then it returns the same archive again.
+        """Return the archive url.

-        We cache archive in self._archive_url
+        return type : str
        """

        if self._archive_url:
@@ -121,11 +180,16 @@ class Url:

    @property
    def _timestamp(self):
-        self.timestamp = _ts(self.timestamp, self.JSON)
-        return self.timestamp
+        """Sets the value of self.timestamp if still not set.
+
+        Return type : datetime.datetime
+
+        """
+        return _timestamp_manager(self.timestamp, self.JSON)

    def save(self):
-        """
+        """Saves/Archive the URL.
+
        To save a webpage on WayBack machine we
        need to send get request to https://web.archive.org/save/

@@ -136,6 +200,8 @@ class Url:

        _archive_url_parser() parses the archive from the header.

+        return type : waybackpy.wrapper.Url
+
        """
        request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
        headers = {"User-Agent": self.user_agent}
@@ -161,7 +227,9 @@ class Url:
            instance=self,
        )

-        m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url)
+        m = re.search(
+            r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
+        )
        str_ts = m.group(1)
        ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
        now = datetime.utcnow()
@@ -175,9 +243,22 @@ class Url:
        return self

    def get(self, url="", user_agent="", encoding=""):
-        """
-        Return the source code of the last archived URL,
-        if no URL is passed to this method.
+        """GET the source of archive or any other URL.
+
+        url : str, waybackpy.wrapper.Url
+            The method will return the source code of
+            this URL instead of last fetched archive.
+
+        user_agent : str
+            The user_agent for GET request to API
+
+        encoding : str
+            If user is using any other encoding that
+            can't be detected by response.encoding
+
+        Return the source code of the last fetched
+        archive URL if no URL is passed to this method
+        else it returns the source code of url passed.

        If encoding is not supplied, it is auto-detected
         from the response itself by requests package.
@@ -213,6 +294,27 @@ class Url:
        unix_timestamp=None,
    ):
        """
+        Parameters
+        ----------
+
+        year : int
+            Archive close to year
+
+        month : int
+            Archive close to month
+
+        day : int
+            Archive close to day
+
+        hour : int
+            Archive close to hour
+
+        minute : int
+            Archive close to minute
+
+        unix_timestamp : str, float or int
+            Archive close to this unix_timestamp
+
        Wayback Machine can have many archives of a webpage,
        sometimes we want archive close to a specific time.

@@ -235,7 +337,7 @@ class Url:
        """

        if unix_timestamp:
-            timestamp = _unix_ts_to_wayback_ts(unix_timestamp)
+            timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
        else:
            now = datetime.utcnow().timetuple()
            timestamp = _wayback_timestamp(
@@ -285,28 +387,45 @@ class Url:

        We simply pass the year in near() and return it.
        """
+
        return self.near(year=year)

    def newest(self):
-        """
-        Return the newest Wayback Machine archive available for this URL.
+        """Return the newest Wayback Machine archive available.

-        We return the output of self.near() as it deafults to current utc time.
+        We return the return value of self.near() as it deafults to current UTC time.

        Due to Wayback Machine database lag, this may not always be the
        most recent archive.
+
+        return type : waybackpy.wrapper.Url
        """
+
        return self.near()

    def total_archives(self, start_timestamp=None, end_timestamp=None):
-        """
+        """Returns the total number of archives for an URL
+
+        Parameters
+        ----------
+        start_timestamp : str
+            1 to 14 digit string of numbers, you are not required to
+            pass a full 14 digit timestamp.
+
+        end_timestamp : str
+            1 to 14 digit string of numbers, you are not required to
+            pass a full 14 digit timestamp.
+
+
+        return type : int
+
+
        A webpage can have multiple archives on the wayback machine
        If someone wants to count the total number of archives of a
        webpage on wayback machine they can use this method.

        Returns the total number of Wayback Machine archives for the URL.

-        Return type in integer.
        """

        cdx = Cdx(
@@ -315,6 +434,8 @@ class Url:
            start_timestamp=start_timestamp,
            end_timestamp=end_timestamp,
        )
+
+        # cdx.snapshots() is generator not list.
        i = 0
        for _ in cdx.snapshots():
            i = i + 1
@@ -328,15 +449,36 @@ class Url:
        end_timestamp=None,
        match_type="prefix",
    ):
-        """
+        """Yields known_urls URLs from the CDX API.
+
+        Parameters
+        ----------
+
+        subdomain : bool
+            If True fetch subdomain URLs along with the host URLs.
+
+        host : bool
+            Only fetch host URLs.
+
+        start_timestamp : str
+            1 to 14 digit string of numbers, you are not required to
+            pass a full 14 digit timestamp.
+
+        end_timestamp : str
+            1 to 14 digit string of numbers, you are not required to
+            pass a full 14 digit timestamp.
+
+        match_type : str
+            One of  (exact, prefix, host and domain)
+
+        return type : waybackpy.snapshot.CdxSnapshot
+
        Yields list of URLs known to exist for given input.
        Defaults to input URL as prefix.

-        This method is kept for compatibility, use the Cdx class instead.
-        This method itself depends on Cdx.
-
-         Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
-         https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
+        Based on:
+        https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
+        By Mohammed Diaa (https://github.com/mhmdiaa)
        """

        if subdomain:
@@ -353,7 +495,5 @@ class Url:
            collapses=["urlkey"],
        )

-        snapshots = cdx.snapshots()
-
-        for snapshot in snapshots:
+        for snapshot in cdx.snapshots():
            yield (snapshot.original)