added more docstrings, lint

2021-01-26 11:42:29 +05:30
parent 817c0ee844
commit acebfabc3e
5 changed files with 118 additions and 47 deletions
--- a/tests/test_cdx.py
+++ b/tests/test_cdx.py
@@ -79,7 +79,7 @@ def test_all_cdx():
    c = 0
    for snapshot in snapshots:
        c += 1
-        if c > 30_529:  # deafult limit is 10k
+        if c > 30529:  # deafult limit is 10k
            break
    url = "https://github.com/*"
@@ -89,5 +89,5 @@ def test_all_cdx():
    for snapshot in snapshots:
        c += 1
-        if c > 100_529:
+        if c > 100529:
            break
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -5,8 +5,7 @@ import random
 import string
 import argparse
-sys.path.append("..")
+import waybackpy.cli as cli
 import waybackpy.cli as cli  # noqa: E402
 from waybackpy.wrapper import Url  # noqa: E402
 from waybackpy.__version__ import __version__
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -61,10 +61,10 @@ def test_check_collapses():
 def test_check_match_type():
-    assert None == _check_match_type(None, "url")
+    assert _check_match_type(None, "url") is None
    match_type = "exact"
    url = "test_url"
-    assert None == _check_match_type(match_type, url)
+    assert _check_match_type(match_type, url) is None
    url = "has * in it"
    with pytest.raises(WaybackError):
@@ -82,7 +82,7 @@ def test_cleaned_url():
 def test_url_check():
    good_url = "https://akamhy.github.io"
-    assert None == _url_check(good_url)
+    assert _url_check(good_url) is None
    bad_url = "https://github-com"
    with pytest.raises(URLError):
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@@ -1,8 +1,4 @@
 import sys
 import pytest
 import random
 import requests
 from datetime import datetime
 from waybackpy.wrapper import Url
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -17,6 +17,78 @@ from .utils import (
 class Url:
    """
    Attributes
    ----------
    url : str
        The input URL, wayback machine API operations are performed
        on this URL after sanatizing it.
    user_agent : str
        The user_agent used while making the GET requests to the
        Wayback machine APIs
    _archive_url : str
        Caches the last fetched archive.
    timestamp : datetime.datetime
        timestamp of the archive URL as datetime object for
        greater usability
    _JSON : dict
        Caches the last fetched availability API data
    latest_version : str
        The latest version of waybackpy on PyPi
    cached_save : bool
        Flag to check if WayBack machine returned a cached
        archive instead of creating a new archive. WayBack
        machine allows only one 1 archive for an URL in
        30 minutes. If the archive returned by WayBack machine
        is older than 3 minutes than this flag is set to True
    Methods turned properties
    ----------
    JSON : dict
        JSON response of availability API as dictionary / loaded JSON
    archive_url : str
        Return the archive url, returns str
    _timestamp : datetime.datetime
        Sets the value of self.timestamp if still not set
    Methods
    -------
    save()
        Archives the URL on WayBack machine
    get(url="", user_agent="", encoding="")
        Gets the source of archive url, can also be used to get source
        of any URL if passed into it.
    near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
        Wayback Machine can have many archives for a URL/webpage, sometimes we want
        archive close to a specific time.
        This method takes year, month, day, hour, minute and unix_timestamp as input.
    oldest(year=1994)
        The oldest archive of an URL.
    newest()
        The newest archive of an URL
    total_archives(start_timestamp=None, end_timestamp=None)
        total number of archives of an URL, the timeframe can be confined by
        start_timestamp and end_timestamp
    known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
        Known URLs for an URL, subdomain, URL as prefix etc.
    """
    def __init__(self, url, user_agent=default_user_agent):
        self.url = url
        self.user_agent = str(user_agent)
@@ -65,32 +137,30 @@ class Url:
    @property
    def JSON(self):
-        """
+        """Returns JSON response of availability API as dictionary / loaded JSON
        If the end user has used near() or its childs like oldest, newest
        and archive_url then the JSON response of these are cached in self._JSON
-        If we find that self._JSON is not None we return it.
+        return type : dict
        else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
        and return it.
        """
        # If user used the near method or any method that depends on near, we
        # are certain that we have a loaded dictionary cached in self._JSON.
        # Return the loaded JSON data.
        if self._JSON:
            return self._JSON
        # If no cached data found, get data and return + cache it.
        endpoint = "https://archive.org/wayback/available"
        headers = {"User-Agent": self.user_agent}
        payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
        response = _get_response(endpoint, params=payload, headers=headers)
-        return response.json()
+        self._JSON = response.json()
        return self._JSON
    @property
    def archive_url(self):
-        """Return the string form of the Url object.
+        """Return the archive url.
-        Parameters
+        return type : str
        ----------
        self : waybackpy.wrapper.Url
            The instance itself.
        """
        if self._archive_url:
@@ -112,10 +182,7 @@ class Url:
    def _timestamp(self):
        """Sets the value of self.timestamp if still not set.
-        Parameters
+        Return type : datetime.datetime
        ----------
        self : waybackpy.wrapper.Url
            The instance itself.
        """
        return _timestamp_manager(self.timestamp, self.JSON)
@@ -123,11 +190,6 @@ class Url:
    def save(self):
        """Saves/Archive the URL.
        Parameters
        ----------
        self : waybackpy.wrapper.Url
            The instance itself.
        To save a webpage on WayBack machine we
        need to send get request to https://web.archive.org/save/
@@ -138,6 +200,8 @@ class Url:
        _archive_url_parser() parses the archive from the header.
        return type : waybackpy.wrapper.Url
        """
        request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
        headers = {"User-Agent": self.user_agent}
@@ -179,9 +243,22 @@ class Url:
        return self
    def get(self, url="", user_agent="", encoding=""):
-        """
+        """GET the source of archive or any other URL.
-        Return the source code of the last archived URL,
+
-        if no URL is passed to this method.
+        url : str, waybackpy.wrapper.Url
            The method will return the source code of
            this URL instead of last fetched archive.
        user_agent : str
            The user_agent for GET request to API
        encoding : str
            If user is using any other encoding that
            can't be detected by response.encoding
        Return the source code of the last fetched
        archive URL if no URL is passed to this method
        else it returns the source code of url passed.
        If encoding is not supplied, it is auto-detected
         from the response itself by requests package.
@@ -219,8 +296,6 @@ class Url:
        """
        Parameters
        ----------
        self : waybackpy.wrapper.Url
            The instance itself.
        year : int
            Archive close to year
@@ -316,13 +391,14 @@ class Url:
        return self.near(year=year)
    def newest(self):
-        """
+        """Return the newest Wayback Machine archive available.
        Return the newest Wayback Machine archive available for this URL.
-        We return the output of self.near() as it deafults to current utc time.
+        We return the return value of self.near() as it deafults to current UTC time.
        Due to Wayback Machine database lag, this may not always be the
        most recent archive.
        return type : waybackpy.wrapper.Url
        """
        return self.near()
@@ -332,9 +408,6 @@ class Url:
        Parameters
        ----------
        self : waybackpy.wrapper.Url
            The instance itself
        start_timestamp : str
            1 to 14 digit string of numbers, you are not required to
            pass a full 14 digit timestamp.
@@ -344,13 +417,15 @@ class Url:
            pass a full 14 digit timestamp.
        return type : int
        A webpage can have multiple archives on the wayback machine
        If someone wants to count the total number of archives of a
        webpage on wayback machine they can use this method.
        Returns the total number of Wayback Machine archives for the URL.
        Return type in integer.
        """
        cdx = Cdx(
@@ -359,6 +434,8 @@ class Url:
            start_timestamp=start_timestamp,
            end_timestamp=end_timestamp,
        )
        # cdx.snapshots() is generator not list.
        i = 0
        for _ in cdx.snapshots():
            i = i + 1
@@ -377,9 +454,6 @@ class Url:
        Parameters
        ----------
        self : waybackpy.wrapper.Url
            The instance itself
        subdomain : bool
            If True fetch subdomain URLs along with the host URLs.
@@ -397,6 +471,8 @@ class Url:
        match_type : str
            One of  (exact, prefix, host and domain)
        return type : waybackpy.snapshot.CdxSnapshot
        Yields list of URLs known to exist for given input.
        Defaults to input URL as prefix.