Added doc strings in wrapper.py, documenting code and improving docs.

2021-01-03 17:11:32 +05:30
parent db5737a857
commit 9823c809e9
1 changed files with 131 additions and 14 deletions
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -12,7 +12,30 @@ default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
 def _archive_url_parser(header):
-    """Parse out the archive from header."""
+    """
    This method has some regexen (or regexes)
    that search for archive url in header.
    This method is used when you try to
    save a webpage on wayback machine.
    The wayback machine's save API doesn't
    return JSON response, we are required
    to read the header of the API response
    and look for the archive URL.
    Two cases are possible:
    1) Either we find the archive url in
       the header.
    2) We didn't find the archive url in
       API header.
    If we found the archive we return it.
    And if we couldn't find it we raise
    WaybackError with a standard Error message.
    """
    # Regex1
    arch = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
    if arch:
@@ -36,14 +59,49 @@ def _archive_url_parser(header):
 def _wayback_timestamp(**kwargs):
-    """Return a formatted timestamp."""
+    """
    Wayback Machine archive URLs
    have a timestamp in them.
    The standard archive URL format is
    https://web.archive.org/web/20191214041711/https://www.youtube.com
    If we break it down in three parts:
    1 ) The start (https://web.archive.org/web/)
    2 ) timestamp (20191214041711)
    3 ) https://www.youtube.com, the original URL
    The near method takes year, month, day, hour and minute
    as Arguments, their type is int.
    This method takes those integers and converts it to
    wayback machine timestamp and returns it.
    Return format is string.
    """
    return "".join(
        str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
    )
 def _get_response(endpoint, params=None, headers=None):
-    """Get response for the supplied request."""
+    """
    This function is used make get request.
    We use the requests package to make the
    requests.
    We try twice and if both the times is fails And
    raises exceptions we give-up and raise WaybackError.
    You can handles WaybackError by importing:
    from waybackpy.exceptions import WaybackError
    try:
        ...
    except WaybackError as e:
        # handle it
    """
    try:
        response = requests.get(endpoint, params=params, headers=headers)
@@ -58,14 +116,16 @@ def _get_response(endpoint, params=None, headers=None):
 class Url:
-    """waybackpy Url object"""
+    """
    waybackpy Url object <class 'waybackpy.wrapper.Url'>
    """
    def __init__(self, url, user_agent=default_UA):
        self.url = url
        self.user_agent = user_agent
        self._url_check()  # checks url validity on init.
-        self._archive_url = None  # URL of archive
+        self._archive_url = None
-        self.timestamp = None  # timestamp for last archive
+        self.timestamp = None
        self._JSON = None
        self._alive_url_list = []
@@ -73,6 +133,17 @@ class Url:
        return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
    def __str__(self):
        """
        Output when print() is used on <class 'waybackpy.wrapper.Url'>
        This should print an archive URL.
        We check if self._archive_url is not None.
        If not None, good. We return string of self._archive_url.
        If self._archive_url is None, it means we ain't used any method that
        sets self._archive_url, we now set self._archive_url to self.archive_url
        and return it.
        """
        if not self._archive_url:
            self._archive_url = self.archive_url
        return "%s" % self._archive_url
@@ -92,7 +163,13 @@ class Url:
        return diff.days
    def _url_check(self):
-        """Check for common URL problems."""
+        """
        Check for common URL problems.
        What we are checking:
        1) '.' in self.url, no url that ain't '.' in it.
        If you known any others, please create a PR on the github repo.
        """
        if "." not in self.url:
            raise URLError("'%s' is not a vaild URL." % self.url)
@@ -198,10 +275,26 @@ class Url:
        return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
    def near(self, year=None, month=None, day=None, hour=None, minute=None):
-        """Return the closest Wayback Machine archive to the time supplied.
+        """
-        Supported params are year, month, day, hour and minute.
+        Wayback Machine can have many archives of a webpage,
-        Any non-supplied parameters default to the current time.
+        sometimes we want archive close to a specific time.
        This method takes year, month, day, hour and minute as input.
        The input type must be integer. Any non-supplied parameters
        default to the current time.
        We convert the input to a wayback machine timestamp using
        _wayback_timestamp(), it returns a string.
        We use the wayback machine's availability API
        (https://archive.org/wayback/available)
        to get the closest archive from the timestamp.
        We set self._archive_url to the archive found, if any.
        If archive found, we set self.timestamp to its timestamp.
        We self._JSON to the response of the availability API.
        And finally return self.
        """
        now = datetime.utcnow().timetuple()
        timestamp = _wayback_timestamp(
@@ -237,11 +330,22 @@ class Url:
        return self
    def oldest(self, year=1994):
-        """Return the oldest Wayback Machine archive for this URL."""
+        """
        Returns the earliest/oldest Wayback Machine archive for the webpage.
        Wayback machine has started archiving the internet around 1997 and
        therefore we can't have any archive older than 1997, we use 1994 as the
        deafult year to look for the oldest archive.
        We simply pass the year in near() and return it.
        """
        return self.near(year=year)
    def newest(self):
-        """Return the newest Wayback Machine archive available for this URL.
+        """
        Return the newest Wayback Machine archive available for this URL.
        We return the output of self.near() as it deafults to current utc time.
        Due to Wayback Machine database lag, this may not always be the
        most recent archive.
@@ -249,7 +353,15 @@ class Url:
        return self.near()
    def total_archives(self):
-        """Returns the total number of Wayback Machine archives for this URL."""
+        """
        A webpage can have multiple archives on the wayback machine
        If someone wants to count the total number of archives of a
        webpage on wayback machine they can use this method.
        Returns the total number of Wayback Machine archives for the URL.
        Return type in integer.
        """
        endpoint = "https://web.archive.org/cdx/search/cdx"
        headers = {
@@ -264,6 +376,10 @@ class Url:
        return response.text.count(",")
    def live_urls_picker(self, url):
        """
        This method is used to check if supplied url
        is >= 400.
        """
        try:
            response_code = requests.get(url).status_code
@@ -277,7 +393,8 @@ class Url:
        self._alive_url_list.append(url)
    def known_urls(self, alive=False, subdomain=False):
-        """Returns list of URLs known to exist for given domain name
+        """
        Returns list of URLs known to exist for given domain name
        because these URLs were crawled by WayBack Machine bots.
        Useful for pen-testers and others.
        Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: