Add doc strings (#90)
* Added some docstrings in utils.py * renamed some func/meth to better names and added doc strings + lint * added more docstrings * more docstrings * improve docstrings * docstrings * added more docstrings, lint * fix import error
This commit is contained in:
		@@ -11,6 +11,7 @@ from .utils import (
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# TODO : Threading support for pagination API. It's designed for Threading.
 | 
			
		||||
# TODO : Add get method here if type is Vaild HTML, SVG other but not - or warc. Test it.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Cdx:
 | 
			
		||||
@@ -42,7 +43,22 @@ class Cdx:
 | 
			
		||||
        self.use_page = False
 | 
			
		||||
 | 
			
		||||
    def cdx_api_manager(self, payload, headers, use_page=False):
 | 
			
		||||
        """
 | 
			
		||||
        """Act as button, we can choose between the normal API and pagination API.
 | 
			
		||||
 | 
			
		||||
        Parameters
 | 
			
		||||
        ----------
 | 
			
		||||
        self : waybackpy.cdx.Cdx
 | 
			
		||||
            The instance itself
 | 
			
		||||
 | 
			
		||||
        payload : dict
 | 
			
		||||
            Get request parameters name value pairs
 | 
			
		||||
 | 
			
		||||
        headers : dict
 | 
			
		||||
            The headers for making the GET request.
 | 
			
		||||
 | 
			
		||||
        use_page : bool
 | 
			
		||||
            If True use pagination API else use normal resume key based API.
 | 
			
		||||
 | 
			
		||||
        We have two options to get the snapshots, we use this
 | 
			
		||||
        method to make a selection between pagination API and
 | 
			
		||||
        the normal one with Resumption Key, sequential querying
 | 
			
		||||
@@ -141,7 +157,7 @@ class Cdx:
 | 
			
		||||
    def snapshots(self):
 | 
			
		||||
        """
 | 
			
		||||
        This function yeilds snapshots encapsulated
 | 
			
		||||
        in CdxSnapshot for more usability.
 | 
			
		||||
        in CdxSnapshot for increased usability.
 | 
			
		||||
 | 
			
		||||
        All the get request values are set if the conditions match
 | 
			
		||||
 | 
			
		||||
@@ -188,10 +204,9 @@ class Cdx:
 | 
			
		||||
 | 
			
		||||
                prop_values = snapshot.split(" ")
 | 
			
		||||
 | 
			
		||||
                # Making sure that we get the same number of
 | 
			
		||||
                # property values as the number of properties
 | 
			
		||||
                prop_values_len = len(prop_values)
 | 
			
		||||
                properties_len = len(properties)
 | 
			
		||||
 | 
			
		||||
                if prop_values_len != properties_len:
 | 
			
		||||
                    raise WaybackError(
 | 
			
		||||
                        "Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(
 | 
			
		||||
 
 | 
			
		||||
@@ -5,6 +5,7 @@ import json
 | 
			
		||||
import random
 | 
			
		||||
import string
 | 
			
		||||
import argparse
 | 
			
		||||
 | 
			
		||||
from .wrapper import Url
 | 
			
		||||
from .exceptions import WaybackError
 | 
			
		||||
from .__version__ import __version__
 | 
			
		||||
 
 | 
			
		||||
@@ -3,15 +3,24 @@ from datetime import datetime
 | 
			
		||||
 | 
			
		||||
class CdxSnapshot:
 | 
			
		||||
    """
 | 
			
		||||
    This class helps to use the Cdx Snapshots easily.
 | 
			
		||||
    This class encapsulates the snapshots for greater usability.
 | 
			
		||||
 | 
			
		||||
    Raw Snapshot data looks like:
 | 
			
		||||
    org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
 | 
			
		||||
 | 
			
		||||
    properties is a dict containg all of the 7 cdx snapshot properties.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, properties):
 | 
			
		||||
        """
 | 
			
		||||
        Parameters
 | 
			
		||||
        ----------
 | 
			
		||||
        self : waybackpy.snapshot.CdxSnapshot
 | 
			
		||||
            The instance itself
 | 
			
		||||
 | 
			
		||||
        properties : dict
 | 
			
		||||
            Properties is a dict containg all of the 7 cdx snapshot properties.
 | 
			
		||||
 | 
			
		||||
        """
 | 
			
		||||
        self.urlkey = properties["urlkey"]
 | 
			
		||||
        self.timestamp = properties["timestamp"]
 | 
			
		||||
        self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
 | 
			
		||||
@@ -25,6 +34,12 @@ class CdxSnapshot:
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        """Returns the Cdx snapshot line.
 | 
			
		||||
 | 
			
		||||
        Output format:
 | 
			
		||||
        org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
 | 
			
		||||
 | 
			
		||||
        """
 | 
			
		||||
        return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
 | 
			
		||||
            urlkey=self.urlkey,
 | 
			
		||||
            timestamp=self.timestamp,
 | 
			
		||||
 
 | 
			
		||||
@@ -1,28 +1,72 @@
 | 
			
		||||
import re
 | 
			
		||||
import time
 | 
			
		||||
import requests
 | 
			
		||||
from .exceptions import WaybackError, URLError
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
 | 
			
		||||
from .exceptions import WaybackError, URLError
 | 
			
		||||
from .__version__ import __version__
 | 
			
		||||
 | 
			
		||||
from urllib3.util.retry import Retry
 | 
			
		||||
from requests.adapters import HTTPAdapter
 | 
			
		||||
from .__version__ import __version__
 | 
			
		||||
 | 
			
		||||
quote = requests.utils.quote
 | 
			
		||||
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _latest_version(package_name, headers):
 | 
			
		||||
    endpoint = "https://pypi.org/pypi/" + package_name + "/json"
 | 
			
		||||
    json = _get_response(endpoint, headers=headers).json()
 | 
			
		||||
    return json["info"]["version"]
 | 
			
		||||
    """Returns the latest version of package_name.
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    package_name : str
 | 
			
		||||
        The name of the python package
 | 
			
		||||
 | 
			
		||||
    headers : dict
 | 
			
		||||
        Headers that will be used while making get requests
 | 
			
		||||
 | 
			
		||||
    Return type is str
 | 
			
		||||
 | 
			
		||||
    Use API <https://pypi.org/pypi/> to get the latest version of
 | 
			
		||||
    waybackpy, but can be used to get latest version of any package
 | 
			
		||||
    on PyPi.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    request_url = "https://pypi.org/pypi/" + package_name + "/json"
 | 
			
		||||
    response = _get_response(request_url, headers=headers)
 | 
			
		||||
    data = response.json()
 | 
			
		||||
    return data["info"]["version"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _unix_ts_to_wayback_ts(unix_ts):
 | 
			
		||||
    return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
 | 
			
		||||
def _unix_timestamp_to_wayback_timestamp(unix_timestamp):
 | 
			
		||||
    """Returns unix timestamp converted to datetime.datetime
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    unix_timestamp : str, int or float
 | 
			
		||||
        Unix-timestamp that needs to be converted to datetime.datetime
 | 
			
		||||
 | 
			
		||||
    Converts and returns input unix_timestamp to datetime.datetime object.
 | 
			
		||||
    Does not matter if unix_timestamp is str, float or int.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _add_payload(instance, payload):
 | 
			
		||||
    """Adds payload from instance that can be used to make get requests.
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    instance : waybackpy.cdx.Cdx
 | 
			
		||||
        instance of the Cdx class
 | 
			
		||||
 | 
			
		||||
    payload : dict
 | 
			
		||||
        A dict onto which we need to add keys and values based on instance.
 | 
			
		||||
 | 
			
		||||
    instance is object of Cdx class and it contains the data required to fill
 | 
			
		||||
    the payload dictionary.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if instance.start_timestamp:
 | 
			
		||||
        payload["from"] = instance.start_timestamp
 | 
			
		||||
 | 
			
		||||
@@ -43,18 +87,27 @@ def _add_payload(instance, payload):
 | 
			
		||||
        for i, f in enumerate(instance.collapses):
 | 
			
		||||
            payload["collapse" + str(i)] = f
 | 
			
		||||
 | 
			
		||||
    # Don't need to return anything as it's dictionary.
 | 
			
		||||
    payload["url"] = instance.url
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _ts(timestamp, data):
 | 
			
		||||
    """
 | 
			
		||||
    Get timestamp of last fetched archive.
 | 
			
		||||
    If used before fetching any archive, will
 | 
			
		||||
    use whatever self.JSON returns.
 | 
			
		||||
def _timestamp_manager(timestamp, data):
 | 
			
		||||
    """Returns the timestamp.
 | 
			
		||||
 | 
			
		||||
    self.timestamp is None implies that
 | 
			
		||||
    self.JSON will return any archive's JSON
 | 
			
		||||
    that wayback machine provides it.
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    timestamp : datetime.datetime
 | 
			
		||||
        datetime object
 | 
			
		||||
 | 
			
		||||
    data : dict
 | 
			
		||||
        A python dictionary, which is loaded JSON os the availability API.
 | 
			
		||||
 | 
			
		||||
    Return type:
 | 
			
		||||
        datetime.datetime
 | 
			
		||||
 | 
			
		||||
     If timestamp is not None then sets the value to timestamp itself.
 | 
			
		||||
     If timestamp is None the returns the value from the last fetched API data.
 | 
			
		||||
     If not timestamp and can not read the archived_snapshots form data return datetime.max
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if timestamp:
 | 
			
		||||
@@ -69,6 +122,21 @@ def _ts(timestamp, data):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_match_type(match_type, url):
 | 
			
		||||
    """Checks the validity of match_type parameter of the CDX GET requests.
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    match_type : list
 | 
			
		||||
        list  that may contain any or all from  ["exact", "prefix", "host", "domain"]
 | 
			
		||||
        See https://github.com/akamhy/waybackpy/wiki/Python-package-docs#url-match-scope
 | 
			
		||||
 | 
			
		||||
    url : str
 | 
			
		||||
        The URL used to create the waybackpy Url object.
 | 
			
		||||
 | 
			
		||||
    If not vaild match_type raise Exception.
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if not match_type:
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
@@ -85,6 +153,19 @@ def _check_match_type(match_type, url):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_collapses(collapses):
 | 
			
		||||
    """Checks the validity of collapse parameter of the CDX GET request.
 | 
			
		||||
 | 
			
		||||
    One or more field or field:N to 'collapses=[]' where
 | 
			
		||||
    field is one of (urlkey, timestamp, original, mimetype, statuscode,
 | 
			
		||||
    digest and length) and N is the first N characters of field to test.
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    collapses : list
 | 
			
		||||
 | 
			
		||||
    If not vaild collapses raise Exception.
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if not isinstance(collapses, list):
 | 
			
		||||
        raise WaybackError("collapses must be a list.")
 | 
			
		||||
@@ -119,12 +200,26 @@ def _check_collapses(collapses):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _check_filters(filters):
 | 
			
		||||
    """Checks the validity of filter parameter of the CDX GET request.
 | 
			
		||||
 | 
			
		||||
    Any number of filter params of the following form may be specified:
 | 
			
		||||
        filters=["[!]field:regex"] may be specified..
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    filters : list
 | 
			
		||||
 | 
			
		||||
    If not vaild filters raise Exception.
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if not isinstance(filters, list):
 | 
			
		||||
        raise WaybackError("filters must be a list.")
 | 
			
		||||
 | 
			
		||||
    # [!]field:regex
 | 
			
		||||
    for _filter in filters:
 | 
			
		||||
        try:
 | 
			
		||||
 | 
			
		||||
            match = re.search(
 | 
			
		||||
                r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
 | 
			
		||||
                _filter,
 | 
			
		||||
@@ -134,8 +229,9 @@ def _check_filters(filters):
 | 
			
		||||
            val = match.group(2)
 | 
			
		||||
 | 
			
		||||
        except Exception:
 | 
			
		||||
 | 
			
		||||
            exc_message = (
 | 
			
		||||
                "Filter '{_filter}' not following the cdx filter syntax.".format(
 | 
			
		||||
                "Filter '{_filter}' is not following the cdx filter syntax.".format(
 | 
			
		||||
                    _filter=_filter
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
@@ -143,6 +239,9 @@ def _check_filters(filters):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _cleaned_url(url):
 | 
			
		||||
    """Sanatize the url
 | 
			
		||||
    Remove and replace illegal whitespace characters from the URL.
 | 
			
		||||
    """
 | 
			
		||||
    return str(url).strip().replace(" ", "%20")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -161,16 +260,29 @@ def _url_check(url):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _full_url(endpoint, params):
 | 
			
		||||
    full_url = endpoint
 | 
			
		||||
    if params:
 | 
			
		||||
        full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
 | 
			
		||||
        for key, val in params.items():
 | 
			
		||||
            key = "filter" if key.startswith("filter") else key
 | 
			
		||||
            key = "collapse" if key.startswith("collapse") else key
 | 
			
		||||
            amp = "" if full_url.endswith("?") else "&"
 | 
			
		||||
            full_url = (
 | 
			
		||||
                full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
 | 
			
		||||
            )
 | 
			
		||||
    """API endpoint + GET parameters = full_url
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    endpoint : str
 | 
			
		||||
        The API endpoint
 | 
			
		||||
 | 
			
		||||
    params : dict
 | 
			
		||||
        Dictionary that has name-value pairs.
 | 
			
		||||
 | 
			
		||||
    Return type is str
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if not params:
 | 
			
		||||
        return endpoint
 | 
			
		||||
 | 
			
		||||
    full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
 | 
			
		||||
    for key, val in params.items():
 | 
			
		||||
        key = "filter" if key.startswith("filter") else key
 | 
			
		||||
        key = "collapse" if key.startswith("collapse") else key
 | 
			
		||||
        amp = "" if full_url.endswith("?") else "&"
 | 
			
		||||
        full_url = full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
 | 
			
		||||
    return full_url
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -191,17 +303,31 @@ def _get_total_pages(url, user_agent):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _archive_url_parser(header, url, latest_version=__version__, instance=None):
 | 
			
		||||
    """
 | 
			
		||||
    """Returns the archive after parsing it from the response header.
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    header : str
 | 
			
		||||
        The response header of WayBack Machine's Save API
 | 
			
		||||
 | 
			
		||||
    url : str
 | 
			
		||||
        The input url, the one used to created the Url object.
 | 
			
		||||
 | 
			
		||||
    latest_version : str
 | 
			
		||||
        The latest version of waybackpy (default is __version__)
 | 
			
		||||
 | 
			
		||||
    instance : waybackpy.wrapper.Url
 | 
			
		||||
        Instance of Url class
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    The wayback machine's save API doesn't
 | 
			
		||||
    return JSON response, we are required
 | 
			
		||||
    to read the header of the API response
 | 
			
		||||
    and look for the archive URL.
 | 
			
		||||
    and find the archive URL.
 | 
			
		||||
 | 
			
		||||
    This method has some regexen (or regexes)
 | 
			
		||||
    that search for archive url in header.
 | 
			
		||||
 | 
			
		||||
    This method is used when you try to
 | 
			
		||||
    save a webpage on wayback machine.
 | 
			
		||||
    This method has some regular expressions
 | 
			
		||||
    that are used to search for the archive url
 | 
			
		||||
    in the response header of Save API.
 | 
			
		||||
 | 
			
		||||
    Two cases are possible:
 | 
			
		||||
    1) Either we find the archive url in
 | 
			
		||||
@@ -213,7 +339,6 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
 | 
			
		||||
    If we found the archive URL we return it.
 | 
			
		||||
 | 
			
		||||
    Return format:
 | 
			
		||||
 | 
			
		||||
    web.archive.org/web/<TIMESTAMP>/<URL>
 | 
			
		||||
 | 
			
		||||
    And if we couldn't find it, we raise
 | 
			
		||||
@@ -304,9 +429,7 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _wayback_timestamp(**kwargs):
 | 
			
		||||
    """
 | 
			
		||||
    Wayback Machine archive URLs
 | 
			
		||||
    have a timestamp in them.
 | 
			
		||||
    """Returns a valid waybackpy timestamp.
 | 
			
		||||
 | 
			
		||||
    The standard archive URL format is
 | 
			
		||||
    https://web.archive.org/web/20191214041711/https://www.youtube.com
 | 
			
		||||
@@ -316,13 +439,17 @@ def _wayback_timestamp(**kwargs):
 | 
			
		||||
    2 ) timestamp (20191214041711)
 | 
			
		||||
    3 ) https://www.youtube.com, the original URL
 | 
			
		||||
 | 
			
		||||
    The near method takes year, month, day, hour and minute
 | 
			
		||||
    as Arguments, their type is int.
 | 
			
		||||
 | 
			
		||||
    The near method of Url class in wrapper.py takes year, month, day, hour
 | 
			
		||||
    and minute as arguments, their type is int.
 | 
			
		||||
 | 
			
		||||
    This method takes those integers and converts it to
 | 
			
		||||
    wayback machine timestamp and returns it.
 | 
			
		||||
 | 
			
		||||
    Return format is string.
 | 
			
		||||
 | 
			
		||||
    zfill(2) adds 1 zero in front of single digit days, months hour etc.
 | 
			
		||||
 | 
			
		||||
    Return type is string.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    return "".join(
 | 
			
		||||
@@ -339,16 +466,37 @@ def _get_response(
 | 
			
		||||
    backoff_factor=0.5,
 | 
			
		||||
    no_raise_on_redirects=False,
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    This function is used make get request.
 | 
			
		||||
    We use the requests package to make the
 | 
			
		||||
    requests.
 | 
			
		||||
    """Makes get requests.
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    endpoint : str
 | 
			
		||||
        The API endpoint.
 | 
			
		||||
 | 
			
		||||
    params : dict
 | 
			
		||||
        The get request parameters. (default is None)
 | 
			
		||||
 | 
			
		||||
    headers : dict
 | 
			
		||||
        Headers for the get request. (default is None)
 | 
			
		||||
 | 
			
		||||
    return_full_url : bool
 | 
			
		||||
        Determines whether the call went full url returned along with the
 | 
			
		||||
        response. (default is False)
 | 
			
		||||
 | 
			
		||||
    retries : int
 | 
			
		||||
        Maximum number of retries for the get request. (default is 5)
 | 
			
		||||
 | 
			
		||||
    backoff_factor : float
 | 
			
		||||
        The factor by which we determine the next retry time after wait.
 | 
			
		||||
        https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html
 | 
			
		||||
        (default is 0.5)
 | 
			
		||||
 | 
			
		||||
    no_raise_on_redirects : bool
 | 
			
		||||
        If maximum 30(default for requests) times redirected than instead of
 | 
			
		||||
        exceptions return. (default is False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    We try five times and if it fails it raises
 | 
			
		||||
    WaybackError exception.
 | 
			
		||||
 | 
			
		||||
    You can handles WaybackError by importing:
 | 
			
		||||
    To handle WaybackError:
 | 
			
		||||
    from waybackpy.exceptions import WaybackError
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
@@ -370,20 +518,28 @@ def _get_response(
 | 
			
		||||
 | 
			
		||||
    s.mount("https://", HTTPAdapter(max_retries=retries))
 | 
			
		||||
 | 
			
		||||
    # The URL with parameters required for the get request
 | 
			
		||||
    url = _full_url(endpoint, params)
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
 | 
			
		||||
        if not return_full_url:
 | 
			
		||||
            return s.get(url, headers=headers)
 | 
			
		||||
 | 
			
		||||
        return (url, s.get(url, headers=headers))
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
 | 
			
		||||
        reason = str(e)
 | 
			
		||||
 | 
			
		||||
        if no_raise_on_redirects:
 | 
			
		||||
            if "Exceeded 30 redirects" in reason:
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
        exc_message = "Error while retrieving {url}.\n{reason}".format(
 | 
			
		||||
            url=url, reason=reason
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        exc = WaybackError(exc_message)
 | 
			
		||||
        exc.__cause__ = e
 | 
			
		||||
        raise exc
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,6 @@
 | 
			
		||||
import re
 | 
			
		||||
from datetime import datetime, timedelta
 | 
			
		||||
 | 
			
		||||
from .exceptions import WaybackError
 | 
			
		||||
from .cdx import Cdx
 | 
			
		||||
from .utils import (
 | 
			
		||||
@@ -9,13 +10,85 @@ from .utils import (
 | 
			
		||||
    default_user_agent,
 | 
			
		||||
    _url_check,
 | 
			
		||||
    _cleaned_url,
 | 
			
		||||
    _ts,
 | 
			
		||||
    _unix_ts_to_wayback_ts,
 | 
			
		||||
    _timestamp_manager,
 | 
			
		||||
    _unix_timestamp_to_wayback_timestamp,
 | 
			
		||||
    _latest_version,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Url:
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    Attributes
 | 
			
		||||
    ----------
 | 
			
		||||
    url : str
 | 
			
		||||
        The input URL, wayback machine API operations are performed
 | 
			
		||||
        on this URL after sanatizing it.
 | 
			
		||||
 | 
			
		||||
    user_agent : str
 | 
			
		||||
        The user_agent used while making the GET requests to the
 | 
			
		||||
        Wayback machine APIs
 | 
			
		||||
 | 
			
		||||
    _archive_url : str
 | 
			
		||||
        Caches the last fetched archive.
 | 
			
		||||
 | 
			
		||||
    timestamp : datetime.datetime
 | 
			
		||||
        timestamp of the archive URL as datetime object for
 | 
			
		||||
        greater usability
 | 
			
		||||
 | 
			
		||||
    _JSON : dict
 | 
			
		||||
        Caches the last fetched availability API data
 | 
			
		||||
 | 
			
		||||
    latest_version : str
 | 
			
		||||
        The latest version of waybackpy on PyPi
 | 
			
		||||
 | 
			
		||||
    cached_save : bool
 | 
			
		||||
        Flag to check if WayBack machine returned a cached
 | 
			
		||||
        archive instead of creating a new archive. WayBack
 | 
			
		||||
        machine allows only one 1 archive for an URL in
 | 
			
		||||
        30 minutes. If the archive returned by WayBack machine
 | 
			
		||||
        is older than 3 minutes than this flag is set to True
 | 
			
		||||
 | 
			
		||||
    Methods turned properties
 | 
			
		||||
    ----------
 | 
			
		||||
    JSON : dict
 | 
			
		||||
        JSON response of availability API as dictionary / loaded JSON
 | 
			
		||||
 | 
			
		||||
    archive_url : str
 | 
			
		||||
        Return the archive url, returns str
 | 
			
		||||
 | 
			
		||||
    _timestamp : datetime.datetime
 | 
			
		||||
        Sets the value of self.timestamp if still not set
 | 
			
		||||
 | 
			
		||||
    Methods
 | 
			
		||||
    -------
 | 
			
		||||
    save()
 | 
			
		||||
        Archives the URL on WayBack machine
 | 
			
		||||
 | 
			
		||||
    get(url="", user_agent="", encoding="")
 | 
			
		||||
        Gets the source of archive url, can also be used to get source
 | 
			
		||||
        of any URL if passed into it.
 | 
			
		||||
 | 
			
		||||
    near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
 | 
			
		||||
        Wayback Machine can have many archives for a URL/webpage, sometimes we want
 | 
			
		||||
        archive close to a specific time.
 | 
			
		||||
        This method takes year, month, day, hour, minute and unix_timestamp as input.
 | 
			
		||||
 | 
			
		||||
    oldest(year=1994)
 | 
			
		||||
        The oldest archive of an URL.
 | 
			
		||||
 | 
			
		||||
    newest()
 | 
			
		||||
        The newest archive of an URL
 | 
			
		||||
 | 
			
		||||
    total_archives(start_timestamp=None, end_timestamp=None)
 | 
			
		||||
        total number of archives of an URL, the timeframe can be confined by
 | 
			
		||||
        start_timestamp and end_timestamp
 | 
			
		||||
 | 
			
		||||
    known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
 | 
			
		||||
        Known URLs for an URL, subdomain, URL as prefix etc.
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, url, user_agent=default_user_agent):
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.user_agent = str(user_agent)
 | 
			
		||||
@@ -32,29 +105,17 @@ class Url:
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        """
 | 
			
		||||
        Output when print() is used on <class 'waybackpy.wrapper.Url'>
 | 
			
		||||
        This should print an archive URL.
 | 
			
		||||
 | 
			
		||||
        We check if self._archive_url is not None.
 | 
			
		||||
        If not None, good. We return string of self._archive_url.
 | 
			
		||||
 | 
			
		||||
        If self._archive_url is None, it means we ain't used any method that
 | 
			
		||||
        sets self._archive_url, we now set self._archive_url to self.archive_url
 | 
			
		||||
        and return it.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if not self._archive_url:
 | 
			
		||||
            self._archive_url = self.archive_url
 | 
			
		||||
 | 
			
		||||
        return "{archive_url}".format(archive_url=self._archive_url)
 | 
			
		||||
 | 
			
		||||
    def __len__(self):
 | 
			
		||||
        """
 | 
			
		||||
        Why do we have len here?
 | 
			
		||||
        """Number of days between today and the date of archive based on the timestamp
 | 
			
		||||
 | 
			
		||||
        Applying len() on <class 'waybackpy.wrapper.Url'>
 | 
			
		||||
        will calculate the number of days between today and
 | 
			
		||||
        the archive timestamp.
 | 
			
		||||
        len() of waybackpy.wrapper.Url should return
 | 
			
		||||
        the number of days between today and the
 | 
			
		||||
        archive timestamp.
 | 
			
		||||
 | 
			
		||||
        Can be applied on return values of near and its
 | 
			
		||||
        childs (e.g. oldest) and if applied on waybackpy.Url()
 | 
			
		||||
@@ -76,32 +137,30 @@ class Url:
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def JSON(self):
 | 
			
		||||
        """
 | 
			
		||||
        If the end user has used near() or its childs like oldest, newest
 | 
			
		||||
        and archive_url then the JSON response of these are cached in self._JSON
 | 
			
		||||
        """Returns JSON response of availability API as dictionary / loaded JSON
 | 
			
		||||
 | 
			
		||||
        If we find that self._JSON is not None we return it.
 | 
			
		||||
        else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
 | 
			
		||||
        and return it.
 | 
			
		||||
        return type : dict
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        # If user used the near method or any method that depends on near, we
 | 
			
		||||
        # are certain that we have a loaded dictionary cached in self._JSON.
 | 
			
		||||
        # Return the loaded JSON data.
 | 
			
		||||
        if self._JSON:
 | 
			
		||||
            return self._JSON
 | 
			
		||||
 | 
			
		||||
        # If no cached data found, get data and return + cache it.
 | 
			
		||||
        endpoint = "https://archive.org/wayback/available"
 | 
			
		||||
        headers = {"User-Agent": self.user_agent}
 | 
			
		||||
        payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
 | 
			
		||||
        response = _get_response(endpoint, params=payload, headers=headers)
 | 
			
		||||
        return response.json()
 | 
			
		||||
        self._JSON = response.json()
 | 
			
		||||
        return self._JSON
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def archive_url(self):
 | 
			
		||||
        """
 | 
			
		||||
        Returns any random archive for the instance.
 | 
			
		||||
        But if near, oldest, newest were used before
 | 
			
		||||
        then it returns the same archive again.
 | 
			
		||||
        """Return the archive url.
 | 
			
		||||
 | 
			
		||||
        We cache archive in self._archive_url
 | 
			
		||||
        return type : str
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if self._archive_url:
 | 
			
		||||
@@ -121,11 +180,16 @@ class Url:
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def _timestamp(self):
 | 
			
		||||
        self.timestamp = _ts(self.timestamp, self.JSON)
 | 
			
		||||
        return self.timestamp
 | 
			
		||||
        """Sets the value of self.timestamp if still not set.
 | 
			
		||||
 | 
			
		||||
        Return type : datetime.datetime
 | 
			
		||||
 | 
			
		||||
        """
 | 
			
		||||
        return _timestamp_manager(self.timestamp, self.JSON)
 | 
			
		||||
 | 
			
		||||
    def save(self):
 | 
			
		||||
        """
 | 
			
		||||
        """Saves/Archive the URL.
 | 
			
		||||
 | 
			
		||||
        To save a webpage on WayBack machine we
 | 
			
		||||
        need to send get request to https://web.archive.org/save/
 | 
			
		||||
 | 
			
		||||
@@ -136,6 +200,8 @@ class Url:
 | 
			
		||||
 | 
			
		||||
        _archive_url_parser() parses the archive from the header.
 | 
			
		||||
 | 
			
		||||
        return type : waybackpy.wrapper.Url
 | 
			
		||||
 | 
			
		||||
        """
 | 
			
		||||
        request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
 | 
			
		||||
        headers = {"User-Agent": self.user_agent}
 | 
			
		||||
@@ -161,7 +227,9 @@ class Url:
 | 
			
		||||
            instance=self,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url)
 | 
			
		||||
        m = re.search(
 | 
			
		||||
            r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
 | 
			
		||||
        )
 | 
			
		||||
        str_ts = m.group(1)
 | 
			
		||||
        ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
 | 
			
		||||
        now = datetime.utcnow()
 | 
			
		||||
@@ -175,9 +243,22 @@ class Url:
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def get(self, url="", user_agent="", encoding=""):
 | 
			
		||||
        """
 | 
			
		||||
        Return the source code of the last archived URL,
 | 
			
		||||
        if no URL is passed to this method.
 | 
			
		||||
        """GET the source of archive or any other URL.
 | 
			
		||||
 | 
			
		||||
        url : str, waybackpy.wrapper.Url
 | 
			
		||||
            The method will return the source code of
 | 
			
		||||
            this URL instead of last fetched archive.
 | 
			
		||||
 | 
			
		||||
        user_agent : str
 | 
			
		||||
            The user_agent for GET request to API
 | 
			
		||||
 | 
			
		||||
        encoding : str
 | 
			
		||||
            If user is using any other encoding that
 | 
			
		||||
            can't be detected by response.encoding
 | 
			
		||||
 | 
			
		||||
        Return the source code of the last fetched
 | 
			
		||||
        archive URL if no URL is passed to this method
 | 
			
		||||
        else it returns the source code of url passed.
 | 
			
		||||
 | 
			
		||||
        If encoding is not supplied, it is auto-detected
 | 
			
		||||
         from the response itself by requests package.
 | 
			
		||||
@@ -213,6 +294,27 @@ class Url:
 | 
			
		||||
        unix_timestamp=None,
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Parameters
 | 
			
		||||
        ----------
 | 
			
		||||
 | 
			
		||||
        year : int
 | 
			
		||||
            Archive close to year
 | 
			
		||||
 | 
			
		||||
        month : int
 | 
			
		||||
            Archive close to month
 | 
			
		||||
 | 
			
		||||
        day : int
 | 
			
		||||
            Archive close to day
 | 
			
		||||
 | 
			
		||||
        hour : int
 | 
			
		||||
            Archive close to hour
 | 
			
		||||
 | 
			
		||||
        minute : int
 | 
			
		||||
            Archive close to minute
 | 
			
		||||
 | 
			
		||||
        unix_timestamp : str, float or int
 | 
			
		||||
            Archive close to this unix_timestamp
 | 
			
		||||
 | 
			
		||||
        Wayback Machine can have many archives of a webpage,
 | 
			
		||||
        sometimes we want archive close to a specific time.
 | 
			
		||||
 | 
			
		||||
@@ -235,7 +337,7 @@ class Url:
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if unix_timestamp:
 | 
			
		||||
            timestamp = _unix_ts_to_wayback_ts(unix_timestamp)
 | 
			
		||||
            timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
 | 
			
		||||
        else:
 | 
			
		||||
            now = datetime.utcnow().timetuple()
 | 
			
		||||
            timestamp = _wayback_timestamp(
 | 
			
		||||
@@ -285,28 +387,45 @@ class Url:
 | 
			
		||||
 | 
			
		||||
        We simply pass the year in near() and return it.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        return self.near(year=year)
 | 
			
		||||
 | 
			
		||||
    def newest(self):
 | 
			
		||||
        """
 | 
			
		||||
        Return the newest Wayback Machine archive available for this URL.
 | 
			
		||||
        """Return the newest Wayback Machine archive available.
 | 
			
		||||
 | 
			
		||||
        We return the output of self.near() as it deafults to current utc time.
 | 
			
		||||
        We return the return value of self.near() as it deafults to current UTC time.
 | 
			
		||||
 | 
			
		||||
        Due to Wayback Machine database lag, this may not always be the
 | 
			
		||||
        most recent archive.
 | 
			
		||||
 | 
			
		||||
        return type : waybackpy.wrapper.Url
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        return self.near()
 | 
			
		||||
 | 
			
		||||
    def total_archives(self, start_timestamp=None, end_timestamp=None):
 | 
			
		||||
        """
 | 
			
		||||
        """Returns the total number of archives for an URL
 | 
			
		||||
 | 
			
		||||
        Parameters
 | 
			
		||||
        ----------
 | 
			
		||||
        start_timestamp : str
 | 
			
		||||
            1 to 14 digit string of numbers, you are not required to
 | 
			
		||||
            pass a full 14 digit timestamp.
 | 
			
		||||
 | 
			
		||||
        end_timestamp : str
 | 
			
		||||
            1 to 14 digit string of numbers, you are not required to
 | 
			
		||||
            pass a full 14 digit timestamp.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        return type : int
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        A webpage can have multiple archives on the wayback machine
 | 
			
		||||
        If someone wants to count the total number of archives of a
 | 
			
		||||
        webpage on wayback machine they can use this method.
 | 
			
		||||
 | 
			
		||||
        Returns the total number of Wayback Machine archives for the URL.
 | 
			
		||||
 | 
			
		||||
        Return type in integer.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        cdx = Cdx(
 | 
			
		||||
@@ -315,6 +434,8 @@ class Url:
 | 
			
		||||
            start_timestamp=start_timestamp,
 | 
			
		||||
            end_timestamp=end_timestamp,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # cdx.snapshots() is generator not list.
 | 
			
		||||
        i = 0
 | 
			
		||||
        for _ in cdx.snapshots():
 | 
			
		||||
            i = i + 1
 | 
			
		||||
@@ -328,15 +449,36 @@ class Url:
 | 
			
		||||
        end_timestamp=None,
 | 
			
		||||
        match_type="prefix",
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        """Yields known_urls URLs from the CDX API.
 | 
			
		||||
 | 
			
		||||
        Parameters
 | 
			
		||||
        ----------
 | 
			
		||||
 | 
			
		||||
        subdomain : bool
 | 
			
		||||
            If True fetch subdomain URLs along with the host URLs.
 | 
			
		||||
 | 
			
		||||
        host : bool
 | 
			
		||||
            Only fetch host URLs.
 | 
			
		||||
 | 
			
		||||
        start_timestamp : str
 | 
			
		||||
            1 to 14 digit string of numbers, you are not required to
 | 
			
		||||
            pass a full 14 digit timestamp.
 | 
			
		||||
 | 
			
		||||
        end_timestamp : str
 | 
			
		||||
            1 to 14 digit string of numbers, you are not required to
 | 
			
		||||
            pass a full 14 digit timestamp.
 | 
			
		||||
 | 
			
		||||
        match_type : str
 | 
			
		||||
            One of  (exact, prefix, host and domain)
 | 
			
		||||
 | 
			
		||||
        return type : waybackpy.snapshot.CdxSnapshot
 | 
			
		||||
 | 
			
		||||
        Yields list of URLs known to exist for given input.
 | 
			
		||||
        Defaults to input URL as prefix.
 | 
			
		||||
 | 
			
		||||
        This method is kept for compatibility, use the Cdx class instead.
 | 
			
		||||
        This method itself depends on Cdx.
 | 
			
		||||
 | 
			
		||||
         Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
 | 
			
		||||
         https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
 | 
			
		||||
        Based on:
 | 
			
		||||
        https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
 | 
			
		||||
        By Mohammed Diaa (https://github.com/mhmdiaa)
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if subdomain:
 | 
			
		||||
@@ -353,7 +495,5 @@ class Url:
 | 
			
		||||
            collapses=["urlkey"],
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        snapshots = cdx.snapshots()
 | 
			
		||||
 | 
			
		||||
        for snapshot in snapshots:
 | 
			
		||||
        for snapshot in cdx.snapshots():
 | 
			
		||||
            yield (snapshot.original)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user