Add doc strings (#90)

* Added some docstrings in utils.py

* renamed some func/meth to better names and added doc strings + lint

* added more docstrings

* more docstrings

* improve docstrings

* docstrings

* added more docstrings, lint

* fix import error
This commit is contained in:
Akash Mahanty 2021-01-26 11:56:03 +05:30 committed by GitHub
parent 88cda94c0b
commit db8f902cff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 443 additions and 121 deletions

View File

@ -79,7 +79,7 @@ def test_all_cdx():
c = 0
for snapshot in snapshots:
c += 1
if c > 30_529: # deafult limit is 10k
if c > 30529: # deafult limit is 10k
break
url = "https://github.com/*"
@ -89,5 +89,5 @@ def test_all_cdx():
for snapshot in snapshots:
c += 1
if c > 100_529:
if c > 100529:
break

View File

@ -5,8 +5,7 @@ import random
import string
import argparse
sys.path.append("..")
import waybackpy.cli as cli # noqa: E402
import waybackpy.cli as cli
from waybackpy.wrapper import Url # noqa: E402
from waybackpy.__version__ import __version__

View File

@ -14,14 +14,14 @@ from waybackpy.utils import (
_check_match_type,
_check_collapses,
_check_filters,
_ts,
_timestamp_manager,
)
def test_ts():
def test_timestamp_manager():
timestamp = True
data = {}
assert _ts(timestamp, data)
assert _timestamp_manager(timestamp, data)
data = """
{"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"}
@ -61,10 +61,10 @@ def test_check_collapses():
def test_check_match_type():
assert None == _check_match_type(None, "url")
assert _check_match_type(None, "url") is None
match_type = "exact"
url = "test_url"
assert None == _check_match_type(match_type, url)
assert _check_match_type(match_type, url) is None
url = "has * in it"
with pytest.raises(WaybackError):
@ -82,7 +82,7 @@ def test_cleaned_url():
def test_url_check():
good_url = "https://akamhy.github.io"
assert None == _url_check(good_url)
assert _url_check(good_url) is None
bad_url = "https://github-com"
with pytest.raises(URLError):

View File

@ -1,8 +1,4 @@
import sys
import pytest
import random
import requests
from datetime import datetime
from waybackpy.wrapper import Url

View File

@ -11,6 +11,7 @@ from .utils import (
)
# TODO : Threading support for pagination API. It's designed for Threading.
# TODO : Add get method here if type is Vaild HTML, SVG other but not - or warc. Test it.
class Cdx:
@ -42,7 +43,22 @@ class Cdx:
self.use_page = False
def cdx_api_manager(self, payload, headers, use_page=False):
"""
"""Act as button, we can choose between the normal API and pagination API.
Parameters
----------
self : waybackpy.cdx.Cdx
The instance itself
payload : dict
Get request parameters name value pairs
headers : dict
The headers for making the GET request.
use_page : bool
If True use pagination API else use normal resume key based API.
We have two options to get the snapshots, we use this
method to make a selection between pagination API and
the normal one with Resumption Key, sequential querying
@ -141,7 +157,7 @@ class Cdx:
def snapshots(self):
"""
This function yeilds snapshots encapsulated
in CdxSnapshot for more usability.
in CdxSnapshot for increased usability.
All the get request values are set if the conditions match
@ -188,10 +204,9 @@ class Cdx:
prop_values = snapshot.split(" ")
# Making sure that we get the same number of
# property values as the number of properties
prop_values_len = len(prop_values)
properties_len = len(properties)
if prop_values_len != properties_len:
raise WaybackError(
"Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(

View File

@ -5,6 +5,7 @@ import json
import random
import string
import argparse
from .wrapper import Url
from .exceptions import WaybackError
from .__version__ import __version__

View File

@ -3,15 +3,24 @@ from datetime import datetime
class CdxSnapshot:
"""
This class helps to use the Cdx Snapshots easily.
This class encapsulates the snapshots for greater usability.
Raw Snapshot data looks like:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
properties is a dict containg all of the 7 cdx snapshot properties.
"""
def __init__(self, properties):
"""
Parameters
----------
self : waybackpy.snapshot.CdxSnapshot
The instance itself
properties : dict
Properties is a dict containg all of the 7 cdx snapshot properties.
"""
self.urlkey = properties["urlkey"]
self.timestamp = properties["timestamp"]
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
@ -25,6 +34,12 @@ class CdxSnapshot:
)
def __str__(self):
"""Returns the Cdx snapshot line.
Output format:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
"""
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
urlkey=self.urlkey,
timestamp=self.timestamp,

View File

@ -1,28 +1,72 @@
import re
import time
import requests
from .exceptions import WaybackError, URLError
from datetime import datetime
from .exceptions import WaybackError, URLError
from .__version__ import __version__
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from .__version__ import __version__
quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _latest_version(package_name, headers):
endpoint = "https://pypi.org/pypi/" + package_name + "/json"
json = _get_response(endpoint, headers=headers).json()
return json["info"]["version"]
"""Returns the latest version of package_name.
Parameters
----------
package_name : str
The name of the python package
headers : dict
Headers that will be used while making get requests
Return type is str
Use API <https://pypi.org/pypi/> to get the latest version of
waybackpy, but can be used to get latest version of any package
on PyPi.
"""
request_url = "https://pypi.org/pypi/" + package_name + "/json"
response = _get_response(request_url, headers=headers)
data = response.json()
return data["info"]["version"]
def _unix_ts_to_wayback_ts(unix_ts):
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
def _unix_timestamp_to_wayback_timestamp(unix_timestamp):
"""Returns unix timestamp converted to datetime.datetime
Parameters
----------
unix_timestamp : str, int or float
Unix-timestamp that needs to be converted to datetime.datetime
Converts and returns input unix_timestamp to datetime.datetime object.
Does not matter if unix_timestamp is str, float or int.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def _add_payload(instance, payload):
"""Adds payload from instance that can be used to make get requests.
Parameters
----------
instance : waybackpy.cdx.Cdx
instance of the Cdx class
payload : dict
A dict onto which we need to add keys and values based on instance.
instance is object of Cdx class and it contains the data required to fill
the payload dictionary.
"""
if instance.start_timestamp:
payload["from"] = instance.start_timestamp
@ -43,18 +87,27 @@ def _add_payload(instance, payload):
for i, f in enumerate(instance.collapses):
payload["collapse" + str(i)] = f
# Don't need to return anything as it's dictionary.
payload["url"] = instance.url
def _ts(timestamp, data):
"""
Get timestamp of last fetched archive.
If used before fetching any archive, will
use whatever self.JSON returns.
def _timestamp_manager(timestamp, data):
"""Returns the timestamp.
self.timestamp is None implies that
self.JSON will return any archive's JSON
that wayback machine provides it.
Parameters
----------
timestamp : datetime.datetime
datetime object
data : dict
A python dictionary, which is loaded JSON os the availability API.
Return type:
datetime.datetime
If timestamp is not None then sets the value to timestamp itself.
If timestamp is None the returns the value from the last fetched API data.
If not timestamp and can not read the archived_snapshots form data return datetime.max
"""
if timestamp:
@ -69,6 +122,21 @@ def _ts(timestamp, data):
def _check_match_type(match_type, url):
"""Checks the validity of match_type parameter of the CDX GET requests.
Parameters
----------
match_type : list
list that may contain any or all from ["exact", "prefix", "host", "domain"]
See https://github.com/akamhy/waybackpy/wiki/Python-package-docs#url-match-scope
url : str
The URL used to create the waybackpy Url object.
If not vaild match_type raise Exception.
"""
if not match_type:
return
@ -85,6 +153,19 @@ def _check_match_type(match_type, url):
def _check_collapses(collapses):
"""Checks the validity of collapse parameter of the CDX GET request.
One or more field or field:N to 'collapses=[]' where
field is one of (urlkey, timestamp, original, mimetype, statuscode,
digest and length) and N is the first N characters of field to test.
Parameters
----------
collapses : list
If not vaild collapses raise Exception.
"""
if not isinstance(collapses, list):
raise WaybackError("collapses must be a list.")
@ -119,12 +200,26 @@ def _check_collapses(collapses):
def _check_filters(filters):
"""Checks the validity of filter parameter of the CDX GET request.
Any number of filter params of the following form may be specified:
filters=["[!]field:regex"] may be specified..
Parameters
----------
filters : list
If not vaild filters raise Exception.
"""
if not isinstance(filters, list):
raise WaybackError("filters must be a list.")
# [!]field:regex
for _filter in filters:
try:
match = re.search(
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
_filter,
@ -134,8 +229,9 @@ def _check_filters(filters):
val = match.group(2)
except Exception:
exc_message = (
"Filter '{_filter}' not following the cdx filter syntax.".format(
"Filter '{_filter}' is not following the cdx filter syntax.".format(
_filter=_filter
)
)
@ -143,6 +239,9 @@ def _check_filters(filters):
def _cleaned_url(url):
"""Sanatize the url
Remove and replace illegal whitespace characters from the URL.
"""
return str(url).strip().replace(" ", "%20")
@ -161,16 +260,29 @@ def _url_check(url):
def _full_url(endpoint, params):
full_url = endpoint
if params:
"""API endpoint + GET parameters = full_url
Parameters
----------
endpoint : str
The API endpoint
params : dict
Dictionary that has name-value pairs.
Return type is str
"""
if not params:
return endpoint
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
for key, val in params.items():
key = "filter" if key.startswith("filter") else key
key = "collapse" if key.startswith("collapse") else key
amp = "" if full_url.endswith("?") else "&"
full_url = (
full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
)
full_url = full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
return full_url
@ -191,17 +303,31 @@ def _get_total_pages(url, user_agent):
def _archive_url_parser(header, url, latest_version=__version__, instance=None):
"""
"""Returns the archive after parsing it from the response header.
Parameters
----------
header : str
The response header of WayBack Machine's Save API
url : str
The input url, the one used to created the Url object.
latest_version : str
The latest version of waybackpy (default is __version__)
instance : waybackpy.wrapper.Url
Instance of Url class
The wayback machine's save API doesn't
return JSON response, we are required
to read the header of the API response
and look for the archive URL.
and find the archive URL.
This method has some regexen (or regexes)
that search for archive url in header.
This method is used when you try to
save a webpage on wayback machine.
This method has some regular expressions
that are used to search for the archive url
in the response header of Save API.
Two cases are possible:
1) Either we find the archive url in
@ -213,7 +339,6 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
If we found the archive URL we return it.
Return format:
web.archive.org/web/<TIMESTAMP>/<URL>
And if we couldn't find it, we raise
@ -304,9 +429,7 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
def _wayback_timestamp(**kwargs):
"""
Wayback Machine archive URLs
have a timestamp in them.
"""Returns a valid waybackpy timestamp.
The standard archive URL format is
https://web.archive.org/web/20191214041711/https://www.youtube.com
@ -316,13 +439,17 @@ def _wayback_timestamp(**kwargs):
2 ) timestamp (20191214041711)
3 ) https://www.youtube.com, the original URL
The near method takes year, month, day, hour and minute
as Arguments, their type is int.
The near method of Url class in wrapper.py takes year, month, day, hour
and minute as arguments, their type is int.
This method takes those integers and converts it to
wayback machine timestamp and returns it.
Return format is string.
zfill(2) adds 1 zero in front of single digit days, months hour etc.
Return type is string.
"""
return "".join(
@ -339,16 +466,37 @@ def _get_response(
backoff_factor=0.5,
no_raise_on_redirects=False,
):
"""
This function is used make get request.
We use the requests package to make the
requests.
"""Makes get requests.
Parameters
----------
endpoint : str
The API endpoint.
params : dict
The get request parameters. (default is None)
headers : dict
Headers for the get request. (default is None)
return_full_url : bool
Determines whether the call went full url returned along with the
response. (default is False)
retries : int
Maximum number of retries for the get request. (default is 5)
backoff_factor : float
The factor by which we determine the next retry time after wait.
https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html
(default is 0.5)
no_raise_on_redirects : bool
If maximum 30(default for requests) times redirected than instead of
exceptions return. (default is False)
We try five times and if it fails it raises
WaybackError exception.
You can handles WaybackError by importing:
To handle WaybackError:
from waybackpy.exceptions import WaybackError
try:
@ -370,20 +518,28 @@ def _get_response(
s.mount("https://", HTTPAdapter(max_retries=retries))
# The URL with parameters required for the get request
url = _full_url(endpoint, params)
try:
if not return_full_url:
return s.get(url, headers=headers)
return (url, s.get(url, headers=headers))
except Exception as e:
reason = str(e)
if no_raise_on_redirects:
if "Exceeded 30 redirects" in reason:
return
exc_message = "Error while retrieving {url}.\n{reason}".format(
url=url, reason=reason
)
exc = WaybackError(exc_message)
exc.__cause__ = e
raise exc

View File

@ -1,5 +1,6 @@
import re
from datetime import datetime, timedelta
from .exceptions import WaybackError
from .cdx import Cdx
from .utils import (
@ -9,13 +10,85 @@ from .utils import (
default_user_agent,
_url_check,
_cleaned_url,
_ts,
_unix_ts_to_wayback_ts,
_timestamp_manager,
_unix_timestamp_to_wayback_timestamp,
_latest_version,
)
class Url:
"""
Attributes
----------
url : str
The input URL, wayback machine API operations are performed
on this URL after sanatizing it.
user_agent : str
The user_agent used while making the GET requests to the
Wayback machine APIs
_archive_url : str
Caches the last fetched archive.
timestamp : datetime.datetime
timestamp of the archive URL as datetime object for
greater usability
_JSON : dict
Caches the last fetched availability API data
latest_version : str
The latest version of waybackpy on PyPi
cached_save : bool
Flag to check if WayBack machine returned a cached
archive instead of creating a new archive. WayBack
machine allows only one 1 archive for an URL in
30 minutes. If the archive returned by WayBack machine
is older than 3 minutes than this flag is set to True
Methods turned properties
----------
JSON : dict
JSON response of availability API as dictionary / loaded JSON
archive_url : str
Return the archive url, returns str
_timestamp : datetime.datetime
Sets the value of self.timestamp if still not set
Methods
-------
save()
Archives the URL on WayBack machine
get(url="", user_agent="", encoding="")
Gets the source of archive url, can also be used to get source
of any URL if passed into it.
near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
Wayback Machine can have many archives for a URL/webpage, sometimes we want
archive close to a specific time.
This method takes year, month, day, hour, minute and unix_timestamp as input.
oldest(year=1994)
The oldest archive of an URL.
newest()
The newest archive of an URL
total_archives(start_timestamp=None, end_timestamp=None)
total number of archives of an URL, the timeframe can be confined by
start_timestamp and end_timestamp
known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
Known URLs for an URL, subdomain, URL as prefix etc.
"""
def __init__(self, url, user_agent=default_user_agent):
self.url = url
self.user_agent = str(user_agent)
@ -32,29 +105,17 @@ class Url:
)
def __str__(self):
"""
Output when print() is used on <class 'waybackpy.wrapper.Url'>
This should print an archive URL.
We check if self._archive_url is not None.
If not None, good. We return string of self._archive_url.
If self._archive_url is None, it means we ain't used any method that
sets self._archive_url, we now set self._archive_url to self.archive_url
and return it.
"""
if not self._archive_url:
self._archive_url = self.archive_url
return "{archive_url}".format(archive_url=self._archive_url)
def __len__(self):
"""
Why do we have len here?
"""Number of days between today and the date of archive based on the timestamp
Applying len() on <class 'waybackpy.wrapper.Url'>
will calculate the number of days between today and
the archive timestamp.
len() of waybackpy.wrapper.Url should return
the number of days between today and the
archive timestamp.
Can be applied on return values of near and its
childs (e.g. oldest) and if applied on waybackpy.Url()
@ -76,32 +137,30 @@ class Url:
@property
def JSON(self):
"""
If the end user has used near() or its childs like oldest, newest
and archive_url then the JSON response of these are cached in self._JSON
"""Returns JSON response of availability API as dictionary / loaded JSON
If we find that self._JSON is not None we return it.
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
and return it.
return type : dict
"""
# If user used the near method or any method that depends on near, we
# are certain that we have a loaded dictionary cached in self._JSON.
# Return the loaded JSON data.
if self._JSON:
return self._JSON
# If no cached data found, get data and return + cache it.
endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": self.user_agent}
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
response = _get_response(endpoint, params=payload, headers=headers)
return response.json()
self._JSON = response.json()
return self._JSON
@property
def archive_url(self):
"""
Returns any random archive for the instance.
But if near, oldest, newest were used before
then it returns the same archive again.
"""Return the archive url.
We cache archive in self._archive_url
return type : str
"""
if self._archive_url:
@ -121,11 +180,16 @@ class Url:
@property
def _timestamp(self):
self.timestamp = _ts(self.timestamp, self.JSON)
return self.timestamp
"""Sets the value of self.timestamp if still not set.
Return type : datetime.datetime
"""
return _timestamp_manager(self.timestamp, self.JSON)
def save(self):
"""
"""Saves/Archive the URL.
To save a webpage on WayBack machine we
need to send get request to https://web.archive.org/save/
@ -136,6 +200,8 @@ class Url:
_archive_url_parser() parses the archive from the header.
return type : waybackpy.wrapper.Url
"""
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
headers = {"User-Agent": self.user_agent}
@ -161,7 +227,9 @@ class Url:
instance=self,
)
m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url)
m = re.search(
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
)
str_ts = m.group(1)
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
now = datetime.utcnow()
@ -175,9 +243,22 @@ class Url:
return self
def get(self, url="", user_agent="", encoding=""):
"""
Return the source code of the last archived URL,
if no URL is passed to this method.
"""GET the source of archive or any other URL.
url : str, waybackpy.wrapper.Url
The method will return the source code of
this URL instead of last fetched archive.
user_agent : str
The user_agent for GET request to API
encoding : str
If user is using any other encoding that
can't be detected by response.encoding
Return the source code of the last fetched
archive URL if no URL is passed to this method
else it returns the source code of url passed.
If encoding is not supplied, it is auto-detected
from the response itself by requests package.
@ -213,6 +294,27 @@ class Url:
unix_timestamp=None,
):
"""
Parameters
----------
year : int
Archive close to year
month : int
Archive close to month
day : int
Archive close to day
hour : int
Archive close to hour
minute : int
Archive close to minute
unix_timestamp : str, float or int
Archive close to this unix_timestamp
Wayback Machine can have many archives of a webpage,
sometimes we want archive close to a specific time.
@ -235,7 +337,7 @@ class Url:
"""
if unix_timestamp:
timestamp = _unix_ts_to_wayback_ts(unix_timestamp)
timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = _wayback_timestamp(
@ -285,28 +387,45 @@ class Url:
We simply pass the year in near() and return it.
"""
return self.near(year=year)
def newest(self):
"""
Return the newest Wayback Machine archive available for this URL.
"""Return the newest Wayback Machine archive available.
We return the output of self.near() as it deafults to current utc time.
We return the return value of self.near() as it deafults to current UTC time.
Due to Wayback Machine database lag, this may not always be the
most recent archive.
return type : waybackpy.wrapper.Url
"""
return self.near()
def total_archives(self, start_timestamp=None, end_timestamp=None):
"""
"""Returns the total number of archives for an URL
Parameters
----------
start_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
end_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
return type : int
A webpage can have multiple archives on the wayback machine
If someone wants to count the total number of archives of a
webpage on wayback machine they can use this method.
Returns the total number of Wayback Machine archives for the URL.
Return type in integer.
"""
cdx = Cdx(
@ -315,6 +434,8 @@ class Url:
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
)
# cdx.snapshots() is generator not list.
i = 0
for _ in cdx.snapshots():
i = i + 1
@ -328,15 +449,36 @@ class Url:
end_timestamp=None,
match_type="prefix",
):
"""
"""Yields known_urls URLs from the CDX API.
Parameters
----------
subdomain : bool
If True fetch subdomain URLs along with the host URLs.
host : bool
Only fetch host URLs.
start_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
end_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
match_type : str
One of (exact, prefix, host and domain)
return type : waybackpy.snapshot.CdxSnapshot
Yields list of URLs known to exist for given input.
Defaults to input URL as prefix.
This method is kept for compatibility, use the Cdx class instead.
This method itself depends on Cdx.
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
Based on:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
By Mohammed Diaa (https://github.com/mhmdiaa)
"""
if subdomain:
@ -353,7 +495,5 @@ class Url:
collapses=["urlkey"],
)
snapshots = cdx.snapshots()
for snapshot in snapshots:
for snapshot in cdx.snapshots():
yield (snapshot.original)