Added some docstrings in utils.py

This commit is contained in:
Akash Mahanty
2021-01-25 20:37:59 +05:30
parent 88cda94c0b
commit d1061bfdc8
2 changed files with 89 additions and 23 deletions

View File

@@ -13,16 +13,58 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
def _latest_version(package_name, headers): def _latest_version(package_name, headers):
"""Returns the latest version of package_name.
Parameters
----------
package_name : str
The name of the python package
headers : dict
Headers that will be used while making get requests
Return type is str
Use API <https://pypi.org/pypi/> to get the latest version of
waybackpy, but can be used to get latest version of any package
on PyPi.
"""
endpoint = "https://pypi.org/pypi/" + package_name + "/json" endpoint = "https://pypi.org/pypi/" + package_name + "/json"
json = _get_response(endpoint, headers=headers).json() json = _get_response(endpoint, headers=headers).json()
return json["info"]["version"] return json["info"]["version"]
def _unix_ts_to_wayback_ts(unix_ts): def _unix_ts_to_wayback_ts(unix_timestamp):
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S") """Returns unix timestamp converted to datetime.datetime
Parameters
----------
unix_timestamp : str, int or float
Unix-timestamp that needs to be converted to datetime.datetime
Converts and returns input unix_timestamp to datetime.datetime object.
Does not matter if unix_timestamp is str, float or int.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def _add_payload(instance, payload): def _add_payload(instance, payload):
"""Adds payload from instance that can be used to make get requests.
Parameters
----------
instance : waybackpy.cdx.Cdx
instance of the Cdx class
payload : dict
A dict onto which we need to add keys and values based on instance.
instance is object of Cdx class and it contains the data required to fill
the payload dictionary.
"""
if instance.start_timestamp: if instance.start_timestamp:
payload["from"] = instance.start_timestamp payload["from"] = instance.start_timestamp
@@ -143,6 +185,9 @@ def _check_filters(filters):
def _cleaned_url(url): def _cleaned_url(url):
"""Sanatize the url
Remove and replace illegal whitespace characters from the URL.
"""
return str(url).strip().replace(" ", "%20") return str(url).strip().replace(" ", "%20")
@@ -195,13 +240,11 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
The wayback machine's save API doesn't The wayback machine's save API doesn't
return JSON response, we are required return JSON response, we are required
to read the header of the API response to read the header of the API response
and look for the archive URL. and find the archive URL.
This method has some regexen (or regexes) This method has some regular expressions
that search for archive url in header. that are used to search for the archive url
in the response header of Save API.
This method is used when you try to
save a webpage on wayback machine.
Two cases are possible: Two cases are possible:
1) Either we find the archive url in 1) Either we find the archive url in
@@ -213,7 +256,6 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
If we found the archive URL we return it. If we found the archive URL we return it.
Return format: Return format:
web.archive.org/web/<TIMESTAMP>/<URL> web.archive.org/web/<TIMESTAMP>/<URL>
And if we couldn't find it, we raise And if we couldn't find it, we raise
@@ -304,9 +346,8 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
def _wayback_timestamp(**kwargs): def _wayback_timestamp(**kwargs):
""" """Returns a valid waybackpy timestamp.
Wayback Machine archive URLs
have a timestamp in them.
The standard archive URL format is The standard archive URL format is
https://web.archive.org/web/20191214041711/https://www.youtube.com https://web.archive.org/web/20191214041711/https://www.youtube.com
@@ -316,12 +357,14 @@ def _wayback_timestamp(**kwargs):
2 ) timestamp (20191214041711) 2 ) timestamp (20191214041711)
3 ) https://www.youtube.com, the original URL 3 ) https://www.youtube.com, the original URL
The near method takes year, month, day, hour and minute The near method of Url class in wrapper.py takes year, month, day, hour
as Arguments, their type is int. and minute as arguments, their type is int.
This method takes those integers and converts it to This method takes those integers and converts it to
wayback machine timestamp and returns it. wayback machine timestamp and returns it.
zfill(2) adds 1 zero in front of single digit days, months hour etc.
Return format is string. Return format is string.
""" """
@@ -339,16 +382,37 @@ def _get_response(
backoff_factor=0.5, backoff_factor=0.5,
no_raise_on_redirects=False, no_raise_on_redirects=False,
): ):
""" """Makes get requests.
This function is used make get request.
We use the requests package to make the Parameters
requests. ----------
endpoint : str
The API endpoint.
params : dict
The get request parameters. (default is None)
headers : dict
Headers for the get request. (default is None)
return_full_url : bool
Determines whether the call went full url returned along with the
response. (default is False)
retries : int
Maximum number of retries for the get request. (default is 5)
backoff_factor : float
The factor by which we determine the next retry time after wait.
https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html
(default is 0.5)
no_raise_on_redirects : bool
If maximum 30(default for requests) times redirected than instead of
exceptions return. (default is False)
We try five times and if it fails it raises To handle WaybackError:
WaybackError exception.
You can handles WaybackError by importing:
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
try: try:

View File

@@ -161,7 +161,9 @@ class Url:
instance=self, instance=self,
) )
m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url) m = re.search(
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
)
str_ts = m.group(1) str_ts = m.group(1)
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S") ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
now = datetime.utcnow() now = datetime.utcnow()