Added some docstrings in utils.py
This commit is contained in:
@@ -13,16 +13,58 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
|
||||
|
||||
|
||||
def _latest_version(package_name, headers):
|
||||
"""Returns the latest version of package_name.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
package_name : str
|
||||
The name of the python package
|
||||
|
||||
headers : dict
|
||||
Headers that will be used while making get requests
|
||||
|
||||
Return type is str
|
||||
|
||||
Use API <https://pypi.org/pypi/> to get the latest version of
|
||||
waybackpy, but can be used to get latest version of any package
|
||||
on PyPi.
|
||||
"""
|
||||
|
||||
endpoint = "https://pypi.org/pypi/" + package_name + "/json"
|
||||
json = _get_response(endpoint, headers=headers).json()
|
||||
return json["info"]["version"]
|
||||
|
||||
|
||||
def _unix_ts_to_wayback_ts(unix_ts):
|
||||
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
|
||||
def _unix_ts_to_wayback_ts(unix_timestamp):
|
||||
"""Returns unix timestamp converted to datetime.datetime
|
||||
|
||||
Parameters
|
||||
----------
|
||||
unix_timestamp : str, int or float
|
||||
Unix-timestamp that needs to be converted to datetime.datetime
|
||||
|
||||
Converts and returns input unix_timestamp to datetime.datetime object.
|
||||
Does not matter if unix_timestamp is str, float or int.
|
||||
"""
|
||||
|
||||
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||
|
||||
|
||||
def _add_payload(instance, payload):
|
||||
"""Adds payload from instance that can be used to make get requests.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
instance : waybackpy.cdx.Cdx
|
||||
instance of the Cdx class
|
||||
|
||||
payload : dict
|
||||
A dict onto which we need to add keys and values based on instance.
|
||||
|
||||
instance is object of Cdx class and it contains the data required to fill
|
||||
the payload dictionary.
|
||||
"""
|
||||
|
||||
if instance.start_timestamp:
|
||||
payload["from"] = instance.start_timestamp
|
||||
|
||||
@@ -143,6 +185,9 @@ def _check_filters(filters):
|
||||
|
||||
|
||||
def _cleaned_url(url):
|
||||
"""Sanatize the url
|
||||
Remove and replace illegal whitespace characters from the URL.
|
||||
"""
|
||||
return str(url).strip().replace(" ", "%20")
|
||||
|
||||
|
||||
@@ -195,13 +240,11 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
|
||||
The wayback machine's save API doesn't
|
||||
return JSON response, we are required
|
||||
to read the header of the API response
|
||||
and look for the archive URL.
|
||||
and find the archive URL.
|
||||
|
||||
This method has some regexen (or regexes)
|
||||
that search for archive url in header.
|
||||
|
||||
This method is used when you try to
|
||||
save a webpage on wayback machine.
|
||||
This method has some regular expressions
|
||||
that are used to search for the archive url
|
||||
in the response header of Save API.
|
||||
|
||||
Two cases are possible:
|
||||
1) Either we find the archive url in
|
||||
@@ -213,7 +256,6 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
|
||||
If we found the archive URL we return it.
|
||||
|
||||
Return format:
|
||||
|
||||
web.archive.org/web/<TIMESTAMP>/<URL>
|
||||
|
||||
And if we couldn't find it, we raise
|
||||
@@ -304,9 +346,8 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
|
||||
|
||||
|
||||
def _wayback_timestamp(**kwargs):
|
||||
"""
|
||||
Wayback Machine archive URLs
|
||||
have a timestamp in them.
|
||||
"""Returns a valid waybackpy timestamp.
|
||||
|
||||
|
||||
The standard archive URL format is
|
||||
https://web.archive.org/web/20191214041711/https://www.youtube.com
|
||||
@@ -316,12 +357,14 @@ def _wayback_timestamp(**kwargs):
|
||||
2 ) timestamp (20191214041711)
|
||||
3 ) https://www.youtube.com, the original URL
|
||||
|
||||
The near method takes year, month, day, hour and minute
|
||||
as Arguments, their type is int.
|
||||
The near method of Url class in wrapper.py takes year, month, day, hour
|
||||
and minute as arguments, their type is int.
|
||||
|
||||
This method takes those integers and converts it to
|
||||
wayback machine timestamp and returns it.
|
||||
|
||||
zfill(2) adds 1 zero in front of single digit days, months hour etc.
|
||||
|
||||
Return format is string.
|
||||
"""
|
||||
|
||||
@@ -339,16 +382,37 @@ def _get_response(
|
||||
backoff_factor=0.5,
|
||||
no_raise_on_redirects=False,
|
||||
):
|
||||
"""
|
||||
This function is used make get request.
|
||||
We use the requests package to make the
|
||||
requests.
|
||||
"""Makes get requests.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endpoint : str
|
||||
The API endpoint.
|
||||
|
||||
params : dict
|
||||
The get request parameters. (default is None)
|
||||
|
||||
headers : dict
|
||||
Headers for the get request. (default is None)
|
||||
|
||||
return_full_url : bool
|
||||
Determines whether the call went full url returned along with the
|
||||
response. (default is False)
|
||||
|
||||
retries : int
|
||||
Maximum number of retries for the get request. (default is 5)
|
||||
|
||||
backoff_factor : float
|
||||
The factor by which we determine the next retry time after wait.
|
||||
https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html
|
||||
(default is 0.5)
|
||||
|
||||
no_raise_on_redirects : bool
|
||||
If maximum 30(default for requests) times redirected than instead of
|
||||
exceptions return. (default is False)
|
||||
|
||||
|
||||
We try five times and if it fails it raises
|
||||
WaybackError exception.
|
||||
|
||||
You can handles WaybackError by importing:
|
||||
To handle WaybackError:
|
||||
from waybackpy.exceptions import WaybackError
|
||||
|
||||
try:
|
||||
|
@@ -161,7 +161,9 @@ class Url:
|
||||
instance=self,
|
||||
)
|
||||
|
||||
m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url)
|
||||
m = re.search(
|
||||
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||
)
|
||||
str_ts = m.group(1)
|
||||
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
|
||||
now = datetime.utcnow()
|
||||
|
Reference in New Issue
Block a user