Add doc strings (#90)

* Added some docstrings in utils.py

* renamed some func/meth to better names and added doc strings + lint

* added more docstrings

* more docstrings

* improve docstrings

* docstrings

* added more docstrings, lint

* fix import error
This commit is contained in:
Akash Mahanty
2021-01-26 11:56:03 +05:30
committed by GitHub
parent 88cda94c0b
commit db8f902cff
9 changed files with 443 additions and 121 deletions

View File

@@ -1,5 +1,6 @@
import re
from datetime import datetime, timedelta
from .exceptions import WaybackError
from .cdx import Cdx
from .utils import (
@@ -9,13 +10,85 @@ from .utils import (
default_user_agent,
_url_check,
_cleaned_url,
_ts,
_unix_ts_to_wayback_ts,
_timestamp_manager,
_unix_timestamp_to_wayback_timestamp,
_latest_version,
)
class Url:
"""
Attributes
----------
url : str
The input URL, wayback machine API operations are performed
on this URL after sanatizing it.
user_agent : str
The user_agent used while making the GET requests to the
Wayback machine APIs
_archive_url : str
Caches the last fetched archive.
timestamp : datetime.datetime
timestamp of the archive URL as datetime object for
greater usability
_JSON : dict
Caches the last fetched availability API data
latest_version : str
The latest version of waybackpy on PyPi
cached_save : bool
Flag to check if WayBack machine returned a cached
archive instead of creating a new archive. WayBack
machine allows only one 1 archive for an URL in
30 minutes. If the archive returned by WayBack machine
is older than 3 minutes than this flag is set to True
Methods turned properties
----------
JSON : dict
JSON response of availability API as dictionary / loaded JSON
archive_url : str
Return the archive url, returns str
_timestamp : datetime.datetime
Sets the value of self.timestamp if still not set
Methods
-------
save()
Archives the URL on WayBack machine
get(url="", user_agent="", encoding="")
Gets the source of archive url, can also be used to get source
of any URL if passed into it.
near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
Wayback Machine can have many archives for a URL/webpage, sometimes we want
archive close to a specific time.
This method takes year, month, day, hour, minute and unix_timestamp as input.
oldest(year=1994)
The oldest archive of an URL.
newest()
The newest archive of an URL
total_archives(start_timestamp=None, end_timestamp=None)
total number of archives of an URL, the timeframe can be confined by
start_timestamp and end_timestamp
known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
Known URLs for an URL, subdomain, URL as prefix etc.
"""
def __init__(self, url, user_agent=default_user_agent):
self.url = url
self.user_agent = str(user_agent)
@@ -32,29 +105,17 @@ class Url:
)
def __str__(self):
"""
Output when print() is used on <class 'waybackpy.wrapper.Url'>
This should print an archive URL.
We check if self._archive_url is not None.
If not None, good. We return string of self._archive_url.
If self._archive_url is None, it means we ain't used any method that
sets self._archive_url, we now set self._archive_url to self.archive_url
and return it.
"""
if not self._archive_url:
self._archive_url = self.archive_url
return "{archive_url}".format(archive_url=self._archive_url)
def __len__(self):
"""
Why do we have len here?
"""Number of days between today and the date of archive based on the timestamp
Applying len() on <class 'waybackpy.wrapper.Url'>
will calculate the number of days between today and
the archive timestamp.
len() of waybackpy.wrapper.Url should return
the number of days between today and the
archive timestamp.
Can be applied on return values of near and its
childs (e.g. oldest) and if applied on waybackpy.Url()
@@ -76,32 +137,30 @@ class Url:
@property
def JSON(self):
"""
If the end user has used near() or its childs like oldest, newest
and archive_url then the JSON response of these are cached in self._JSON
"""Returns JSON response of availability API as dictionary / loaded JSON
If we find that self._JSON is not None we return it.
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
and return it.
return type : dict
"""
# If user used the near method or any method that depends on near, we
# are certain that we have a loaded dictionary cached in self._JSON.
# Return the loaded JSON data.
if self._JSON:
return self._JSON
# If no cached data found, get data and return + cache it.
endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": self.user_agent}
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
response = _get_response(endpoint, params=payload, headers=headers)
return response.json()
self._JSON = response.json()
return self._JSON
@property
def archive_url(self):
"""
Returns any random archive for the instance.
But if near, oldest, newest were used before
then it returns the same archive again.
"""Return the archive url.
We cache archive in self._archive_url
return type : str
"""
if self._archive_url:
@@ -121,11 +180,16 @@ class Url:
@property
def _timestamp(self):
self.timestamp = _ts(self.timestamp, self.JSON)
return self.timestamp
"""Sets the value of self.timestamp if still not set.
Return type : datetime.datetime
"""
return _timestamp_manager(self.timestamp, self.JSON)
def save(self):
"""
"""Saves/Archive the URL.
To save a webpage on WayBack machine we
need to send get request to https://web.archive.org/save/
@@ -136,6 +200,8 @@ class Url:
_archive_url_parser() parses the archive from the header.
return type : waybackpy.wrapper.Url
"""
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
headers = {"User-Agent": self.user_agent}
@@ -161,7 +227,9 @@ class Url:
instance=self,
)
m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url)
m = re.search(
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
)
str_ts = m.group(1)
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
now = datetime.utcnow()
@@ -175,9 +243,22 @@ class Url:
return self
def get(self, url="", user_agent="", encoding=""):
"""
Return the source code of the last archived URL,
if no URL is passed to this method.
"""GET the source of archive or any other URL.
url : str, waybackpy.wrapper.Url
The method will return the source code of
this URL instead of last fetched archive.
user_agent : str
The user_agent for GET request to API
encoding : str
If user is using any other encoding that
can't be detected by response.encoding
Return the source code of the last fetched
archive URL if no URL is passed to this method
else it returns the source code of url passed.
If encoding is not supplied, it is auto-detected
from the response itself by requests package.
@@ -213,6 +294,27 @@ class Url:
unix_timestamp=None,
):
"""
Parameters
----------
year : int
Archive close to year
month : int
Archive close to month
day : int
Archive close to day
hour : int
Archive close to hour
minute : int
Archive close to minute
unix_timestamp : str, float or int
Archive close to this unix_timestamp
Wayback Machine can have many archives of a webpage,
sometimes we want archive close to a specific time.
@@ -235,7 +337,7 @@ class Url:
"""
if unix_timestamp:
timestamp = _unix_ts_to_wayback_ts(unix_timestamp)
timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = _wayback_timestamp(
@@ -285,28 +387,45 @@ class Url:
We simply pass the year in near() and return it.
"""
return self.near(year=year)
def newest(self):
"""
Return the newest Wayback Machine archive available for this URL.
"""Return the newest Wayback Machine archive available.
We return the output of self.near() as it deafults to current utc time.
We return the return value of self.near() as it deafults to current UTC time.
Due to Wayback Machine database lag, this may not always be the
most recent archive.
return type : waybackpy.wrapper.Url
"""
return self.near()
def total_archives(self, start_timestamp=None, end_timestamp=None):
"""
"""Returns the total number of archives for an URL
Parameters
----------
start_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
end_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
return type : int
A webpage can have multiple archives on the wayback machine
If someone wants to count the total number of archives of a
webpage on wayback machine they can use this method.
Returns the total number of Wayback Machine archives for the URL.
Return type in integer.
"""
cdx = Cdx(
@@ -315,6 +434,8 @@ class Url:
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
)
# cdx.snapshots() is generator not list.
i = 0
for _ in cdx.snapshots():
i = i + 1
@@ -328,15 +449,36 @@ class Url:
end_timestamp=None,
match_type="prefix",
):
"""
"""Yields known_urls URLs from the CDX API.
Parameters
----------
subdomain : bool
If True fetch subdomain URLs along with the host URLs.
host : bool
Only fetch host URLs.
start_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
end_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
match_type : str
One of (exact, prefix, host and domain)
return type : waybackpy.snapshot.CdxSnapshot
Yields list of URLs known to exist for given input.
Defaults to input URL as prefix.
This method is kept for compatibility, use the Cdx class instead.
This method itself depends on Cdx.
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
Based on:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
By Mohammed Diaa (https://github.com/mhmdiaa)
"""
if subdomain:
@@ -353,7 +495,5 @@ class Url:
collapses=["urlkey"],
)
snapshots = cdx.snapshots()
for snapshot in snapshots:
for snapshot in cdx.snapshots():
yield (snapshot.original)