Add doc strings (#90)
* Added some docstrings in utils.py * renamed some func/meth to better names and added doc strings + lint * added more docstrings * more docstrings * improve docstrings * docstrings * added more docstrings, lint * fix import error
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .exceptions import WaybackError
|
||||
from .cdx import Cdx
|
||||
from .utils import (
|
||||
@@ -9,13 +10,85 @@ from .utils import (
|
||||
default_user_agent,
|
||||
_url_check,
|
||||
_cleaned_url,
|
||||
_ts,
|
||||
_unix_ts_to_wayback_ts,
|
||||
_timestamp_manager,
|
||||
_unix_timestamp_to_wayback_timestamp,
|
||||
_latest_version,
|
||||
)
|
||||
|
||||
|
||||
class Url:
|
||||
"""
|
||||
|
||||
Attributes
|
||||
----------
|
||||
url : str
|
||||
The input URL, wayback machine API operations are performed
|
||||
on this URL after sanatizing it.
|
||||
|
||||
user_agent : str
|
||||
The user_agent used while making the GET requests to the
|
||||
Wayback machine APIs
|
||||
|
||||
_archive_url : str
|
||||
Caches the last fetched archive.
|
||||
|
||||
timestamp : datetime.datetime
|
||||
timestamp of the archive URL as datetime object for
|
||||
greater usability
|
||||
|
||||
_JSON : dict
|
||||
Caches the last fetched availability API data
|
||||
|
||||
latest_version : str
|
||||
The latest version of waybackpy on PyPi
|
||||
|
||||
cached_save : bool
|
||||
Flag to check if WayBack machine returned a cached
|
||||
archive instead of creating a new archive. WayBack
|
||||
machine allows only one 1 archive for an URL in
|
||||
30 minutes. If the archive returned by WayBack machine
|
||||
is older than 3 minutes than this flag is set to True
|
||||
|
||||
Methods turned properties
|
||||
----------
|
||||
JSON : dict
|
||||
JSON response of availability API as dictionary / loaded JSON
|
||||
|
||||
archive_url : str
|
||||
Return the archive url, returns str
|
||||
|
||||
_timestamp : datetime.datetime
|
||||
Sets the value of self.timestamp if still not set
|
||||
|
||||
Methods
|
||||
-------
|
||||
save()
|
||||
Archives the URL on WayBack machine
|
||||
|
||||
get(url="", user_agent="", encoding="")
|
||||
Gets the source of archive url, can also be used to get source
|
||||
of any URL if passed into it.
|
||||
|
||||
near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
|
||||
Wayback Machine can have many archives for a URL/webpage, sometimes we want
|
||||
archive close to a specific time.
|
||||
This method takes year, month, day, hour, minute and unix_timestamp as input.
|
||||
|
||||
oldest(year=1994)
|
||||
The oldest archive of an URL.
|
||||
|
||||
newest()
|
||||
The newest archive of an URL
|
||||
|
||||
total_archives(start_timestamp=None, end_timestamp=None)
|
||||
total number of archives of an URL, the timeframe can be confined by
|
||||
start_timestamp and end_timestamp
|
||||
|
||||
known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
|
||||
Known URLs for an URL, subdomain, URL as prefix etc.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, url, user_agent=default_user_agent):
|
||||
self.url = url
|
||||
self.user_agent = str(user_agent)
|
||||
@@ -32,29 +105,17 @@ class Url:
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Output when print() is used on <class 'waybackpy.wrapper.Url'>
|
||||
This should print an archive URL.
|
||||
|
||||
We check if self._archive_url is not None.
|
||||
If not None, good. We return string of self._archive_url.
|
||||
|
||||
If self._archive_url is None, it means we ain't used any method that
|
||||
sets self._archive_url, we now set self._archive_url to self.archive_url
|
||||
and return it.
|
||||
"""
|
||||
|
||||
if not self._archive_url:
|
||||
self._archive_url = self.archive_url
|
||||
|
||||
return "{archive_url}".format(archive_url=self._archive_url)
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Why do we have len here?
|
||||
"""Number of days between today and the date of archive based on the timestamp
|
||||
|
||||
Applying len() on <class 'waybackpy.wrapper.Url'>
|
||||
will calculate the number of days between today and
|
||||
the archive timestamp.
|
||||
len() of waybackpy.wrapper.Url should return
|
||||
the number of days between today and the
|
||||
archive timestamp.
|
||||
|
||||
Can be applied on return values of near and its
|
||||
childs (e.g. oldest) and if applied on waybackpy.Url()
|
||||
@@ -76,32 +137,30 @@ class Url:
|
||||
|
||||
@property
|
||||
def JSON(self):
|
||||
"""
|
||||
If the end user has used near() or its childs like oldest, newest
|
||||
and archive_url then the JSON response of these are cached in self._JSON
|
||||
"""Returns JSON response of availability API as dictionary / loaded JSON
|
||||
|
||||
If we find that self._JSON is not None we return it.
|
||||
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
|
||||
and return it.
|
||||
return type : dict
|
||||
"""
|
||||
|
||||
# If user used the near method or any method that depends on near, we
|
||||
# are certain that we have a loaded dictionary cached in self._JSON.
|
||||
# Return the loaded JSON data.
|
||||
if self._JSON:
|
||||
return self._JSON
|
||||
|
||||
# If no cached data found, get data and return + cache it.
|
||||
endpoint = "https://archive.org/wayback/available"
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
|
||||
response = _get_response(endpoint, params=payload, headers=headers)
|
||||
return response.json()
|
||||
self._JSON = response.json()
|
||||
return self._JSON
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
"""
|
||||
Returns any random archive for the instance.
|
||||
But if near, oldest, newest were used before
|
||||
then it returns the same archive again.
|
||||
"""Return the archive url.
|
||||
|
||||
We cache archive in self._archive_url
|
||||
return type : str
|
||||
"""
|
||||
|
||||
if self._archive_url:
|
||||
@@ -121,11 +180,16 @@ class Url:
|
||||
|
||||
@property
|
||||
def _timestamp(self):
|
||||
self.timestamp = _ts(self.timestamp, self.JSON)
|
||||
return self.timestamp
|
||||
"""Sets the value of self.timestamp if still not set.
|
||||
|
||||
Return type : datetime.datetime
|
||||
|
||||
"""
|
||||
return _timestamp_manager(self.timestamp, self.JSON)
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
"""Saves/Archive the URL.
|
||||
|
||||
To save a webpage on WayBack machine we
|
||||
need to send get request to https://web.archive.org/save/
|
||||
|
||||
@@ -136,6 +200,8 @@ class Url:
|
||||
|
||||
_archive_url_parser() parses the archive from the header.
|
||||
|
||||
return type : waybackpy.wrapper.Url
|
||||
|
||||
"""
|
||||
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
@@ -161,7 +227,9 @@ class Url:
|
||||
instance=self,
|
||||
)
|
||||
|
||||
m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url)
|
||||
m = re.search(
|
||||
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||
)
|
||||
str_ts = m.group(1)
|
||||
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
|
||||
now = datetime.utcnow()
|
||||
@@ -175,9 +243,22 @@ class Url:
|
||||
return self
|
||||
|
||||
def get(self, url="", user_agent="", encoding=""):
|
||||
"""
|
||||
Return the source code of the last archived URL,
|
||||
if no URL is passed to this method.
|
||||
"""GET the source of archive or any other URL.
|
||||
|
||||
url : str, waybackpy.wrapper.Url
|
||||
The method will return the source code of
|
||||
this URL instead of last fetched archive.
|
||||
|
||||
user_agent : str
|
||||
The user_agent for GET request to API
|
||||
|
||||
encoding : str
|
||||
If user is using any other encoding that
|
||||
can't be detected by response.encoding
|
||||
|
||||
Return the source code of the last fetched
|
||||
archive URL if no URL is passed to this method
|
||||
else it returns the source code of url passed.
|
||||
|
||||
If encoding is not supplied, it is auto-detected
|
||||
from the response itself by requests package.
|
||||
@@ -213,6 +294,27 @@ class Url:
|
||||
unix_timestamp=None,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
|
||||
year : int
|
||||
Archive close to year
|
||||
|
||||
month : int
|
||||
Archive close to month
|
||||
|
||||
day : int
|
||||
Archive close to day
|
||||
|
||||
hour : int
|
||||
Archive close to hour
|
||||
|
||||
minute : int
|
||||
Archive close to minute
|
||||
|
||||
unix_timestamp : str, float or int
|
||||
Archive close to this unix_timestamp
|
||||
|
||||
Wayback Machine can have many archives of a webpage,
|
||||
sometimes we want archive close to a specific time.
|
||||
|
||||
@@ -235,7 +337,7 @@ class Url:
|
||||
"""
|
||||
|
||||
if unix_timestamp:
|
||||
timestamp = _unix_ts_to_wayback_ts(unix_timestamp)
|
||||
timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||
else:
|
||||
now = datetime.utcnow().timetuple()
|
||||
timestamp = _wayback_timestamp(
|
||||
@@ -285,28 +387,45 @@ class Url:
|
||||
|
||||
We simply pass the year in near() and return it.
|
||||
"""
|
||||
|
||||
return self.near(year=year)
|
||||
|
||||
def newest(self):
|
||||
"""
|
||||
Return the newest Wayback Machine archive available for this URL.
|
||||
"""Return the newest Wayback Machine archive available.
|
||||
|
||||
We return the output of self.near() as it deafults to current utc time.
|
||||
We return the return value of self.near() as it deafults to current UTC time.
|
||||
|
||||
Due to Wayback Machine database lag, this may not always be the
|
||||
most recent archive.
|
||||
|
||||
return type : waybackpy.wrapper.Url
|
||||
"""
|
||||
|
||||
return self.near()
|
||||
|
||||
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
||||
"""
|
||||
"""Returns the total number of archives for an URL
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
end_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
|
||||
return type : int
|
||||
|
||||
|
||||
A webpage can have multiple archives on the wayback machine
|
||||
If someone wants to count the total number of archives of a
|
||||
webpage on wayback machine they can use this method.
|
||||
|
||||
Returns the total number of Wayback Machine archives for the URL.
|
||||
|
||||
Return type in integer.
|
||||
"""
|
||||
|
||||
cdx = Cdx(
|
||||
@@ -315,6 +434,8 @@ class Url:
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
)
|
||||
|
||||
# cdx.snapshots() is generator not list.
|
||||
i = 0
|
||||
for _ in cdx.snapshots():
|
||||
i = i + 1
|
||||
@@ -328,15 +449,36 @@ class Url:
|
||||
end_timestamp=None,
|
||||
match_type="prefix",
|
||||
):
|
||||
"""
|
||||
"""Yields known_urls URLs from the CDX API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
subdomain : bool
|
||||
If True fetch subdomain URLs along with the host URLs.
|
||||
|
||||
host : bool
|
||||
Only fetch host URLs.
|
||||
|
||||
start_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
end_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
match_type : str
|
||||
One of (exact, prefix, host and domain)
|
||||
|
||||
return type : waybackpy.snapshot.CdxSnapshot
|
||||
|
||||
Yields list of URLs known to exist for given input.
|
||||
Defaults to input URL as prefix.
|
||||
|
||||
This method is kept for compatibility, use the Cdx class instead.
|
||||
This method itself depends on Cdx.
|
||||
|
||||
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
|
||||
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||
Based on:
|
||||
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||
By Mohammed Diaa (https://github.com/mhmdiaa)
|
||||
"""
|
||||
|
||||
if subdomain:
|
||||
@@ -353,7 +495,5 @@ class Url:
|
||||
collapses=["urlkey"],
|
||||
)
|
||||
|
||||
snapshots = cdx.snapshots()
|
||||
|
||||
for snapshot in snapshots:
|
||||
for snapshot in cdx.snapshots():
|
||||
yield (snapshot.original)
|
||||
|
Reference in New Issue
Block a user