Added doc strings in wrapper.py, documenting code and improving docs.

This commit is contained in:
Akash Mahanty 2021-01-03 17:11:32 +05:30
parent db5737a857
commit 9823c809e9

View File

@ -12,7 +12,30 @@ default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _archive_url_parser(header): def _archive_url_parser(header):
"""Parse out the archive from header.""" """
This method has some regexen (or regexes)
that search for archive url in header.
This method is used when you try to
save a webpage on wayback machine.
The wayback machine's save API doesn't
return JSON response, we are required
to read the header of the API response
and look for the archive URL.
Two cases are possible:
1) Either we find the archive url in
the header.
2) We didn't find the archive url in
API header.
If we found the archive we return it.
And if we couldn't find it we raise
WaybackError with a standard Error message.
"""
# Regex1 # Regex1
arch = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header)) arch = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
if arch: if arch:
@ -36,14 +59,49 @@ def _archive_url_parser(header):
def _wayback_timestamp(**kwargs): def _wayback_timestamp(**kwargs):
"""Return a formatted timestamp.""" """
Wayback Machine archive URLs
have a timestamp in them.
The standard archive URL format is
https://web.archive.org/web/20191214041711/https://www.youtube.com
If we break it down in three parts:
1 ) The start (https://web.archive.org/web/)
2 ) timestamp (20191214041711)
3 ) https://www.youtube.com, the original URL
The near method takes year, month, day, hour and minute
as Arguments, their type is int.
This method takes those integers and converts it to
wayback machine timestamp and returns it.
Return format is string.
"""
return "".join( return "".join(
str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"] str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
) )
def _get_response(endpoint, params=None, headers=None): def _get_response(endpoint, params=None, headers=None):
"""Get response for the supplied request.""" """
This function is used make get request.
We use the requests package to make the
requests.
We try twice and if both the times is fails And
raises exceptions we give-up and raise WaybackError.
You can handles WaybackError by importing:
from waybackpy.exceptions import WaybackError
try:
...
except WaybackError as e:
# handle it
"""
try: try:
response = requests.get(endpoint, params=params, headers=headers) response = requests.get(endpoint, params=params, headers=headers)
@ -58,14 +116,16 @@ def _get_response(endpoint, params=None, headers=None):
class Url: class Url:
"""waybackpy Url object""" """
waybackpy Url object <class 'waybackpy.wrapper.Url'>
"""
def __init__(self, url, user_agent=default_UA): def __init__(self, url, user_agent=default_UA):
self.url = url self.url = url
self.user_agent = user_agent self.user_agent = user_agent
self._url_check() # checks url validity on init. self._url_check() # checks url validity on init.
self._archive_url = None # URL of archive self._archive_url = None
self.timestamp = None # timestamp for last archive self.timestamp = None
self._JSON = None self._JSON = None
self._alive_url_list = [] self._alive_url_list = []
@ -73,6 +133,17 @@ class Url:
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
def __str__(self): def __str__(self):
"""
Output when print() is used on <class 'waybackpy.wrapper.Url'>
This should print an archive URL.
We check if self._archive_url is not None.
If not None, good. We return string of self._archive_url.
If self._archive_url is None, it means we ain't used any method that
sets self._archive_url, we now set self._archive_url to self.archive_url
and return it.
"""
if not self._archive_url: if not self._archive_url:
self._archive_url = self.archive_url self._archive_url = self.archive_url
return "%s" % self._archive_url return "%s" % self._archive_url
@ -92,7 +163,13 @@ class Url:
return diff.days return diff.days
def _url_check(self): def _url_check(self):
"""Check for common URL problems.""" """
Check for common URL problems.
What we are checking:
1) '.' in self.url, no url that ain't '.' in it.
If you known any others, please create a PR on the github repo.
"""
if "." not in self.url: if "." not in self.url:
raise URLError("'%s' is not a vaild URL." % self.url) raise URLError("'%s' is not a vaild URL." % self.url)
@ -198,10 +275,26 @@ class Url:
return response.content.decode(encoding.replace("text/html", "UTF-8", 1)) return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
def near(self, year=None, month=None, day=None, hour=None, minute=None): def near(self, year=None, month=None, day=None, hour=None, minute=None):
"""Return the closest Wayback Machine archive to the time supplied. """
Supported params are year, month, day, hour and minute. Wayback Machine can have many archives of a webpage,
Any non-supplied parameters default to the current time. sometimes we want archive close to a specific time.
This method takes year, month, day, hour and minute as input.
The input type must be integer. Any non-supplied parameters
default to the current time.
We convert the input to a wayback machine timestamp using
_wayback_timestamp(), it returns a string.
We use the wayback machine's availability API
(https://archive.org/wayback/available)
to get the closest archive from the timestamp.
We set self._archive_url to the archive found, if any.
If archive found, we set self.timestamp to its timestamp.
We self._JSON to the response of the availability API.
And finally return self.
""" """
now = datetime.utcnow().timetuple() now = datetime.utcnow().timetuple()
timestamp = _wayback_timestamp( timestamp = _wayback_timestamp(
@ -237,11 +330,22 @@ class Url:
return self return self
def oldest(self, year=1994): def oldest(self, year=1994):
"""Return the oldest Wayback Machine archive for this URL.""" """
Returns the earliest/oldest Wayback Machine archive for the webpage.
Wayback machine has started archiving the internet around 1997 and
therefore we can't have any archive older than 1997, we use 1994 as the
deafult year to look for the oldest archive.
We simply pass the year in near() and return it.
"""
return self.near(year=year) return self.near(year=year)
def newest(self): def newest(self):
"""Return the newest Wayback Machine archive available for this URL. """
Return the newest Wayback Machine archive available for this URL.
We return the output of self.near() as it deafults to current utc time.
Due to Wayback Machine database lag, this may not always be the Due to Wayback Machine database lag, this may not always be the
most recent archive. most recent archive.
@ -249,7 +353,15 @@ class Url:
return self.near() return self.near()
def total_archives(self): def total_archives(self):
"""Returns the total number of Wayback Machine archives for this URL.""" """
A webpage can have multiple archives on the wayback machine
If someone wants to count the total number of archives of a
webpage on wayback machine they can use this method.
Returns the total number of Wayback Machine archives for the URL.
Return type in integer.
"""
endpoint = "https://web.archive.org/cdx/search/cdx" endpoint = "https://web.archive.org/cdx/search/cdx"
headers = { headers = {
@ -264,6 +376,10 @@ class Url:
return response.text.count(",") return response.text.count(",")
def live_urls_picker(self, url): def live_urls_picker(self, url):
"""
This method is used to check if supplied url
is >= 400.
"""
try: try:
response_code = requests.get(url).status_code response_code = requests.get(url).status_code
@ -277,7 +393,8 @@ class Url:
self._alive_url_list.append(url) self._alive_url_list.append(url)
def known_urls(self, alive=False, subdomain=False): def known_urls(self, alive=False, subdomain=False):
"""Returns list of URLs known to exist for given domain name """
Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine bots. because these URLs were crawled by WayBack Machine bots.
Useful for pen-testers and others. Useful for pen-testers and others.
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: