From 9823c809e9c82b7c3a87a6c343c8f437658a8916 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Sun, 3 Jan 2021 17:11:32 +0530 Subject: [PATCH] Added doc strings in wrapper.py, documenting code and improving docs. --- waybackpy/wrapper.py | 145 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 14 deletions(-) diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 956230c..390a852 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -12,7 +12,30 @@ default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy" def _archive_url_parser(header): - """Parse out the archive from header.""" + """ + This method has some regexen (or regexes) + that search for archive url in header. + + This method is used when you try to + save a webpage on wayback machine. + + The wayback machine's save API doesn't + return JSON response, we are required + to read the header of the API response + and look for the archive URL. + + Two cases are possible: + 1) Either we find the archive url in + the header. + + 2) We didn't find the archive url in + API header. + + If we found the archive we return it. + + And if we couldn't find it we raise + WaybackError with a standard Error message. + """ # Regex1 arch = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header)) if arch: @@ -36,14 +59,49 @@ def _archive_url_parser(header): def _wayback_timestamp(**kwargs): - """Return a formatted timestamp.""" + """ + Wayback Machine archive URLs + have a timestamp in them. + + The standard archive URL format is + https://web.archive.org/web/20191214041711/https://www.youtube.com + + If we break it down in three parts: + 1 ) The start (https://web.archive.org/web/) + 2 ) timestamp (20191214041711) + 3 ) https://www.youtube.com, the original URL + + The near method takes year, month, day, hour and minute + as Arguments, their type is int. + + This method takes those integers and converts it to + wayback machine timestamp and returns it. + + Return format is string. + """ return "".join( str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"] ) def _get_response(endpoint, params=None, headers=None): - """Get response for the supplied request.""" + """ + This function is used make get request. + We use the requests package to make the + requests. + + + We try twice and if both the times is fails And + raises exceptions we give-up and raise WaybackError. + + You can handles WaybackError by importing: + from waybackpy.exceptions import WaybackError + + try: + ... + except WaybackError as e: + # handle it + """ try: response = requests.get(endpoint, params=params, headers=headers) @@ -58,14 +116,16 @@ def _get_response(endpoint, params=None, headers=None): class Url: - """waybackpy Url object""" + """ + waybackpy Url object + """ def __init__(self, url, user_agent=default_UA): self.url = url self.user_agent = user_agent self._url_check() # checks url validity on init. - self._archive_url = None # URL of archive - self.timestamp = None # timestamp for last archive + self._archive_url = None + self.timestamp = None self._JSON = None self._alive_url_list = [] @@ -73,6 +133,17 @@ class Url: return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) def __str__(self): + """ + Output when print() is used on + This should print an archive URL. + + We check if self._archive_url is not None. + If not None, good. We return string of self._archive_url. + + If self._archive_url is None, it means we ain't used any method that + sets self._archive_url, we now set self._archive_url to self.archive_url + and return it. + """ if not self._archive_url: self._archive_url = self.archive_url return "%s" % self._archive_url @@ -92,7 +163,13 @@ class Url: return diff.days def _url_check(self): - """Check for common URL problems.""" + """ + Check for common URL problems. + What we are checking: + 1) '.' in self.url, no url that ain't '.' in it. + + If you known any others, please create a PR on the github repo. + """ if "." not in self.url: raise URLError("'%s' is not a vaild URL." % self.url) @@ -198,10 +275,26 @@ class Url: return response.content.decode(encoding.replace("text/html", "UTF-8", 1)) def near(self, year=None, month=None, day=None, hour=None, minute=None): - """Return the closest Wayback Machine archive to the time supplied. - Supported params are year, month, day, hour and minute. - Any non-supplied parameters default to the current time. + """ + Wayback Machine can have many archives of a webpage, + sometimes we want archive close to a specific time. + This method takes year, month, day, hour and minute as input. + The input type must be integer. Any non-supplied parameters + default to the current time. + + We convert the input to a wayback machine timestamp using + _wayback_timestamp(), it returns a string. + + We use the wayback machine's availability API + (https://archive.org/wayback/available) + to get the closest archive from the timestamp. + + We set self._archive_url to the archive found, if any. + If archive found, we set self.timestamp to its timestamp. + We self._JSON to the response of the availability API. + + And finally return self. """ now = datetime.utcnow().timetuple() timestamp = _wayback_timestamp( @@ -237,11 +330,22 @@ class Url: return self def oldest(self, year=1994): - """Return the oldest Wayback Machine archive for this URL.""" + """ + Returns the earliest/oldest Wayback Machine archive for the webpage. + + Wayback machine has started archiving the internet around 1997 and + therefore we can't have any archive older than 1997, we use 1994 as the + deafult year to look for the oldest archive. + + We simply pass the year in near() and return it. + """ return self.near(year=year) def newest(self): - """Return the newest Wayback Machine archive available for this URL. + """ + Return the newest Wayback Machine archive available for this URL. + + We return the output of self.near() as it deafults to current utc time. Due to Wayback Machine database lag, this may not always be the most recent archive. @@ -249,7 +353,15 @@ class Url: return self.near() def total_archives(self): - """Returns the total number of Wayback Machine archives for this URL.""" + """ + A webpage can have multiple archives on the wayback machine + If someone wants to count the total number of archives of a + webpage on wayback machine they can use this method. + + Returns the total number of Wayback Machine archives for the URL. + + Return type in integer. + """ endpoint = "https://web.archive.org/cdx/search/cdx" headers = { @@ -264,6 +376,10 @@ class Url: return response.text.count(",") def live_urls_picker(self, url): + """ + This method is used to check if supplied url + is >= 400. + """ try: response_code = requests.get(url).status_code @@ -277,7 +393,8 @@ class Url: self._alive_url_list.append(url) def known_urls(self, alive=False, subdomain=False): - """Returns list of URLs known to exist for given domain name + """ + Returns list of URLs known to exist for given domain name because these URLs were crawled by WayBack Machine bots. Useful for pen-testers and others. Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: