improve docs

This commit is contained in:
Akash Mahanty 2021-01-05 01:46:12 +05:30
parent 6fb6b2deee
commit e0a4b007d5
3 changed files with 84 additions and 27 deletions

View File

@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
# ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━ # ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━ # ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓ # ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
@ -10,24 +8,43 @@
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━ # ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
""" """
Waybackpy is a Python package that interfaces with the Internet Archive's Wayback Machine API. Waybackpy is a Python package & command-line program that interfaces with the Internet Archive's Wayback Machine API.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Archive pages and retrieve archived pages easily. Archive webpage and retrieve archived URLs easily.
Usage: Usage:
>>> import waybackpy >>> import waybackpy
>>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
>>> new_archive = target_url.save()
>>> print(new_archive)
https://web.archive.org/web/20200502170312/https://www.python.org/
Full documentation @ <https://akamhy.github.io/waybackpy/>. >>> url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
:copyright: (c) 2020 by akamhy. >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
>>> wayback = waybackpy.Url(url, user_agent)
>>> archive = wayback.save()
>>> str(archive)
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
>>> archive.timestamp
datetime.datetime(2021, 1, 4, 17, 35, 12, 691741)
>>> oldest_archive = wayback.oldest()
>>> str(oldest_archive)
'https://web.archive.org/web/20050422130129/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
>>> archive_close_to_2010_feb = wayback.near(year=2010, month=2)
>>> str(archive_close_to_2010_feb)
'https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
>>> str(wayback.newest())
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
Full documentation @ <https://github.com/akamhy/waybackpy/wiki>.
:copyright: (c) 2020-2021 AKash Mahanty Et al.
:license: MIT :license: MIT
""" """
from .wrapper import Url from .wrapper import Url, Cdx
from .__version__ import ( from .__version__ import (
__title__, __title__,
__description__, __description__,

View File

@ -1,6 +1,3 @@
# -*- coding: utf-8 -*-
class WaybackError(Exception): class WaybackError(Exception):
""" """
Raised when Wayback Machine API Service is unreachable/down. Raised when Wayback Machine API Service is unreachable/down.

View File

@ -11,10 +11,10 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
def _get_total_pages(url, user_agent): def _get_total_pages(url, user_agent):
""" """
If showNumPages is passed in cdx API, it returns 'number of pages of' If showNumPages is passed in cdx API, it returns
and each page has many archives. 'number of archive pages'and each page has many archives.
This func returns number of pages (type int). This func returns number of pages of archives (type int).
""" """
total_pages_url = ( total_pages_url = (
"https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url "https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url
@ -167,6 +167,19 @@ class Url:
return "%s" % self._archive_url return "%s" % self._archive_url
def __len__(self): def __len__(self):
"""
Why do we have len here?
Applying len() on <class 'waybackpy.wrapper.Url'>
will calculate the number of days between today and
the archive timestamp.
Can be applied on return values of near and its
childs (e.g. oldest) and if applied on waybackpy.Url()
whithout using any functions, it just grabs
self._timestamp and def _timestamp gets it
from def JSON.
"""
td_max = timedelta( td_max = timedelta(
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999 days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
) )
@ -194,7 +207,12 @@ class Url:
@property @property
def JSON(self): def JSON(self):
""" """
Returns JSON data from 'https://archive.org/wayback/available?url=YOUR-URL'. If the end user has used near() or its childs like oldest, newest
and archive_url then the JSON response of these are cached in self._JSON
If we find that self._JSON is not None we return it.
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
and return it.
""" """
if self._JSON: if self._JSON:
@ -235,8 +253,12 @@ class Url:
def _timestamp(self): def _timestamp(self):
""" """
Get timestamp of last fetched archive. Get timestamp of last fetched archive.
If used before fetching any archive, This If used before fetching any archive, will
randomly picks archive. use whatever self.JSON returns.
self.timestamp is None implies that
self.JSON will return any archive's JSON
that wayback machine provides it.
""" """
if self.timestamp: if self.timestamp:
@ -256,13 +278,25 @@ class Url:
def _cleaned_url(self): def _cleaned_url(self):
""" """
Remove newlines Remove EOL
replace " " with "_" replace " " with "_"
""" """
return str(self.url).strip().replace(" ", "_") return str(self.url).strip().replace(" ", "_")
def save(self): def save(self):
"""Create a new Wayback Machine archive for this URL.""" """
To save a webpage on WayBack machine we
need to send get request to https://web.archive.org/save/
And to get the archive URL we are required to read the
header of the API response.
_get_response() takes care of the get requests. It uses requests
package.
_archive_url_parser() parses the archive from the header.
"""
request_url = "https://web.archive.org/save/" + self._cleaned_url() request_url = "https://web.archive.org/save/" + self._cleaned_url()
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
response = _get_response(request_url, params=None, headers=headers) response = _get_response(request_url, params=None, headers=headers)
@ -271,8 +305,10 @@ class Url:
return self return self
def get(self, url="", user_agent="", encoding=""): def get(self, url="", user_agent="", encoding=""):
"""Return the source code of the supplied URL. """
If encoding is not supplied, it is auto-detected from the response. Return the source code of the supplied URL.
If encoding is not supplied, it is auto-detected
from the response itself by requests package.
""" """
if not url: if not url:
@ -452,8 +488,10 @@ class Url:
class CdxSnapshot: class CdxSnapshot:
""" """
[["urlkey","timestamp","original","mimetype","statuscode","digest","length"], This class helps to handle the Cdx Snapshots easily.
["org,archive)/", "19970126045828", "http://www.archive.org:80/", "text/html", "200", "Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY", "1415"]]
What the raw data looks like:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
""" """
def __init__( def __init__(
@ -478,6 +516,7 @@ class CdxSnapshot:
class Cdx: class Cdx:
""" """
waybackpy Cdx class, Type : <class 'waybackpy.wrapper.Cdx'> waybackpy Cdx class, Type : <class 'waybackpy.wrapper.Cdx'>
Cdx keys are : Cdx keys are :
urlkey urlkey
timestamp timestamp
@ -501,6 +540,10 @@ class Cdx:
self.end_timestamp = str(end_timestamp) if end_timestamp else None self.end_timestamp = str(end_timestamp) if end_timestamp else None
def snapshots(self): def snapshots(self):
"""
This function yeilds snapshots encapsulated
in CdxSnapshot for more usability.
"""
payload = {} payload = {}
endpoint = "https://web.archive.org/cdx/search/cdx" endpoint = "https://web.archive.org/cdx/search/cdx"
total_pages = _get_total_pages(self.url, self.user_agent) total_pages = _get_total_pages(self.url, self.user_agent)