improve docs

This commit is contained in:
Akash Mahanty 2021-01-05 01:46:12 +05:30
parent 6fb6b2deee
commit e0a4b007d5
3 changed files with 84 additions and 27 deletions

View File

@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
# ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
@ -10,24 +8,43 @@
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
"""
Waybackpy is a Python package that interfaces with the Internet Archive's Wayback Machine API.
Waybackpy is a Python package & command-line program that interfaces with the Internet Archive's Wayback Machine API.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Archive pages and retrieve archived pages easily.
Archive webpage and retrieve archived URLs easily.
Usage:
>>> import waybackpy
>>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
>>> new_archive = target_url.save()
>>> print(new_archive)
https://web.archive.org/web/20200502170312/https://www.python.org/
>>> import waybackpy
Full documentation @ <https://akamhy.github.io/waybackpy/>.
:copyright: (c) 2020 by akamhy.
>>> url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
>>> wayback = waybackpy.Url(url, user_agent)
>>> archive = wayback.save()
>>> str(archive)
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
>>> archive.timestamp
datetime.datetime(2021, 1, 4, 17, 35, 12, 691741)
>>> oldest_archive = wayback.oldest()
>>> str(oldest_archive)
'https://web.archive.org/web/20050422130129/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
>>> archive_close_to_2010_feb = wayback.near(year=2010, month=2)
>>> str(archive_close_to_2010_feb)
'https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
>>> str(wayback.newest())
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
Full documentation @ <https://github.com/akamhy/waybackpy/wiki>.
:copyright: (c) 2020-2021 AKash Mahanty Et al.
:license: MIT
"""
from .wrapper import Url
from .wrapper import Url, Cdx
from .__version__ import (
__title__,
__description__,

View File

@ -1,6 +1,3 @@
# -*- coding: utf-8 -*-
class WaybackError(Exception):
"""
Raised when Wayback Machine API Service is unreachable/down.

View File

@ -11,10 +11,10 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
def _get_total_pages(url, user_agent):
"""
If showNumPages is passed in cdx API, it returns 'number of pages of'
and each page has many archives.
If showNumPages is passed in cdx API, it returns
'number of archive pages'and each page has many archives.
This func returns number of pages (type int).
This func returns number of pages of archives (type int).
"""
total_pages_url = (
"https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url
@ -167,6 +167,19 @@ class Url:
return "%s" % self._archive_url
def __len__(self):
"""
Why do we have len here?
Applying len() on <class 'waybackpy.wrapper.Url'>
will calculate the number of days between today and
the archive timestamp.
Can be applied on return values of near and its
childs (e.g. oldest) and if applied on waybackpy.Url()
whithout using any functions, it just grabs
self._timestamp and def _timestamp gets it
from def JSON.
"""
td_max = timedelta(
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
)
@ -194,7 +207,12 @@ class Url:
@property
def JSON(self):
"""
Returns JSON data from 'https://archive.org/wayback/available?url=YOUR-URL'.
If the end user has used near() or its childs like oldest, newest
and archive_url then the JSON response of these are cached in self._JSON
If we find that self._JSON is not None we return it.
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
and return it.
"""
if self._JSON:
@ -235,8 +253,12 @@ class Url:
def _timestamp(self):
"""
Get timestamp of last fetched archive.
If used before fetching any archive, This
randomly picks archive.
If used before fetching any archive, will
use whatever self.JSON returns.
self.timestamp is None implies that
self.JSON will return any archive's JSON
that wayback machine provides it.
"""
if self.timestamp:
@ -256,13 +278,25 @@ class Url:
def _cleaned_url(self):
"""
Remove newlines
Remove EOL
replace " " with "_"
"""
return str(self.url).strip().replace(" ", "_")
def save(self):
"""Create a new Wayback Machine archive for this URL."""
"""
To save a webpage on WayBack machine we
need to send get request to https://web.archive.org/save/
And to get the archive URL we are required to read the
header of the API response.
_get_response() takes care of the get requests. It uses requests
package.
_archive_url_parser() parses the archive from the header.
"""
request_url = "https://web.archive.org/save/" + self._cleaned_url()
headers = {"User-Agent": self.user_agent}
response = _get_response(request_url, params=None, headers=headers)
@ -271,8 +305,10 @@ class Url:
return self
def get(self, url="", user_agent="", encoding=""):
"""Return the source code of the supplied URL.
If encoding is not supplied, it is auto-detected from the response.
"""
Return the source code of the supplied URL.
If encoding is not supplied, it is auto-detected
from the response itself by requests package.
"""
if not url:
@ -452,8 +488,10 @@ class Url:
class CdxSnapshot:
"""
[["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
["org,archive)/", "19970126045828", "http://www.archive.org:80/", "text/html", "200", "Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY", "1415"]]
This class helps to handle the Cdx Snapshots easily.
What the raw data looks like:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
"""
def __init__(
@ -478,6 +516,7 @@ class CdxSnapshot:
class Cdx:
"""
waybackpy Cdx class, Type : <class 'waybackpy.wrapper.Cdx'>
Cdx keys are :
urlkey
timestamp
@ -501,6 +540,10 @@ class Cdx:
self.end_timestamp = str(end_timestamp) if end_timestamp else None
def snapshots(self):
"""
This function yeilds snapshots encapsulated
in CdxSnapshot for more usability.
"""
payload = {}
endpoint = "https://web.archive.org/cdx/search/cdx"
total_pages = _get_total_pages(self.url, self.user_agent)