improve docs
This commit is contained in:
parent
6fb6b2deee
commit
e0a4b007d5
@ -1,5 +1,3 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
|
# ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
|
||||||
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
|
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
|
||||||
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
|
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
|
||||||
@ -10,24 +8,43 @@
|
|||||||
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
|
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Waybackpy is a Python package that interfaces with the Internet Archive's Wayback Machine API.
|
Waybackpy is a Python package & command-line program that interfaces with the Internet Archive's Wayback Machine API.
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Archive pages and retrieve archived pages easily.
|
Archive webpage and retrieve archived URLs easily.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
>>> import waybackpy
|
>>> import waybackpy
|
||||||
>>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
|
|
||||||
>>> new_archive = target_url.save()
|
|
||||||
>>> print(new_archive)
|
|
||||||
https://web.archive.org/web/20200502170312/https://www.python.org/
|
|
||||||
|
|
||||||
Full documentation @ <https://akamhy.github.io/waybackpy/>.
|
>>> url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
|
||||||
:copyright: (c) 2020 by akamhy.
|
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||||
|
|
||||||
|
>>> wayback = waybackpy.Url(url, user_agent)
|
||||||
|
|
||||||
|
>>> archive = wayback.save()
|
||||||
|
>>> str(archive)
|
||||||
|
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
|
||||||
|
|
||||||
|
>>> archive.timestamp
|
||||||
|
datetime.datetime(2021, 1, 4, 17, 35, 12, 691741)
|
||||||
|
|
||||||
|
>>> oldest_archive = wayback.oldest()
|
||||||
|
>>> str(oldest_archive)
|
||||||
|
'https://web.archive.org/web/20050422130129/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
|
||||||
|
|
||||||
|
>>> archive_close_to_2010_feb = wayback.near(year=2010, month=2)
|
||||||
|
>>> str(archive_close_to_2010_feb)
|
||||||
|
'https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
|
||||||
|
|
||||||
|
>>> str(wayback.newest())
|
||||||
|
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
|
||||||
|
|
||||||
|
Full documentation @ <https://github.com/akamhy/waybackpy/wiki>.
|
||||||
|
:copyright: (c) 2020-2021 AKash Mahanty Et al.
|
||||||
:license: MIT
|
:license: MIT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .wrapper import Url
|
from .wrapper import Url, Cdx
|
||||||
from .__version__ import (
|
from .__version__ import (
|
||||||
__title__,
|
__title__,
|
||||||
__description__,
|
__description__,
|
||||||
|
@ -1,6 +1,3 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
|
|
||||||
class WaybackError(Exception):
|
class WaybackError(Exception):
|
||||||
"""
|
"""
|
||||||
Raised when Wayback Machine API Service is unreachable/down.
|
Raised when Wayback Machine API Service is unreachable/down.
|
||||||
|
@ -11,10 +11,10 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
|
|||||||
|
|
||||||
def _get_total_pages(url, user_agent):
|
def _get_total_pages(url, user_agent):
|
||||||
"""
|
"""
|
||||||
If showNumPages is passed in cdx API, it returns 'number of pages of'
|
If showNumPages is passed in cdx API, it returns
|
||||||
and each page has many archives.
|
'number of archive pages'and each page has many archives.
|
||||||
|
|
||||||
This func returns number of pages (type int).
|
This func returns number of pages of archives (type int).
|
||||||
"""
|
"""
|
||||||
total_pages_url = (
|
total_pages_url = (
|
||||||
"https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url
|
"https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url
|
||||||
@ -167,6 +167,19 @@ class Url:
|
|||||||
return "%s" % self._archive_url
|
return "%s" % self._archive_url
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
"""
|
||||||
|
Why do we have len here?
|
||||||
|
|
||||||
|
Applying len() on <class 'waybackpy.wrapper.Url'>
|
||||||
|
will calculate the number of days between today and
|
||||||
|
the archive timestamp.
|
||||||
|
|
||||||
|
Can be applied on return values of near and its
|
||||||
|
childs (e.g. oldest) and if applied on waybackpy.Url()
|
||||||
|
whithout using any functions, it just grabs
|
||||||
|
self._timestamp and def _timestamp gets it
|
||||||
|
from def JSON.
|
||||||
|
"""
|
||||||
td_max = timedelta(
|
td_max = timedelta(
|
||||||
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
||||||
)
|
)
|
||||||
@ -194,7 +207,12 @@ class Url:
|
|||||||
@property
|
@property
|
||||||
def JSON(self):
|
def JSON(self):
|
||||||
"""
|
"""
|
||||||
Returns JSON data from 'https://archive.org/wayback/available?url=YOUR-URL'.
|
If the end user has used near() or its childs like oldest, newest
|
||||||
|
and archive_url then the JSON response of these are cached in self._JSON
|
||||||
|
|
||||||
|
If we find that self._JSON is not None we return it.
|
||||||
|
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
|
||||||
|
and return it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self._JSON:
|
if self._JSON:
|
||||||
@ -235,8 +253,12 @@ class Url:
|
|||||||
def _timestamp(self):
|
def _timestamp(self):
|
||||||
"""
|
"""
|
||||||
Get timestamp of last fetched archive.
|
Get timestamp of last fetched archive.
|
||||||
If used before fetching any archive, This
|
If used before fetching any archive, will
|
||||||
randomly picks archive.
|
use whatever self.JSON returns.
|
||||||
|
|
||||||
|
self.timestamp is None implies that
|
||||||
|
self.JSON will return any archive's JSON
|
||||||
|
that wayback machine provides it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.timestamp:
|
if self.timestamp:
|
||||||
@ -256,13 +278,25 @@ class Url:
|
|||||||
|
|
||||||
def _cleaned_url(self):
|
def _cleaned_url(self):
|
||||||
"""
|
"""
|
||||||
Remove newlines
|
Remove EOL
|
||||||
replace " " with "_"
|
replace " " with "_"
|
||||||
"""
|
"""
|
||||||
return str(self.url).strip().replace(" ", "_")
|
return str(self.url).strip().replace(" ", "_")
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
"""Create a new Wayback Machine archive for this URL."""
|
"""
|
||||||
|
To save a webpage on WayBack machine we
|
||||||
|
need to send get request to https://web.archive.org/save/
|
||||||
|
|
||||||
|
And to get the archive URL we are required to read the
|
||||||
|
header of the API response.
|
||||||
|
|
||||||
|
_get_response() takes care of the get requests. It uses requests
|
||||||
|
package.
|
||||||
|
|
||||||
|
_archive_url_parser() parses the archive from the header.
|
||||||
|
|
||||||
|
"""
|
||||||
request_url = "https://web.archive.org/save/" + self._cleaned_url()
|
request_url = "https://web.archive.org/save/" + self._cleaned_url()
|
||||||
headers = {"User-Agent": self.user_agent}
|
headers = {"User-Agent": self.user_agent}
|
||||||
response = _get_response(request_url, params=None, headers=headers)
|
response = _get_response(request_url, params=None, headers=headers)
|
||||||
@ -271,8 +305,10 @@ class Url:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def get(self, url="", user_agent="", encoding=""):
|
def get(self, url="", user_agent="", encoding=""):
|
||||||
"""Return the source code of the supplied URL.
|
"""
|
||||||
If encoding is not supplied, it is auto-detected from the response.
|
Return the source code of the supplied URL.
|
||||||
|
If encoding is not supplied, it is auto-detected
|
||||||
|
from the response itself by requests package.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not url:
|
if not url:
|
||||||
@ -452,8 +488,10 @@ class Url:
|
|||||||
|
|
||||||
class CdxSnapshot:
|
class CdxSnapshot:
|
||||||
"""
|
"""
|
||||||
[["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
|
This class helps to handle the Cdx Snapshots easily.
|
||||||
["org,archive)/", "19970126045828", "http://www.archive.org:80/", "text/html", "200", "Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY", "1415"]]
|
|
||||||
|
What the raw data looks like:
|
||||||
|
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -478,6 +516,7 @@ class CdxSnapshot:
|
|||||||
class Cdx:
|
class Cdx:
|
||||||
"""
|
"""
|
||||||
waybackpy Cdx class, Type : <class 'waybackpy.wrapper.Cdx'>
|
waybackpy Cdx class, Type : <class 'waybackpy.wrapper.Cdx'>
|
||||||
|
|
||||||
Cdx keys are :
|
Cdx keys are :
|
||||||
urlkey
|
urlkey
|
||||||
timestamp
|
timestamp
|
||||||
@ -501,6 +540,10 @@ class Cdx:
|
|||||||
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
||||||
|
|
||||||
def snapshots(self):
|
def snapshots(self):
|
||||||
|
"""
|
||||||
|
This function yeilds snapshots encapsulated
|
||||||
|
in CdxSnapshot for more usability.
|
||||||
|
"""
|
||||||
payload = {}
|
payload = {}
|
||||||
endpoint = "https://web.archive.org/cdx/search/cdx"
|
endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||||
total_pages = _get_total_pages(self.url, self.user_agent)
|
total_pages = _get_total_pages(self.url, self.user_agent)
|
||||||
|
Loading…
Reference in New Issue
Block a user