diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py
index 8ec185e..223d098 100644
--- a/waybackpy/__init__.py
+++ b/waybackpy/__init__.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-
# ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
@@ -10,24 +8,43 @@
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
"""
-Waybackpy is a Python package that interfaces with the Internet Archive's Wayback Machine API.
+Waybackpy is a Python package & command-line program that interfaces with the Internet Archive's Wayback Machine API.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Archive pages and retrieve archived pages easily.
+Archive webpage and retrieve archived URLs easily.
Usage:
- >>> import waybackpy
- >>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
- >>> new_archive = target_url.save()
- >>> print(new_archive)
- https://web.archive.org/web/20200502170312/https://www.python.org/
+ >>> import waybackpy
-Full documentation @ .
-:copyright: (c) 2020 by akamhy.
+ >>> url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
+ >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
+
+ >>> wayback = waybackpy.Url(url, user_agent)
+
+ >>> archive = wayback.save()
+ >>> str(archive)
+ 'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
+
+ >>> archive.timestamp
+ datetime.datetime(2021, 1, 4, 17, 35, 12, 691741)
+
+ >>> oldest_archive = wayback.oldest()
+ >>> str(oldest_archive)
+ 'https://web.archive.org/web/20050422130129/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
+
+ >>> archive_close_to_2010_feb = wayback.near(year=2010, month=2)
+ >>> str(archive_close_to_2010_feb)
+ 'https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
+
+ >>> str(wayback.newest())
+ 'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
+
+Full documentation @ .
+:copyright: (c) 2020-2021 AKash Mahanty Et al.
:license: MIT
"""
-from .wrapper import Url
+from .wrapper import Url, Cdx
from .__version__ import (
__title__,
__description__,
diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py
index 2de2b6f..b08f6d6 100644
--- a/waybackpy/exceptions.py
+++ b/waybackpy/exceptions.py
@@ -1,6 +1,3 @@
-# -*- coding: utf-8 -*-
-
-
class WaybackError(Exception):
"""
Raised when Wayback Machine API Service is unreachable/down.
diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py
index 79b61ee..d305960 100644
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -11,10 +11,10 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
def _get_total_pages(url, user_agent):
"""
- If showNumPages is passed in cdx API, it returns 'number of pages of'
- and each page has many archives.
+ If showNumPages is passed in cdx API, it returns
+ 'number of archive pages'and each page has many archives.
- This func returns number of pages (type int).
+ This func returns number of pages of archives (type int).
"""
total_pages_url = (
"https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url
@@ -167,6 +167,19 @@ class Url:
return "%s" % self._archive_url
def __len__(self):
+ """
+ Why do we have len here?
+
+ Applying len() on
+ will calculate the number of days between today and
+ the archive timestamp.
+
+ Can be applied on return values of near and its
+ childs (e.g. oldest) and if applied on waybackpy.Url()
+ whithout using any functions, it just grabs
+ self._timestamp and def _timestamp gets it
+ from def JSON.
+ """
td_max = timedelta(
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
)
@@ -194,7 +207,12 @@ class Url:
@property
def JSON(self):
"""
- Returns JSON data from 'https://archive.org/wayback/available?url=YOUR-URL'.
+ If the end user has used near() or its childs like oldest, newest
+ and archive_url then the JSON response of these are cached in self._JSON
+
+ If we find that self._JSON is not None we return it.
+ else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
+ and return it.
"""
if self._JSON:
@@ -235,8 +253,12 @@ class Url:
def _timestamp(self):
"""
Get timestamp of last fetched archive.
- If used before fetching any archive, This
- randomly picks archive.
+ If used before fetching any archive, will
+ use whatever self.JSON returns.
+
+ self.timestamp is None implies that
+ self.JSON will return any archive's JSON
+ that wayback machine provides it.
"""
if self.timestamp:
@@ -256,13 +278,25 @@ class Url:
def _cleaned_url(self):
"""
- Remove newlines
+ Remove EOL
replace " " with "_"
"""
return str(self.url).strip().replace(" ", "_")
def save(self):
- """Create a new Wayback Machine archive for this URL."""
+ """
+ To save a webpage on WayBack machine we
+ need to send get request to https://web.archive.org/save/
+
+ And to get the archive URL we are required to read the
+ header of the API response.
+
+ _get_response() takes care of the get requests. It uses requests
+ package.
+
+ _archive_url_parser() parses the archive from the header.
+
+ """
request_url = "https://web.archive.org/save/" + self._cleaned_url()
headers = {"User-Agent": self.user_agent}
response = _get_response(request_url, params=None, headers=headers)
@@ -271,8 +305,10 @@ class Url:
return self
def get(self, url="", user_agent="", encoding=""):
- """Return the source code of the supplied URL.
- If encoding is not supplied, it is auto-detected from the response.
+ """
+ Return the source code of the supplied URL.
+ If encoding is not supplied, it is auto-detected
+ from the response itself by requests package.
"""
if not url:
@@ -452,8 +488,10 @@ class Url:
class CdxSnapshot:
"""
- [["urlkey","timestamp","original","mimetype","statuscode","digest","length"],
- ["org,archive)/", "19970126045828", "http://www.archive.org:80/", "text/html", "200", "Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY", "1415"]]
+ This class helps to handle the Cdx Snapshots easily.
+
+ What the raw data looks like:
+ org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
"""
def __init__(
@@ -478,6 +516,7 @@ class CdxSnapshot:
class Cdx:
"""
waybackpy Cdx class, Type :
+
Cdx keys are :
urlkey
timestamp
@@ -501,6 +540,10 @@ class Cdx:
self.end_timestamp = str(end_timestamp) if end_timestamp else None
def snapshots(self):
+ """
+ This function yeilds snapshots encapsulated
+ in CdxSnapshot for more usability.
+ """
payload = {}
endpoint = "https://web.archive.org/cdx/search/cdx"
total_pages = _get_total_pages(self.url, self.user_agent)