diff --git a/tests/test_cdx.py b/tests/test_cdx.py
index 887afd7..fdc6bbe 100644
--- a/tests/test_cdx.py
+++ b/tests/test_cdx.py
@@ -79,7 +79,7 @@ def test_all_cdx():
c = 0
for snapshot in snapshots:
c += 1
- if c > 30_529: # deafult limit is 10k
+ if c > 30529: # deafult limit is 10k
break
url = "https://github.com/*"
@@ -89,5 +89,5 @@ def test_all_cdx():
for snapshot in snapshots:
c += 1
- if c > 100_529:
+ if c > 100529:
break
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d8593c7..f788c2e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -5,8 +5,7 @@ import random
import string
import argparse
-sys.path.append("..")
-import waybackpy.cli as cli # noqa: E402
+import waybackpy.cli as cli
from waybackpy.wrapper import Url # noqa: E402
from waybackpy.__version__ import __version__
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 08cfaec..4c869d7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -14,14 +14,14 @@ from waybackpy.utils import (
_check_match_type,
_check_collapses,
_check_filters,
- _ts,
+ _timestamp_manager,
)
-def test_ts():
+def test_timestamp_manager():
timestamp = True
data = {}
- assert _ts(timestamp, data)
+ assert _timestamp_manager(timestamp, data)
data = """
{"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"}
@@ -61,10 +61,10 @@ def test_check_collapses():
def test_check_match_type():
- assert None == _check_match_type(None, "url")
+ assert _check_match_type(None, "url") is None
match_type = "exact"
url = "test_url"
- assert None == _check_match_type(match_type, url)
+ assert _check_match_type(match_type, url) is None
url = "has * in it"
with pytest.raises(WaybackError):
@@ -82,7 +82,7 @@ def test_cleaned_url():
def test_url_check():
good_url = "https://akamhy.github.io"
- assert None == _url_check(good_url)
+ assert _url_check(good_url) is None
bad_url = "https://github-com"
with pytest.raises(URLError):
diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py
index 359ba91..100608f 100644
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@@ -1,8 +1,4 @@
-import sys
import pytest
-import random
-import requests
-from datetime import datetime
from waybackpy.wrapper import Url
diff --git a/waybackpy/cdx.py b/waybackpy/cdx.py
index 3ce30bf..b2295c7 100644
--- a/waybackpy/cdx.py
+++ b/waybackpy/cdx.py
@@ -11,6 +11,7 @@ from .utils import (
)
# TODO : Threading support for pagination API. It's designed for Threading.
+# TODO : Add get method here if type is Vaild HTML, SVG other but not - or warc. Test it.
class Cdx:
@@ -42,7 +43,22 @@ class Cdx:
self.use_page = False
def cdx_api_manager(self, payload, headers, use_page=False):
- """
+ """Act as button, we can choose between the normal API and pagination API.
+
+ Parameters
+ ----------
+ self : waybackpy.cdx.Cdx
+ The instance itself
+
+ payload : dict
+ Get request parameters name value pairs
+
+ headers : dict
+ The headers for making the GET request.
+
+ use_page : bool
+ If True use pagination API else use normal resume key based API.
+
We have two options to get the snapshots, we use this
method to make a selection between pagination API and
the normal one with Resumption Key, sequential querying
@@ -141,7 +157,7 @@ class Cdx:
def snapshots(self):
"""
This function yeilds snapshots encapsulated
- in CdxSnapshot for more usability.
+ in CdxSnapshot for increased usability.
All the get request values are set if the conditions match
@@ -188,10 +204,9 @@ class Cdx:
prop_values = snapshot.split(" ")
- # Making sure that we get the same number of
- # property values as the number of properties
prop_values_len = len(prop_values)
properties_len = len(properties)
+
if prop_values_len != properties_len:
raise WaybackError(
"Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(
diff --git a/waybackpy/cli.py b/waybackpy/cli.py
index adbf1aa..45f305a 100644
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -5,6 +5,7 @@ import json
import random
import string
import argparse
+
from .wrapper import Url
from .exceptions import WaybackError
from .__version__ import __version__
diff --git a/waybackpy/snapshot.py b/waybackpy/snapshot.py
index 992ad2e..e3dc027 100644
--- a/waybackpy/snapshot.py
+++ b/waybackpy/snapshot.py
@@ -3,15 +3,24 @@ from datetime import datetime
class CdxSnapshot:
"""
- This class helps to use the Cdx Snapshots easily.
+ This class encapsulates the snapshots for greater usability.
Raw Snapshot data looks like:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
- properties is a dict containg all of the 7 cdx snapshot properties.
"""
def __init__(self, properties):
+ """
+ Parameters
+ ----------
+ self : waybackpy.snapshot.CdxSnapshot
+ The instance itself
+
+ properties : dict
+ Properties is a dict containg all of the 7 cdx snapshot properties.
+
+ """
self.urlkey = properties["urlkey"]
self.timestamp = properties["timestamp"]
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
@@ -25,6 +34,12 @@ class CdxSnapshot:
)
def __str__(self):
+ """Returns the Cdx snapshot line.
+
+ Output format:
+ org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
+
+ """
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
urlkey=self.urlkey,
timestamp=self.timestamp,
diff --git a/waybackpy/utils.py b/waybackpy/utils.py
index 8bfee70..7c6958d 100644
--- a/waybackpy/utils.py
+++ b/waybackpy/utils.py
@@ -1,28 +1,72 @@
import re
import time
import requests
-from .exceptions import WaybackError, URLError
from datetime import datetime
+from .exceptions import WaybackError, URLError
+from .__version__ import __version__
+
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
-from .__version__ import __version__
quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _latest_version(package_name, headers):
- endpoint = "https://pypi.org/pypi/" + package_name + "/json"
- json = _get_response(endpoint, headers=headers).json()
- return json["info"]["version"]
+ """Returns the latest version of package_name.
+
+ Parameters
+ ----------
+ package_name : str
+ The name of the python package
+
+ headers : dict
+ Headers that will be used while making get requests
+
+ Return type is str
+
+ Use API to get the latest version of
+ waybackpy, but can be used to get latest version of any package
+ on PyPi.
+ """
+
+ request_url = "https://pypi.org/pypi/" + package_name + "/json"
+ response = _get_response(request_url, headers=headers)
+ data = response.json()
+ return data["info"]["version"]
-def _unix_ts_to_wayback_ts(unix_ts):
- return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
+def _unix_timestamp_to_wayback_timestamp(unix_timestamp):
+ """Returns unix timestamp converted to datetime.datetime
+
+ Parameters
+ ----------
+ unix_timestamp : str, int or float
+ Unix-timestamp that needs to be converted to datetime.datetime
+
+ Converts and returns input unix_timestamp to datetime.datetime object.
+ Does not matter if unix_timestamp is str, float or int.
+ """
+
+ return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def _add_payload(instance, payload):
+ """Adds payload from instance that can be used to make get requests.
+
+ Parameters
+ ----------
+ instance : waybackpy.cdx.Cdx
+ instance of the Cdx class
+
+ payload : dict
+ A dict onto which we need to add keys and values based on instance.
+
+ instance is object of Cdx class and it contains the data required to fill
+ the payload dictionary.
+ """
+
if instance.start_timestamp:
payload["from"] = instance.start_timestamp
@@ -43,18 +87,27 @@ def _add_payload(instance, payload):
for i, f in enumerate(instance.collapses):
payload["collapse" + str(i)] = f
+ # Don't need to return anything as it's dictionary.
payload["url"] = instance.url
-def _ts(timestamp, data):
- """
- Get timestamp of last fetched archive.
- If used before fetching any archive, will
- use whatever self.JSON returns.
+def _timestamp_manager(timestamp, data):
+ """Returns the timestamp.
- self.timestamp is None implies that
- self.JSON will return any archive's JSON
- that wayback machine provides it.
+ Parameters
+ ----------
+ timestamp : datetime.datetime
+ datetime object
+
+ data : dict
+ A python dictionary, which is loaded JSON os the availability API.
+
+ Return type:
+ datetime.datetime
+
+ If timestamp is not None then sets the value to timestamp itself.
+ If timestamp is None the returns the value from the last fetched API data.
+ If not timestamp and can not read the archived_snapshots form data return datetime.max
"""
if timestamp:
@@ -69,6 +122,21 @@ def _ts(timestamp, data):
def _check_match_type(match_type, url):
+ """Checks the validity of match_type parameter of the CDX GET requests.
+
+ Parameters
+ ----------
+ match_type : list
+ list that may contain any or all from ["exact", "prefix", "host", "domain"]
+ See https://github.com/akamhy/waybackpy/wiki/Python-package-docs#url-match-scope
+
+ url : str
+ The URL used to create the waybackpy Url object.
+
+ If not vaild match_type raise Exception.
+
+ """
+
if not match_type:
return
@@ -85,6 +153,19 @@ def _check_match_type(match_type, url):
def _check_collapses(collapses):
+ """Checks the validity of collapse parameter of the CDX GET request.
+
+ One or more field or field:N to 'collapses=[]' where
+ field is one of (urlkey, timestamp, original, mimetype, statuscode,
+ digest and length) and N is the first N characters of field to test.
+
+ Parameters
+ ----------
+ collapses : list
+
+ If not vaild collapses raise Exception.
+
+ """
if not isinstance(collapses, list):
raise WaybackError("collapses must be a list.")
@@ -119,12 +200,26 @@ def _check_collapses(collapses):
def _check_filters(filters):
+ """Checks the validity of filter parameter of the CDX GET request.
+
+ Any number of filter params of the following form may be specified:
+ filters=["[!]field:regex"] may be specified..
+
+ Parameters
+ ----------
+ filters : list
+
+ If not vaild filters raise Exception.
+
+ """
+
if not isinstance(filters, list):
raise WaybackError("filters must be a list.")
# [!]field:regex
for _filter in filters:
try:
+
match = re.search(
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
_filter,
@@ -134,8 +229,9 @@ def _check_filters(filters):
val = match.group(2)
except Exception:
+
exc_message = (
- "Filter '{_filter}' not following the cdx filter syntax.".format(
+ "Filter '{_filter}' is not following the cdx filter syntax.".format(
_filter=_filter
)
)
@@ -143,6 +239,9 @@ def _check_filters(filters):
def _cleaned_url(url):
+ """Sanatize the url
+ Remove and replace illegal whitespace characters from the URL.
+ """
return str(url).strip().replace(" ", "%20")
@@ -161,16 +260,29 @@ def _url_check(url):
def _full_url(endpoint, params):
- full_url = endpoint
- if params:
- full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
- for key, val in params.items():
- key = "filter" if key.startswith("filter") else key
- key = "collapse" if key.startswith("collapse") else key
- amp = "" if full_url.endswith("?") else "&"
- full_url = (
- full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
- )
+ """API endpoint + GET parameters = full_url
+
+ Parameters
+ ----------
+ endpoint : str
+ The API endpoint
+
+ params : dict
+ Dictionary that has name-value pairs.
+
+ Return type is str
+
+ """
+
+ if not params:
+ return endpoint
+
+ full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
+ for key, val in params.items():
+ key = "filter" if key.startswith("filter") else key
+ key = "collapse" if key.startswith("collapse") else key
+ amp = "" if full_url.endswith("?") else "&"
+ full_url = full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
return full_url
@@ -191,17 +303,31 @@ def _get_total_pages(url, user_agent):
def _archive_url_parser(header, url, latest_version=__version__, instance=None):
- """
+ """Returns the archive after parsing it from the response header.
+
+ Parameters
+ ----------
+ header : str
+ The response header of WayBack Machine's Save API
+
+ url : str
+ The input url, the one used to created the Url object.
+
+ latest_version : str
+ The latest version of waybackpy (default is __version__)
+
+ instance : waybackpy.wrapper.Url
+ Instance of Url class
+
+
The wayback machine's save API doesn't
return JSON response, we are required
to read the header of the API response
- and look for the archive URL.
+ and find the archive URL.
- This method has some regexen (or regexes)
- that search for archive url in header.
-
- This method is used when you try to
- save a webpage on wayback machine.
+ This method has some regular expressions
+ that are used to search for the archive url
+ in the response header of Save API.
Two cases are possible:
1) Either we find the archive url in
@@ -213,7 +339,6 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
If we found the archive URL we return it.
Return format:
-
web.archive.org/web//
And if we couldn't find it, we raise
@@ -304,9 +429,7 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
def _wayback_timestamp(**kwargs):
- """
- Wayback Machine archive URLs
- have a timestamp in them.
+ """Returns a valid waybackpy timestamp.
The standard archive URL format is
https://web.archive.org/web/20191214041711/https://www.youtube.com
@@ -316,13 +439,17 @@ def _wayback_timestamp(**kwargs):
2 ) timestamp (20191214041711)
3 ) https://www.youtube.com, the original URL
- The near method takes year, month, day, hour and minute
- as Arguments, their type is int.
+
+ The near method of Url class in wrapper.py takes year, month, day, hour
+ and minute as arguments, their type is int.
This method takes those integers and converts it to
wayback machine timestamp and returns it.
- Return format is string.
+
+ zfill(2) adds 1 zero in front of single digit days, months hour etc.
+
+ Return type is string.
"""
return "".join(
@@ -339,16 +466,37 @@ def _get_response(
backoff_factor=0.5,
no_raise_on_redirects=False,
):
- """
- This function is used make get request.
- We use the requests package to make the
- requests.
+ """Makes get requests.
+
+ Parameters
+ ----------
+ endpoint : str
+ The API endpoint.
+
+ params : dict
+ The get request parameters. (default is None)
+
+ headers : dict
+ Headers for the get request. (default is None)
+
+ return_full_url : bool
+ Determines whether the call went full url returned along with the
+ response. (default is False)
+
+ retries : int
+ Maximum number of retries for the get request. (default is 5)
+
+ backoff_factor : float
+ The factor by which we determine the next retry time after wait.
+ https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html
+ (default is 0.5)
+
+ no_raise_on_redirects : bool
+ If maximum 30(default for requests) times redirected than instead of
+ exceptions return. (default is False)
- We try five times and if it fails it raises
- WaybackError exception.
-
- You can handles WaybackError by importing:
+ To handle WaybackError:
from waybackpy.exceptions import WaybackError
try:
@@ -370,20 +518,28 @@ def _get_response(
s.mount("https://", HTTPAdapter(max_retries=retries))
+ # The URL with parameters required for the get request
url = _full_url(endpoint, params)
try:
+
if not return_full_url:
return s.get(url, headers=headers)
+
return (url, s.get(url, headers=headers))
+
except Exception as e:
+
reason = str(e)
+
if no_raise_on_redirects:
if "Exceeded 30 redirects" in reason:
return
+
exc_message = "Error while retrieving {url}.\n{reason}".format(
url=url, reason=reason
)
+
exc = WaybackError(exc_message)
exc.__cause__ = e
raise exc
diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py
index 77add29..ef24a81 100644
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -1,5 +1,6 @@
import re
from datetime import datetime, timedelta
+
from .exceptions import WaybackError
from .cdx import Cdx
from .utils import (
@@ -9,13 +10,85 @@ from .utils import (
default_user_agent,
_url_check,
_cleaned_url,
- _ts,
- _unix_ts_to_wayback_ts,
+ _timestamp_manager,
+ _unix_timestamp_to_wayback_timestamp,
_latest_version,
)
class Url:
+ """
+
+ Attributes
+ ----------
+ url : str
+ The input URL, wayback machine API operations are performed
+ on this URL after sanatizing it.
+
+ user_agent : str
+ The user_agent used while making the GET requests to the
+ Wayback machine APIs
+
+ _archive_url : str
+ Caches the last fetched archive.
+
+ timestamp : datetime.datetime
+ timestamp of the archive URL as datetime object for
+ greater usability
+
+ _JSON : dict
+ Caches the last fetched availability API data
+
+ latest_version : str
+ The latest version of waybackpy on PyPi
+
+ cached_save : bool
+ Flag to check if WayBack machine returned a cached
+ archive instead of creating a new archive. WayBack
+ machine allows only one 1 archive for an URL in
+ 30 minutes. If the archive returned by WayBack machine
+ is older than 3 minutes than this flag is set to True
+
+ Methods turned properties
+ ----------
+ JSON : dict
+ JSON response of availability API as dictionary / loaded JSON
+
+ archive_url : str
+ Return the archive url, returns str
+
+ _timestamp : datetime.datetime
+ Sets the value of self.timestamp if still not set
+
+ Methods
+ -------
+ save()
+ Archives the URL on WayBack machine
+
+ get(url="", user_agent="", encoding="")
+ Gets the source of archive url, can also be used to get source
+ of any URL if passed into it.
+
+ near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
+ Wayback Machine can have many archives for a URL/webpage, sometimes we want
+ archive close to a specific time.
+ This method takes year, month, day, hour, minute and unix_timestamp as input.
+
+ oldest(year=1994)
+ The oldest archive of an URL.
+
+ newest()
+ The newest archive of an URL
+
+ total_archives(start_timestamp=None, end_timestamp=None)
+ total number of archives of an URL, the timeframe can be confined by
+ start_timestamp and end_timestamp
+
+ known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
+ Known URLs for an URL, subdomain, URL as prefix etc.
+
+ """
+
def __init__(self, url, user_agent=default_user_agent):
self.url = url
self.user_agent = str(user_agent)
@@ -32,29 +105,17 @@ class Url:
)
def __str__(self):
- """
- Output when print() is used on
- This should print an archive URL.
-
- We check if self._archive_url is not None.
- If not None, good. We return string of self._archive_url.
-
- If self._archive_url is None, it means we ain't used any method that
- sets self._archive_url, we now set self._archive_url to self.archive_url
- and return it.
- """
-
if not self._archive_url:
self._archive_url = self.archive_url
+
return "{archive_url}".format(archive_url=self._archive_url)
def __len__(self):
- """
- Why do we have len here?
+ """Number of days between today and the date of archive based on the timestamp
- Applying len() on
- will calculate the number of days between today and
- the archive timestamp.
+ len() of waybackpy.wrapper.Url should return
+ the number of days between today and the
+ archive timestamp.
Can be applied on return values of near and its
childs (e.g. oldest) and if applied on waybackpy.Url()
@@ -76,32 +137,30 @@ class Url:
@property
def JSON(self):
- """
- If the end user has used near() or its childs like oldest, newest
- and archive_url then the JSON response of these are cached in self._JSON
+ """Returns JSON response of availability API as dictionary / loaded JSON
- If we find that self._JSON is not None we return it.
- else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
- and return it.
+ return type : dict
"""
+ # If user used the near method or any method that depends on near, we
+ # are certain that we have a loaded dictionary cached in self._JSON.
+ # Return the loaded JSON data.
if self._JSON:
return self._JSON
+ # If no cached data found, get data and return + cache it.
endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": self.user_agent}
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
response = _get_response(endpoint, params=payload, headers=headers)
- return response.json()
+ self._JSON = response.json()
+ return self._JSON
@property
def archive_url(self):
- """
- Returns any random archive for the instance.
- But if near, oldest, newest were used before
- then it returns the same archive again.
+ """Return the archive url.
- We cache archive in self._archive_url
+ return type : str
"""
if self._archive_url:
@@ -121,11 +180,16 @@ class Url:
@property
def _timestamp(self):
- self.timestamp = _ts(self.timestamp, self.JSON)
- return self.timestamp
+ """Sets the value of self.timestamp if still not set.
+
+ Return type : datetime.datetime
+
+ """
+ return _timestamp_manager(self.timestamp, self.JSON)
def save(self):
- """
+ """Saves/Archive the URL.
+
To save a webpage on WayBack machine we
need to send get request to https://web.archive.org/save/
@@ -136,6 +200,8 @@ class Url:
_archive_url_parser() parses the archive from the header.
+ return type : waybackpy.wrapper.Url
+
"""
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
headers = {"User-Agent": self.user_agent}
@@ -161,7 +227,9 @@ class Url:
instance=self,
)
- m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url)
+ m = re.search(
+ r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
+ )
str_ts = m.group(1)
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
now = datetime.utcnow()
@@ -175,9 +243,22 @@ class Url:
return self
def get(self, url="", user_agent="", encoding=""):
- """
- Return the source code of the last archived URL,
- if no URL is passed to this method.
+ """GET the source of archive or any other URL.
+
+ url : str, waybackpy.wrapper.Url
+ The method will return the source code of
+ this URL instead of last fetched archive.
+
+ user_agent : str
+ The user_agent for GET request to API
+
+ encoding : str
+ If user is using any other encoding that
+ can't be detected by response.encoding
+
+ Return the source code of the last fetched
+ archive URL if no URL is passed to this method
+ else it returns the source code of url passed.
If encoding is not supplied, it is auto-detected
from the response itself by requests package.
@@ -213,6 +294,27 @@ class Url:
unix_timestamp=None,
):
"""
+ Parameters
+ ----------
+
+ year : int
+ Archive close to year
+
+ month : int
+ Archive close to month
+
+ day : int
+ Archive close to day
+
+ hour : int
+ Archive close to hour
+
+ minute : int
+ Archive close to minute
+
+ unix_timestamp : str, float or int
+ Archive close to this unix_timestamp
+
Wayback Machine can have many archives of a webpage,
sometimes we want archive close to a specific time.
@@ -235,7 +337,7 @@ class Url:
"""
if unix_timestamp:
- timestamp = _unix_ts_to_wayback_ts(unix_timestamp)
+ timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = _wayback_timestamp(
@@ -285,28 +387,45 @@ class Url:
We simply pass the year in near() and return it.
"""
+
return self.near(year=year)
def newest(self):
- """
- Return the newest Wayback Machine archive available for this URL.
+ """Return the newest Wayback Machine archive available.
- We return the output of self.near() as it deafults to current utc time.
+ We return the return value of self.near() as it deafults to current UTC time.
Due to Wayback Machine database lag, this may not always be the
most recent archive.
+
+ return type : waybackpy.wrapper.Url
"""
+
return self.near()
def total_archives(self, start_timestamp=None, end_timestamp=None):
- """
+ """Returns the total number of archives for an URL
+
+ Parameters
+ ----------
+ start_timestamp : str
+ 1 to 14 digit string of numbers, you are not required to
+ pass a full 14 digit timestamp.
+
+ end_timestamp : str
+ 1 to 14 digit string of numbers, you are not required to
+ pass a full 14 digit timestamp.
+
+
+ return type : int
+
+
A webpage can have multiple archives on the wayback machine
If someone wants to count the total number of archives of a
webpage on wayback machine they can use this method.
Returns the total number of Wayback Machine archives for the URL.
- Return type in integer.
"""
cdx = Cdx(
@@ -315,6 +434,8 @@ class Url:
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
)
+
+ # cdx.snapshots() is generator not list.
i = 0
for _ in cdx.snapshots():
i = i + 1
@@ -328,15 +449,36 @@ class Url:
end_timestamp=None,
match_type="prefix",
):
- """
+ """Yields known_urls URLs from the CDX API.
+
+ Parameters
+ ----------
+
+ subdomain : bool
+ If True fetch subdomain URLs along with the host URLs.
+
+ host : bool
+ Only fetch host URLs.
+
+ start_timestamp : str
+ 1 to 14 digit string of numbers, you are not required to
+ pass a full 14 digit timestamp.
+
+ end_timestamp : str
+ 1 to 14 digit string of numbers, you are not required to
+ pass a full 14 digit timestamp.
+
+ match_type : str
+ One of (exact, prefix, host and domain)
+
+ return type : waybackpy.snapshot.CdxSnapshot
+
Yields list of URLs known to exist for given input.
Defaults to input URL as prefix.
- This method is kept for compatibility, use the Cdx class instead.
- This method itself depends on Cdx.
-
- Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
- https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
+ Based on:
+ https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
+ By Mohammed Diaa (https://github.com/mhmdiaa)
"""
if subdomain:
@@ -353,7 +495,5 @@ class Url:
collapses=["urlkey"],
)
- snapshots = cdx.snapshots()
-
- for snapshot in snapshots:
+ for snapshot in cdx.snapshots():
yield (snapshot.original)