From db8f902cfff592f07386bcc92c7910ccb50c98d0 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Tue, 26 Jan 2021 11:56:03 +0530 Subject: [PATCH] Add doc strings (#90) * Added some docstrings in utils.py * renamed some func/meth to better names and added doc strings + lint * added more docstrings * more docstrings * improve docstrings * docstrings * added more docstrings, lint * fix import error --- tests/test_cdx.py | 4 +- tests/test_cli.py | 3 +- tests/test_utils.py | 12 +- tests/test_wrapper.py | 4 - waybackpy/cdx.py | 23 +++- waybackpy/cli.py | 1 + waybackpy/snapshot.py | 19 +++- waybackpy/utils.py | 252 ++++++++++++++++++++++++++++++++++-------- waybackpy/wrapper.py | 246 ++++++++++++++++++++++++++++++++--------- 9 files changed, 443 insertions(+), 121 deletions(-) diff --git a/tests/test_cdx.py b/tests/test_cdx.py index 887afd7..fdc6bbe 100644 --- a/tests/test_cdx.py +++ b/tests/test_cdx.py @@ -79,7 +79,7 @@ def test_all_cdx(): c = 0 for snapshot in snapshots: c += 1 - if c > 30_529: # deafult limit is 10k + if c > 30529: # deafult limit is 10k break url = "https://github.com/*" @@ -89,5 +89,5 @@ def test_all_cdx(): for snapshot in snapshots: c += 1 - if c > 100_529: + if c > 100529: break diff --git a/tests/test_cli.py b/tests/test_cli.py index d8593c7..f788c2e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,8 +5,7 @@ import random import string import argparse -sys.path.append("..") -import waybackpy.cli as cli # noqa: E402 +import waybackpy.cli as cli from waybackpy.wrapper import Url # noqa: E402 from waybackpy.__version__ import __version__ diff --git a/tests/test_utils.py b/tests/test_utils.py index 08cfaec..4c869d7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -14,14 +14,14 @@ from waybackpy.utils import ( _check_match_type, _check_collapses, _check_filters, - _ts, + _timestamp_manager, ) -def test_ts(): +def test_timestamp_manager(): timestamp = True data = {} - assert _ts(timestamp, data) + assert _timestamp_manager(timestamp, data) data = """ {"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"} @@ -61,10 +61,10 @@ def test_check_collapses(): def test_check_match_type(): - assert None == _check_match_type(None, "url") + assert _check_match_type(None, "url") is None match_type = "exact" url = "test_url" - assert None == _check_match_type(match_type, url) + assert _check_match_type(match_type, url) is None url = "has * in it" with pytest.raises(WaybackError): @@ -82,7 +82,7 @@ def test_cleaned_url(): def test_url_check(): good_url = "https://akamhy.github.io" - assert None == _url_check(good_url) + assert _url_check(good_url) is None bad_url = "https://github-com" with pytest.raises(URLError): diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 359ba91..100608f 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -1,8 +1,4 @@ -import sys import pytest -import random -import requests -from datetime import datetime from waybackpy.wrapper import Url diff --git a/waybackpy/cdx.py b/waybackpy/cdx.py index 3ce30bf..b2295c7 100644 --- a/waybackpy/cdx.py +++ b/waybackpy/cdx.py @@ -11,6 +11,7 @@ from .utils import ( ) # TODO : Threading support for pagination API. It's designed for Threading. +# TODO : Add get method here if type is Vaild HTML, SVG other but not - or warc. Test it. class Cdx: @@ -42,7 +43,22 @@ class Cdx: self.use_page = False def cdx_api_manager(self, payload, headers, use_page=False): - """ + """Act as button, we can choose between the normal API and pagination API. + + Parameters + ---------- + self : waybackpy.cdx.Cdx + The instance itself + + payload : dict + Get request parameters name value pairs + + headers : dict + The headers for making the GET request. + + use_page : bool + If True use pagination API else use normal resume key based API. + We have two options to get the snapshots, we use this method to make a selection between pagination API and the normal one with Resumption Key, sequential querying @@ -141,7 +157,7 @@ class Cdx: def snapshots(self): """ This function yeilds snapshots encapsulated - in CdxSnapshot for more usability. + in CdxSnapshot for increased usability. All the get request values are set if the conditions match @@ -188,10 +204,9 @@ class Cdx: prop_values = snapshot.split(" ") - # Making sure that we get the same number of - # property values as the number of properties prop_values_len = len(prop_values) properties_len = len(properties) + if prop_values_len != properties_len: raise WaybackError( "Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format( diff --git a/waybackpy/cli.py b/waybackpy/cli.py index adbf1aa..45f305a 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -5,6 +5,7 @@ import json import random import string import argparse + from .wrapper import Url from .exceptions import WaybackError from .__version__ import __version__ diff --git a/waybackpy/snapshot.py b/waybackpy/snapshot.py index 992ad2e..e3dc027 100644 --- a/waybackpy/snapshot.py +++ b/waybackpy/snapshot.py @@ -3,15 +3,24 @@ from datetime import datetime class CdxSnapshot: """ - This class helps to use the Cdx Snapshots easily. + This class encapsulates the snapshots for greater usability. Raw Snapshot data looks like: org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415 - properties is a dict containg all of the 7 cdx snapshot properties. """ def __init__(self, properties): + """ + Parameters + ---------- + self : waybackpy.snapshot.CdxSnapshot + The instance itself + + properties : dict + Properties is a dict containg all of the 7 cdx snapshot properties. + + """ self.urlkey = properties["urlkey"] self.timestamp = properties["timestamp"] self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S") @@ -25,6 +34,12 @@ class CdxSnapshot: ) def __str__(self): + """Returns the Cdx snapshot line. + + Output format: + org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415 + + """ return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format( urlkey=self.urlkey, timestamp=self.timestamp, diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 8bfee70..7c6958d 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -1,28 +1,72 @@ import re import time import requests -from .exceptions import WaybackError, URLError from datetime import datetime +from .exceptions import WaybackError, URLError +from .__version__ import __version__ + from urllib3.util.retry import Retry from requests.adapters import HTTPAdapter -from .__version__ import __version__ quote = requests.utils.quote default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy" def _latest_version(package_name, headers): - endpoint = "https://pypi.org/pypi/" + package_name + "/json" - json = _get_response(endpoint, headers=headers).json() - return json["info"]["version"] + """Returns the latest version of package_name. + + Parameters + ---------- + package_name : str + The name of the python package + + headers : dict + Headers that will be used while making get requests + + Return type is str + + Use API to get the latest version of + waybackpy, but can be used to get latest version of any package + on PyPi. + """ + + request_url = "https://pypi.org/pypi/" + package_name + "/json" + response = _get_response(request_url, headers=headers) + data = response.json() + return data["info"]["version"] -def _unix_ts_to_wayback_ts(unix_ts): - return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S") +def _unix_timestamp_to_wayback_timestamp(unix_timestamp): + """Returns unix timestamp converted to datetime.datetime + + Parameters + ---------- + unix_timestamp : str, int or float + Unix-timestamp that needs to be converted to datetime.datetime + + Converts and returns input unix_timestamp to datetime.datetime object. + Does not matter if unix_timestamp is str, float or int. + """ + + return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") def _add_payload(instance, payload): + """Adds payload from instance that can be used to make get requests. + + Parameters + ---------- + instance : waybackpy.cdx.Cdx + instance of the Cdx class + + payload : dict + A dict onto which we need to add keys and values based on instance. + + instance is object of Cdx class and it contains the data required to fill + the payload dictionary. + """ + if instance.start_timestamp: payload["from"] = instance.start_timestamp @@ -43,18 +87,27 @@ def _add_payload(instance, payload): for i, f in enumerate(instance.collapses): payload["collapse" + str(i)] = f + # Don't need to return anything as it's dictionary. payload["url"] = instance.url -def _ts(timestamp, data): - """ - Get timestamp of last fetched archive. - If used before fetching any archive, will - use whatever self.JSON returns. +def _timestamp_manager(timestamp, data): + """Returns the timestamp. - self.timestamp is None implies that - self.JSON will return any archive's JSON - that wayback machine provides it. + Parameters + ---------- + timestamp : datetime.datetime + datetime object + + data : dict + A python dictionary, which is loaded JSON os the availability API. + + Return type: + datetime.datetime + + If timestamp is not None then sets the value to timestamp itself. + If timestamp is None the returns the value from the last fetched API data. + If not timestamp and can not read the archived_snapshots form data return datetime.max """ if timestamp: @@ -69,6 +122,21 @@ def _ts(timestamp, data): def _check_match_type(match_type, url): + """Checks the validity of match_type parameter of the CDX GET requests. + + Parameters + ---------- + match_type : list + list that may contain any or all from ["exact", "prefix", "host", "domain"] + See https://github.com/akamhy/waybackpy/wiki/Python-package-docs#url-match-scope + + url : str + The URL used to create the waybackpy Url object. + + If not vaild match_type raise Exception. + + """ + if not match_type: return @@ -85,6 +153,19 @@ def _check_match_type(match_type, url): def _check_collapses(collapses): + """Checks the validity of collapse parameter of the CDX GET request. + + One or more field or field:N to 'collapses=[]' where + field is one of (urlkey, timestamp, original, mimetype, statuscode, + digest and length) and N is the first N characters of field to test. + + Parameters + ---------- + collapses : list + + If not vaild collapses raise Exception. + + """ if not isinstance(collapses, list): raise WaybackError("collapses must be a list.") @@ -119,12 +200,26 @@ def _check_collapses(collapses): def _check_filters(filters): + """Checks the validity of filter parameter of the CDX GET request. + + Any number of filter params of the following form may be specified: + filters=["[!]field:regex"] may be specified.. + + Parameters + ---------- + filters : list + + If not vaild filters raise Exception. + + """ + if not isinstance(filters, list): raise WaybackError("filters must be a list.") # [!]field:regex for _filter in filters: try: + match = re.search( r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)", _filter, @@ -134,8 +229,9 @@ def _check_filters(filters): val = match.group(2) except Exception: + exc_message = ( - "Filter '{_filter}' not following the cdx filter syntax.".format( + "Filter '{_filter}' is not following the cdx filter syntax.".format( _filter=_filter ) ) @@ -143,6 +239,9 @@ def _check_filters(filters): def _cleaned_url(url): + """Sanatize the url + Remove and replace illegal whitespace characters from the URL. + """ return str(url).strip().replace(" ", "%20") @@ -161,16 +260,29 @@ def _url_check(url): def _full_url(endpoint, params): - full_url = endpoint - if params: - full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") - for key, val in params.items(): - key = "filter" if key.startswith("filter") else key - key = "collapse" if key.startswith("collapse") else key - amp = "" if full_url.endswith("?") else "&" - full_url = ( - full_url + amp + "{key}={val}".format(key=key, val=quote(str(val))) - ) + """API endpoint + GET parameters = full_url + + Parameters + ---------- + endpoint : str + The API endpoint + + params : dict + Dictionary that has name-value pairs. + + Return type is str + + """ + + if not params: + return endpoint + + full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") + for key, val in params.items(): + key = "filter" if key.startswith("filter") else key + key = "collapse" if key.startswith("collapse") else key + amp = "" if full_url.endswith("?") else "&" + full_url = full_url + amp + "{key}={val}".format(key=key, val=quote(str(val))) return full_url @@ -191,17 +303,31 @@ def _get_total_pages(url, user_agent): def _archive_url_parser(header, url, latest_version=__version__, instance=None): - """ + """Returns the archive after parsing it from the response header. + + Parameters + ---------- + header : str + The response header of WayBack Machine's Save API + + url : str + The input url, the one used to created the Url object. + + latest_version : str + The latest version of waybackpy (default is __version__) + + instance : waybackpy.wrapper.Url + Instance of Url class + + The wayback machine's save API doesn't return JSON response, we are required to read the header of the API response - and look for the archive URL. + and find the archive URL. - This method has some regexen (or regexes) - that search for archive url in header. - - This method is used when you try to - save a webpage on wayback machine. + This method has some regular expressions + that are used to search for the archive url + in the response header of Save API. Two cases are possible: 1) Either we find the archive url in @@ -213,7 +339,6 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None): If we found the archive URL we return it. Return format: - web.archive.org/web// And if we couldn't find it, we raise @@ -304,9 +429,7 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None): def _wayback_timestamp(**kwargs): - """ - Wayback Machine archive URLs - have a timestamp in them. + """Returns a valid waybackpy timestamp. The standard archive URL format is https://web.archive.org/web/20191214041711/https://www.youtube.com @@ -316,13 +439,17 @@ def _wayback_timestamp(**kwargs): 2 ) timestamp (20191214041711) 3 ) https://www.youtube.com, the original URL - The near method takes year, month, day, hour and minute - as Arguments, their type is int. + + The near method of Url class in wrapper.py takes year, month, day, hour + and minute as arguments, their type is int. This method takes those integers and converts it to wayback machine timestamp and returns it. - Return format is string. + + zfill(2) adds 1 zero in front of single digit days, months hour etc. + + Return type is string. """ return "".join( @@ -339,16 +466,37 @@ def _get_response( backoff_factor=0.5, no_raise_on_redirects=False, ): - """ - This function is used make get request. - We use the requests package to make the - requests. + """Makes get requests. + + Parameters + ---------- + endpoint : str + The API endpoint. + + params : dict + The get request parameters. (default is None) + + headers : dict + Headers for the get request. (default is None) + + return_full_url : bool + Determines whether the call went full url returned along with the + response. (default is False) + + retries : int + Maximum number of retries for the get request. (default is 5) + + backoff_factor : float + The factor by which we determine the next retry time after wait. + https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html + (default is 0.5) + + no_raise_on_redirects : bool + If maximum 30(default for requests) times redirected than instead of + exceptions return. (default is False) - We try five times and if it fails it raises - WaybackError exception. - - You can handles WaybackError by importing: + To handle WaybackError: from waybackpy.exceptions import WaybackError try: @@ -370,20 +518,28 @@ def _get_response( s.mount("https://", HTTPAdapter(max_retries=retries)) + # The URL with parameters required for the get request url = _full_url(endpoint, params) try: + if not return_full_url: return s.get(url, headers=headers) + return (url, s.get(url, headers=headers)) + except Exception as e: + reason = str(e) + if no_raise_on_redirects: if "Exceeded 30 redirects" in reason: return + exc_message = "Error while retrieving {url}.\n{reason}".format( url=url, reason=reason ) + exc = WaybackError(exc_message) exc.__cause__ = e raise exc diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 77add29..ef24a81 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -1,5 +1,6 @@ import re from datetime import datetime, timedelta + from .exceptions import WaybackError from .cdx import Cdx from .utils import ( @@ -9,13 +10,85 @@ from .utils import ( default_user_agent, _url_check, _cleaned_url, - _ts, - _unix_ts_to_wayback_ts, + _timestamp_manager, + _unix_timestamp_to_wayback_timestamp, _latest_version, ) class Url: + """ + + Attributes + ---------- + url : str + The input URL, wayback machine API operations are performed + on this URL after sanatizing it. + + user_agent : str + The user_agent used while making the GET requests to the + Wayback machine APIs + + _archive_url : str + Caches the last fetched archive. + + timestamp : datetime.datetime + timestamp of the archive URL as datetime object for + greater usability + + _JSON : dict + Caches the last fetched availability API data + + latest_version : str + The latest version of waybackpy on PyPi + + cached_save : bool + Flag to check if WayBack machine returned a cached + archive instead of creating a new archive. WayBack + machine allows only one 1 archive for an URL in + 30 minutes. If the archive returned by WayBack machine + is older than 3 minutes than this flag is set to True + + Methods turned properties + ---------- + JSON : dict + JSON response of availability API as dictionary / loaded JSON + + archive_url : str + Return the archive url, returns str + + _timestamp : datetime.datetime + Sets the value of self.timestamp if still not set + + Methods + ------- + save() + Archives the URL on WayBack machine + + get(url="", user_agent="", encoding="") + Gets the source of archive url, can also be used to get source + of any URL if passed into it. + + near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None) + Wayback Machine can have many archives for a URL/webpage, sometimes we want + archive close to a specific time. + This method takes year, month, day, hour, minute and unix_timestamp as input. + + oldest(year=1994) + The oldest archive of an URL. + + newest() + The newest archive of an URL + + total_archives(start_timestamp=None, end_timestamp=None) + total number of archives of an URL, the timeframe can be confined by + start_timestamp and end_timestamp + + known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix") + Known URLs for an URL, subdomain, URL as prefix etc. + + """ + def __init__(self, url, user_agent=default_user_agent): self.url = url self.user_agent = str(user_agent) @@ -32,29 +105,17 @@ class Url: ) def __str__(self): - """ - Output when print() is used on - This should print an archive URL. - - We check if self._archive_url is not None. - If not None, good. We return string of self._archive_url. - - If self._archive_url is None, it means we ain't used any method that - sets self._archive_url, we now set self._archive_url to self.archive_url - and return it. - """ - if not self._archive_url: self._archive_url = self.archive_url + return "{archive_url}".format(archive_url=self._archive_url) def __len__(self): - """ - Why do we have len here? + """Number of days between today and the date of archive based on the timestamp - Applying len() on - will calculate the number of days between today and - the archive timestamp. + len() of waybackpy.wrapper.Url should return + the number of days between today and the + archive timestamp. Can be applied on return values of near and its childs (e.g. oldest) and if applied on waybackpy.Url() @@ -76,32 +137,30 @@ class Url: @property def JSON(self): - """ - If the end user has used near() or its childs like oldest, newest - and archive_url then the JSON response of these are cached in self._JSON + """Returns JSON response of availability API as dictionary / loaded JSON - If we find that self._JSON is not None we return it. - else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL' - and return it. + return type : dict """ + # If user used the near method or any method that depends on near, we + # are certain that we have a loaded dictionary cached in self._JSON. + # Return the loaded JSON data. if self._JSON: return self._JSON + # If no cached data found, get data and return + cache it. endpoint = "https://archive.org/wayback/available" headers = {"User-Agent": self.user_agent} payload = {"url": "{url}".format(url=_cleaned_url(self.url))} response = _get_response(endpoint, params=payload, headers=headers) - return response.json() + self._JSON = response.json() + return self._JSON @property def archive_url(self): - """ - Returns any random archive for the instance. - But if near, oldest, newest were used before - then it returns the same archive again. + """Return the archive url. - We cache archive in self._archive_url + return type : str """ if self._archive_url: @@ -121,11 +180,16 @@ class Url: @property def _timestamp(self): - self.timestamp = _ts(self.timestamp, self.JSON) - return self.timestamp + """Sets the value of self.timestamp if still not set. + + Return type : datetime.datetime + + """ + return _timestamp_manager(self.timestamp, self.JSON) def save(self): - """ + """Saves/Archive the URL. + To save a webpage on WayBack machine we need to send get request to https://web.archive.org/save/ @@ -136,6 +200,8 @@ class Url: _archive_url_parser() parses the archive from the header. + return type : waybackpy.wrapper.Url + """ request_url = "https://web.archive.org/save/" + _cleaned_url(self.url) headers = {"User-Agent": self.user_agent} @@ -161,7 +227,9 @@ class Url: instance=self, ) - m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url) + m = re.search( + r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url + ) str_ts = m.group(1) ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S") now = datetime.utcnow() @@ -175,9 +243,22 @@ class Url: return self def get(self, url="", user_agent="", encoding=""): - """ - Return the source code of the last archived URL, - if no URL is passed to this method. + """GET the source of archive or any other URL. + + url : str, waybackpy.wrapper.Url + The method will return the source code of + this URL instead of last fetched archive. + + user_agent : str + The user_agent for GET request to API + + encoding : str + If user is using any other encoding that + can't be detected by response.encoding + + Return the source code of the last fetched + archive URL if no URL is passed to this method + else it returns the source code of url passed. If encoding is not supplied, it is auto-detected from the response itself by requests package. @@ -213,6 +294,27 @@ class Url: unix_timestamp=None, ): """ + Parameters + ---------- + + year : int + Archive close to year + + month : int + Archive close to month + + day : int + Archive close to day + + hour : int + Archive close to hour + + minute : int + Archive close to minute + + unix_timestamp : str, float or int + Archive close to this unix_timestamp + Wayback Machine can have many archives of a webpage, sometimes we want archive close to a specific time. @@ -235,7 +337,7 @@ class Url: """ if unix_timestamp: - timestamp = _unix_ts_to_wayback_ts(unix_timestamp) + timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp) else: now = datetime.utcnow().timetuple() timestamp = _wayback_timestamp( @@ -285,28 +387,45 @@ class Url: We simply pass the year in near() and return it. """ + return self.near(year=year) def newest(self): - """ - Return the newest Wayback Machine archive available for this URL. + """Return the newest Wayback Machine archive available. - We return the output of self.near() as it deafults to current utc time. + We return the return value of self.near() as it deafults to current UTC time. Due to Wayback Machine database lag, this may not always be the most recent archive. + + return type : waybackpy.wrapper.Url """ + return self.near() def total_archives(self, start_timestamp=None, end_timestamp=None): - """ + """Returns the total number of archives for an URL + + Parameters + ---------- + start_timestamp : str + 1 to 14 digit string of numbers, you are not required to + pass a full 14 digit timestamp. + + end_timestamp : str + 1 to 14 digit string of numbers, you are not required to + pass a full 14 digit timestamp. + + + return type : int + + A webpage can have multiple archives on the wayback machine If someone wants to count the total number of archives of a webpage on wayback machine they can use this method. Returns the total number of Wayback Machine archives for the URL. - Return type in integer. """ cdx = Cdx( @@ -315,6 +434,8 @@ class Url: start_timestamp=start_timestamp, end_timestamp=end_timestamp, ) + + # cdx.snapshots() is generator not list. i = 0 for _ in cdx.snapshots(): i = i + 1 @@ -328,15 +449,36 @@ class Url: end_timestamp=None, match_type="prefix", ): - """ + """Yields known_urls URLs from the CDX API. + + Parameters + ---------- + + subdomain : bool + If True fetch subdomain URLs along with the host URLs. + + host : bool + Only fetch host URLs. + + start_timestamp : str + 1 to 14 digit string of numbers, you are not required to + pass a full 14 digit timestamp. + + end_timestamp : str + 1 to 14 digit string of numbers, you are not required to + pass a full 14 digit timestamp. + + match_type : str + One of (exact, prefix, host and domain) + + return type : waybackpy.snapshot.CdxSnapshot + Yields list of URLs known to exist for given input. Defaults to input URL as prefix. - This method is kept for compatibility, use the Cdx class instead. - This method itself depends on Cdx. - - Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: - https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050 + Based on: + https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050 + By Mohammed Diaa (https://github.com/mhmdiaa) """ if subdomain: @@ -353,7 +495,5 @@ class Url: collapses=["urlkey"], ) - snapshots = cdx.snapshots() - - for snapshot in snapshots: + for snapshot in cdx.snapshots(): yield (snapshot.original)