diff --git a/tests/test_cdx.py b/tests/test_cdx.py index 887afd7..fdc6bbe 100644 --- a/tests/test_cdx.py +++ b/tests/test_cdx.py @@ -79,7 +79,7 @@ def test_all_cdx(): c = 0 for snapshot in snapshots: c += 1 - if c > 30_529: # deafult limit is 10k + if c > 30529: # deafult limit is 10k break url = "https://github.com/*" @@ -89,5 +89,5 @@ def test_all_cdx(): for snapshot in snapshots: c += 1 - if c > 100_529: + if c > 100529: break diff --git a/tests/test_cli.py b/tests/test_cli.py index d8593c7..f788c2e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,8 +5,7 @@ import random import string import argparse -sys.path.append("..") -import waybackpy.cli as cli # noqa: E402 +import waybackpy.cli as cli from waybackpy.wrapper import Url # noqa: E402 from waybackpy.__version__ import __version__ diff --git a/tests/test_utils.py b/tests/test_utils.py index 08cfaec..dc8a466 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -61,10 +61,10 @@ def test_check_collapses(): def test_check_match_type(): - assert None == _check_match_type(None, "url") + assert _check_match_type(None, "url") is None match_type = "exact" url = "test_url" - assert None == _check_match_type(match_type, url) + assert _check_match_type(match_type, url) is None url = "has * in it" with pytest.raises(WaybackError): @@ -82,7 +82,7 @@ def test_cleaned_url(): def test_url_check(): good_url = "https://akamhy.github.io" - assert None == _url_check(good_url) + assert _url_check(good_url) is None bad_url = "https://github-com" with pytest.raises(URLError): diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 359ba91..100608f 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -1,8 +1,4 @@ -import sys import pytest -import random -import requests -from datetime import datetime from waybackpy.wrapper import Url diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 0872d33..ef24a81 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -17,6 +17,78 @@ from .utils import ( class Url: + """ + + Attributes + ---------- + url : str + The input URL, wayback machine API operations are performed + on this URL after sanatizing it. + + user_agent : str + The user_agent used while making the GET requests to the + Wayback machine APIs + + _archive_url : str + Caches the last fetched archive. + + timestamp : datetime.datetime + timestamp of the archive URL as datetime object for + greater usability + + _JSON : dict + Caches the last fetched availability API data + + latest_version : str + The latest version of waybackpy on PyPi + + cached_save : bool + Flag to check if WayBack machine returned a cached + archive instead of creating a new archive. WayBack + machine allows only one 1 archive for an URL in + 30 minutes. If the archive returned by WayBack machine + is older than 3 minutes than this flag is set to True + + Methods turned properties + ---------- + JSON : dict + JSON response of availability API as dictionary / loaded JSON + + archive_url : str + Return the archive url, returns str + + _timestamp : datetime.datetime + Sets the value of self.timestamp if still not set + + Methods + ------- + save() + Archives the URL on WayBack machine + + get(url="", user_agent="", encoding="") + Gets the source of archive url, can also be used to get source + of any URL if passed into it. + + near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None) + Wayback Machine can have many archives for a URL/webpage, sometimes we want + archive close to a specific time. + This method takes year, month, day, hour, minute and unix_timestamp as input. + + oldest(year=1994) + The oldest archive of an URL. + + newest() + The newest archive of an URL + + total_archives(start_timestamp=None, end_timestamp=None) + total number of archives of an URL, the timeframe can be confined by + start_timestamp and end_timestamp + + known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix") + Known URLs for an URL, subdomain, URL as prefix etc. + + """ + def __init__(self, url, user_agent=default_user_agent): self.url = url self.user_agent = str(user_agent) @@ -65,32 +137,30 @@ class Url: @property def JSON(self): - """ - If the end user has used near() or its childs like oldest, newest - and archive_url then the JSON response of these are cached in self._JSON + """Returns JSON response of availability API as dictionary / loaded JSON - If we find that self._JSON is not None we return it. - else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL' - and return it. + return type : dict """ + # If user used the near method or any method that depends on near, we + # are certain that we have a loaded dictionary cached in self._JSON. + # Return the loaded JSON data. if self._JSON: return self._JSON + # If no cached data found, get data and return + cache it. endpoint = "https://archive.org/wayback/available" headers = {"User-Agent": self.user_agent} payload = {"url": "{url}".format(url=_cleaned_url(self.url))} response = _get_response(endpoint, params=payload, headers=headers) - return response.json() + self._JSON = response.json() + return self._JSON @property def archive_url(self): - """Return the string form of the Url object. + """Return the archive url. - Parameters - ---------- - self : waybackpy.wrapper.Url - The instance itself. + return type : str """ if self._archive_url: @@ -112,10 +182,7 @@ class Url: def _timestamp(self): """Sets the value of self.timestamp if still not set. - Parameters - ---------- - self : waybackpy.wrapper.Url - The instance itself. + Return type : datetime.datetime """ return _timestamp_manager(self.timestamp, self.JSON) @@ -123,11 +190,6 @@ class Url: def save(self): """Saves/Archive the URL. - Parameters - ---------- - self : waybackpy.wrapper.Url - The instance itself. - To save a webpage on WayBack machine we need to send get request to https://web.archive.org/save/ @@ -138,6 +200,8 @@ class Url: _archive_url_parser() parses the archive from the header. + return type : waybackpy.wrapper.Url + """ request_url = "https://web.archive.org/save/" + _cleaned_url(self.url) headers = {"User-Agent": self.user_agent} @@ -179,9 +243,22 @@ class Url: return self def get(self, url="", user_agent="", encoding=""): - """ - Return the source code of the last archived URL, - if no URL is passed to this method. + """GET the source of archive or any other URL. + + url : str, waybackpy.wrapper.Url + The method will return the source code of + this URL instead of last fetched archive. + + user_agent : str + The user_agent for GET request to API + + encoding : str + If user is using any other encoding that + can't be detected by response.encoding + + Return the source code of the last fetched + archive URL if no URL is passed to this method + else it returns the source code of url passed. If encoding is not supplied, it is auto-detected from the response itself by requests package. @@ -219,8 +296,6 @@ class Url: """ Parameters ---------- - self : waybackpy.wrapper.Url - The instance itself. year : int Archive close to year @@ -316,13 +391,14 @@ class Url: return self.near(year=year) def newest(self): - """ - Return the newest Wayback Machine archive available for this URL. + """Return the newest Wayback Machine archive available. - We return the output of self.near() as it deafults to current utc time. + We return the return value of self.near() as it deafults to current UTC time. Due to Wayback Machine database lag, this may not always be the most recent archive. + + return type : waybackpy.wrapper.Url """ return self.near() @@ -332,9 +408,6 @@ class Url: Parameters ---------- - self : waybackpy.wrapper.Url - The instance itself - start_timestamp : str 1 to 14 digit string of numbers, you are not required to pass a full 14 digit timestamp. @@ -344,13 +417,15 @@ class Url: pass a full 14 digit timestamp. + return type : int + + A webpage can have multiple archives on the wayback machine If someone wants to count the total number of archives of a webpage on wayback machine they can use this method. Returns the total number of Wayback Machine archives for the URL. - Return type in integer. """ cdx = Cdx( @@ -359,6 +434,8 @@ class Url: start_timestamp=start_timestamp, end_timestamp=end_timestamp, ) + + # cdx.snapshots() is generator not list. i = 0 for _ in cdx.snapshots(): i = i + 1 @@ -377,9 +454,6 @@ class Url: Parameters ---------- - self : waybackpy.wrapper.Url - The instance itself - subdomain : bool If True fetch subdomain URLs along with the host URLs. @@ -397,6 +471,8 @@ class Url: match_type : str One of (exact, prefix, host and domain) + return type : waybackpy.snapshot.CdxSnapshot + Yields list of URLs known to exist for given input. Defaults to input URL as prefix.