added more docstrings, lint

This commit is contained in:
Akash Mahanty
2021-01-26 11:42:29 +05:30
parent 817c0ee844
commit acebfabc3e
5 changed files with 118 additions and 47 deletions

View File

@@ -79,7 +79,7 @@ def test_all_cdx():
c = 0 c = 0
for snapshot in snapshots: for snapshot in snapshots:
c += 1 c += 1
if c > 30_529: # deafult limit is 10k if c > 30529: # deafult limit is 10k
break break
url = "https://github.com/*" url = "https://github.com/*"
@@ -89,5 +89,5 @@ def test_all_cdx():
for snapshot in snapshots: for snapshot in snapshots:
c += 1 c += 1
if c > 100_529: if c > 100529:
break break

View File

@@ -5,8 +5,7 @@ import random
import string import string
import argparse import argparse
sys.path.append("..") import waybackpy.cli as cli
import waybackpy.cli as cli # noqa: E402
from waybackpy.wrapper import Url # noqa: E402 from waybackpy.wrapper import Url # noqa: E402
from waybackpy.__version__ import __version__ from waybackpy.__version__ import __version__

View File

@@ -61,10 +61,10 @@ def test_check_collapses():
def test_check_match_type(): def test_check_match_type():
assert None == _check_match_type(None, "url") assert _check_match_type(None, "url") is None
match_type = "exact" match_type = "exact"
url = "test_url" url = "test_url"
assert None == _check_match_type(match_type, url) assert _check_match_type(match_type, url) is None
url = "has * in it" url = "has * in it"
with pytest.raises(WaybackError): with pytest.raises(WaybackError):
@@ -82,7 +82,7 @@ def test_cleaned_url():
def test_url_check(): def test_url_check():
good_url = "https://akamhy.github.io" good_url = "https://akamhy.github.io"
assert None == _url_check(good_url) assert _url_check(good_url) is None
bad_url = "https://github-com" bad_url = "https://github-com"
with pytest.raises(URLError): with pytest.raises(URLError):

View File

@@ -1,8 +1,4 @@
import sys
import pytest import pytest
import random
import requests
from datetime import datetime
from waybackpy.wrapper import Url from waybackpy.wrapper import Url

View File

@@ -17,6 +17,78 @@ from .utils import (
class Url: class Url:
"""
Attributes
----------
url : str
The input URL, wayback machine API operations are performed
on this URL after sanatizing it.
user_agent : str
The user_agent used while making the GET requests to the
Wayback machine APIs
_archive_url : str
Caches the last fetched archive.
timestamp : datetime.datetime
timestamp of the archive URL as datetime object for
greater usability
_JSON : dict
Caches the last fetched availability API data
latest_version : str
The latest version of waybackpy on PyPi
cached_save : bool
Flag to check if WayBack machine returned a cached
archive instead of creating a new archive. WayBack
machine allows only one 1 archive for an URL in
30 minutes. If the archive returned by WayBack machine
is older than 3 minutes than this flag is set to True
Methods turned properties
----------
JSON : dict
JSON response of availability API as dictionary / loaded JSON
archive_url : str
Return the archive url, returns str
_timestamp : datetime.datetime
Sets the value of self.timestamp if still not set
Methods
-------
save()
Archives the URL on WayBack machine
get(url="", user_agent="", encoding="")
Gets the source of archive url, can also be used to get source
of any URL if passed into it.
near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
Wayback Machine can have many archives for a URL/webpage, sometimes we want
archive close to a specific time.
This method takes year, month, day, hour, minute and unix_timestamp as input.
oldest(year=1994)
The oldest archive of an URL.
newest()
The newest archive of an URL
total_archives(start_timestamp=None, end_timestamp=None)
total number of archives of an URL, the timeframe can be confined by
start_timestamp and end_timestamp
known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
Known URLs for an URL, subdomain, URL as prefix etc.
"""
def __init__(self, url, user_agent=default_user_agent): def __init__(self, url, user_agent=default_user_agent):
self.url = url self.url = url
self.user_agent = str(user_agent) self.user_agent = str(user_agent)
@@ -65,32 +137,30 @@ class Url:
@property @property
def JSON(self): def JSON(self):
""" """Returns JSON response of availability API as dictionary / loaded JSON
If the end user has used near() or its childs like oldest, newest
and archive_url then the JSON response of these are cached in self._JSON
If we find that self._JSON is not None we return it. return type : dict
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
and return it.
""" """
# If user used the near method or any method that depends on near, we
# are certain that we have a loaded dictionary cached in self._JSON.
# Return the loaded JSON data.
if self._JSON: if self._JSON:
return self._JSON return self._JSON
# If no cached data found, get data and return + cache it.
endpoint = "https://archive.org/wayback/available" endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
payload = {"url": "{url}".format(url=_cleaned_url(self.url))} payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
response = _get_response(endpoint, params=payload, headers=headers) response = _get_response(endpoint, params=payload, headers=headers)
return response.json() self._JSON = response.json()
return self._JSON
@property @property
def archive_url(self): def archive_url(self):
"""Return the string form of the Url object. """Return the archive url.
Parameters return type : str
----------
self : waybackpy.wrapper.Url
The instance itself.
""" """
if self._archive_url: if self._archive_url:
@@ -112,10 +182,7 @@ class Url:
def _timestamp(self): def _timestamp(self):
"""Sets the value of self.timestamp if still not set. """Sets the value of self.timestamp if still not set.
Parameters Return type : datetime.datetime
----------
self : waybackpy.wrapper.Url
The instance itself.
""" """
return _timestamp_manager(self.timestamp, self.JSON) return _timestamp_manager(self.timestamp, self.JSON)
@@ -123,11 +190,6 @@ class Url:
def save(self): def save(self):
"""Saves/Archive the URL. """Saves/Archive the URL.
Parameters
----------
self : waybackpy.wrapper.Url
The instance itself.
To save a webpage on WayBack machine we To save a webpage on WayBack machine we
need to send get request to https://web.archive.org/save/ need to send get request to https://web.archive.org/save/
@@ -138,6 +200,8 @@ class Url:
_archive_url_parser() parses the archive from the header. _archive_url_parser() parses the archive from the header.
return type : waybackpy.wrapper.Url
""" """
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url) request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
@@ -179,9 +243,22 @@ class Url:
return self return self
def get(self, url="", user_agent="", encoding=""): def get(self, url="", user_agent="", encoding=""):
""" """GET the source of archive or any other URL.
Return the source code of the last archived URL,
if no URL is passed to this method. url : str, waybackpy.wrapper.Url
The method will return the source code of
this URL instead of last fetched archive.
user_agent : str
The user_agent for GET request to API
encoding : str
If user is using any other encoding that
can't be detected by response.encoding
Return the source code of the last fetched
archive URL if no URL is passed to this method
else it returns the source code of url passed.
If encoding is not supplied, it is auto-detected If encoding is not supplied, it is auto-detected
from the response itself by requests package. from the response itself by requests package.
@@ -219,8 +296,6 @@ class Url:
""" """
Parameters Parameters
---------- ----------
self : waybackpy.wrapper.Url
The instance itself.
year : int year : int
Archive close to year Archive close to year
@@ -316,13 +391,14 @@ class Url:
return self.near(year=year) return self.near(year=year)
def newest(self): def newest(self):
""" """Return the newest Wayback Machine archive available.
Return the newest Wayback Machine archive available for this URL.
We return the output of self.near() as it deafults to current utc time. We return the return value of self.near() as it deafults to current UTC time.
Due to Wayback Machine database lag, this may not always be the Due to Wayback Machine database lag, this may not always be the
most recent archive. most recent archive.
return type : waybackpy.wrapper.Url
""" """
return self.near() return self.near()
@@ -332,9 +408,6 @@ class Url:
Parameters Parameters
---------- ----------
self : waybackpy.wrapper.Url
The instance itself
start_timestamp : str start_timestamp : str
1 to 14 digit string of numbers, you are not required to 1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp. pass a full 14 digit timestamp.
@@ -344,13 +417,15 @@ class Url:
pass a full 14 digit timestamp. pass a full 14 digit timestamp.
return type : int
A webpage can have multiple archives on the wayback machine A webpage can have multiple archives on the wayback machine
If someone wants to count the total number of archives of a If someone wants to count the total number of archives of a
webpage on wayback machine they can use this method. webpage on wayback machine they can use this method.
Returns the total number of Wayback Machine archives for the URL. Returns the total number of Wayback Machine archives for the URL.
Return type in integer.
""" """
cdx = Cdx( cdx = Cdx(
@@ -359,6 +434,8 @@ class Url:
start_timestamp=start_timestamp, start_timestamp=start_timestamp,
end_timestamp=end_timestamp, end_timestamp=end_timestamp,
) )
# cdx.snapshots() is generator not list.
i = 0 i = 0
for _ in cdx.snapshots(): for _ in cdx.snapshots():
i = i + 1 i = i + 1
@@ -377,9 +454,6 @@ class Url:
Parameters Parameters
---------- ----------
self : waybackpy.wrapper.Url
The instance itself
subdomain : bool subdomain : bool
If True fetch subdomain URLs along with the host URLs. If True fetch subdomain URLs along with the host URLs.
@@ -397,6 +471,8 @@ class Url:
match_type : str match_type : str
One of (exact, prefix, host and domain) One of (exact, prefix, host and domain)
return type : waybackpy.snapshot.CdxSnapshot
Yields list of URLs known to exist for given input. Yields list of URLs known to exist for given input.
Defaults to input URL as prefix. Defaults to input URL as prefix.