Add doc strings (#90)
* Added some docstrings in utils.py * renamed some func/meth to better names and added doc strings + lint * added more docstrings * more docstrings * improve docstrings * docstrings * added more docstrings, lint * fix import error
This commit is contained in:
parent
88cda94c0b
commit
db8f902cff
@ -79,7 +79,7 @@ def test_all_cdx():
|
|||||||
c = 0
|
c = 0
|
||||||
for snapshot in snapshots:
|
for snapshot in snapshots:
|
||||||
c += 1
|
c += 1
|
||||||
if c > 30_529: # deafult limit is 10k
|
if c > 30529: # deafult limit is 10k
|
||||||
break
|
break
|
||||||
|
|
||||||
url = "https://github.com/*"
|
url = "https://github.com/*"
|
||||||
@ -89,5 +89,5 @@ def test_all_cdx():
|
|||||||
|
|
||||||
for snapshot in snapshots:
|
for snapshot in snapshots:
|
||||||
c += 1
|
c += 1
|
||||||
if c > 100_529:
|
if c > 100529:
|
||||||
break
|
break
|
||||||
|
@ -5,8 +5,7 @@ import random
|
|||||||
import string
|
import string
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
sys.path.append("..")
|
import waybackpy.cli as cli
|
||||||
import waybackpy.cli as cli # noqa: E402
|
|
||||||
from waybackpy.wrapper import Url # noqa: E402
|
from waybackpy.wrapper import Url # noqa: E402
|
||||||
from waybackpy.__version__ import __version__
|
from waybackpy.__version__ import __version__
|
||||||
|
|
||||||
|
@ -14,14 +14,14 @@ from waybackpy.utils import (
|
|||||||
_check_match_type,
|
_check_match_type,
|
||||||
_check_collapses,
|
_check_collapses,
|
||||||
_check_filters,
|
_check_filters,
|
||||||
_ts,
|
_timestamp_manager,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_ts():
|
def test_timestamp_manager():
|
||||||
timestamp = True
|
timestamp = True
|
||||||
data = {}
|
data = {}
|
||||||
assert _ts(timestamp, data)
|
assert _timestamp_manager(timestamp, data)
|
||||||
|
|
||||||
data = """
|
data = """
|
||||||
{"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"}
|
{"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"}
|
||||||
@ -61,10 +61,10 @@ def test_check_collapses():
|
|||||||
|
|
||||||
|
|
||||||
def test_check_match_type():
|
def test_check_match_type():
|
||||||
assert None == _check_match_type(None, "url")
|
assert _check_match_type(None, "url") is None
|
||||||
match_type = "exact"
|
match_type = "exact"
|
||||||
url = "test_url"
|
url = "test_url"
|
||||||
assert None == _check_match_type(match_type, url)
|
assert _check_match_type(match_type, url) is None
|
||||||
|
|
||||||
url = "has * in it"
|
url = "has * in it"
|
||||||
with pytest.raises(WaybackError):
|
with pytest.raises(WaybackError):
|
||||||
@ -82,7 +82,7 @@ def test_cleaned_url():
|
|||||||
|
|
||||||
def test_url_check():
|
def test_url_check():
|
||||||
good_url = "https://akamhy.github.io"
|
good_url = "https://akamhy.github.io"
|
||||||
assert None == _url_check(good_url)
|
assert _url_check(good_url) is None
|
||||||
|
|
||||||
bad_url = "https://github-com"
|
bad_url = "https://github-com"
|
||||||
with pytest.raises(URLError):
|
with pytest.raises(URLError):
|
||||||
|
@ -1,8 +1,4 @@
|
|||||||
import sys
|
|
||||||
import pytest
|
import pytest
|
||||||
import random
|
|
||||||
import requests
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from waybackpy.wrapper import Url
|
from waybackpy.wrapper import Url
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ from .utils import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
# TODO : Threading support for pagination API. It's designed for Threading.
|
# TODO : Threading support for pagination API. It's designed for Threading.
|
||||||
|
# TODO : Add get method here if type is Vaild HTML, SVG other but not - or warc. Test it.
|
||||||
|
|
||||||
|
|
||||||
class Cdx:
|
class Cdx:
|
||||||
@ -42,7 +43,22 @@ class Cdx:
|
|||||||
self.use_page = False
|
self.use_page = False
|
||||||
|
|
||||||
def cdx_api_manager(self, payload, headers, use_page=False):
|
def cdx_api_manager(self, payload, headers, use_page=False):
|
||||||
"""
|
"""Act as button, we can choose between the normal API and pagination API.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
self : waybackpy.cdx.Cdx
|
||||||
|
The instance itself
|
||||||
|
|
||||||
|
payload : dict
|
||||||
|
Get request parameters name value pairs
|
||||||
|
|
||||||
|
headers : dict
|
||||||
|
The headers for making the GET request.
|
||||||
|
|
||||||
|
use_page : bool
|
||||||
|
If True use pagination API else use normal resume key based API.
|
||||||
|
|
||||||
We have two options to get the snapshots, we use this
|
We have two options to get the snapshots, we use this
|
||||||
method to make a selection between pagination API and
|
method to make a selection between pagination API and
|
||||||
the normal one with Resumption Key, sequential querying
|
the normal one with Resumption Key, sequential querying
|
||||||
@ -141,7 +157,7 @@ class Cdx:
|
|||||||
def snapshots(self):
|
def snapshots(self):
|
||||||
"""
|
"""
|
||||||
This function yeilds snapshots encapsulated
|
This function yeilds snapshots encapsulated
|
||||||
in CdxSnapshot for more usability.
|
in CdxSnapshot for increased usability.
|
||||||
|
|
||||||
All the get request values are set if the conditions match
|
All the get request values are set if the conditions match
|
||||||
|
|
||||||
@ -188,10 +204,9 @@ class Cdx:
|
|||||||
|
|
||||||
prop_values = snapshot.split(" ")
|
prop_values = snapshot.split(" ")
|
||||||
|
|
||||||
# Making sure that we get the same number of
|
|
||||||
# property values as the number of properties
|
|
||||||
prop_values_len = len(prop_values)
|
prop_values_len = len(prop_values)
|
||||||
properties_len = len(properties)
|
properties_len = len(properties)
|
||||||
|
|
||||||
if prop_values_len != properties_len:
|
if prop_values_len != properties_len:
|
||||||
raise WaybackError(
|
raise WaybackError(
|
||||||
"Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(
|
"Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(
|
||||||
|
@ -5,6 +5,7 @@ import json
|
|||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from .wrapper import Url
|
from .wrapper import Url
|
||||||
from .exceptions import WaybackError
|
from .exceptions import WaybackError
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
|
@ -3,15 +3,24 @@ from datetime import datetime
|
|||||||
|
|
||||||
class CdxSnapshot:
|
class CdxSnapshot:
|
||||||
"""
|
"""
|
||||||
This class helps to use the Cdx Snapshots easily.
|
This class encapsulates the snapshots for greater usability.
|
||||||
|
|
||||||
Raw Snapshot data looks like:
|
Raw Snapshot data looks like:
|
||||||
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
|
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
|
||||||
|
|
||||||
properties is a dict containg all of the 7 cdx snapshot properties.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, properties):
|
def __init__(self, properties):
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
self : waybackpy.snapshot.CdxSnapshot
|
||||||
|
The instance itself
|
||||||
|
|
||||||
|
properties : dict
|
||||||
|
Properties is a dict containg all of the 7 cdx snapshot properties.
|
||||||
|
|
||||||
|
"""
|
||||||
self.urlkey = properties["urlkey"]
|
self.urlkey = properties["urlkey"]
|
||||||
self.timestamp = properties["timestamp"]
|
self.timestamp = properties["timestamp"]
|
||||||
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
||||||
@ -25,6 +34,12 @@ class CdxSnapshot:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
"""Returns the Cdx snapshot line.
|
||||||
|
|
||||||
|
Output format:
|
||||||
|
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
|
||||||
|
|
||||||
|
"""
|
||||||
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
|
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
|
||||||
urlkey=self.urlkey,
|
urlkey=self.urlkey,
|
||||||
timestamp=self.timestamp,
|
timestamp=self.timestamp,
|
||||||
|
@ -1,28 +1,72 @@
|
|||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import requests
|
import requests
|
||||||
from .exceptions import WaybackError, URLError
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from .exceptions import WaybackError, URLError
|
||||||
|
from .__version__ import __version__
|
||||||
|
|
||||||
from urllib3.util.retry import Retry
|
from urllib3.util.retry import Retry
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from .__version__ import __version__
|
|
||||||
|
|
||||||
quote = requests.utils.quote
|
quote = requests.utils.quote
|
||||||
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||||
|
|
||||||
|
|
||||||
def _latest_version(package_name, headers):
|
def _latest_version(package_name, headers):
|
||||||
endpoint = "https://pypi.org/pypi/" + package_name + "/json"
|
"""Returns the latest version of package_name.
|
||||||
json = _get_response(endpoint, headers=headers).json()
|
|
||||||
return json["info"]["version"]
|
Parameters
|
||||||
|
----------
|
||||||
|
package_name : str
|
||||||
|
The name of the python package
|
||||||
|
|
||||||
|
headers : dict
|
||||||
|
Headers that will be used while making get requests
|
||||||
|
|
||||||
|
Return type is str
|
||||||
|
|
||||||
|
Use API <https://pypi.org/pypi/> to get the latest version of
|
||||||
|
waybackpy, but can be used to get latest version of any package
|
||||||
|
on PyPi.
|
||||||
|
"""
|
||||||
|
|
||||||
|
request_url = "https://pypi.org/pypi/" + package_name + "/json"
|
||||||
|
response = _get_response(request_url, headers=headers)
|
||||||
|
data = response.json()
|
||||||
|
return data["info"]["version"]
|
||||||
|
|
||||||
|
|
||||||
def _unix_ts_to_wayback_ts(unix_ts):
|
def _unix_timestamp_to_wayback_timestamp(unix_timestamp):
|
||||||
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
|
"""Returns unix timestamp converted to datetime.datetime
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
unix_timestamp : str, int or float
|
||||||
|
Unix-timestamp that needs to be converted to datetime.datetime
|
||||||
|
|
||||||
|
Converts and returns input unix_timestamp to datetime.datetime object.
|
||||||
|
Does not matter if unix_timestamp is str, float or int.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||||
|
|
||||||
|
|
||||||
def _add_payload(instance, payload):
|
def _add_payload(instance, payload):
|
||||||
|
"""Adds payload from instance that can be used to make get requests.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
instance : waybackpy.cdx.Cdx
|
||||||
|
instance of the Cdx class
|
||||||
|
|
||||||
|
payload : dict
|
||||||
|
A dict onto which we need to add keys and values based on instance.
|
||||||
|
|
||||||
|
instance is object of Cdx class and it contains the data required to fill
|
||||||
|
the payload dictionary.
|
||||||
|
"""
|
||||||
|
|
||||||
if instance.start_timestamp:
|
if instance.start_timestamp:
|
||||||
payload["from"] = instance.start_timestamp
|
payload["from"] = instance.start_timestamp
|
||||||
|
|
||||||
@ -43,18 +87,27 @@ def _add_payload(instance, payload):
|
|||||||
for i, f in enumerate(instance.collapses):
|
for i, f in enumerate(instance.collapses):
|
||||||
payload["collapse" + str(i)] = f
|
payload["collapse" + str(i)] = f
|
||||||
|
|
||||||
|
# Don't need to return anything as it's dictionary.
|
||||||
payload["url"] = instance.url
|
payload["url"] = instance.url
|
||||||
|
|
||||||
|
|
||||||
def _ts(timestamp, data):
|
def _timestamp_manager(timestamp, data):
|
||||||
"""
|
"""Returns the timestamp.
|
||||||
Get timestamp of last fetched archive.
|
|
||||||
If used before fetching any archive, will
|
|
||||||
use whatever self.JSON returns.
|
|
||||||
|
|
||||||
self.timestamp is None implies that
|
Parameters
|
||||||
self.JSON will return any archive's JSON
|
----------
|
||||||
that wayback machine provides it.
|
timestamp : datetime.datetime
|
||||||
|
datetime object
|
||||||
|
|
||||||
|
data : dict
|
||||||
|
A python dictionary, which is loaded JSON os the availability API.
|
||||||
|
|
||||||
|
Return type:
|
||||||
|
datetime.datetime
|
||||||
|
|
||||||
|
If timestamp is not None then sets the value to timestamp itself.
|
||||||
|
If timestamp is None the returns the value from the last fetched API data.
|
||||||
|
If not timestamp and can not read the archived_snapshots form data return datetime.max
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if timestamp:
|
if timestamp:
|
||||||
@ -69,6 +122,21 @@ def _ts(timestamp, data):
|
|||||||
|
|
||||||
|
|
||||||
def _check_match_type(match_type, url):
|
def _check_match_type(match_type, url):
|
||||||
|
"""Checks the validity of match_type parameter of the CDX GET requests.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
match_type : list
|
||||||
|
list that may contain any or all from ["exact", "prefix", "host", "domain"]
|
||||||
|
See https://github.com/akamhy/waybackpy/wiki/Python-package-docs#url-match-scope
|
||||||
|
|
||||||
|
url : str
|
||||||
|
The URL used to create the waybackpy Url object.
|
||||||
|
|
||||||
|
If not vaild match_type raise Exception.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
if not match_type:
|
if not match_type:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -85,6 +153,19 @@ def _check_match_type(match_type, url):
|
|||||||
|
|
||||||
|
|
||||||
def _check_collapses(collapses):
|
def _check_collapses(collapses):
|
||||||
|
"""Checks the validity of collapse parameter of the CDX GET request.
|
||||||
|
|
||||||
|
One or more field or field:N to 'collapses=[]' where
|
||||||
|
field is one of (urlkey, timestamp, original, mimetype, statuscode,
|
||||||
|
digest and length) and N is the first N characters of field to test.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
collapses : list
|
||||||
|
|
||||||
|
If not vaild collapses raise Exception.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
if not isinstance(collapses, list):
|
if not isinstance(collapses, list):
|
||||||
raise WaybackError("collapses must be a list.")
|
raise WaybackError("collapses must be a list.")
|
||||||
@ -119,12 +200,26 @@ def _check_collapses(collapses):
|
|||||||
|
|
||||||
|
|
||||||
def _check_filters(filters):
|
def _check_filters(filters):
|
||||||
|
"""Checks the validity of filter parameter of the CDX GET request.
|
||||||
|
|
||||||
|
Any number of filter params of the following form may be specified:
|
||||||
|
filters=["[!]field:regex"] may be specified..
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filters : list
|
||||||
|
|
||||||
|
If not vaild filters raise Exception.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
if not isinstance(filters, list):
|
if not isinstance(filters, list):
|
||||||
raise WaybackError("filters must be a list.")
|
raise WaybackError("filters must be a list.")
|
||||||
|
|
||||||
# [!]field:regex
|
# [!]field:regex
|
||||||
for _filter in filters:
|
for _filter in filters:
|
||||||
try:
|
try:
|
||||||
|
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
|
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
|
||||||
_filter,
|
_filter,
|
||||||
@ -134,8 +229,9 @@ def _check_filters(filters):
|
|||||||
val = match.group(2)
|
val = match.group(2)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
||||||
exc_message = (
|
exc_message = (
|
||||||
"Filter '{_filter}' not following the cdx filter syntax.".format(
|
"Filter '{_filter}' is not following the cdx filter syntax.".format(
|
||||||
_filter=_filter
|
_filter=_filter
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -143,6 +239,9 @@ def _check_filters(filters):
|
|||||||
|
|
||||||
|
|
||||||
def _cleaned_url(url):
|
def _cleaned_url(url):
|
||||||
|
"""Sanatize the url
|
||||||
|
Remove and replace illegal whitespace characters from the URL.
|
||||||
|
"""
|
||||||
return str(url).strip().replace(" ", "%20")
|
return str(url).strip().replace(" ", "%20")
|
||||||
|
|
||||||
|
|
||||||
@ -161,16 +260,29 @@ def _url_check(url):
|
|||||||
|
|
||||||
|
|
||||||
def _full_url(endpoint, params):
|
def _full_url(endpoint, params):
|
||||||
full_url = endpoint
|
"""API endpoint + GET parameters = full_url
|
||||||
if params:
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
endpoint : str
|
||||||
|
The API endpoint
|
||||||
|
|
||||||
|
params : dict
|
||||||
|
Dictionary that has name-value pairs.
|
||||||
|
|
||||||
|
Return type is str
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not params:
|
||||||
|
return endpoint
|
||||||
|
|
||||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||||
for key, val in params.items():
|
for key, val in params.items():
|
||||||
key = "filter" if key.startswith("filter") else key
|
key = "filter" if key.startswith("filter") else key
|
||||||
key = "collapse" if key.startswith("collapse") else key
|
key = "collapse" if key.startswith("collapse") else key
|
||||||
amp = "" if full_url.endswith("?") else "&"
|
amp = "" if full_url.endswith("?") else "&"
|
||||||
full_url = (
|
full_url = full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
|
||||||
full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
|
|
||||||
)
|
|
||||||
return full_url
|
return full_url
|
||||||
|
|
||||||
|
|
||||||
@ -191,17 +303,31 @@ def _get_total_pages(url, user_agent):
|
|||||||
|
|
||||||
|
|
||||||
def _archive_url_parser(header, url, latest_version=__version__, instance=None):
|
def _archive_url_parser(header, url, latest_version=__version__, instance=None):
|
||||||
"""
|
"""Returns the archive after parsing it from the response header.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
header : str
|
||||||
|
The response header of WayBack Machine's Save API
|
||||||
|
|
||||||
|
url : str
|
||||||
|
The input url, the one used to created the Url object.
|
||||||
|
|
||||||
|
latest_version : str
|
||||||
|
The latest version of waybackpy (default is __version__)
|
||||||
|
|
||||||
|
instance : waybackpy.wrapper.Url
|
||||||
|
Instance of Url class
|
||||||
|
|
||||||
|
|
||||||
The wayback machine's save API doesn't
|
The wayback machine's save API doesn't
|
||||||
return JSON response, we are required
|
return JSON response, we are required
|
||||||
to read the header of the API response
|
to read the header of the API response
|
||||||
and look for the archive URL.
|
and find the archive URL.
|
||||||
|
|
||||||
This method has some regexen (or regexes)
|
This method has some regular expressions
|
||||||
that search for archive url in header.
|
that are used to search for the archive url
|
||||||
|
in the response header of Save API.
|
||||||
This method is used when you try to
|
|
||||||
save a webpage on wayback machine.
|
|
||||||
|
|
||||||
Two cases are possible:
|
Two cases are possible:
|
||||||
1) Either we find the archive url in
|
1) Either we find the archive url in
|
||||||
@ -213,7 +339,6 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
|
|||||||
If we found the archive URL we return it.
|
If we found the archive URL we return it.
|
||||||
|
|
||||||
Return format:
|
Return format:
|
||||||
|
|
||||||
web.archive.org/web/<TIMESTAMP>/<URL>
|
web.archive.org/web/<TIMESTAMP>/<URL>
|
||||||
|
|
||||||
And if we couldn't find it, we raise
|
And if we couldn't find it, we raise
|
||||||
@ -304,9 +429,7 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
|
|||||||
|
|
||||||
|
|
||||||
def _wayback_timestamp(**kwargs):
|
def _wayback_timestamp(**kwargs):
|
||||||
"""
|
"""Returns a valid waybackpy timestamp.
|
||||||
Wayback Machine archive URLs
|
|
||||||
have a timestamp in them.
|
|
||||||
|
|
||||||
The standard archive URL format is
|
The standard archive URL format is
|
||||||
https://web.archive.org/web/20191214041711/https://www.youtube.com
|
https://web.archive.org/web/20191214041711/https://www.youtube.com
|
||||||
@ -316,13 +439,17 @@ def _wayback_timestamp(**kwargs):
|
|||||||
2 ) timestamp (20191214041711)
|
2 ) timestamp (20191214041711)
|
||||||
3 ) https://www.youtube.com, the original URL
|
3 ) https://www.youtube.com, the original URL
|
||||||
|
|
||||||
The near method takes year, month, day, hour and minute
|
|
||||||
as Arguments, their type is int.
|
The near method of Url class in wrapper.py takes year, month, day, hour
|
||||||
|
and minute as arguments, their type is int.
|
||||||
|
|
||||||
This method takes those integers and converts it to
|
This method takes those integers and converts it to
|
||||||
wayback machine timestamp and returns it.
|
wayback machine timestamp and returns it.
|
||||||
|
|
||||||
Return format is string.
|
|
||||||
|
zfill(2) adds 1 zero in front of single digit days, months hour etc.
|
||||||
|
|
||||||
|
Return type is string.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return "".join(
|
return "".join(
|
||||||
@ -339,16 +466,37 @@ def _get_response(
|
|||||||
backoff_factor=0.5,
|
backoff_factor=0.5,
|
||||||
no_raise_on_redirects=False,
|
no_raise_on_redirects=False,
|
||||||
):
|
):
|
||||||
"""
|
"""Makes get requests.
|
||||||
This function is used make get request.
|
|
||||||
We use the requests package to make the
|
Parameters
|
||||||
requests.
|
----------
|
||||||
|
endpoint : str
|
||||||
|
The API endpoint.
|
||||||
|
|
||||||
|
params : dict
|
||||||
|
The get request parameters. (default is None)
|
||||||
|
|
||||||
|
headers : dict
|
||||||
|
Headers for the get request. (default is None)
|
||||||
|
|
||||||
|
return_full_url : bool
|
||||||
|
Determines whether the call went full url returned along with the
|
||||||
|
response. (default is False)
|
||||||
|
|
||||||
|
retries : int
|
||||||
|
Maximum number of retries for the get request. (default is 5)
|
||||||
|
|
||||||
|
backoff_factor : float
|
||||||
|
The factor by which we determine the next retry time after wait.
|
||||||
|
https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html
|
||||||
|
(default is 0.5)
|
||||||
|
|
||||||
|
no_raise_on_redirects : bool
|
||||||
|
If maximum 30(default for requests) times redirected than instead of
|
||||||
|
exceptions return. (default is False)
|
||||||
|
|
||||||
|
|
||||||
We try five times and if it fails it raises
|
To handle WaybackError:
|
||||||
WaybackError exception.
|
|
||||||
|
|
||||||
You can handles WaybackError by importing:
|
|
||||||
from waybackpy.exceptions import WaybackError
|
from waybackpy.exceptions import WaybackError
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -370,20 +518,28 @@ def _get_response(
|
|||||||
|
|
||||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||||
|
|
||||||
|
# The URL with parameters required for the get request
|
||||||
url = _full_url(endpoint, params)
|
url = _full_url(endpoint, params)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
if not return_full_url:
|
if not return_full_url:
|
||||||
return s.get(url, headers=headers)
|
return s.get(url, headers=headers)
|
||||||
|
|
||||||
return (url, s.get(url, headers=headers))
|
return (url, s.get(url, headers=headers))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
reason = str(e)
|
reason = str(e)
|
||||||
|
|
||||||
if no_raise_on_redirects:
|
if no_raise_on_redirects:
|
||||||
if "Exceeded 30 redirects" in reason:
|
if "Exceeded 30 redirects" in reason:
|
||||||
return
|
return
|
||||||
|
|
||||||
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
||||||
url=url, reason=reason
|
url=url, reason=reason
|
||||||
)
|
)
|
||||||
|
|
||||||
exc = WaybackError(exc_message)
|
exc = WaybackError(exc_message)
|
||||||
exc.__cause__ = e
|
exc.__cause__ = e
|
||||||
raise exc
|
raise exc
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from .exceptions import WaybackError
|
from .exceptions import WaybackError
|
||||||
from .cdx import Cdx
|
from .cdx import Cdx
|
||||||
from .utils import (
|
from .utils import (
|
||||||
@ -9,13 +10,85 @@ from .utils import (
|
|||||||
default_user_agent,
|
default_user_agent,
|
||||||
_url_check,
|
_url_check,
|
||||||
_cleaned_url,
|
_cleaned_url,
|
||||||
_ts,
|
_timestamp_manager,
|
||||||
_unix_ts_to_wayback_ts,
|
_unix_timestamp_to_wayback_timestamp,
|
||||||
_latest_version,
|
_latest_version,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class Url:
|
class Url:
|
||||||
|
"""
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
url : str
|
||||||
|
The input URL, wayback machine API operations are performed
|
||||||
|
on this URL after sanatizing it.
|
||||||
|
|
||||||
|
user_agent : str
|
||||||
|
The user_agent used while making the GET requests to the
|
||||||
|
Wayback machine APIs
|
||||||
|
|
||||||
|
_archive_url : str
|
||||||
|
Caches the last fetched archive.
|
||||||
|
|
||||||
|
timestamp : datetime.datetime
|
||||||
|
timestamp of the archive URL as datetime object for
|
||||||
|
greater usability
|
||||||
|
|
||||||
|
_JSON : dict
|
||||||
|
Caches the last fetched availability API data
|
||||||
|
|
||||||
|
latest_version : str
|
||||||
|
The latest version of waybackpy on PyPi
|
||||||
|
|
||||||
|
cached_save : bool
|
||||||
|
Flag to check if WayBack machine returned a cached
|
||||||
|
archive instead of creating a new archive. WayBack
|
||||||
|
machine allows only one 1 archive for an URL in
|
||||||
|
30 minutes. If the archive returned by WayBack machine
|
||||||
|
is older than 3 minutes than this flag is set to True
|
||||||
|
|
||||||
|
Methods turned properties
|
||||||
|
----------
|
||||||
|
JSON : dict
|
||||||
|
JSON response of availability API as dictionary / loaded JSON
|
||||||
|
|
||||||
|
archive_url : str
|
||||||
|
Return the archive url, returns str
|
||||||
|
|
||||||
|
_timestamp : datetime.datetime
|
||||||
|
Sets the value of self.timestamp if still not set
|
||||||
|
|
||||||
|
Methods
|
||||||
|
-------
|
||||||
|
save()
|
||||||
|
Archives the URL on WayBack machine
|
||||||
|
|
||||||
|
get(url="", user_agent="", encoding="")
|
||||||
|
Gets the source of archive url, can also be used to get source
|
||||||
|
of any URL if passed into it.
|
||||||
|
|
||||||
|
near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
|
||||||
|
Wayback Machine can have many archives for a URL/webpage, sometimes we want
|
||||||
|
archive close to a specific time.
|
||||||
|
This method takes year, month, day, hour, minute and unix_timestamp as input.
|
||||||
|
|
||||||
|
oldest(year=1994)
|
||||||
|
The oldest archive of an URL.
|
||||||
|
|
||||||
|
newest()
|
||||||
|
The newest archive of an URL
|
||||||
|
|
||||||
|
total_archives(start_timestamp=None, end_timestamp=None)
|
||||||
|
total number of archives of an URL, the timeframe can be confined by
|
||||||
|
start_timestamp and end_timestamp
|
||||||
|
|
||||||
|
known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
|
||||||
|
Known URLs for an URL, subdomain, URL as prefix etc.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, url, user_agent=default_user_agent):
|
def __init__(self, url, user_agent=default_user_agent):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.user_agent = str(user_agent)
|
self.user_agent = str(user_agent)
|
||||||
@ -32,29 +105,17 @@ class Url:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
"""
|
|
||||||
Output when print() is used on <class 'waybackpy.wrapper.Url'>
|
|
||||||
This should print an archive URL.
|
|
||||||
|
|
||||||
We check if self._archive_url is not None.
|
|
||||||
If not None, good. We return string of self._archive_url.
|
|
||||||
|
|
||||||
If self._archive_url is None, it means we ain't used any method that
|
|
||||||
sets self._archive_url, we now set self._archive_url to self.archive_url
|
|
||||||
and return it.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not self._archive_url:
|
if not self._archive_url:
|
||||||
self._archive_url = self.archive_url
|
self._archive_url = self.archive_url
|
||||||
|
|
||||||
return "{archive_url}".format(archive_url=self._archive_url)
|
return "{archive_url}".format(archive_url=self._archive_url)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""
|
"""Number of days between today and the date of archive based on the timestamp
|
||||||
Why do we have len here?
|
|
||||||
|
|
||||||
Applying len() on <class 'waybackpy.wrapper.Url'>
|
len() of waybackpy.wrapper.Url should return
|
||||||
will calculate the number of days between today and
|
the number of days between today and the
|
||||||
the archive timestamp.
|
archive timestamp.
|
||||||
|
|
||||||
Can be applied on return values of near and its
|
Can be applied on return values of near and its
|
||||||
childs (e.g. oldest) and if applied on waybackpy.Url()
|
childs (e.g. oldest) and if applied on waybackpy.Url()
|
||||||
@ -76,32 +137,30 @@ class Url:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def JSON(self):
|
def JSON(self):
|
||||||
"""
|
"""Returns JSON response of availability API as dictionary / loaded JSON
|
||||||
If the end user has used near() or its childs like oldest, newest
|
|
||||||
and archive_url then the JSON response of these are cached in self._JSON
|
|
||||||
|
|
||||||
If we find that self._JSON is not None we return it.
|
return type : dict
|
||||||
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
|
|
||||||
and return it.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# If user used the near method or any method that depends on near, we
|
||||||
|
# are certain that we have a loaded dictionary cached in self._JSON.
|
||||||
|
# Return the loaded JSON data.
|
||||||
if self._JSON:
|
if self._JSON:
|
||||||
return self._JSON
|
return self._JSON
|
||||||
|
|
||||||
|
# If no cached data found, get data and return + cache it.
|
||||||
endpoint = "https://archive.org/wayback/available"
|
endpoint = "https://archive.org/wayback/available"
|
||||||
headers = {"User-Agent": self.user_agent}
|
headers = {"User-Agent": self.user_agent}
|
||||||
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
|
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
|
||||||
response = _get_response(endpoint, params=payload, headers=headers)
|
response = _get_response(endpoint, params=payload, headers=headers)
|
||||||
return response.json()
|
self._JSON = response.json()
|
||||||
|
return self._JSON
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def archive_url(self):
|
def archive_url(self):
|
||||||
"""
|
"""Return the archive url.
|
||||||
Returns any random archive for the instance.
|
|
||||||
But if near, oldest, newest were used before
|
|
||||||
then it returns the same archive again.
|
|
||||||
|
|
||||||
We cache archive in self._archive_url
|
return type : str
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self._archive_url:
|
if self._archive_url:
|
||||||
@ -121,11 +180,16 @@ class Url:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def _timestamp(self):
|
def _timestamp(self):
|
||||||
self.timestamp = _ts(self.timestamp, self.JSON)
|
"""Sets the value of self.timestamp if still not set.
|
||||||
return self.timestamp
|
|
||||||
|
Return type : datetime.datetime
|
||||||
|
|
||||||
|
"""
|
||||||
|
return _timestamp_manager(self.timestamp, self.JSON)
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
"""
|
"""Saves/Archive the URL.
|
||||||
|
|
||||||
To save a webpage on WayBack machine we
|
To save a webpage on WayBack machine we
|
||||||
need to send get request to https://web.archive.org/save/
|
need to send get request to https://web.archive.org/save/
|
||||||
|
|
||||||
@ -136,6 +200,8 @@ class Url:
|
|||||||
|
|
||||||
_archive_url_parser() parses the archive from the header.
|
_archive_url_parser() parses the archive from the header.
|
||||||
|
|
||||||
|
return type : waybackpy.wrapper.Url
|
||||||
|
|
||||||
"""
|
"""
|
||||||
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
|
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
|
||||||
headers = {"User-Agent": self.user_agent}
|
headers = {"User-Agent": self.user_agent}
|
||||||
@ -161,7 +227,9 @@ class Url:
|
|||||||
instance=self,
|
instance=self,
|
||||||
)
|
)
|
||||||
|
|
||||||
m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url)
|
m = re.search(
|
||||||
|
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||||
|
)
|
||||||
str_ts = m.group(1)
|
str_ts = m.group(1)
|
||||||
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
|
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
|
||||||
now = datetime.utcnow()
|
now = datetime.utcnow()
|
||||||
@ -175,9 +243,22 @@ class Url:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def get(self, url="", user_agent="", encoding=""):
|
def get(self, url="", user_agent="", encoding=""):
|
||||||
"""
|
"""GET the source of archive or any other URL.
|
||||||
Return the source code of the last archived URL,
|
|
||||||
if no URL is passed to this method.
|
url : str, waybackpy.wrapper.Url
|
||||||
|
The method will return the source code of
|
||||||
|
this URL instead of last fetched archive.
|
||||||
|
|
||||||
|
user_agent : str
|
||||||
|
The user_agent for GET request to API
|
||||||
|
|
||||||
|
encoding : str
|
||||||
|
If user is using any other encoding that
|
||||||
|
can't be detected by response.encoding
|
||||||
|
|
||||||
|
Return the source code of the last fetched
|
||||||
|
archive URL if no URL is passed to this method
|
||||||
|
else it returns the source code of url passed.
|
||||||
|
|
||||||
If encoding is not supplied, it is auto-detected
|
If encoding is not supplied, it is auto-detected
|
||||||
from the response itself by requests package.
|
from the response itself by requests package.
|
||||||
@ -213,6 +294,27 @@ class Url:
|
|||||||
unix_timestamp=None,
|
unix_timestamp=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
|
||||||
|
year : int
|
||||||
|
Archive close to year
|
||||||
|
|
||||||
|
month : int
|
||||||
|
Archive close to month
|
||||||
|
|
||||||
|
day : int
|
||||||
|
Archive close to day
|
||||||
|
|
||||||
|
hour : int
|
||||||
|
Archive close to hour
|
||||||
|
|
||||||
|
minute : int
|
||||||
|
Archive close to minute
|
||||||
|
|
||||||
|
unix_timestamp : str, float or int
|
||||||
|
Archive close to this unix_timestamp
|
||||||
|
|
||||||
Wayback Machine can have many archives of a webpage,
|
Wayback Machine can have many archives of a webpage,
|
||||||
sometimes we want archive close to a specific time.
|
sometimes we want archive close to a specific time.
|
||||||
|
|
||||||
@ -235,7 +337,7 @@ class Url:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if unix_timestamp:
|
if unix_timestamp:
|
||||||
timestamp = _unix_ts_to_wayback_ts(unix_timestamp)
|
timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||||
else:
|
else:
|
||||||
now = datetime.utcnow().timetuple()
|
now = datetime.utcnow().timetuple()
|
||||||
timestamp = _wayback_timestamp(
|
timestamp = _wayback_timestamp(
|
||||||
@ -285,28 +387,45 @@ class Url:
|
|||||||
|
|
||||||
We simply pass the year in near() and return it.
|
We simply pass the year in near() and return it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return self.near(year=year)
|
return self.near(year=year)
|
||||||
|
|
||||||
def newest(self):
|
def newest(self):
|
||||||
"""
|
"""Return the newest Wayback Machine archive available.
|
||||||
Return the newest Wayback Machine archive available for this URL.
|
|
||||||
|
|
||||||
We return the output of self.near() as it deafults to current utc time.
|
We return the return value of self.near() as it deafults to current UTC time.
|
||||||
|
|
||||||
Due to Wayback Machine database lag, this may not always be the
|
Due to Wayback Machine database lag, this may not always be the
|
||||||
most recent archive.
|
most recent archive.
|
||||||
|
|
||||||
|
return type : waybackpy.wrapper.Url
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return self.near()
|
return self.near()
|
||||||
|
|
||||||
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
||||||
"""
|
"""Returns the total number of archives for an URL
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
start_timestamp : str
|
||||||
|
1 to 14 digit string of numbers, you are not required to
|
||||||
|
pass a full 14 digit timestamp.
|
||||||
|
|
||||||
|
end_timestamp : str
|
||||||
|
1 to 14 digit string of numbers, you are not required to
|
||||||
|
pass a full 14 digit timestamp.
|
||||||
|
|
||||||
|
|
||||||
|
return type : int
|
||||||
|
|
||||||
|
|
||||||
A webpage can have multiple archives on the wayback machine
|
A webpage can have multiple archives on the wayback machine
|
||||||
If someone wants to count the total number of archives of a
|
If someone wants to count the total number of archives of a
|
||||||
webpage on wayback machine they can use this method.
|
webpage on wayback machine they can use this method.
|
||||||
|
|
||||||
Returns the total number of Wayback Machine archives for the URL.
|
Returns the total number of Wayback Machine archives for the URL.
|
||||||
|
|
||||||
Return type in integer.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cdx = Cdx(
|
cdx = Cdx(
|
||||||
@ -315,6 +434,8 @@ class Url:
|
|||||||
start_timestamp=start_timestamp,
|
start_timestamp=start_timestamp,
|
||||||
end_timestamp=end_timestamp,
|
end_timestamp=end_timestamp,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# cdx.snapshots() is generator not list.
|
||||||
i = 0
|
i = 0
|
||||||
for _ in cdx.snapshots():
|
for _ in cdx.snapshots():
|
||||||
i = i + 1
|
i = i + 1
|
||||||
@ -328,15 +449,36 @@ class Url:
|
|||||||
end_timestamp=None,
|
end_timestamp=None,
|
||||||
match_type="prefix",
|
match_type="prefix",
|
||||||
):
|
):
|
||||||
"""
|
"""Yields known_urls URLs from the CDX API.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
|
||||||
|
subdomain : bool
|
||||||
|
If True fetch subdomain URLs along with the host URLs.
|
||||||
|
|
||||||
|
host : bool
|
||||||
|
Only fetch host URLs.
|
||||||
|
|
||||||
|
start_timestamp : str
|
||||||
|
1 to 14 digit string of numbers, you are not required to
|
||||||
|
pass a full 14 digit timestamp.
|
||||||
|
|
||||||
|
end_timestamp : str
|
||||||
|
1 to 14 digit string of numbers, you are not required to
|
||||||
|
pass a full 14 digit timestamp.
|
||||||
|
|
||||||
|
match_type : str
|
||||||
|
One of (exact, prefix, host and domain)
|
||||||
|
|
||||||
|
return type : waybackpy.snapshot.CdxSnapshot
|
||||||
|
|
||||||
Yields list of URLs known to exist for given input.
|
Yields list of URLs known to exist for given input.
|
||||||
Defaults to input URL as prefix.
|
Defaults to input URL as prefix.
|
||||||
|
|
||||||
This method is kept for compatibility, use the Cdx class instead.
|
Based on:
|
||||||
This method itself depends on Cdx.
|
|
||||||
|
|
||||||
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
|
|
||||||
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||||
|
By Mohammed Diaa (https://github.com/mhmdiaa)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if subdomain:
|
if subdomain:
|
||||||
@ -353,7 +495,5 @@ class Url:
|
|||||||
collapses=["urlkey"],
|
collapses=["urlkey"],
|
||||||
)
|
)
|
||||||
|
|
||||||
snapshots = cdx.snapshots()
|
for snapshot in cdx.snapshots():
|
||||||
|
|
||||||
for snapshot in snapshots:
|
|
||||||
yield (snapshot.original)
|
yield (snapshot.original)
|
||||||
|
Loading…
Reference in New Issue
Block a user