Add doc strings (#90)

* Added some docstrings in utils.py

* renamed some func/meth to better names and added doc strings + lint

* added more docstrings

* more docstrings

* improve docstrings

* docstrings

* added more docstrings, lint

* fix import error
This commit is contained in:
Akash Mahanty 2021-01-26 11:56:03 +05:30 committed by GitHub
parent 88cda94c0b
commit db8f902cff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 443 additions and 121 deletions

View File

@ -79,7 +79,7 @@ def test_all_cdx():
c = 0 c = 0
for snapshot in snapshots: for snapshot in snapshots:
c += 1 c += 1
if c > 30_529: # deafult limit is 10k if c > 30529: # deafult limit is 10k
break break
url = "https://github.com/*" url = "https://github.com/*"
@ -89,5 +89,5 @@ def test_all_cdx():
for snapshot in snapshots: for snapshot in snapshots:
c += 1 c += 1
if c > 100_529: if c > 100529:
break break

View File

@ -5,8 +5,7 @@ import random
import string import string
import argparse import argparse
sys.path.append("..") import waybackpy.cli as cli
import waybackpy.cli as cli # noqa: E402
from waybackpy.wrapper import Url # noqa: E402 from waybackpy.wrapper import Url # noqa: E402
from waybackpy.__version__ import __version__ from waybackpy.__version__ import __version__

View File

@ -14,14 +14,14 @@ from waybackpy.utils import (
_check_match_type, _check_match_type,
_check_collapses, _check_collapses,
_check_filters, _check_filters,
_ts, _timestamp_manager,
) )
def test_ts(): def test_timestamp_manager():
timestamp = True timestamp = True
data = {} data = {}
assert _ts(timestamp, data) assert _timestamp_manager(timestamp, data)
data = """ data = """
{"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"} {"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"}
@ -61,10 +61,10 @@ def test_check_collapses():
def test_check_match_type(): def test_check_match_type():
assert None == _check_match_type(None, "url") assert _check_match_type(None, "url") is None
match_type = "exact" match_type = "exact"
url = "test_url" url = "test_url"
assert None == _check_match_type(match_type, url) assert _check_match_type(match_type, url) is None
url = "has * in it" url = "has * in it"
with pytest.raises(WaybackError): with pytest.raises(WaybackError):
@ -82,7 +82,7 @@ def test_cleaned_url():
def test_url_check(): def test_url_check():
good_url = "https://akamhy.github.io" good_url = "https://akamhy.github.io"
assert None == _url_check(good_url) assert _url_check(good_url) is None
bad_url = "https://github-com" bad_url = "https://github-com"
with pytest.raises(URLError): with pytest.raises(URLError):

View File

@ -1,8 +1,4 @@
import sys
import pytest import pytest
import random
import requests
from datetime import datetime
from waybackpy.wrapper import Url from waybackpy.wrapper import Url

View File

@ -11,6 +11,7 @@ from .utils import (
) )
# TODO : Threading support for pagination API. It's designed for Threading. # TODO : Threading support for pagination API. It's designed for Threading.
# TODO : Add get method here if type is Vaild HTML, SVG other but not - or warc. Test it.
class Cdx: class Cdx:
@ -42,7 +43,22 @@ class Cdx:
self.use_page = False self.use_page = False
def cdx_api_manager(self, payload, headers, use_page=False): def cdx_api_manager(self, payload, headers, use_page=False):
""" """Act as button, we can choose between the normal API and pagination API.
Parameters
----------
self : waybackpy.cdx.Cdx
The instance itself
payload : dict
Get request parameters name value pairs
headers : dict
The headers for making the GET request.
use_page : bool
If True use pagination API else use normal resume key based API.
We have two options to get the snapshots, we use this We have two options to get the snapshots, we use this
method to make a selection between pagination API and method to make a selection between pagination API and
the normal one with Resumption Key, sequential querying the normal one with Resumption Key, sequential querying
@ -141,7 +157,7 @@ class Cdx:
def snapshots(self): def snapshots(self):
""" """
This function yeilds snapshots encapsulated This function yeilds snapshots encapsulated
in CdxSnapshot for more usability. in CdxSnapshot for increased usability.
All the get request values are set if the conditions match All the get request values are set if the conditions match
@ -188,10 +204,9 @@ class Cdx:
prop_values = snapshot.split(" ") prop_values = snapshot.split(" ")
# Making sure that we get the same number of
# property values as the number of properties
prop_values_len = len(prop_values) prop_values_len = len(prop_values)
properties_len = len(properties) properties_len = len(properties)
if prop_values_len != properties_len: if prop_values_len != properties_len:
raise WaybackError( raise WaybackError(
"Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format( "Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(

View File

@ -5,6 +5,7 @@ import json
import random import random
import string import string
import argparse import argparse
from .wrapper import Url from .wrapper import Url
from .exceptions import WaybackError from .exceptions import WaybackError
from .__version__ import __version__ from .__version__ import __version__

View File

@ -3,15 +3,24 @@ from datetime import datetime
class CdxSnapshot: class CdxSnapshot:
""" """
This class helps to use the Cdx Snapshots easily. This class encapsulates the snapshots for greater usability.
Raw Snapshot data looks like: Raw Snapshot data looks like:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415 org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
properties is a dict containg all of the 7 cdx snapshot properties.
""" """
def __init__(self, properties): def __init__(self, properties):
"""
Parameters
----------
self : waybackpy.snapshot.CdxSnapshot
The instance itself
properties : dict
Properties is a dict containg all of the 7 cdx snapshot properties.
"""
self.urlkey = properties["urlkey"] self.urlkey = properties["urlkey"]
self.timestamp = properties["timestamp"] self.timestamp = properties["timestamp"]
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S") self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
@ -25,6 +34,12 @@ class CdxSnapshot:
) )
def __str__(self): def __str__(self):
"""Returns the Cdx snapshot line.
Output format:
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
"""
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format( return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
urlkey=self.urlkey, urlkey=self.urlkey,
timestamp=self.timestamp, timestamp=self.timestamp,

View File

@ -1,28 +1,72 @@
import re import re
import time import time
import requests import requests
from .exceptions import WaybackError, URLError
from datetime import datetime from datetime import datetime
from .exceptions import WaybackError, URLError
from .__version__ import __version__
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from .__version__ import __version__
quote = requests.utils.quote quote = requests.utils.quote
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy" default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _latest_version(package_name, headers): def _latest_version(package_name, headers):
endpoint = "https://pypi.org/pypi/" + package_name + "/json" """Returns the latest version of package_name.
json = _get_response(endpoint, headers=headers).json()
return json["info"]["version"] Parameters
----------
package_name : str
The name of the python package
headers : dict
Headers that will be used while making get requests
Return type is str
Use API <https://pypi.org/pypi/> to get the latest version of
waybackpy, but can be used to get latest version of any package
on PyPi.
"""
request_url = "https://pypi.org/pypi/" + package_name + "/json"
response = _get_response(request_url, headers=headers)
data = response.json()
return data["info"]["version"]
def _unix_ts_to_wayback_ts(unix_ts): def _unix_timestamp_to_wayback_timestamp(unix_timestamp):
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S") """Returns unix timestamp converted to datetime.datetime
Parameters
----------
unix_timestamp : str, int or float
Unix-timestamp that needs to be converted to datetime.datetime
Converts and returns input unix_timestamp to datetime.datetime object.
Does not matter if unix_timestamp is str, float or int.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def _add_payload(instance, payload): def _add_payload(instance, payload):
"""Adds payload from instance that can be used to make get requests.
Parameters
----------
instance : waybackpy.cdx.Cdx
instance of the Cdx class
payload : dict
A dict onto which we need to add keys and values based on instance.
instance is object of Cdx class and it contains the data required to fill
the payload dictionary.
"""
if instance.start_timestamp: if instance.start_timestamp:
payload["from"] = instance.start_timestamp payload["from"] = instance.start_timestamp
@ -43,18 +87,27 @@ def _add_payload(instance, payload):
for i, f in enumerate(instance.collapses): for i, f in enumerate(instance.collapses):
payload["collapse" + str(i)] = f payload["collapse" + str(i)] = f
# Don't need to return anything as it's dictionary.
payload["url"] = instance.url payload["url"] = instance.url
def _ts(timestamp, data): def _timestamp_manager(timestamp, data):
""" """Returns the timestamp.
Get timestamp of last fetched archive.
If used before fetching any archive, will
use whatever self.JSON returns.
self.timestamp is None implies that Parameters
self.JSON will return any archive's JSON ----------
that wayback machine provides it. timestamp : datetime.datetime
datetime object
data : dict
A python dictionary, which is loaded JSON os the availability API.
Return type:
datetime.datetime
If timestamp is not None then sets the value to timestamp itself.
If timestamp is None the returns the value from the last fetched API data.
If not timestamp and can not read the archived_snapshots form data return datetime.max
""" """
if timestamp: if timestamp:
@ -69,6 +122,21 @@ def _ts(timestamp, data):
def _check_match_type(match_type, url): def _check_match_type(match_type, url):
"""Checks the validity of match_type parameter of the CDX GET requests.
Parameters
----------
match_type : list
list that may contain any or all from ["exact", "prefix", "host", "domain"]
See https://github.com/akamhy/waybackpy/wiki/Python-package-docs#url-match-scope
url : str
The URL used to create the waybackpy Url object.
If not vaild match_type raise Exception.
"""
if not match_type: if not match_type:
return return
@ -85,6 +153,19 @@ def _check_match_type(match_type, url):
def _check_collapses(collapses): def _check_collapses(collapses):
"""Checks the validity of collapse parameter of the CDX GET request.
One or more field or field:N to 'collapses=[]' where
field is one of (urlkey, timestamp, original, mimetype, statuscode,
digest and length) and N is the first N characters of field to test.
Parameters
----------
collapses : list
If not vaild collapses raise Exception.
"""
if not isinstance(collapses, list): if not isinstance(collapses, list):
raise WaybackError("collapses must be a list.") raise WaybackError("collapses must be a list.")
@ -119,12 +200,26 @@ def _check_collapses(collapses):
def _check_filters(filters): def _check_filters(filters):
"""Checks the validity of filter parameter of the CDX GET request.
Any number of filter params of the following form may be specified:
filters=["[!]field:regex"] may be specified..
Parameters
----------
filters : list
If not vaild filters raise Exception.
"""
if not isinstance(filters, list): if not isinstance(filters, list):
raise WaybackError("filters must be a list.") raise WaybackError("filters must be a list.")
# [!]field:regex # [!]field:regex
for _filter in filters: for _filter in filters:
try: try:
match = re.search( match = re.search(
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)", r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
_filter, _filter,
@ -134,8 +229,9 @@ def _check_filters(filters):
val = match.group(2) val = match.group(2)
except Exception: except Exception:
exc_message = ( exc_message = (
"Filter '{_filter}' not following the cdx filter syntax.".format( "Filter '{_filter}' is not following the cdx filter syntax.".format(
_filter=_filter _filter=_filter
) )
) )
@ -143,6 +239,9 @@ def _check_filters(filters):
def _cleaned_url(url): def _cleaned_url(url):
"""Sanatize the url
Remove and replace illegal whitespace characters from the URL.
"""
return str(url).strip().replace(" ", "%20") return str(url).strip().replace(" ", "%20")
@ -161,16 +260,29 @@ def _url_check(url):
def _full_url(endpoint, params): def _full_url(endpoint, params):
full_url = endpoint """API endpoint + GET parameters = full_url
if params:
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") Parameters
for key, val in params.items(): ----------
key = "filter" if key.startswith("filter") else key endpoint : str
key = "collapse" if key.startswith("collapse") else key The API endpoint
amp = "" if full_url.endswith("?") else "&"
full_url = ( params : dict
full_url + amp + "{key}={val}".format(key=key, val=quote(str(val))) Dictionary that has name-value pairs.
)
Return type is str
"""
if not params:
return endpoint
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
for key, val in params.items():
key = "filter" if key.startswith("filter") else key
key = "collapse" if key.startswith("collapse") else key
amp = "" if full_url.endswith("?") else "&"
full_url = full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
return full_url return full_url
@ -191,17 +303,31 @@ def _get_total_pages(url, user_agent):
def _archive_url_parser(header, url, latest_version=__version__, instance=None): def _archive_url_parser(header, url, latest_version=__version__, instance=None):
""" """Returns the archive after parsing it from the response header.
Parameters
----------
header : str
The response header of WayBack Machine's Save API
url : str
The input url, the one used to created the Url object.
latest_version : str
The latest version of waybackpy (default is __version__)
instance : waybackpy.wrapper.Url
Instance of Url class
The wayback machine's save API doesn't The wayback machine's save API doesn't
return JSON response, we are required return JSON response, we are required
to read the header of the API response to read the header of the API response
and look for the archive URL. and find the archive URL.
This method has some regexen (or regexes) This method has some regular expressions
that search for archive url in header. that are used to search for the archive url
in the response header of Save API.
This method is used when you try to
save a webpage on wayback machine.
Two cases are possible: Two cases are possible:
1) Either we find the archive url in 1) Either we find the archive url in
@ -213,7 +339,6 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
If we found the archive URL we return it. If we found the archive URL we return it.
Return format: Return format:
web.archive.org/web/<TIMESTAMP>/<URL> web.archive.org/web/<TIMESTAMP>/<URL>
And if we couldn't find it, we raise And if we couldn't find it, we raise
@ -304,9 +429,7 @@ def _archive_url_parser(header, url, latest_version=__version__, instance=None):
def _wayback_timestamp(**kwargs): def _wayback_timestamp(**kwargs):
""" """Returns a valid waybackpy timestamp.
Wayback Machine archive URLs
have a timestamp in them.
The standard archive URL format is The standard archive URL format is
https://web.archive.org/web/20191214041711/https://www.youtube.com https://web.archive.org/web/20191214041711/https://www.youtube.com
@ -316,13 +439,17 @@ def _wayback_timestamp(**kwargs):
2 ) timestamp (20191214041711) 2 ) timestamp (20191214041711)
3 ) https://www.youtube.com, the original URL 3 ) https://www.youtube.com, the original URL
The near method takes year, month, day, hour and minute
as Arguments, their type is int. The near method of Url class in wrapper.py takes year, month, day, hour
and minute as arguments, their type is int.
This method takes those integers and converts it to This method takes those integers and converts it to
wayback machine timestamp and returns it. wayback machine timestamp and returns it.
Return format is string.
zfill(2) adds 1 zero in front of single digit days, months hour etc.
Return type is string.
""" """
return "".join( return "".join(
@ -339,16 +466,37 @@ def _get_response(
backoff_factor=0.5, backoff_factor=0.5,
no_raise_on_redirects=False, no_raise_on_redirects=False,
): ):
""" """Makes get requests.
This function is used make get request.
We use the requests package to make the Parameters
requests. ----------
endpoint : str
The API endpoint.
params : dict
The get request parameters. (default is None)
headers : dict
Headers for the get request. (default is None)
return_full_url : bool
Determines whether the call went full url returned along with the
response. (default is False)
retries : int
Maximum number of retries for the get request. (default is 5)
backoff_factor : float
The factor by which we determine the next retry time after wait.
https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html
(default is 0.5)
no_raise_on_redirects : bool
If maximum 30(default for requests) times redirected than instead of
exceptions return. (default is False)
We try five times and if it fails it raises To handle WaybackError:
WaybackError exception.
You can handles WaybackError by importing:
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
try: try:
@ -370,20 +518,28 @@ def _get_response(
s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries))
# The URL with parameters required for the get request
url = _full_url(endpoint, params) url = _full_url(endpoint, params)
try: try:
if not return_full_url: if not return_full_url:
return s.get(url, headers=headers) return s.get(url, headers=headers)
return (url, s.get(url, headers=headers)) return (url, s.get(url, headers=headers))
except Exception as e: except Exception as e:
reason = str(e) reason = str(e)
if no_raise_on_redirects: if no_raise_on_redirects:
if "Exceeded 30 redirects" in reason: if "Exceeded 30 redirects" in reason:
return return
exc_message = "Error while retrieving {url}.\n{reason}".format( exc_message = "Error while retrieving {url}.\n{reason}".format(
url=url, reason=reason url=url, reason=reason
) )
exc = WaybackError(exc_message) exc = WaybackError(exc_message)
exc.__cause__ = e exc.__cause__ = e
raise exc raise exc

View File

@ -1,5 +1,6 @@
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
from .exceptions import WaybackError from .exceptions import WaybackError
from .cdx import Cdx from .cdx import Cdx
from .utils import ( from .utils import (
@ -9,13 +10,85 @@ from .utils import (
default_user_agent, default_user_agent,
_url_check, _url_check,
_cleaned_url, _cleaned_url,
_ts, _timestamp_manager,
_unix_ts_to_wayback_ts, _unix_timestamp_to_wayback_timestamp,
_latest_version, _latest_version,
) )
class Url: class Url:
"""
Attributes
----------
url : str
The input URL, wayback machine API operations are performed
on this URL after sanatizing it.
user_agent : str
The user_agent used while making the GET requests to the
Wayback machine APIs
_archive_url : str
Caches the last fetched archive.
timestamp : datetime.datetime
timestamp of the archive URL as datetime object for
greater usability
_JSON : dict
Caches the last fetched availability API data
latest_version : str
The latest version of waybackpy on PyPi
cached_save : bool
Flag to check if WayBack machine returned a cached
archive instead of creating a new archive. WayBack
machine allows only one 1 archive for an URL in
30 minutes. If the archive returned by WayBack machine
is older than 3 minutes than this flag is set to True
Methods turned properties
----------
JSON : dict
JSON response of availability API as dictionary / loaded JSON
archive_url : str
Return the archive url, returns str
_timestamp : datetime.datetime
Sets the value of self.timestamp if still not set
Methods
-------
save()
Archives the URL on WayBack machine
get(url="", user_agent="", encoding="")
Gets the source of archive url, can also be used to get source
of any URL if passed into it.
near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
Wayback Machine can have many archives for a URL/webpage, sometimes we want
archive close to a specific time.
This method takes year, month, day, hour, minute and unix_timestamp as input.
oldest(year=1994)
The oldest archive of an URL.
newest()
The newest archive of an URL
total_archives(start_timestamp=None, end_timestamp=None)
total number of archives of an URL, the timeframe can be confined by
start_timestamp and end_timestamp
known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
Known URLs for an URL, subdomain, URL as prefix etc.
"""
def __init__(self, url, user_agent=default_user_agent): def __init__(self, url, user_agent=default_user_agent):
self.url = url self.url = url
self.user_agent = str(user_agent) self.user_agent = str(user_agent)
@ -32,29 +105,17 @@ class Url:
) )
def __str__(self): def __str__(self):
"""
Output when print() is used on <class 'waybackpy.wrapper.Url'>
This should print an archive URL.
We check if self._archive_url is not None.
If not None, good. We return string of self._archive_url.
If self._archive_url is None, it means we ain't used any method that
sets self._archive_url, we now set self._archive_url to self.archive_url
and return it.
"""
if not self._archive_url: if not self._archive_url:
self._archive_url = self.archive_url self._archive_url = self.archive_url
return "{archive_url}".format(archive_url=self._archive_url) return "{archive_url}".format(archive_url=self._archive_url)
def __len__(self): def __len__(self):
""" """Number of days between today and the date of archive based on the timestamp
Why do we have len here?
Applying len() on <class 'waybackpy.wrapper.Url'> len() of waybackpy.wrapper.Url should return
will calculate the number of days between today and the number of days between today and the
the archive timestamp. archive timestamp.
Can be applied on return values of near and its Can be applied on return values of near and its
childs (e.g. oldest) and if applied on waybackpy.Url() childs (e.g. oldest) and if applied on waybackpy.Url()
@ -76,32 +137,30 @@ class Url:
@property @property
def JSON(self): def JSON(self):
""" """Returns JSON response of availability API as dictionary / loaded JSON
If the end user has used near() or its childs like oldest, newest
and archive_url then the JSON response of these are cached in self._JSON
If we find that self._JSON is not None we return it. return type : dict
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
and return it.
""" """
# If user used the near method or any method that depends on near, we
# are certain that we have a loaded dictionary cached in self._JSON.
# Return the loaded JSON data.
if self._JSON: if self._JSON:
return self._JSON return self._JSON
# If no cached data found, get data and return + cache it.
endpoint = "https://archive.org/wayback/available" endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
payload = {"url": "{url}".format(url=_cleaned_url(self.url))} payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
response = _get_response(endpoint, params=payload, headers=headers) response = _get_response(endpoint, params=payload, headers=headers)
return response.json() self._JSON = response.json()
return self._JSON
@property @property
def archive_url(self): def archive_url(self):
""" """Return the archive url.
Returns any random archive for the instance.
But if near, oldest, newest were used before
then it returns the same archive again.
We cache archive in self._archive_url return type : str
""" """
if self._archive_url: if self._archive_url:
@ -121,11 +180,16 @@ class Url:
@property @property
def _timestamp(self): def _timestamp(self):
self.timestamp = _ts(self.timestamp, self.JSON) """Sets the value of self.timestamp if still not set.
return self.timestamp
Return type : datetime.datetime
"""
return _timestamp_manager(self.timestamp, self.JSON)
def save(self): def save(self):
""" """Saves/Archive the URL.
To save a webpage on WayBack machine we To save a webpage on WayBack machine we
need to send get request to https://web.archive.org/save/ need to send get request to https://web.archive.org/save/
@ -136,6 +200,8 @@ class Url:
_archive_url_parser() parses the archive from the header. _archive_url_parser() parses the archive from the header.
return type : waybackpy.wrapper.Url
""" """
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url) request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
@ -161,7 +227,9 @@ class Url:
instance=self, instance=self,
) )
m = re.search(r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url) m = re.search(
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
)
str_ts = m.group(1) str_ts = m.group(1)
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S") ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
now = datetime.utcnow() now = datetime.utcnow()
@ -175,9 +243,22 @@ class Url:
return self return self
def get(self, url="", user_agent="", encoding=""): def get(self, url="", user_agent="", encoding=""):
""" """GET the source of archive or any other URL.
Return the source code of the last archived URL,
if no URL is passed to this method. url : str, waybackpy.wrapper.Url
The method will return the source code of
this URL instead of last fetched archive.
user_agent : str
The user_agent for GET request to API
encoding : str
If user is using any other encoding that
can't be detected by response.encoding
Return the source code of the last fetched
archive URL if no URL is passed to this method
else it returns the source code of url passed.
If encoding is not supplied, it is auto-detected If encoding is not supplied, it is auto-detected
from the response itself by requests package. from the response itself by requests package.
@ -213,6 +294,27 @@ class Url:
unix_timestamp=None, unix_timestamp=None,
): ):
""" """
Parameters
----------
year : int
Archive close to year
month : int
Archive close to month
day : int
Archive close to day
hour : int
Archive close to hour
minute : int
Archive close to minute
unix_timestamp : str, float or int
Archive close to this unix_timestamp
Wayback Machine can have many archives of a webpage, Wayback Machine can have many archives of a webpage,
sometimes we want archive close to a specific time. sometimes we want archive close to a specific time.
@ -235,7 +337,7 @@ class Url:
""" """
if unix_timestamp: if unix_timestamp:
timestamp = _unix_ts_to_wayback_ts(unix_timestamp) timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
else: else:
now = datetime.utcnow().timetuple() now = datetime.utcnow().timetuple()
timestamp = _wayback_timestamp( timestamp = _wayback_timestamp(
@ -285,28 +387,45 @@ class Url:
We simply pass the year in near() and return it. We simply pass the year in near() and return it.
""" """
return self.near(year=year) return self.near(year=year)
def newest(self): def newest(self):
""" """Return the newest Wayback Machine archive available.
Return the newest Wayback Machine archive available for this URL.
We return the output of self.near() as it deafults to current utc time. We return the return value of self.near() as it deafults to current UTC time.
Due to Wayback Machine database lag, this may not always be the Due to Wayback Machine database lag, this may not always be the
most recent archive. most recent archive.
return type : waybackpy.wrapper.Url
""" """
return self.near() return self.near()
def total_archives(self, start_timestamp=None, end_timestamp=None): def total_archives(self, start_timestamp=None, end_timestamp=None):
""" """Returns the total number of archives for an URL
Parameters
----------
start_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
end_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
return type : int
A webpage can have multiple archives on the wayback machine A webpage can have multiple archives on the wayback machine
If someone wants to count the total number of archives of a If someone wants to count the total number of archives of a
webpage on wayback machine they can use this method. webpage on wayback machine they can use this method.
Returns the total number of Wayback Machine archives for the URL. Returns the total number of Wayback Machine archives for the URL.
Return type in integer.
""" """
cdx = Cdx( cdx = Cdx(
@ -315,6 +434,8 @@ class Url:
start_timestamp=start_timestamp, start_timestamp=start_timestamp,
end_timestamp=end_timestamp, end_timestamp=end_timestamp,
) )
# cdx.snapshots() is generator not list.
i = 0 i = 0
for _ in cdx.snapshots(): for _ in cdx.snapshots():
i = i + 1 i = i + 1
@ -328,15 +449,36 @@ class Url:
end_timestamp=None, end_timestamp=None,
match_type="prefix", match_type="prefix",
): ):
""" """Yields known_urls URLs from the CDX API.
Parameters
----------
subdomain : bool
If True fetch subdomain URLs along with the host URLs.
host : bool
Only fetch host URLs.
start_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
end_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
match_type : str
One of (exact, prefix, host and domain)
return type : waybackpy.snapshot.CdxSnapshot
Yields list of URLs known to exist for given input. Yields list of URLs known to exist for given input.
Defaults to input URL as prefix. Defaults to input URL as prefix.
This method is kept for compatibility, use the Cdx class instead. Based on:
This method itself depends on Cdx. https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
By Mohammed Diaa (https://github.com/mhmdiaa)
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
""" """
if subdomain: if subdomain:
@ -353,7 +495,5 @@ class Url:
collapses=["urlkey"], collapses=["urlkey"],
) )
snapshots = cdx.snapshots() for snapshot in cdx.snapshots():
for snapshot in snapshots:
yield (snapshot.original) yield (snapshot.original)