Create separate module for the 3 different APIs also CDX is now CLI supported.
This commit is contained in:
@@ -1,298 +1,27 @@
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from .save_api import WaybackMachineSaveAPI
|
||||
from .availability_api import WaybackMachineAvailabilityAPI
|
||||
from .cdx_api import WaybackMachineCDXServerAPI
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
from .exceptions import WaybackError
|
||||
from .cdx import Cdx
|
||||
from .utils import (
|
||||
_archive_url_parser,
|
||||
_wayback_timestamp,
|
||||
_get_response,
|
||||
default_user_agent,
|
||||
_url_check,
|
||||
_cleaned_url,
|
||||
_timestamp_manager,
|
||||
_unix_timestamp_to_wayback_timestamp,
|
||||
_latest_version,
|
||||
)
|
||||
|
||||
|
||||
class Url:
|
||||
"""
|
||||
|
||||
Attributes
|
||||
----------
|
||||
url : str
|
||||
The input URL, wayback machine API operations are performed
|
||||
on this URL after sanatizing it.
|
||||
|
||||
user_agent : str
|
||||
The user_agent used while making the GET requests to the
|
||||
Wayback machine APIs
|
||||
|
||||
_archive_url : str
|
||||
Caches the last fetched archive.
|
||||
|
||||
timestamp : datetime.datetime
|
||||
timestamp of the archive URL as datetime object for
|
||||
greater usability
|
||||
|
||||
_JSON : dict
|
||||
Caches the last fetched availability API data
|
||||
|
||||
latest_version : str
|
||||
The latest version of waybackpy on PyPi
|
||||
|
||||
cached_save : bool
|
||||
Flag to check if WayBack machine returned a cached
|
||||
archive instead of creating a new archive. WayBack
|
||||
machine allows only one 1 archive for an URL in
|
||||
30 minutes. If the archive returned by WayBack machine
|
||||
is older than 3 minutes than this flag is set to True
|
||||
|
||||
Methods turned properties
|
||||
----------
|
||||
JSON : dict
|
||||
JSON response of availability API as dictionary / loaded JSON
|
||||
|
||||
archive_url : str
|
||||
Return the archive url, returns str
|
||||
|
||||
_timestamp : datetime.datetime
|
||||
Sets the value of self.timestamp if still not set
|
||||
|
||||
Methods
|
||||
-------
|
||||
save()
|
||||
Archives the URL on WayBack machine
|
||||
|
||||
get(url="", user_agent="", encoding="")
|
||||
Gets the source of archive url, can also be used to get source
|
||||
of any URL if passed into it.
|
||||
|
||||
near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
|
||||
Wayback Machine can have many archives for a URL/webpage, sometimes we want
|
||||
archive close to a specific time.
|
||||
This method takes year, month, day, hour, minute and unix_timestamp as input.
|
||||
|
||||
oldest(year=1994)
|
||||
The oldest archive of an URL.
|
||||
|
||||
newest()
|
||||
The newest archive of an URL
|
||||
|
||||
total_archives(start_timestamp=None, end_timestamp=None)
|
||||
total number of archives of an URL, the timeframe can be confined by
|
||||
start_timestamp and end_timestamp
|
||||
|
||||
known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
|
||||
Known URLs for an URL, subdomain, URL as prefix etc.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, url, user_agent=default_user_agent):
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
|
||||
self.url = url
|
||||
self.user_agent = str(user_agent)
|
||||
_url_check(self.url)
|
||||
self._archive_url = None
|
||||
self.timestamp = None
|
||||
self._JSON = None
|
||||
self.latest_version = None
|
||||
self.cached_save = False
|
||||
|
||||
def __repr__(self):
|
||||
return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
|
||||
url=self.url, user_agent=self.user_agent
|
||||
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
if not self._archive_url:
|
||||
self._archive_url = self.archive_url
|
||||
|
||||
return "{archive_url}".format(archive_url=self._archive_url)
|
||||
|
||||
def __len__(self):
|
||||
"""Number of days between today and the date of archive based on the timestamp
|
||||
|
||||
len() of waybackpy.wrapper.Url should return
|
||||
the number of days between today and the
|
||||
archive timestamp.
|
||||
|
||||
Can be applied on return values of near and its
|
||||
childs (e.g. oldest) and if applied on waybackpy.Url()
|
||||
whithout using any functions, it just grabs
|
||||
self._timestamp and def _timestamp gets it
|
||||
from def JSON.
|
||||
"""
|
||||
td_max = timedelta(
|
||||
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
||||
)
|
||||
|
||||
if not self.timestamp:
|
||||
self.timestamp = self._timestamp
|
||||
|
||||
if self.timestamp == datetime.max:
|
||||
return td_max.days
|
||||
|
||||
return (datetime.utcnow() - self.timestamp).days
|
||||
|
||||
@property
|
||||
def JSON(self):
|
||||
"""Returns JSON response of availability API as dictionary / loaded JSON
|
||||
|
||||
return type : dict
|
||||
"""
|
||||
|
||||
# If user used the near method or any method that depends on near, we
|
||||
# are certain that we have a loaded dictionary cached in self._JSON.
|
||||
# Return the loaded JSON data.
|
||||
if self._JSON:
|
||||
return self._JSON
|
||||
|
||||
# If no cached data found, get data and return + cache it.
|
||||
endpoint = "https://archive.org/wayback/available"
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
|
||||
response = _get_response(endpoint, params=payload, headers=headers)
|
||||
self._JSON = response.json()
|
||||
return self._JSON
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
"""Return the archive url.
|
||||
|
||||
return type : str
|
||||
"""
|
||||
|
||||
if self._archive_url:
|
||||
return self._archive_url
|
||||
|
||||
data = self.JSON
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
archive_url = None
|
||||
else:
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||
)
|
||||
self._archive_url = archive_url
|
||||
return archive_url
|
||||
|
||||
@property
|
||||
def _timestamp(self):
|
||||
"""Sets the value of self.timestamp if still not set.
|
||||
|
||||
Return type : datetime.datetime
|
||||
|
||||
"""
|
||||
return _timestamp_manager(self.timestamp, self.JSON)
|
||||
|
||||
def save(self):
|
||||
"""Saves/Archive the URL.
|
||||
|
||||
To save a webpage on WayBack machine we
|
||||
need to send get request to https://web.archive.org/save/
|
||||
|
||||
And to get the archive URL we are required to read the
|
||||
header of the API response.
|
||||
|
||||
_get_response() takes care of the get requests.
|
||||
|
||||
_archive_url_parser() parses the archive from the header.
|
||||
|
||||
return type : waybackpy.wrapper.Url
|
||||
|
||||
"""
|
||||
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
|
||||
response = _get_response(
|
||||
request_url,
|
||||
params=None,
|
||||
headers=headers,
|
||||
backoff_factor=2,
|
||||
no_raise_on_redirects=True,
|
||||
self.wayback_machine_save_api = WaybackMachineSaveAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
|
||||
if not self.latest_version:
|
||||
self.latest_version = _latest_version("waybackpy", headers=headers)
|
||||
if response:
|
||||
res_headers = response.headers
|
||||
else:
|
||||
res_headers = "save redirected"
|
||||
self._archive_url = "https://" + _archive_url_parser(
|
||||
res_headers,
|
||||
self.url,
|
||||
latest_version=self.latest_version,
|
||||
instance=self,
|
||||
response=response,
|
||||
)
|
||||
|
||||
if response.status_code == 509:
|
||||
raise WaybackError(
|
||||
"Can not save '{url}'. You have probably reached the limit of active "
|
||||
"sessions. Try later.".format(
|
||||
url=_cleaned_url(self.url), text=response.text
|
||||
)
|
||||
)
|
||||
|
||||
m = re.search(
|
||||
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||
)
|
||||
str_ts = m.group(1)
|
||||
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
|
||||
now = datetime.utcnow()
|
||||
total_seconds = int((now - ts).total_seconds())
|
||||
|
||||
if total_seconds > 60 * 3:
|
||||
self.cached_save = True
|
||||
|
||||
self.timestamp = ts
|
||||
|
||||
self.archive_url = self.wayback_machine_save_api.archive_url
|
||||
self.timestamp = self.wayback_machine_save_api.timestamp()
|
||||
self.headers = self.wayback_machine_save_api.headers
|
||||
return self
|
||||
|
||||
def get(self, url="", user_agent="", encoding=""):
|
||||
"""GET the source of archive or any other URL.
|
||||
|
||||
url : str, waybackpy.wrapper.Url
|
||||
The method will return the source code of
|
||||
this URL instead of last fetched archive.
|
||||
|
||||
user_agent : str
|
||||
The user_agent for GET request to API
|
||||
|
||||
encoding : str
|
||||
If user is using any other encoding that
|
||||
can't be detected by response.encoding
|
||||
|
||||
Return the source code of the last fetched
|
||||
archive URL if no URL is passed to this method
|
||||
else it returns the source code of url passed.
|
||||
|
||||
If encoding is not supplied, it is auto-detected
|
||||
from the response itself by requests package.
|
||||
"""
|
||||
|
||||
if not url and self._archive_url:
|
||||
url = self._archive_url
|
||||
|
||||
elif not url and not self._archive_url:
|
||||
url = _cleaned_url(self.url)
|
||||
|
||||
if not user_agent:
|
||||
user_agent = self.user_agent
|
||||
|
||||
headers = {"User-Agent": str(user_agent)}
|
||||
response = _get_response(str(url), params=None, headers=headers)
|
||||
|
||||
if not encoding:
|
||||
try:
|
||||
encoding = response.encoding
|
||||
except AttributeError:
|
||||
encoding = "UTF-8"
|
||||
|
||||
return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
|
||||
|
||||
def near(
|
||||
self,
|
||||
year=None,
|
||||
@@ -302,153 +31,45 @@ class Url:
|
||||
minute=None,
|
||||
unix_timestamp=None,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
|
||||
year : int
|
||||
Archive close to year
|
||||
|
||||
month : int
|
||||
Archive close to month
|
||||
|
||||
day : int
|
||||
Archive close to day
|
||||
|
||||
hour : int
|
||||
Archive close to hour
|
||||
|
||||
minute : int
|
||||
Archive close to minute
|
||||
|
||||
unix_timestamp : str, float or int
|
||||
Archive close to this unix_timestamp
|
||||
|
||||
Wayback Machine can have many archives of a webpage,
|
||||
sometimes we want archive close to a specific time.
|
||||
|
||||
This method takes year, month, day, hour and minute as input.
|
||||
The input type must be integer. Any non-supplied parameters
|
||||
default to the current time.
|
||||
|
||||
We convert the input to a wayback machine timestamp using
|
||||
_wayback_timestamp(), it returns a string.
|
||||
|
||||
We use the wayback machine's availability API
|
||||
(https://archive.org/wayback/available)
|
||||
to get the closest archive from the timestamp.
|
||||
|
||||
We set self._archive_url to the archive found, if any.
|
||||
If archive found, we set self.timestamp to its timestamp.
|
||||
We self._JSON to the response of the availability API.
|
||||
|
||||
And finally return self.
|
||||
"""
|
||||
|
||||
if unix_timestamp:
|
||||
timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||
else:
|
||||
now = datetime.utcnow().timetuple()
|
||||
timestamp = _wayback_timestamp(
|
||||
year=year if year else now.tm_year,
|
||||
month=month if month else now.tm_mon,
|
||||
day=day if day else now.tm_mday,
|
||||
hour=hour if hour else now.tm_hour,
|
||||
minute=minute if minute else now.tm_min,
|
||||
)
|
||||
|
||||
endpoint = "https://archive.org/wayback/available"
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
payload = {
|
||||
"url": "{url}".format(url=_cleaned_url(self.url)),
|
||||
"timestamp": timestamp,
|
||||
}
|
||||
response = _get_response(endpoint, params=payload, headers=headers)
|
||||
data = response.json()
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
raise WaybackError(
|
||||
"Can not find archive for '{url}' try later or use wayback.Url(url, user_agent).save() "
|
||||
"to create a new archive.\nAPI response:\n{text}".format(
|
||||
url=_cleaned_url(self.url), text=response.text
|
||||
)
|
||||
)
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||
self.wayback_machine_availability_api.near(
|
||||
year=year,
|
||||
month=month,
|
||||
day=day,
|
||||
hour=hour,
|
||||
minute=minute,
|
||||
unix_timestamp=unix_timestamp,
|
||||
)
|
||||
|
||||
self._archive_url = archive_url
|
||||
self.timestamp = datetime.strptime(
|
||||
data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||
)
|
||||
self._JSON = data
|
||||
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def oldest(self, year=1994):
|
||||
"""
|
||||
Returns the earliest/oldest Wayback Machine archive for the webpage.
|
||||
|
||||
Wayback machine has started archiving the internet around 1997 and
|
||||
therefore we can't have any archive older than 1997, we use 1994 as the
|
||||
deafult year to look for the oldest archive.
|
||||
|
||||
We simply pass the year in near() and return it.
|
||||
"""
|
||||
|
||||
return self.near(year=year)
|
||||
def oldest(self):
|
||||
self.wayback_machine_availability_api.oldest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def newest(self):
|
||||
"""Return the newest Wayback Machine archive available.
|
||||
self.wayback_machine_availability_api.newest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
We return the return value of self.near() as it deafults to current UTC time.
|
||||
|
||||
Due to Wayback Machine database lag, this may not always be the
|
||||
most recent archive.
|
||||
|
||||
return type : waybackpy.wrapper.Url
|
||||
"""
|
||||
|
||||
return self.near()
|
||||
def set_availability_api_attrs(self):
|
||||
self.archive_url = self.wayback_machine_availability_api.archive_url
|
||||
self.JSON = self.wayback_machine_availability_api.JSON
|
||||
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
||||
|
||||
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
||||
"""Returns the total number of archives for an URL
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
end_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
|
||||
return type : int
|
||||
|
||||
|
||||
A webpage can have multiple archives on the wayback machine
|
||||
If someone wants to count the total number of archives of a
|
||||
webpage on wayback machine they can use this method.
|
||||
|
||||
Returns the total number of Wayback Machine archives for the URL.
|
||||
|
||||
"""
|
||||
|
||||
cdx = Cdx(
|
||||
_cleaned_url(self.url),
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
self.url,
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
)
|
||||
|
||||
# cdx.snapshots() is generator not list.
|
||||
i = 0
|
||||
count = 0
|
||||
for _ in cdx.snapshots():
|
||||
i = i + 1
|
||||
return i
|
||||
count = count + 1
|
||||
return count
|
||||
|
||||
def known_urls(
|
||||
self,
|
||||
@@ -458,45 +79,13 @@ class Url:
|
||||
end_timestamp=None,
|
||||
match_type="prefix",
|
||||
):
|
||||
"""Yields known_urls URLs from the CDX API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
subdomain : bool
|
||||
If True fetch subdomain URLs along with the host URLs.
|
||||
|
||||
host : bool
|
||||
Only fetch host URLs.
|
||||
|
||||
start_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
end_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
match_type : str
|
||||
One of (exact, prefix, host and domain)
|
||||
|
||||
return type : waybackpy.snapshot.CdxSnapshot
|
||||
|
||||
Yields list of URLs known to exist for given input.
|
||||
Defaults to input URL as prefix.
|
||||
|
||||
Based on:
|
||||
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||
By Mohammed Diaa (https://github.com/mhmdiaa)
|
||||
"""
|
||||
|
||||
if subdomain:
|
||||
match_type = "domain"
|
||||
if host:
|
||||
match_type = "host"
|
||||
|
||||
cdx = Cdx(
|
||||
_cleaned_url(self.url),
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
self.url,
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
|
Reference in New Issue
Block a user