424 lines
12 KiB
Python
424 lines
12 KiB
Python
import re
|
|
from datetime import datetime, timedelta
|
|
|
|
from .exceptions import WaybackError
|
|
from .cdx import Cdx
|
|
from .utils import (
|
|
_archive_url_parser,
|
|
_wayback_timestamp,
|
|
_get_response,
|
|
default_user_agent,
|
|
_url_check,
|
|
_cleaned_url,
|
|
_timestamp_manager,
|
|
_unix_timestamp_to_wayback_timestamp,
|
|
_latest_version,
|
|
)
|
|
|
|
|
|
class Url:
|
|
def __init__(self, url, user_agent=default_user_agent):
|
|
self.url = url
|
|
self.user_agent = str(user_agent)
|
|
_url_check(self.url)
|
|
self._archive_url = None
|
|
self.timestamp = None
|
|
self._JSON = None
|
|
self.latest_version = None
|
|
self.cached_save = False
|
|
|
|
def __repr__(self):
|
|
return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
|
|
url=self.url, user_agent=self.user_agent
|
|
)
|
|
|
|
def __str__(self):
|
|
if not self._archive_url:
|
|
self._archive_url = self.archive_url
|
|
|
|
return "{archive_url}".format(archive_url=self._archive_url)
|
|
|
|
def __len__(self):
|
|
"""Number of days between today and the date of archive based on the timestamp
|
|
|
|
len() of waybackpy.wrapper.Url should return
|
|
the number of days between today and the
|
|
archive timestamp.
|
|
|
|
Can be applied on return values of near and its
|
|
childs (e.g. oldest) and if applied on waybackpy.Url()
|
|
whithout using any functions, it just grabs
|
|
self._timestamp and def _timestamp gets it
|
|
from def JSON.
|
|
"""
|
|
td_max = timedelta(
|
|
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
|
)
|
|
|
|
if not self.timestamp:
|
|
self.timestamp = self._timestamp
|
|
|
|
if self.timestamp == datetime.max:
|
|
return td_max.days
|
|
|
|
return (datetime.utcnow() - self.timestamp).days
|
|
|
|
@property
|
|
def JSON(self):
|
|
"""
|
|
If the end user has used near() or its childs like oldest, newest
|
|
and archive_url then the JSON response of these are cached in self._JSON
|
|
|
|
If we find that self._JSON is not None we return it.
|
|
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
|
|
and return it.
|
|
"""
|
|
|
|
if self._JSON:
|
|
return self._JSON
|
|
|
|
endpoint = "https://archive.org/wayback/available"
|
|
headers = {"User-Agent": self.user_agent}
|
|
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
|
|
response = _get_response(endpoint, params=payload, headers=headers)
|
|
return response.json()
|
|
|
|
@property
|
|
def archive_url(self):
|
|
"""Return the string form of the Url object.
|
|
|
|
Parameters
|
|
----------
|
|
self : waybackpy.wrapper.Url
|
|
The instance itself.
|
|
"""
|
|
|
|
if self._archive_url:
|
|
return self._archive_url
|
|
|
|
data = self.JSON
|
|
|
|
if not data["archived_snapshots"]:
|
|
archive_url = None
|
|
else:
|
|
archive_url = data["archived_snapshots"]["closest"]["url"]
|
|
archive_url = archive_url.replace(
|
|
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
|
)
|
|
self._archive_url = archive_url
|
|
return archive_url
|
|
|
|
@property
|
|
def _timestamp(self):
|
|
"""Sets the value of self.timestamp if still not set.
|
|
|
|
Parameters
|
|
----------
|
|
self : waybackpy.wrapper.Url
|
|
The instance itself.
|
|
|
|
"""
|
|
return _timestamp_manager(self.timestamp, self.JSON)
|
|
|
|
def save(self):
|
|
"""Saves/Archive the URL.
|
|
|
|
Parameters
|
|
----------
|
|
self : waybackpy.wrapper.Url
|
|
The instance itself.
|
|
|
|
To save a webpage on WayBack machine we
|
|
need to send get request to https://web.archive.org/save/
|
|
|
|
And to get the archive URL we are required to read the
|
|
header of the API response.
|
|
|
|
_get_response() takes care of the get requests.
|
|
|
|
_archive_url_parser() parses the archive from the header.
|
|
|
|
"""
|
|
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
|
|
headers = {"User-Agent": self.user_agent}
|
|
|
|
response = _get_response(
|
|
request_url,
|
|
params=None,
|
|
headers=headers,
|
|
backoff_factor=2,
|
|
no_raise_on_redirects=True,
|
|
)
|
|
|
|
if not self.latest_version:
|
|
self.latest_version = _latest_version("waybackpy", headers=headers)
|
|
if response:
|
|
res_headers = response.headers
|
|
else:
|
|
res_headers = "save redirected"
|
|
self._archive_url = "https://" + _archive_url_parser(
|
|
res_headers,
|
|
self.url,
|
|
latest_version=self.latest_version,
|
|
instance=self,
|
|
)
|
|
|
|
m = re.search(
|
|
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
|
)
|
|
str_ts = m.group(1)
|
|
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
|
|
now = datetime.utcnow()
|
|
total_seconds = int((now - ts).total_seconds())
|
|
|
|
if total_seconds > 60 * 3:
|
|
self.cached_save = True
|
|
|
|
self.timestamp = ts
|
|
|
|
return self
|
|
|
|
def get(self, url="", user_agent="", encoding=""):
|
|
"""
|
|
Return the source code of the last archived URL,
|
|
if no URL is passed to this method.
|
|
|
|
If encoding is not supplied, it is auto-detected
|
|
from the response itself by requests package.
|
|
"""
|
|
|
|
if not url and self._archive_url:
|
|
url = self._archive_url
|
|
|
|
elif not url and not self._archive_url:
|
|
url = _cleaned_url(self.url)
|
|
|
|
if not user_agent:
|
|
user_agent = self.user_agent
|
|
|
|
headers = {"User-Agent": str(user_agent)}
|
|
response = _get_response(str(url), params=None, headers=headers)
|
|
|
|
if not encoding:
|
|
try:
|
|
encoding = response.encoding
|
|
except AttributeError:
|
|
encoding = "UTF-8"
|
|
|
|
return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
|
|
|
|
def near(
|
|
self,
|
|
year=None,
|
|
month=None,
|
|
day=None,
|
|
hour=None,
|
|
minute=None,
|
|
unix_timestamp=None,
|
|
):
|
|
"""
|
|
Parameters
|
|
----------
|
|
self : waybackpy.wrapper.Url
|
|
The instance itself.
|
|
|
|
year : int
|
|
Archive close to year
|
|
|
|
month : int
|
|
Archive close to month
|
|
|
|
day : int
|
|
Archive close to day
|
|
|
|
hour : int
|
|
Archive close to hour
|
|
|
|
minute : int
|
|
Archive close to minute
|
|
|
|
unix_timestamp : str, float or int
|
|
Archive close to this unix_timestamp
|
|
|
|
Wayback Machine can have many archives of a webpage,
|
|
sometimes we want archive close to a specific time.
|
|
|
|
This method takes year, month, day, hour and minute as input.
|
|
The input type must be integer. Any non-supplied parameters
|
|
default to the current time.
|
|
|
|
We convert the input to a wayback machine timestamp using
|
|
_wayback_timestamp(), it returns a string.
|
|
|
|
We use the wayback machine's availability API
|
|
(https://archive.org/wayback/available)
|
|
to get the closest archive from the timestamp.
|
|
|
|
We set self._archive_url to the archive found, if any.
|
|
If archive found, we set self.timestamp to its timestamp.
|
|
We self._JSON to the response of the availability API.
|
|
|
|
And finally return self.
|
|
"""
|
|
|
|
if unix_timestamp:
|
|
timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
|
else:
|
|
now = datetime.utcnow().timetuple()
|
|
timestamp = _wayback_timestamp(
|
|
year=year if year else now.tm_year,
|
|
month=month if month else now.tm_mon,
|
|
day=day if day else now.tm_mday,
|
|
hour=hour if hour else now.tm_hour,
|
|
minute=minute if minute else now.tm_min,
|
|
)
|
|
|
|
endpoint = "https://archive.org/wayback/available"
|
|
headers = {"User-Agent": self.user_agent}
|
|
payload = {
|
|
"url": "{url}".format(url=_cleaned_url(self.url)),
|
|
"timestamp": timestamp,
|
|
}
|
|
response = _get_response(endpoint, params=payload, headers=headers)
|
|
data = response.json()
|
|
|
|
if not data["archived_snapshots"]:
|
|
raise WaybackError(
|
|
"Can not find archive for '{url}' try later or use wayback.Url(url, user_agent).save() "
|
|
"to create a new archive.\nAPI response:\n{text}".format(
|
|
url=_cleaned_url(self.url), text=response.text
|
|
)
|
|
)
|
|
archive_url = data["archived_snapshots"]["closest"]["url"]
|
|
archive_url = archive_url.replace(
|
|
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
|
)
|
|
|
|
self._archive_url = archive_url
|
|
self.timestamp = datetime.strptime(
|
|
data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
|
)
|
|
self._JSON = data
|
|
|
|
return self
|
|
|
|
def oldest(self, year=1994):
|
|
"""
|
|
Returns the earliest/oldest Wayback Machine archive for the webpage.
|
|
|
|
Wayback machine has started archiving the internet around 1997 and
|
|
therefore we can't have any archive older than 1997, we use 1994 as the
|
|
deafult year to look for the oldest archive.
|
|
|
|
We simply pass the year in near() and return it.
|
|
"""
|
|
|
|
return self.near(year=year)
|
|
|
|
def newest(self):
|
|
"""
|
|
Return the newest Wayback Machine archive available for this URL.
|
|
|
|
We return the output of self.near() as it deafults to current utc time.
|
|
|
|
Due to Wayback Machine database lag, this may not always be the
|
|
most recent archive.
|
|
"""
|
|
|
|
return self.near()
|
|
|
|
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
|
"""Returns the total number of archives for an URL
|
|
|
|
Parameters
|
|
----------
|
|
self : waybackpy.wrapper.Url
|
|
The instance itself
|
|
|
|
start_timestamp : str
|
|
1 to 14 digit string of numbers, you are not required to
|
|
pass a full 14 digit timestamp.
|
|
|
|
end_timestamp : str
|
|
1 to 14 digit string of numbers, you are not required to
|
|
pass a full 14 digit timestamp.
|
|
|
|
|
|
A webpage can have multiple archives on the wayback machine
|
|
If someone wants to count the total number of archives of a
|
|
webpage on wayback machine they can use this method.
|
|
|
|
Returns the total number of Wayback Machine archives for the URL.
|
|
|
|
Return type in integer.
|
|
"""
|
|
|
|
cdx = Cdx(
|
|
_cleaned_url(self.url),
|
|
user_agent=self.user_agent,
|
|
start_timestamp=start_timestamp,
|
|
end_timestamp=end_timestamp,
|
|
)
|
|
i = 0
|
|
for _ in cdx.snapshots():
|
|
i = i + 1
|
|
return i
|
|
|
|
def known_urls(
|
|
self,
|
|
subdomain=False,
|
|
host=False,
|
|
start_timestamp=None,
|
|
end_timestamp=None,
|
|
match_type="prefix",
|
|
):
|
|
"""Yields known_urls URLs from the CDX API.
|
|
|
|
Parameters
|
|
----------
|
|
|
|
self : waybackpy.wrapper.Url
|
|
The instance itself
|
|
|
|
subdomain : bool
|
|
If True fetch subdomain URLs along with the host URLs.
|
|
|
|
host : bool
|
|
Only fetch host URLs.
|
|
|
|
start_timestamp : str
|
|
1 to 14 digit string of numbers, you are not required to
|
|
pass a full 14 digit timestamp.
|
|
|
|
end_timestamp : str
|
|
1 to 14 digit string of numbers, you are not required to
|
|
pass a full 14 digit timestamp.
|
|
|
|
match_type : str
|
|
One of (exact, prefix, host and domain)
|
|
|
|
Yields list of URLs known to exist for given input.
|
|
Defaults to input URL as prefix.
|
|
|
|
Based on:
|
|
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
|
By Mohammed Diaa (https://github.com/mhmdiaa)
|
|
"""
|
|
|
|
if subdomain:
|
|
match_type = "domain"
|
|
if host:
|
|
match_type = "host"
|
|
|
|
cdx = Cdx(
|
|
_cleaned_url(self.url),
|
|
user_agent=self.user_agent,
|
|
start_timestamp=start_timestamp,
|
|
end_timestamp=end_timestamp,
|
|
match_type=match_type,
|
|
collapses=["urlkey"],
|
|
)
|
|
|
|
for snapshot in cdx.snapshots():
|
|
yield (snapshot.original)
|