Files
waybackpy/waybackpy/wrapper.py
Akash Mahanty 817c0ee844 docstrings
2021-01-26 02:13:52 +05:30

424 lines
12 KiB
Python

import re
from datetime import datetime, timedelta
from .exceptions import WaybackError
from .cdx import Cdx
from .utils import (
_archive_url_parser,
_wayback_timestamp,
_get_response,
default_user_agent,
_url_check,
_cleaned_url,
_timestamp_manager,
_unix_timestamp_to_wayback_timestamp,
_latest_version,
)
class Url:
def __init__(self, url, user_agent=default_user_agent):
self.url = url
self.user_agent = str(user_agent)
_url_check(self.url)
self._archive_url = None
self.timestamp = None
self._JSON = None
self.latest_version = None
self.cached_save = False
def __repr__(self):
return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
url=self.url, user_agent=self.user_agent
)
def __str__(self):
if not self._archive_url:
self._archive_url = self.archive_url
return "{archive_url}".format(archive_url=self._archive_url)
def __len__(self):
"""Number of days between today and the date of archive based on the timestamp
len() of waybackpy.wrapper.Url should return
the number of days between today and the
archive timestamp.
Can be applied on return values of near and its
childs (e.g. oldest) and if applied on waybackpy.Url()
whithout using any functions, it just grabs
self._timestamp and def _timestamp gets it
from def JSON.
"""
td_max = timedelta(
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
)
if not self.timestamp:
self.timestamp = self._timestamp
if self.timestamp == datetime.max:
return td_max.days
return (datetime.utcnow() - self.timestamp).days
@property
def JSON(self):
"""
If the end user has used near() or its childs like oldest, newest
and archive_url then the JSON response of these are cached in self._JSON
If we find that self._JSON is not None we return it.
else we get the response of 'https://archive.org/wayback/available?url=YOUR-URL'
and return it.
"""
if self._JSON:
return self._JSON
endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": self.user_agent}
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
response = _get_response(endpoint, params=payload, headers=headers)
return response.json()
@property
def archive_url(self):
"""Return the string form of the Url object.
Parameters
----------
self : waybackpy.wrapper.Url
The instance itself.
"""
if self._archive_url:
return self._archive_url
data = self.JSON
if not data["archived_snapshots"]:
archive_url = None
else:
archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace(
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
)
self._archive_url = archive_url
return archive_url
@property
def _timestamp(self):
"""Sets the value of self.timestamp if still not set.
Parameters
----------
self : waybackpy.wrapper.Url
The instance itself.
"""
return _timestamp_manager(self.timestamp, self.JSON)
def save(self):
"""Saves/Archive the URL.
Parameters
----------
self : waybackpy.wrapper.Url
The instance itself.
To save a webpage on WayBack machine we
need to send get request to https://web.archive.org/save/
And to get the archive URL we are required to read the
header of the API response.
_get_response() takes care of the get requests.
_archive_url_parser() parses the archive from the header.
"""
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
headers = {"User-Agent": self.user_agent}
response = _get_response(
request_url,
params=None,
headers=headers,
backoff_factor=2,
no_raise_on_redirects=True,
)
if not self.latest_version:
self.latest_version = _latest_version("waybackpy", headers=headers)
if response:
res_headers = response.headers
else:
res_headers = "save redirected"
self._archive_url = "https://" + _archive_url_parser(
res_headers,
self.url,
latest_version=self.latest_version,
instance=self,
)
m = re.search(
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
)
str_ts = m.group(1)
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
now = datetime.utcnow()
total_seconds = int((now - ts).total_seconds())
if total_seconds > 60 * 3:
self.cached_save = True
self.timestamp = ts
return self
def get(self, url="", user_agent="", encoding=""):
"""
Return the source code of the last archived URL,
if no URL is passed to this method.
If encoding is not supplied, it is auto-detected
from the response itself by requests package.
"""
if not url and self._archive_url:
url = self._archive_url
elif not url and not self._archive_url:
url = _cleaned_url(self.url)
if not user_agent:
user_agent = self.user_agent
headers = {"User-Agent": str(user_agent)}
response = _get_response(str(url), params=None, headers=headers)
if not encoding:
try:
encoding = response.encoding
except AttributeError:
encoding = "UTF-8"
return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
def near(
self,
year=None,
month=None,
day=None,
hour=None,
minute=None,
unix_timestamp=None,
):
"""
Parameters
----------
self : waybackpy.wrapper.Url
The instance itself.
year : int
Archive close to year
month : int
Archive close to month
day : int
Archive close to day
hour : int
Archive close to hour
minute : int
Archive close to minute
unix_timestamp : str, float or int
Archive close to this unix_timestamp
Wayback Machine can have many archives of a webpage,
sometimes we want archive close to a specific time.
This method takes year, month, day, hour and minute as input.
The input type must be integer. Any non-supplied parameters
default to the current time.
We convert the input to a wayback machine timestamp using
_wayback_timestamp(), it returns a string.
We use the wayback machine's availability API
(https://archive.org/wayback/available)
to get the closest archive from the timestamp.
We set self._archive_url to the archive found, if any.
If archive found, we set self.timestamp to its timestamp.
We self._JSON to the response of the availability API.
And finally return self.
"""
if unix_timestamp:
timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
else:
now = datetime.utcnow().timetuple()
timestamp = _wayback_timestamp(
year=year if year else now.tm_year,
month=month if month else now.tm_mon,
day=day if day else now.tm_mday,
hour=hour if hour else now.tm_hour,
minute=minute if minute else now.tm_min,
)
endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": self.user_agent}
payload = {
"url": "{url}".format(url=_cleaned_url(self.url)),
"timestamp": timestamp,
}
response = _get_response(endpoint, params=payload, headers=headers)
data = response.json()
if not data["archived_snapshots"]:
raise WaybackError(
"Can not find archive for '{url}' try later or use wayback.Url(url, user_agent).save() "
"to create a new archive.\nAPI response:\n{text}".format(
url=_cleaned_url(self.url), text=response.text
)
)
archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace(
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
)
self._archive_url = archive_url
self.timestamp = datetime.strptime(
data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
self._JSON = data
return self
def oldest(self, year=1994):
"""
Returns the earliest/oldest Wayback Machine archive for the webpage.
Wayback machine has started archiving the internet around 1997 and
therefore we can't have any archive older than 1997, we use 1994 as the
deafult year to look for the oldest archive.
We simply pass the year in near() and return it.
"""
return self.near(year=year)
def newest(self):
"""
Return the newest Wayback Machine archive available for this URL.
We return the output of self.near() as it deafults to current utc time.
Due to Wayback Machine database lag, this may not always be the
most recent archive.
"""
return self.near()
def total_archives(self, start_timestamp=None, end_timestamp=None):
"""Returns the total number of archives for an URL
Parameters
----------
self : waybackpy.wrapper.Url
The instance itself
start_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
end_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
A webpage can have multiple archives on the wayback machine
If someone wants to count the total number of archives of a
webpage on wayback machine they can use this method.
Returns the total number of Wayback Machine archives for the URL.
Return type in integer.
"""
cdx = Cdx(
_cleaned_url(self.url),
user_agent=self.user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
)
i = 0
for _ in cdx.snapshots():
i = i + 1
return i
def known_urls(
self,
subdomain=False,
host=False,
start_timestamp=None,
end_timestamp=None,
match_type="prefix",
):
"""Yields known_urls URLs from the CDX API.
Parameters
----------
self : waybackpy.wrapper.Url
The instance itself
subdomain : bool
If True fetch subdomain URLs along with the host URLs.
host : bool
Only fetch host URLs.
start_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
end_timestamp : str
1 to 14 digit string of numbers, you are not required to
pass a full 14 digit timestamp.
match_type : str
One of (exact, prefix, host and domain)
Yields list of URLs known to exist for given input.
Defaults to input URL as prefix.
Based on:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
By Mohammed Diaa (https://github.com/mhmdiaa)
"""
if subdomain:
match_type = "domain"
if host:
match_type = "host"
cdx = Cdx(
_cleaned_url(self.url),
user_agent=self.user_agent,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
match_type=match_type,
collapses=["urlkey"],
)
for snapshot in cdx.snapshots():
yield (snapshot.original)