waybackpy/waybackpy/wrapper.py

89 lines
2.9 KiB
Python
Raw Normal View History

2020-05-04 04:35:52 +02:00
# -*- coding: utf-8 -*-
import json
2020-05-04 04:35:52 +02:00
from datetime import datetime
2020-05-05 06:13:13 +02:00
from waybackpy.exceptions import TooManyArchivingRequests, ArchivingNotAllowed, PageNotSaved, ArchiveNotFound, UrlNotFound, UrlNotFound, InvalidUrl
2020-05-04 06:19:40 +02:00
try:
from urllib.request import Request, urlopen
from urllib.error import HTTPError
except ImportError:
from urllib2 import Request, urlopen, HTTPError
2020-05-04 04:35:52 +02:00
default_UA = "waybackpy python package"
2020-05-04 05:26:01 +02:00
def clean_url(url):
return str(url).strip().replace(" ","_")
2020-05-04 04:35:52 +02:00
def save(url,UA=default_UA):
base_save_url = "https://web.archive.org/save/"
2020-05-04 12:03:00 +02:00
request_url = (base_save_url + clean_url(url))
2020-05-04 04:35:52 +02:00
hdr = { 'User-Agent' : '%s' % UA }
req = Request(request_url, headers=hdr)
if "." not in url:
2020-05-04 12:31:01 +02:00
raise InvalidUrl("'%s' is not a vaild url." % url)
2020-05-04 04:35:52 +02:00
try:
response = urlopen(req) #nosec
2020-05-04 06:19:40 +02:00
except HTTPError as e:
2020-05-04 04:35:52 +02:00
if e.code == 502:
2020-05-04 12:31:01 +02:00
raise BadGateWay(e)
2020-05-04 04:35:52 +02:00
elif e.code == 429:
2020-05-04 12:31:01 +02:00
raise TooManyArchivingRequests(e)
2020-05-04 12:03:00 +02:00
elif e.code == 404:
raise UrlNotFound(e)
else:
raise PageNotSaved(e)
2020-05-04 04:35:52 +02:00
header = response.headers
if "exclusion.robots.policy" in str(header):
raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url))
archive_id = header['Content-Location']
archived_url = "https://web.archive.org" + archive_id
return archived_url
def get(url,encoding=None,UA=default_UA):
hdr = { 'User-Agent' : '%s' % UA }
request_url = clean_url(url)
req = Request(request_url, headers=hdr)
resp=urlopen(req)
if encoding is None:
try:
encoding= resp.headers['content-type'].split('charset=')[-1]
except:
encoding = "UTF-8"
return resp.read().decode(encoding)
def wayback_timestamp(year,month,day,hour,minute):
year = str(year)
month = str(month).zfill(2)
day = str(day).zfill(2)
hour = str(hour).zfill(2)
minute = str(minute).zfill(2)
return (year+month+day+hour+minute)
2020-05-04 04:35:52 +02:00
def near(
url,
year=datetime.utcnow().strftime('%Y'),
month=datetime.utcnow().strftime('%m'),
day=datetime.utcnow().strftime('%d'),
hour=datetime.utcnow().strftime('%H'),
minute=datetime.utcnow().strftime('%M'),
UA=default_UA,
):
timestamp = wayback_timestamp(year,month,day,hour,minute)
2020-05-04 05:26:01 +02:00
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (clean_url(url), str(timestamp))
2020-05-04 04:35:52 +02:00
hdr = { 'User-Agent' : '%s' % UA }
req = Request(request_url, headers=hdr)
response = urlopen(req) #nosec
data = json.loads(response.read().decode("UTF-8"))
2020-05-04 04:35:52 +02:00
if not data["archived_snapshots"]:
2020-05-04 05:31:33 +02:00
raise ArchiveNotFound("'%s' is not yet archived." % url)
2020-05-04 04:35:52 +02:00
archive_url = (data["archived_snapshots"]["closest"]["url"])
return archive_url
2020-05-04 04:51:42 +02:00
def oldest(url,UA=default_UA,year=1994):
return near(url,year=year,UA=UA)
2020-05-04 04:35:52 +02:00
def newest(url,UA=default_UA):
return near(url,UA=UA)