* fix: pylint errors were pointed out by codacy * fix: line length * fix: help text * fix: revert https://stackoverflow.com/a/64477857 makes cli unusable * fix: cli error and refactor codes
141 lines
4.5 KiB
Python
141 lines
4.5 KiB
Python
from datetime import datetime, timedelta
|
|
from typing import Generator, Optional
|
|
|
|
from .availability_api import WaybackMachineAvailabilityAPI
|
|
from .cdx_api import WaybackMachineCDXServerAPI
|
|
from .save_api import WaybackMachineSaveAPI
|
|
from .utils import DEFAULT_USER_AGENT
|
|
|
|
|
|
class Url(object):
|
|
"""
|
|
The Url class is not recommended to be used anymore, instead use:
|
|
|
|
- WaybackMachineSaveAPI
|
|
- WaybackMachineAvailabilityAPI
|
|
- WaybackMachineCDXServerAPI
|
|
|
|
The reason it is still in the code is backwards compatibility with 2.x.x
|
|
versions.
|
|
|
|
If were are using the Url before the update to version 3.x.x, your code should
|
|
still be working fine and there is no hurry to update the interface but is
|
|
recommended that you do not use the Url class for new code as it would be
|
|
removed after 2025 also the first 3.x.x versions was released in January 2022
|
|
and three years are more than enough to update the older interface code.
|
|
"""
|
|
|
|
def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
|
|
self.url = url
|
|
self.user_agent = str(user_agent)
|
|
self.archive_url: Optional[str] = None
|
|
self.timestamp: Optional[datetime] = None
|
|
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
|
self.url, user_agent=self.user_agent
|
|
)
|
|
|
|
def __str__(self) -> str:
|
|
if not self.archive_url:
|
|
self.newest()
|
|
return str(self.archive_url)
|
|
|
|
def __len__(self) -> int:
|
|
td_max = timedelta(
|
|
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
|
)
|
|
|
|
if not isinstance(self.timestamp, datetime):
|
|
self.oldest()
|
|
|
|
if not isinstance(self.timestamp, datetime):
|
|
raise TypeError("timestamp must be a datetime")
|
|
elif self.timestamp == datetime.max:
|
|
return td_max.days
|
|
else:
|
|
return (datetime.utcnow() - self.timestamp).days
|
|
|
|
def save(self) -> "Url":
|
|
self.wayback_machine_save_api = WaybackMachineSaveAPI(
|
|
self.url, user_agent=self.user_agent
|
|
)
|
|
self.archive_url = self.wayback_machine_save_api.archive_url
|
|
self.timestamp = self.wayback_machine_save_api.timestamp()
|
|
self.headers = self.wayback_machine_save_api.headers
|
|
return self
|
|
|
|
def near(
|
|
self,
|
|
year: Optional[int] = None,
|
|
month: Optional[int] = None,
|
|
day: Optional[int] = None,
|
|
hour: Optional[int] = None,
|
|
minute: Optional[int] = None,
|
|
unix_timestamp: Optional[int] = None,
|
|
) -> "Url":
|
|
|
|
self.wayback_machine_availability_api.near(
|
|
year=year,
|
|
month=month,
|
|
day=day,
|
|
hour=hour,
|
|
minute=minute,
|
|
unix_timestamp=unix_timestamp,
|
|
)
|
|
self.set_availability_api_attrs()
|
|
return self
|
|
|
|
def oldest(self) -> "Url":
|
|
self.wayback_machine_availability_api.oldest()
|
|
self.set_availability_api_attrs()
|
|
return self
|
|
|
|
def newest(self) -> "Url":
|
|
self.wayback_machine_availability_api.newest()
|
|
self.set_availability_api_attrs()
|
|
return self
|
|
|
|
def set_availability_api_attrs(self) -> None:
|
|
self.archive_url = self.wayback_machine_availability_api.archive_url
|
|
self.JSON = self.wayback_machine_availability_api.JSON
|
|
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
|
|
|
def total_archives(
|
|
self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
|
|
) -> int:
|
|
cdx = WaybackMachineCDXServerAPI(
|
|
self.url,
|
|
user_agent=self.user_agent,
|
|
start_timestamp=start_timestamp,
|
|
end_timestamp=end_timestamp,
|
|
)
|
|
|
|
count = 0
|
|
for _ in cdx.snapshots():
|
|
count = count + 1
|
|
return count
|
|
|
|
def known_urls(
|
|
self,
|
|
subdomain: bool = False,
|
|
host: bool = False,
|
|
start_timestamp: Optional[str] = None,
|
|
end_timestamp: Optional[str] = None,
|
|
match_type: str = "prefix",
|
|
) -> Generator[str, None, None]:
|
|
if subdomain:
|
|
match_type = "domain"
|
|
if host:
|
|
match_type = "host"
|
|
|
|
cdx = WaybackMachineCDXServerAPI(
|
|
self.url,
|
|
user_agent=self.user_agent,
|
|
start_timestamp=start_timestamp,
|
|
end_timestamp=end_timestamp,
|
|
match_type=match_type,
|
|
collapses=["urlkey"],
|
|
)
|
|
|
|
for snapshot in cdx.snapshots():
|
|
yield (snapshot.original)
|