add: type annotation to waybackpy modules

This commit is contained in:
eggplants
2022-02-04 04:25:01 +09:00
parent c274c474b2
commit 38088fa0d8
9 changed files with 275 additions and 205 deletions

View File

@@ -1,6 +1,7 @@
import json
import time
from datetime import datetime
from typing import Any, Dict, Optional
import requests
@@ -10,37 +11,41 @@ from .exceptions import (
)
from .utils import DEFAULT_USER_AGENT
ResponseJSON = Dict[str, Any]
class WaybackMachineAvailabilityAPI:
class WaybackMachineAvailabilityAPI(object):
"""
Class that interfaces the availability API of the Wayback Machine.
"""
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
self.headers = {"User-Agent": self.user_agent}
self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
self.payload = {"url": "{url}".format(url=self.url)}
self.endpoint = "https://archive.org/wayback/available"
self.max_tries = max_tries
self.tries = 0
self.last_api_call_unix_time = int(time.time())
self.api_call_time_gap = 5
self.JSON = None
self.JSON: Optional[ResponseJSON] = None
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp: int) -> str:
"""
Converts Unix time to wayback Machine timestamp.
"""
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def __repr__(self):
def __repr__(self) -> str:
"""
Same as string representation, just return the archive URL as a string.
"""
return str(self)
def __str__(self):
def __str__(self) -> str:
"""
String representation of the class. If atleast one API call was successfully
made then return the archive URL as a string. Else returns None.
@@ -54,7 +59,7 @@ class WaybackMachineAvailabilityAPI:
return self.archive_url
def json(self):
def json(self) -> Optional[ResponseJSON]:
"""
Makes the API call to the availability API can set the JSON response
to the JSON attribute of the instance and also returns the JSON attribute.
@@ -79,7 +84,7 @@ class WaybackMachineAvailabilityAPI:
return self.JSON
def timestamp(self):
def timestamp(self) -> datetime:
"""
Converts the timestamp form the JSON response to datetime object.
If JSON attribute of the instance is None it implies that the either
@@ -91,19 +96,29 @@ class WaybackMachineAvailabilityAPI:
If you get an URL as a response form the availability API it is guaranteed
that you can get the datetime object from the timestamp.
"""
if not self.JSON or not self.JSON["archived_snapshots"]:
if self.JSON is None or "archived_snapshots" not in self.JSON:
return datetime.max
elif (
self.JSON is not None
and "archived_snapshots" in self.JSON
and self.JSON["archived_snapshots"] is not None
and "closest" in self.JSON["archived_snapshots"]
and self.JSON["archived_snapshots"]["closest"] is not None
and "timestamp" in self.JSON["archived_snapshots"]["closest"]
):
return datetime.strptime(
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
else:
raise ValueError("Could not get timestamp from result")
@property
def archive_url(self):
def archive_url(self) -> str:
"""
Reads the the JSON response data and tries to get the timestamp and returns
the timestamp if found else returns None.
"""
archive_url = ""
data = self.JSON
# If the user didn't used oldest, newest or near but tries to access the
@@ -138,7 +153,7 @@ class WaybackMachineAvailabilityAPI:
)
return archive_url
def wayback_timestamp(self, **kwargs):
def wayback_timestamp(self, **kwargs: int) -> str:
"""
Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
@@ -148,7 +163,7 @@ class WaybackMachineAvailabilityAPI:
for key in ["year", "month", "day", "hour", "minute"]
)
def oldest(self):
def oldest(self) -> "WaybackMachineAvailabilityAPI":
"""
Passing the year 1994 should return the oldest archive because
wayback machine was started in May, 1996 and there should be no archive
@@ -156,7 +171,7 @@ class WaybackMachineAvailabilityAPI:
"""
return self.near(year=1994)
def newest(self):
def newest(self) -> "WaybackMachineAvailabilityAPI":
"""
Passing the current UNIX time should be sufficient to get the newest
archive considering the API request-response time delay and also the
@@ -166,13 +181,13 @@ class WaybackMachineAvailabilityAPI:
def near(
self,
year=None,
month=None,
day=None,
hour=None,
minute=None,
unix_timestamp=None,
):
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
) -> "WaybackMachineAvailabilityAPI":
"""
The main method for this Class, oldest and newest methods are dependent on this
method.
@@ -188,11 +203,11 @@ class WaybackMachineAvailabilityAPI:
else:
now = datetime.utcnow().timetuple()
timestamp = self.wayback_timestamp(
year=year if year else now.tm_year,
month=month if month else now.tm_mon,
day=day if day else now.tm_mday,
hour=hour if hour else now.tm_hour,
minute=minute if minute else now.tm_min,
year=now.tm_year if year is None else year,
month=now.tm_mon if month is None else month,
day=now.tm_mday if day is None else day,
hour=now.tm_hour if hour is None else hour,
minute=now.tm_min if minute is None else minute,
)
self.payload["timestamp"] = timestamp

View File

@@ -1,3 +1,5 @@
from typing import Dict, Generator, List, Optional, cast
from .cdx_snapshot import CDXSnapshot
from .cdx_utils import (
check_collapses,
@@ -11,43 +13,48 @@ from .exceptions import WaybackError
from .utils import DEFAULT_USER_AGENT
class WaybackMachineCDXServerAPI:
class WaybackMachineCDXServerAPI(object):
"""
Class that interfaces the CDX server API of the Wayback Machine.
"""
# start_timestamp: from, can not use from as it's a keyword
# end_timestamp: to, not using to as can not use from
def __init__(
self,
url,
user_agent=DEFAULT_USER_AGENT,
start_timestamp=None, # from, can not use from as it's a keyword
end_timestamp=None, # to, not using to as can not use from
filters=[],
match_type=None,
gzip=None,
collapses=[],
limit=None,
max_tries=3,
):
url: str,
user_agent: str = DEFAULT_USER_AGENT,
start_timestamp: Optional[str] = None,
end_timestamp: Optional[str] = None,
filters: List[str] = [],
match_type: Optional[str] = None,
gzip: Optional[str] = None,
collapses: List[str] = [],
limit: Optional[str] = None,
max_tries: int = 3,
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
self.start_timestamp = str(start_timestamp) if start_timestamp else None
self.end_timestamp = str(end_timestamp) if end_timestamp else None
self.start_timestamp = (
str(start_timestamp) if start_timestamp is not None else None
)
self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
self.filters = filters
check_filters(self.filters)
self.match_type = str(match_type).strip() if match_type else None
self.match_type = str(match_type).strip() if match_type is not None else None
check_match_type(self.match_type, self.url)
self.gzip = gzip if gzip else True
self.gzip = gzip
self.collapses = collapses
check_collapses(self.collapses)
self.limit = limit if limit else 5000
self.limit = limit if limit is not None else 5000
self.max_tries = max_tries
self.last_api_request_url = None
self.last_api_request_url: Optional[str] = None
self.use_page = False
self.endpoint = "https://web.archive.org/cdx/search/cdx"
def cdx_api_manager(self, payload, headers, use_page=False):
def cdx_api_manager(
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
) -> Generator[str, None, None]:
total_pages = get_total_pages(self.url, self.user_agent)
# If we only have two or less pages of archives then we care for more accuracy
# pagination API is lagged sometimes
@@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI:
url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url
text = res.text
@@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI:
yield text
else:
payload["showResumeKey"] = "true"
payload["limit"] = str(self.limit)
resumeKey = None
more = True
while more:
if resumeKey:
payload["resumeKey"] = resumeKey
url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url
@@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI:
yield text
def add_payload(self, payload):
def add_payload(self, payload: Dict[str, str]) -> None:
if self.start_timestamp:
payload["from"] = self.start_timestamp
if self.end_timestamp:
payload["to"] = self.end_timestamp
if self.gzip is not True:
if self.gzip is None:
payload["gzip"] = "false"
if self.match_type:
@@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI:
# Don't need to return anything as it's dictionary.
payload["url"] = self.url
def snapshots(self):
payload = {}
def snapshots(self) -> Generator[CDXSnapshot, None, None]:
payload: Dict[str, str] = {}
headers = {"User-Agent": self.user_agent}
self.add_payload(payload)
@@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI:
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
continue
properties = {
properties: Dict[str, Optional[str]] = {
"urlkey": None,
"timestamp": None,
"original": None,
@@ -190,4 +198,4 @@ class WaybackMachineCDXServerAPI:
properties["length"],
) = prop_values
yield CDXSnapshot(properties)
yield CDXSnapshot(cast(Dict[str, str], properties))

View File

@@ -1,7 +1,8 @@
from datetime import datetime
from typing import Dict
class CDXSnapshot:
class CDXSnapshot(object):
"""
Class for the CDX snapshot lines returned by the CDX API,
Each valid line of the CDX API is casted to an CDXSnapshot object
@@ -10,7 +11,7 @@ class CDXSnapshot:
of the CDXSnapshot.
"""
def __init__(self, properties):
def __init__(self, properties: Dict[str, str]) -> None:
self.urlkey = properties["urlkey"]
self.timestamp = properties["timestamp"]
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
@@ -23,7 +24,7 @@ class CDXSnapshot:
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
)
def __str__(self):
def __str__(self) -> str:
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
urlkey=self.urlkey,
timestamp=self.timestamp,

View File

@@ -1,23 +1,30 @@
import re
from typing import Any, Dict, List, Optional, Union
from urllib.parse import quote
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# from urllib3.util.retry import Retry
from requests.packages.urllib3.util.retry import Retry
from .exceptions import WaybackError
from .utils import DEFAULT_USER_AGENT
def get_total_pages(url, user_agent=DEFAULT_USER_AGENT):
def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
endpoint = "https://web.archive.org/cdx/search/cdx?"
payload = {"showNumPages": "true", "url": str(url)}
headers = {"User-Agent": user_agent}
request_url = full_url(endpoint, params=payload)
response = get_response(request_url, headers=headers)
if isinstance(response, requests.Response):
return int(response.text.strip())
else:
raise response
def full_url(endpoint, params):
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
if not params:
return endpoint
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
@@ -26,27 +33,25 @@ def full_url(endpoint, params):
key = "collapse" if key.startswith("collapse") else key
amp = "" if full_url.endswith("?") else "&"
full_url = (
full_url
+ amp
+ "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
full_url + amp + "{key}={val}".format(key=key, val=quote(str(val), safe=""))
)
return full_url
def get_response(
url,
headers=None,
retries=5,
backoff_factor=0.5,
no_raise_on_redirects=False,
):
url: str,
headers: Optional[Dict[str, str]] = None,
retries: int = 5,
backoff_factor: float = 0.5,
# no_raise_on_redirects=False,
) -> Union[requests.Response, Exception]:
session = requests.Session()
retries = Retry(
retries_ = Retry(
total=retries,
backoff_factor=backoff_factor,
status_forcelist=[500, 502, 503, 504],
)
session.mount("https://", HTTPAdapter(max_retries=retries))
session.mount("https://", HTTPAdapter(max_retries=retries_))
try:
response = session.get(url, headers=headers)
@@ -62,23 +67,18 @@ def get_response(
raise exc
def check_filters(filters):
def check_filters(filters: List[str]) -> None:
if not isinstance(filters, list):
raise WaybackError("filters must be a list.")
# [!]field:regex
for _filter in filters:
try:
match = re.search(
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
_filter,
)
match.group(1)
match.group(2)
except Exception:
if match is None or len(match.groups()) != 2:
exc_message = (
"Filter '{_filter}' is not following the cdx filter syntax.".format(
@@ -88,43 +88,38 @@ def check_filters(filters):
raise WaybackError(exc_message)
def check_collapses(collapses):
def check_collapses(collapses: List[str]) -> bool:
if not isinstance(collapses, list):
raise WaybackError("collapses must be a list.")
if len(collapses) == 0:
return
elif len(collapses) == 0:
return True
for collapse in collapses:
try:
match = re.search(
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
collapse,
)
match.group(1)
if 2 == len(match.groups()):
match.group(2)
except Exception:
if match is None or len(match.groups()) != 2:
exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
collapse=collapse
)
raise WaybackError(exc_message)
else:
return True
def check_match_type(match_type, url):
def check_match_type(match_type: Optional[str], url: str) -> bool:
legal_match_type = ["exact", "prefix", "host", "domain"]
if not match_type:
return
if "*" in url:
return True
elif "*" in url:
raise WaybackError(
"Can not use wildcard in the URL along with the match_type arguments."
)
legal_match_type = ["exact", "prefix", "host", "domain"]
if match_type not in legal_match_type:
elif match_type not in legal_match_type:
exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
match_type=match_type
)
raise WaybackError(exc_message)
else:
return True

View File

@@ -3,6 +3,7 @@ import os
import random
import re
import string
from typing import Generator, List, Optional
import click
import requests
@@ -163,34 +164,34 @@ from .wrapper import Url
+ "will be printed.",
)
def main(
url,
user_agent,
version,
license,
newest,
oldest,
json,
near,
year,
month,
day,
hour,
minute,
save,
headers,
known_urls,
subdomain,
file,
cdx,
start_timestamp,
end_timestamp,
filter,
match_type,
gzip,
collapse,
limit,
cdx_print,
):
url: Optional[str],
user_agent: str,
version: bool,
license: bool,
newest: bool,
oldest: bool,
json: bool,
near: bool,
year: Optional[int],
month: Optional[int],
day: Optional[int],
hour: Optional[int],
minute: Optional[int],
save: bool,
headers: bool,
known_urls: bool,
subdomain: bool,
file: bool,
cdx: bool,
start_timestamp: Optional[str],
end_timestamp: Optional[str],
filter: List[str],
match_type: Optional[str],
gzip: Optional[str],
collapse: List[str],
limit: Optional[str],
cdx_print: List[str],
) -> None:
"""\b
_ _
| | | |
@@ -244,7 +245,9 @@ def main(
)
return
def echo_availability_api(availability_api_instance):
def echo_availability_api(
availability_api_instance: WaybackMachineAvailabilityAPI,
) -> None:
click.echo("Archive URL:")
if not availability_api_instance.archive_url:
archive_url = (
@@ -295,13 +298,14 @@ def main(
click.echo(save_api.headers)
return
def save_urls_on_file(url_gen):
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
domain = None
sys_random = random.SystemRandom()
uid = "".join(
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
)
url_count = 0
file_name = None
for url in url_gen:
url_count += 1
@@ -310,7 +314,7 @@ def main(
domain = "domain-unknown"
if match:
if match is not None:
domain = match.group(1)
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
@@ -323,7 +327,7 @@ def main(
click.echo(url)
if url_count > 0:
if url_count > 0 or file_name is not None:
click.echo(
"\n\n'{file_name}' saved in current working directory".format(
file_name=file_name

View File

@@ -14,6 +14,8 @@ class WaybackError(Exception):
All other exceptions are inherited from this class.
"""
pass
class RedirectSaveError(WaybackError):
"""
@@ -21,32 +23,44 @@ class RedirectSaveError(WaybackError):
redirect URL is archived but not the original URL.
"""
pass
class URLError(Exception):
"""
Raised when malformed URLs are passed as arguments.
"""
pass
class MaximumRetriesExceeded(WaybackError):
"""
MaximumRetriesExceeded
"""
pass
class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
"""
MaximumSaveRetriesExceeded
"""
pass
class ArchiveNotInAvailabilityAPIResponse(WaybackError):
"""
Could not parse the archive in the JSON response of the availability API.
"""
pass
class InvalidJSONInAvailabilityAPIResponse(WaybackError):
"""
availability api returned invalid JSON
"""
pass

View File

@@ -1,38 +1,42 @@
import re
import time
from datetime import datetime
from typing import Dict, Optional
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# from urllib3.util.retry import Retry
from requests.packages.urllib3.util.retry import Retry
from .exceptions import MaximumSaveRetriesExceeded
from .utils import DEFAULT_USER_AGENT
class WaybackMachineSaveAPI:
class WaybackMachineSaveAPI(object):
"""
WaybackMachineSaveAPI class provides an interface for saving URLs on the
Wayback Machine.
"""
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.request_url = "https://web.archive.org/save/" + self.url
self.user_agent = user_agent
self.request_headers = {"User-Agent": self.user_agent}
self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
if max_tries < 1:
raise ValueError("max_tries should be positive")
self.max_tries = max_tries
self.total_save_retries = 5
self.backoff_factor = 0.5
self.status_forcelist = [500, 502, 503, 504]
self._archive_url = None
self._archive_url: Optional[str] = None
self.instance_birth_time = datetime.utcnow()
@property
def archive_url(self):
def archive_url(self) -> str:
"""
Returns the archive URL is already cached by _archive_url
else invoke the save method to save the archive which returns the
@@ -44,7 +48,7 @@ class WaybackMachineSaveAPI:
else:
return self.save()
def get_save_request_headers(self):
def get_save_request_headers(self) -> None:
"""
Creates a session and tries 'retries' number of times to
retrieve the archive.
@@ -61,21 +65,21 @@ class WaybackMachineSaveAPI:
the response URL yourself in the browser.
"""
session = requests.Session()
retries = Retry(
retries_ = Retry(
total=self.total_save_retries,
backoff_factor=self.backoff_factor,
status_forcelist=self.status_forcelist,
)
session.mount("https://", HTTPAdapter(max_retries=retries))
session.mount("https://", HTTPAdapter(max_retries=retries_))
self.response = session.get(self.request_url, headers=self.request_headers)
self.headers = (
self.response.headers
) # <class 'requests.structures.CaseInsensitiveDict'>
# requests.response.headers is requests.structures.CaseInsensitiveDict
self.headers = self.response.headers
self.headers_str = str(self.headers)
self.status_code = self.response.status_code
self.response_url = self.response.url
session.close()
def archive_url_parser(self):
def archive_url_parser(self) -> Optional[str]:
"""
Three regexen (like oxen?) are used to search for the
archive URL in the headers and finally look in the response URL
@@ -83,18 +87,18 @@ class WaybackMachineSaveAPI:
"""
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
match = re.search(regex1, str(self.headers))
match = re.search(regex1, self.headers_str)
if match:
return "https://web.archive.org" + match.group(1)
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
match = re.search(regex2, str(self.headers))
if match:
match = re.search(regex2, self.headers_str)
if match is not None and len(match.groups()) == 1:
return "https://" + match.group(1)
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
match = re.search(regex3, str(self.headers))
if match:
match = re.search(regex3, self.headers_str)
if match is not None and len(match.groups()) == 1:
return "https" + match.group(1)
if self.response_url:
@@ -105,7 +109,9 @@ class WaybackMachineSaveAPI:
if match:
return "https://" + match.group(0)
def sleep(self, tries):
return None
def sleep(self, tries: int) -> None:
"""
Ensure that the we wait some time before succesive retries so that we
don't waste the retries before the page is even captured by the Wayback
@@ -120,7 +126,7 @@ class WaybackMachineSaveAPI:
sleep_seconds = 10
time.sleep(sleep_seconds)
def timestamp(self):
def timestamp(self) -> datetime:
"""
Read the timestamp off the archive URL and convert the Wayback Machine
timestamp to datetime object.
@@ -133,9 +139,10 @@ class WaybackMachineSaveAPI:
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
cached archive if last archive was captured before last 45 minutes.
"""
m = re.search(
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
)
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
m = re.search(regex, str(self._archive_url))
if m is None or len(m.groups()) != 1:
raise ValueError("Could not find get timestamp")
string_timestamp = m.group(1)
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
@@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:
return timestamp
def save(self):
def save(self) -> str:
"""
Calls the SavePageNow API of the Wayback Machine with required parameters
and headers to save the URL.
@@ -169,7 +176,7 @@ class WaybackMachineSaveAPI:
self.get_save_request_headers()
self.saved_archive = self.archive_url_parser()
if self.saved_archive is not None:
if isinstance(self.saved_archive, str):
self._archive_url = self.saved_archive
self.timestamp()
return self.saved_archive
@@ -179,5 +186,5 @@ class WaybackMachineSaveAPI:
raise MaximumSaveRetriesExceeded(
"Tried %s times but failed to save and retrieve the" % str(tries)
+ " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
% (self.url, self.response_url, str(self.headers)),
% (self.url, self.response_url, self.headers_str),
)

View File

@@ -2,22 +2,43 @@ import requests
from . import __version__
DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
DEFAULT_USER_AGENT: str = (
"waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
)
def latest_version_pypi(package_name, user_agent=DEFAULT_USER_AGENT):
def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
request_url = "https://pypi.org/pypi/" + package_name + "/json"
headers = {"User-Agent": user_agent}
response = requests.get(request_url, headers=headers)
data = response.json()
return data["info"]["version"]
if (
data is not None
and "info" in data
and data["info"] is not None
and "version" in data["info"]
and data["info"]["version"] is not None
):
return str(data["info"]["version"])
else:
raise ValueError("Could not get latest pypi version")
def latest_version_github(package_name, user_agent=DEFAULT_USER_AGENT):
def latest_version_github(
package_name: str, user_agent: str = DEFAULT_USER_AGENT
) -> str:
request_url = (
"https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
)
headers = {"User-Agent": user_agent}
response = requests.get(request_url, headers=headers)
data = response.json()
return data[0]["tag_name"]
if (
data is not None
and len(data) > 0
and data[0] is not None
and "tag_name" in data[0]
):
return str(data[0]["tag_name"])
else:
raise ValueError("Could not get latest github version")

View File

@@ -1,4 +1,5 @@
from datetime import datetime, timedelta
from typing import Generator, Optional
from .availability_api import WaybackMachineAvailabilityAPI
from .cdx_api import WaybackMachineCDXServerAPI
@@ -19,35 +20,37 @@ the older interface code.
"""
class Url:
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
class Url(object):
def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
self.url = url
self.user_agent = str(user_agent)
self.archive_url = None
self.timestamp = None
self.archive_url: Optional[str] = None
self.timestamp: Optional[datetime] = None
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
self.url, user_agent=self.user_agent
)
def __str__(self):
def __str__(self) -> str:
if not self.archive_url:
self.newest()
return self.archive_url
return str(self.archive_url)
def __len__(self):
def __len__(self) -> int:
td_max = timedelta(
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
)
if not self.timestamp:
if not isinstance(self.timestamp, datetime):
self.oldest()
if self.timestamp == datetime.max:
if not isinstance(self.timestamp, datetime):
raise TypeError("timestamp must be a datetime")
elif self.timestamp == datetime.max:
return td_max.days
else:
return (datetime.utcnow() - self.timestamp).days
def save(self):
def save(self) -> "Url":
self.wayback_machine_save_api = WaybackMachineSaveAPI(
self.url, user_agent=self.user_agent
)
@@ -58,13 +61,13 @@ class Url:
def near(
self,
year=None,
month=None,
day=None,
hour=None,
minute=None,
unix_timestamp=None,
):
year: Optional[int] = None,
month: Optional[int] = None,
day: Optional[int] = None,
hour: Optional[int] = None,
minute: Optional[int] = None,
unix_timestamp: Optional[int] = None,
) -> "Url":
self.wayback_machine_availability_api.near(
year=year,
@@ -77,22 +80,24 @@ class Url:
self.set_availability_api_attrs()
return self
def oldest(self):
def oldest(self) -> "Url":
self.wayback_machine_availability_api.oldest()
self.set_availability_api_attrs()
return self
def newest(self):
def newest(self) -> "Url":
self.wayback_machine_availability_api.newest()
self.set_availability_api_attrs()
return self
def set_availability_api_attrs(self):
def set_availability_api_attrs(self) -> None:
self.archive_url = self.wayback_machine_availability_api.archive_url
self.JSON = self.wayback_machine_availability_api.JSON
self.timestamp = self.wayback_machine_availability_api.timestamp()
def total_archives(self, start_timestamp=None, end_timestamp=None):
def total_archives(
self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
) -> int:
cdx = WaybackMachineCDXServerAPI(
self.url,
user_agent=self.user_agent,
@@ -107,12 +112,12 @@ class Url:
def known_urls(
self,
subdomain=False,
host=False,
start_timestamp=None,
end_timestamp=None,
match_type="prefix",
):
subdomain: bool = False,
host: bool = False,
start_timestamp: Optional[str] = None,
end_timestamp: Optional[str] = None,
match_type: str = "prefix",
) -> Generator[str, None, None]:
if subdomain:
match_type = "domain"
if host: