From 38088fa0d8d13d9e90cbc3fc95c70a0839f55a76 Mon Sep 17 00:00:00 2001 From: eggplants Date: Fri, 4 Feb 2022 04:25:01 +0900 Subject: [PATCH] add: type annotation to waybackpy modules --- waybackpy/availability_api.py | 75 +++++++++++++++++------------ waybackpy/cdx_api.py | 66 ++++++++++++++----------- waybackpy/cdx_snapshot.py | 7 +-- waybackpy/cdx_utils.py | 91 +++++++++++++++++------------------ waybackpy/cli.py | 72 ++++++++++++++------------- waybackpy/exceptions.py | 14 ++++++ waybackpy/save_api.py | 61 ++++++++++++----------- waybackpy/utils.py | 31 ++++++++++-- waybackpy/wrapper.py | 63 +++++++++++++----------- 9 files changed, 275 insertions(+), 205 deletions(-) diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 6e76bb8..bab92f7 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -1,6 +1,7 @@ import json import time from datetime import datetime +from typing import Any, Dict, Optional import requests @@ -10,37 +11,41 @@ from .exceptions import ( ) from .utils import DEFAULT_USER_AGENT +ResponseJSON = Dict[str, Any] -class WaybackMachineAvailabilityAPI: + +class WaybackMachineAvailabilityAPI(object): """ Class that interfaces the availability API of the Wayback Machine. """ - def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3): + def __init__( + self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3 + ) -> None: self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent - self.headers = {"User-Agent": self.user_agent} + self.headers: Dict[str, str] = {"User-Agent": self.user_agent} self.payload = {"url": "{url}".format(url=self.url)} self.endpoint = "https://archive.org/wayback/available" self.max_tries = max_tries self.tries = 0 self.last_api_call_unix_time = int(time.time()) self.api_call_time_gap = 5 - self.JSON = None + self.JSON: Optional[ResponseJSON] = None - def unix_timestamp_to_wayback_timestamp(self, unix_timestamp): + def unix_timestamp_to_wayback_timestamp(self, unix_timestamp: int) -> str: """ Converts Unix time to wayback Machine timestamp. """ return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") - def __repr__(self): + def __repr__(self) -> str: """ Same as string representation, just return the archive URL as a string. """ return str(self) - def __str__(self): + def __str__(self) -> str: """ String representation of the class. If atleast one API call was successfully made then return the archive URL as a string. Else returns None. @@ -54,7 +59,7 @@ class WaybackMachineAvailabilityAPI: return self.archive_url - def json(self): + def json(self) -> Optional[ResponseJSON]: """ Makes the API call to the availability API can set the JSON response to the JSON attribute of the instance and also returns the JSON attribute. @@ -79,7 +84,7 @@ class WaybackMachineAvailabilityAPI: return self.JSON - def timestamp(self): + def timestamp(self) -> datetime: """ Converts the timestamp form the JSON response to datetime object. If JSON attribute of the instance is None it implies that the either @@ -91,19 +96,29 @@ class WaybackMachineAvailabilityAPI: If you get an URL as a response form the availability API it is guaranteed that you can get the datetime object from the timestamp. """ - if not self.JSON or not self.JSON["archived_snapshots"]: + if self.JSON is None or "archived_snapshots" not in self.JSON: return datetime.max - - return datetime.strptime( - self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" - ) + elif ( + self.JSON is not None + and "archived_snapshots" in self.JSON + and self.JSON["archived_snapshots"] is not None + and "closest" in self.JSON["archived_snapshots"] + and self.JSON["archived_snapshots"]["closest"] is not None + and "timestamp" in self.JSON["archived_snapshots"]["closest"] + ): + return datetime.strptime( + self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" + ) + else: + raise ValueError("Could not get timestamp from result") @property - def archive_url(self): + def archive_url(self) -> str: """ Reads the the JSON response data and tries to get the timestamp and returns the timestamp if found else returns None. """ + archive_url = "" data = self.JSON # If the user didn't used oldest, newest or near but tries to access the @@ -138,7 +153,7 @@ class WaybackMachineAvailabilityAPI: ) return archive_url - def wayback_timestamp(self, **kwargs): + def wayback_timestamp(self, **kwargs: int) -> str: """ Prepends zero before the year, month, day, hour and minute so that they are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. @@ -148,7 +163,7 @@ class WaybackMachineAvailabilityAPI: for key in ["year", "month", "day", "hour", "minute"] ) - def oldest(self): + def oldest(self) -> "WaybackMachineAvailabilityAPI": """ Passing the year 1994 should return the oldest archive because wayback machine was started in May, 1996 and there should be no archive @@ -156,7 +171,7 @@ class WaybackMachineAvailabilityAPI: """ return self.near(year=1994) - def newest(self): + def newest(self) -> "WaybackMachineAvailabilityAPI": """ Passing the current UNIX time should be sufficient to get the newest archive considering the API request-response time delay and also the @@ -166,13 +181,13 @@ class WaybackMachineAvailabilityAPI: def near( self, - year=None, - month=None, - day=None, - hour=None, - minute=None, - unix_timestamp=None, - ): + year: Optional[int] = None, + month: Optional[int] = None, + day: Optional[int] = None, + hour: Optional[int] = None, + minute: Optional[int] = None, + unix_timestamp: Optional[int] = None, + ) -> "WaybackMachineAvailabilityAPI": """ The main method for this Class, oldest and newest methods are dependent on this method. @@ -188,11 +203,11 @@ class WaybackMachineAvailabilityAPI: else: now = datetime.utcnow().timetuple() timestamp = self.wayback_timestamp( - year=year if year else now.tm_year, - month=month if month else now.tm_mon, - day=day if day else now.tm_mday, - hour=hour if hour else now.tm_hour, - minute=minute if minute else now.tm_min, + year=now.tm_year if year is None else year, + month=now.tm_mon if month is None else month, + day=now.tm_mday if day is None else day, + hour=now.tm_hour if hour is None else hour, + minute=now.tm_min if minute is None else minute, ) self.payload["timestamp"] = timestamp diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index a04f8af..c39c83d 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -1,3 +1,5 @@ +from typing import Dict, Generator, List, Optional, cast + from .cdx_snapshot import CDXSnapshot from .cdx_utils import ( check_collapses, @@ -11,43 +13,48 @@ from .exceptions import WaybackError from .utils import DEFAULT_USER_AGENT -class WaybackMachineCDXServerAPI: +class WaybackMachineCDXServerAPI(object): """ Class that interfaces the CDX server API of the Wayback Machine. """ + # start_timestamp: from, can not use from as it's a keyword + # end_timestamp: to, not using to as can not use from def __init__( self, - url, - user_agent=DEFAULT_USER_AGENT, - start_timestamp=None, # from, can not use from as it's a keyword - end_timestamp=None, # to, not using to as can not use from - filters=[], - match_type=None, - gzip=None, - collapses=[], - limit=None, - max_tries=3, - ): + url: str, + user_agent: str = DEFAULT_USER_AGENT, + start_timestamp: Optional[str] = None, + end_timestamp: Optional[str] = None, + filters: List[str] = [], + match_type: Optional[str] = None, + gzip: Optional[str] = None, + collapses: List[str] = [], + limit: Optional[str] = None, + max_tries: int = 3, + ) -> None: self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent - self.start_timestamp = str(start_timestamp) if start_timestamp else None - self.end_timestamp = str(end_timestamp) if end_timestamp else None + self.start_timestamp = ( + str(start_timestamp) if start_timestamp is not None else None + ) + self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None self.filters = filters check_filters(self.filters) - self.match_type = str(match_type).strip() if match_type else None + self.match_type = str(match_type).strip() if match_type is not None else None check_match_type(self.match_type, self.url) - self.gzip = gzip if gzip else True + self.gzip = gzip self.collapses = collapses check_collapses(self.collapses) - self.limit = limit if limit else 5000 + self.limit = limit if limit is not None else 5000 self.max_tries = max_tries - self.last_api_request_url = None + self.last_api_request_url: Optional[str] = None self.use_page = False self.endpoint = "https://web.archive.org/cdx/search/cdx" - def cdx_api_manager(self, payload, headers, use_page=False): - + def cdx_api_manager( + self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False + ) -> Generator[str, None, None]: total_pages = get_total_pages(self.url, self.user_agent) # If we only have two or less pages of archives then we care for more accuracy # pagination API is lagged sometimes @@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI: url = full_url(self.endpoint, params=payload) res = get_response(url, headers=headers) + if isinstance(res, Exception): + raise res self.last_api_request_url = url text = res.text @@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI: yield text else: - payload["showResumeKey"] = "true" payload["limit"] = str(self.limit) resumeKey = None - more = True while more: - if resumeKey: payload["resumeKey"] = resumeKey url = full_url(self.endpoint, params=payload) res = get_response(url, headers=headers) + if isinstance(res, Exception): + raise res self.last_api_request_url = url @@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI: yield text - def add_payload(self, payload): + def add_payload(self, payload: Dict[str, str]) -> None: if self.start_timestamp: payload["from"] = self.start_timestamp if self.end_timestamp: payload["to"] = self.end_timestamp - if self.gzip is not True: + if self.gzip is None: payload["gzip"] = "false" if self.match_type: @@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI: # Don't need to return anything as it's dictionary. payload["url"] = self.url - def snapshots(self): - payload = {} + def snapshots(self) -> Generator[CDXSnapshot, None, None]: + payload: Dict[str, str] = {} headers = {"User-Agent": self.user_agent} self.add_payload(payload) @@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI: if len(snapshot) < 46: # 14 + 32 (timestamp+digest) continue - properties = { + properties: Dict[str, Optional[str]] = { "urlkey": None, "timestamp": None, "original": None, @@ -190,4 +198,4 @@ class WaybackMachineCDXServerAPI: properties["length"], ) = prop_values - yield CDXSnapshot(properties) + yield CDXSnapshot(cast(Dict[str, str], properties)) diff --git a/waybackpy/cdx_snapshot.py b/waybackpy/cdx_snapshot.py index 58d4e8b..d8419ea 100644 --- a/waybackpy/cdx_snapshot.py +++ b/waybackpy/cdx_snapshot.py @@ -1,7 +1,8 @@ from datetime import datetime +from typing import Dict -class CDXSnapshot: +class CDXSnapshot(object): """ Class for the CDX snapshot lines returned by the CDX API, Each valid line of the CDX API is casted to an CDXSnapshot object @@ -10,7 +11,7 @@ class CDXSnapshot: of the CDXSnapshot. """ - def __init__(self, properties): + def __init__(self, properties: Dict[str, str]) -> None: self.urlkey = properties["urlkey"] self.timestamp = properties["timestamp"] self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S") @@ -23,7 +24,7 @@ class CDXSnapshot: "https://web.archive.org/web/" + self.timestamp + "/" + self.original ) - def __str__(self): + def __str__(self) -> str: return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format( urlkey=self.urlkey, timestamp=self.timestamp, diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py index 06f043c..b4eff44 100644 --- a/waybackpy/cdx_utils.py +++ b/waybackpy/cdx_utils.py @@ -1,23 +1,30 @@ import re +from typing import Any, Dict, List, Optional, Union +from urllib.parse import quote import requests from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry + +# from urllib3.util.retry import Retry +from requests.packages.urllib3.util.retry import Retry from .exceptions import WaybackError from .utils import DEFAULT_USER_AGENT -def get_total_pages(url, user_agent=DEFAULT_USER_AGENT): +def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int: endpoint = "https://web.archive.org/cdx/search/cdx?" payload = {"showNumPages": "true", "url": str(url)} headers = {"User-Agent": user_agent} request_url = full_url(endpoint, params=payload) response = get_response(request_url, headers=headers) - return int(response.text.strip()) + if isinstance(response, requests.Response): + return int(response.text.strip()) + else: + raise response -def full_url(endpoint, params): +def full_url(endpoint: str, params: Dict[str, Any]) -> str: if not params: return endpoint full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") @@ -26,27 +33,25 @@ def full_url(endpoint, params): key = "collapse" if key.startswith("collapse") else key amp = "" if full_url.endswith("?") else "&" full_url = ( - full_url - + amp - + "{key}={val}".format(key=key, val=requests.utils.quote(str(val))) + full_url + amp + "{key}={val}".format(key=key, val=quote(str(val), safe="")) ) return full_url def get_response( - url, - headers=None, - retries=5, - backoff_factor=0.5, - no_raise_on_redirects=False, -): + url: str, + headers: Optional[Dict[str, str]] = None, + retries: int = 5, + backoff_factor: float = 0.5, + # no_raise_on_redirects=False, +) -> Union[requests.Response, Exception]: session = requests.Session() - retries = Retry( + retries_ = Retry( total=retries, backoff_factor=backoff_factor, status_forcelist=[500, 502, 503, 504], ) - session.mount("https://", HTTPAdapter(max_retries=retries)) + session.mount("https://", HTTPAdapter(max_retries=retries_)) try: response = session.get(url, headers=headers) @@ -62,23 +67,18 @@ def get_response( raise exc -def check_filters(filters): +def check_filters(filters: List[str]) -> None: if not isinstance(filters, list): raise WaybackError("filters must be a list.") # [!]field:regex for _filter in filters: - try: + match = re.search( + r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)", + _filter, + ) - match = re.search( - r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)", - _filter, - ) - - match.group(1) - match.group(2) - - except Exception: + if match is None or len(match.groups()) != 2: exc_message = ( "Filter '{_filter}' is not following the cdx filter syntax.".format( @@ -88,43 +88,38 @@ def check_filters(filters): raise WaybackError(exc_message) -def check_collapses(collapses): - +def check_collapses(collapses: List[str]) -> bool: if not isinstance(collapses, list): raise WaybackError("collapses must be a list.") - - if len(collapses) == 0: - return + elif len(collapses) == 0: + return True for collapse in collapses: - try: - match = re.search( - r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?", - collapse, - ) - match.group(1) - if 2 == len(match.groups()): - match.group(2) - except Exception: + match = re.search( + r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?", + collapse, + ) + if match is None or len(match.groups()) != 2: exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format( collapse=collapse ) raise WaybackError(exc_message) + else: + return True -def check_match_type(match_type, url): +def check_match_type(match_type: Optional[str], url: str) -> bool: + legal_match_type = ["exact", "prefix", "host", "domain"] if not match_type: - return - - if "*" in url: + return True + elif "*" in url: raise WaybackError( "Can not use wildcard in the URL along with the match_type arguments." ) - - legal_match_type = ["exact", "prefix", "host", "domain"] - - if match_type not in legal_match_type: + elif match_type not in legal_match_type: exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format( match_type=match_type ) raise WaybackError(exc_message) + else: + return True diff --git a/waybackpy/cli.py b/waybackpy/cli.py index f1117c2..8fca775 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -3,6 +3,7 @@ import os import random import re import string +from typing import Generator, List, Optional import click import requests @@ -163,34 +164,34 @@ from .wrapper import Url + "will be printed.", ) def main( - url, - user_agent, - version, - license, - newest, - oldest, - json, - near, - year, - month, - day, - hour, - minute, - save, - headers, - known_urls, - subdomain, - file, - cdx, - start_timestamp, - end_timestamp, - filter, - match_type, - gzip, - collapse, - limit, - cdx_print, -): + url: Optional[str], + user_agent: str, + version: bool, + license: bool, + newest: bool, + oldest: bool, + json: bool, + near: bool, + year: Optional[int], + month: Optional[int], + day: Optional[int], + hour: Optional[int], + minute: Optional[int], + save: bool, + headers: bool, + known_urls: bool, + subdomain: bool, + file: bool, + cdx: bool, + start_timestamp: Optional[str], + end_timestamp: Optional[str], + filter: List[str], + match_type: Optional[str], + gzip: Optional[str], + collapse: List[str], + limit: Optional[str], + cdx_print: List[str], +) -> None: """\b _ _ | | | | @@ -244,7 +245,9 @@ def main( ) return - def echo_availability_api(availability_api_instance): + def echo_availability_api( + availability_api_instance: WaybackMachineAvailabilityAPI, + ) -> None: click.echo("Archive URL:") if not availability_api_instance.archive_url: archive_url = ( @@ -295,13 +298,14 @@ def main( click.echo(save_api.headers) return - def save_urls_on_file(url_gen): + def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: domain = None sys_random = random.SystemRandom() uid = "".join( sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) ) url_count = 0 + file_name = None for url in url_gen: url_count += 1 @@ -310,7 +314,7 @@ def main( domain = "domain-unknown" - if match: + if match is not None: domain = match.group(1) file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid) @@ -318,12 +322,12 @@ def main( if not os.path.isfile(file_path): open(file_path, "w+").close() - with open(file_path, "a") as f: - f.write("{url}\n".format(url=url)) + with open(file_path, "a") as f: + f.write("{url}\n".format(url=url)) click.echo(url) - if url_count > 0: + if url_count > 0 or file_name is not None: click.echo( "\n\n'{file_name}' saved in current working directory".format( file_name=file_name diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 8e75aea..53f00c2 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -14,6 +14,8 @@ class WaybackError(Exception): All other exceptions are inherited from this class. """ + pass + class RedirectSaveError(WaybackError): """ @@ -21,32 +23,44 @@ class RedirectSaveError(WaybackError): redirect URL is archived but not the original URL. """ + pass + class URLError(Exception): """ Raised when malformed URLs are passed as arguments. """ + pass + class MaximumRetriesExceeded(WaybackError): """ MaximumRetriesExceeded """ + pass + class MaximumSaveRetriesExceeded(MaximumRetriesExceeded): """ MaximumSaveRetriesExceeded """ + pass + class ArchiveNotInAvailabilityAPIResponse(WaybackError): """ Could not parse the archive in the JSON response of the availability API. """ + pass + class InvalidJSONInAvailabilityAPIResponse(WaybackError): """ availability api returned invalid JSON """ + + pass diff --git a/waybackpy/save_api.py b/waybackpy/save_api.py index 530e03a..af71dee 100644 --- a/waybackpy/save_api.py +++ b/waybackpy/save_api.py @@ -1,38 +1,42 @@ import re import time from datetime import datetime +from typing import Dict, Optional import requests from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry + +# from urllib3.util.retry import Retry +from requests.packages.urllib3.util.retry import Retry from .exceptions import MaximumSaveRetriesExceeded from .utils import DEFAULT_USER_AGENT -class WaybackMachineSaveAPI: - +class WaybackMachineSaveAPI(object): """ WaybackMachineSaveAPI class provides an interface for saving URLs on the Wayback Machine. """ - def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8): + def __init__( + self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8 + ) -> None: self.url = str(url).strip().replace(" ", "%20") self.request_url = "https://web.archive.org/save/" + self.url self.user_agent = user_agent - self.request_headers = {"User-Agent": self.user_agent} + self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent} if max_tries < 1: raise ValueError("max_tries should be positive") self.max_tries = max_tries self.total_save_retries = 5 self.backoff_factor = 0.5 self.status_forcelist = [500, 502, 503, 504] - self._archive_url = None + self._archive_url: Optional[str] = None self.instance_birth_time = datetime.utcnow() @property - def archive_url(self): + def archive_url(self) -> str: """ Returns the archive URL is already cached by _archive_url else invoke the save method to save the archive which returns the @@ -44,7 +48,7 @@ class WaybackMachineSaveAPI: else: return self.save() - def get_save_request_headers(self): + def get_save_request_headers(self) -> None: """ Creates a session and tries 'retries' number of times to retrieve the archive. @@ -61,21 +65,21 @@ class WaybackMachineSaveAPI: the response URL yourself in the browser. """ session = requests.Session() - retries = Retry( + retries_ = Retry( total=self.total_save_retries, backoff_factor=self.backoff_factor, status_forcelist=self.status_forcelist, ) - session.mount("https://", HTTPAdapter(max_retries=retries)) + session.mount("https://", HTTPAdapter(max_retries=retries_)) self.response = session.get(self.request_url, headers=self.request_headers) - self.headers = ( - self.response.headers - ) # + # requests.response.headers is requests.structures.CaseInsensitiveDict + self.headers = self.response.headers + self.headers_str = str(self.headers) self.status_code = self.response.status_code self.response_url = self.response.url session.close() - def archive_url_parser(self): + def archive_url_parser(self) -> Optional[str]: """ Three regexen (like oxen?) are used to search for the archive URL in the headers and finally look in the response URL @@ -83,18 +87,18 @@ class WaybackMachineSaveAPI: """ regex1 = r"Content-Location: (/web/[0-9]{14}/.*)" - match = re.search(regex1, str(self.headers)) + match = re.search(regex1, self.headers_str) if match: return "https://web.archive.org" + match.group(1) regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>" - match = re.search(regex2, str(self.headers)) - if match: + match = re.search(regex2, self.headers_str) + if match is not None and len(match.groups()) == 1: return "https://" + match.group(1) regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}" - match = re.search(regex3, str(self.headers)) - if match: + match = re.search(regex3, self.headers_str) + if match is not None and len(match.groups()) == 1: return "https" + match.group(1) if self.response_url: @@ -105,7 +109,9 @@ class WaybackMachineSaveAPI: if match: return "https://" + match.group(0) - def sleep(self, tries): + return None + + def sleep(self, tries: int) -> None: """ Ensure that the we wait some time before succesive retries so that we don't waste the retries before the page is even captured by the Wayback @@ -120,7 +126,7 @@ class WaybackMachineSaveAPI: sleep_seconds = 10 time.sleep(sleep_seconds) - def timestamp(self): + def timestamp(self) -> datetime: """ Read the timestamp off the archive URL and convert the Wayback Machine timestamp to datetime object. @@ -133,9 +139,10 @@ class WaybackMachineSaveAPI: didn't serve a Cached URL. It is quite common for the Wayback Machine to serve cached archive if last archive was captured before last 45 minutes. """ - m = re.search( - r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url - ) + regex = r"https?://web\.archive.org/web/([0-9]{14})/http" + m = re.search(regex, str(self._archive_url)) + if m is None or len(m.groups()) != 1: + raise ValueError("Could not find get timestamp") string_timestamp = m.group(1) timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S") @@ -149,7 +156,7 @@ class WaybackMachineSaveAPI: return timestamp - def save(self): + def save(self) -> str: """ Calls the SavePageNow API of the Wayback Machine with required parameters and headers to save the URL. @@ -169,7 +176,7 @@ class WaybackMachineSaveAPI: self.get_save_request_headers() self.saved_archive = self.archive_url_parser() - if self.saved_archive is not None: + if isinstance(self.saved_archive, str): self._archive_url = self.saved_archive self.timestamp() return self.saved_archive @@ -179,5 +186,5 @@ class WaybackMachineSaveAPI: raise MaximumSaveRetriesExceeded( "Tried %s times but failed to save and retrieve the" % str(tries) + " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n" - % (self.url, self.response_url, str(self.headers)), + % (self.url, self.response_url, self.headers_str), ) diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 7201403..ac95a4e 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -2,22 +2,43 @@ import requests from . import __version__ -DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__ +DEFAULT_USER_AGENT: str = ( + "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__ +) -def latest_version_pypi(package_name, user_agent=DEFAULT_USER_AGENT): +def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str: request_url = "https://pypi.org/pypi/" + package_name + "/json" headers = {"User-Agent": user_agent} response = requests.get(request_url, headers=headers) data = response.json() - return data["info"]["version"] + if ( + data is not None + and "info" in data + and data["info"] is not None + and "version" in data["info"] + and data["info"]["version"] is not None + ): + return str(data["info"]["version"]) + else: + raise ValueError("Could not get latest pypi version") -def latest_version_github(package_name, user_agent=DEFAULT_USER_AGENT): +def latest_version_github( + package_name: str, user_agent: str = DEFAULT_USER_AGENT +) -> str: request_url = ( "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1" ) headers = {"User-Agent": user_agent} response = requests.get(request_url, headers=headers) data = response.json() - return data[0]["tag_name"] + if ( + data is not None + and len(data) > 0 + and data[0] is not None + and "tag_name" in data[0] + ): + return str(data[0]["tag_name"]) + else: + raise ValueError("Could not get latest github version") diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 3121b77..91d1d92 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta +from typing import Generator, Optional from .availability_api import WaybackMachineAvailabilityAPI from .cdx_api import WaybackMachineCDXServerAPI @@ -19,35 +20,37 @@ the older interface code. """ -class Url: - def __init__(self, url, user_agent=DEFAULT_USER_AGENT): +class Url(object): + def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None: self.url = url self.user_agent = str(user_agent) - self.archive_url = None - self.timestamp = None + self.archive_url: Optional[str] = None + self.timestamp: Optional[datetime] = None self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI( self.url, user_agent=self.user_agent ) - def __str__(self): + def __str__(self) -> str: if not self.archive_url: self.newest() - return self.archive_url + return str(self.archive_url) - def __len__(self): + def __len__(self) -> int: td_max = timedelta( days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999 ) - if not self.timestamp: + if not isinstance(self.timestamp, datetime): self.oldest() - if self.timestamp == datetime.max: + if not isinstance(self.timestamp, datetime): + raise TypeError("timestamp must be a datetime") + elif self.timestamp == datetime.max: return td_max.days + else: + return (datetime.utcnow() - self.timestamp).days - return (datetime.utcnow() - self.timestamp).days - - def save(self): + def save(self) -> "Url": self.wayback_machine_save_api = WaybackMachineSaveAPI( self.url, user_agent=self.user_agent ) @@ -58,13 +61,13 @@ class Url: def near( self, - year=None, - month=None, - day=None, - hour=None, - minute=None, - unix_timestamp=None, - ): + year: Optional[int] = None, + month: Optional[int] = None, + day: Optional[int] = None, + hour: Optional[int] = None, + minute: Optional[int] = None, + unix_timestamp: Optional[int] = None, + ) -> "Url": self.wayback_machine_availability_api.near( year=year, @@ -77,22 +80,24 @@ class Url: self.set_availability_api_attrs() return self - def oldest(self): + def oldest(self) -> "Url": self.wayback_machine_availability_api.oldest() self.set_availability_api_attrs() return self - def newest(self): + def newest(self) -> "Url": self.wayback_machine_availability_api.newest() self.set_availability_api_attrs() return self - def set_availability_api_attrs(self): + def set_availability_api_attrs(self) -> None: self.archive_url = self.wayback_machine_availability_api.archive_url self.JSON = self.wayback_machine_availability_api.JSON self.timestamp = self.wayback_machine_availability_api.timestamp() - def total_archives(self, start_timestamp=None, end_timestamp=None): + def total_archives( + self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None + ) -> int: cdx = WaybackMachineCDXServerAPI( self.url, user_agent=self.user_agent, @@ -107,12 +112,12 @@ class Url: def known_urls( self, - subdomain=False, - host=False, - start_timestamp=None, - end_timestamp=None, - match_type="prefix", - ): + subdomain: bool = False, + host: bool = False, + start_timestamp: Optional[str] = None, + end_timestamp: Optional[str] = None, + match_type: str = "prefix", + ) -> Generator[str, None, None]: if subdomain: match_type = "domain" if host: