diff --git a/tests/test_cdx_utils.py b/tests/test_cdx_utils.py index 8378dd0..ae92147 100644 --- a/tests/test_cdx_utils.py +++ b/tests/test_cdx_utils.py @@ -53,10 +53,6 @@ def test_get_response() -> None: response = get_response(url, headers=headers) assert not isinstance(response, Exception) and response.status_code == 200 - url = "http/wwhfhfvhvjhmom" - with pytest.raises(WaybackError): - get_response(url, headers=headers) - def test_check_filters() -> None: filters: List[str] = [] diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 11a9716..6ef5b53 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -1,9 +1,32 @@ +""" +This module interfaces the Wayback Machine's availability API. + +The interface could be useful for looking up archives and finding archives +that are close to a specific date and time. + +It has a class called WaybackMachineAvailabilityAPI, and the class has +methods such as: + +near() for looking up archives close to a specific date and time. + +oldest() for retrieving the first archive URL of the webpage. + +newest() for retrieving the latest archive of an URL. + +The Wayback Machine Availability response should be a valid JSON and +if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised. + +If the Availability API returned valid JSON but archive URL could not be found +it it then ArchiveNotInAvailabilityAPIResponse is raised. +""" + import json import time from datetime import datetime from typing import Any, Dict, Optional import requests +from requests.models import Response from .exceptions import ( ArchiveNotInAvailabilityAPIResponse, @@ -22,38 +45,43 @@ class WaybackMachineAvailabilityAPI(object): def __init__( self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3 ) -> None: + self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent self.headers: Dict[str, str] = {"User-Agent": self.user_agent} - self.payload = {"url": self.url} - self.endpoint = "https://archive.org/wayback/available" - self.max_tries = max_tries - self.tries = 0 - self.last_api_call_unix_time = int(time.time()) - self.api_call_time_gap = 5 + self.payload: Dict[str, str] = {"url": self.url} + self.endpoint: str = "https://archive.org/wayback/available" + self.max_tries: int = max_tries + self.tries: int = 0 + self.last_api_call_unix_time: int = int(time.time()) + self.api_call_time_gap: int = 5 self.JSON: Optional[ResponseJSON] = None @staticmethod def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: """ - Converts Unix time to wayback Machine timestamp. + Converts Unix time to wayback Machine timestamp and the Wayback Machine + timestamp format is yyyyMMddhhmmss. """ + return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") def __repr__(self) -> str: """ Same as string representation, just return the archive URL as a string. """ + return str(self) def __str__(self) -> str: """ - String representation of the class. If atleast one API call was successfully - made then return the archive URL as a string. Else returns None. + String representation of the class. If atleast one API + call was successfully made then return the archive URL + as a string. Else returns "". """ - # String must not return anything other than a string object - # So, if some asks for string repr before making the API requests + # String should not return anything other than a string object + # So, if a string repr is asked for before making any API requests # just return "" if not self.JSON: return "" @@ -62,26 +90,36 @@ class WaybackMachineAvailabilityAPI(object): def json(self) -> Optional[ResponseJSON]: """ - Makes the API call to the availability API can set the JSON response - to the JSON attribute of the instance and also returns the JSON attribute. + Makes the API call to the availability API and set the JSON response + to the JSON attribute of the instance and also returns the JSON + attribute. + + time_diff and sleep_time makes sure that you are not making too many + requests in a short interval of item, making too many requests is bad + as Wayback Machine may reject them above a certain threshold. + + The end-user can change the api_call_time_gap attribute of the instance + to increase or decrease the default time gap between two successive API + calls, but it is not recommended to increase it. """ + time_diff = int(time.time()) - self.last_api_call_unix_time sleep_time = self.api_call_time_gap - time_diff if sleep_time > 0: time.sleep(sleep_time) - self.response = requests.get( + self.response: Response = requests.get( self.endpoint, params=self.payload, headers=self.headers ) self.last_api_call_unix_time = int(time.time()) self.tries += 1 try: self.JSON = self.response.json() - except json.decoder.JSONDecodeError: + except json.decoder.JSONDecodeError as json_decode_error: raise InvalidJSONInAvailabilityAPIResponse( f"Response data:\n{self.response.text}" - ) + ) from json_decode_error return self.JSON @@ -91,15 +129,17 @@ class WaybackMachineAvailabilityAPI(object): If JSON attribute of the instance is None it implies that the either the the last API call failed or one was never made. - If not JSON or if JSON but no timestamp in the JSON response then returns - the maximum value for datetime object that is possible. + If not JSON or if JSON but no timestamp in the JSON response then + returns the maximum value for datetime object that is possible. - If you get an URL as a response form the availability API it is guaranteed - that you can get the datetime object from the timestamp. + If you get an URL as a response form the availability API it is + guaranteed that you can get the datetime object from the timestamp. """ + if self.JSON is None or "archived_snapshots" not in self.JSON: return datetime.max - elif ( + + if ( self.JSON is not None and "archived_snapshots" in self.JSON and self.JSON["archived_snapshots"] is not None @@ -110,21 +150,23 @@ class WaybackMachineAvailabilityAPI(object): return datetime.strptime( self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" ) - else: - raise ValueError("Could not get timestamp from result") + + raise ValueError("Could not get timestamp from result") @property def archive_url(self) -> str: """ - Reads the the JSON response data and tries to get the timestamp and returns - the timestamp if found else returns None. + Reads the the JSON response data and returns + the timestamp if found and if not found raises + ArchiveNotInAvailabilityAPIResponse. """ + archive_url = "" data = self.JSON - # If the user didn't used oldest, newest or near but tries to access the - # archive_url attribute then, we assume they are fine with any archive - # and invoke the oldest archive function. + # If the user didn't invoke oldest, newest or near but tries to access the + # archive_url attribute then assume they are fine with any archive + # and invoke the oldest method. if not data: self.oldest() @@ -137,7 +179,7 @@ class WaybackMachineAvailabilityAPI(object): self.json() # It makes a new API call data = self.JSON # json() updated the value of JSON attribute - # Even if after we exhausted teh max_tries, then we give up and + # If we exhausted the max_tries, then we give up and # raise exception. if not data or not data["archived_snapshots"]: @@ -160,6 +202,7 @@ class WaybackMachineAvailabilityAPI(object): Prepends zero before the year, month, day, hour and minute so that they are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. """ + return "".join( str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"] @@ -167,18 +210,21 @@ class WaybackMachineAvailabilityAPI(object): def oldest(self) -> "WaybackMachineAvailabilityAPI": """ - Passing the year 1994 should return the oldest archive because - wayback machine was started in May, 1996 and there should be no archive - before the year 1994. + Passes the date 1994-01-01 to near which should return the oldest archive + because Wayback Machine was started in May, 1996 and it is assumed that + there would be no archive older than January 1, 1994. """ - return self.near(year=1994) + + return self.near(year=1994, month=1, day=1) def newest(self) -> "WaybackMachineAvailabilityAPI": """ - Passing the current UNIX time should be sufficient to get the newest - archive considering the API request-response time delay and also the - database lags on Wayback machine. + Passes the current UNIX time to near() for retrieving the newest archive + from the availability API. + + We assume that wayback machine can not archive the future of a webpage. """ + return self.near(unix_timestamp=int(time.time())) def near( @@ -191,16 +237,18 @@ class WaybackMachineAvailabilityAPI(object): unix_timestamp: Optional[int] = None, ) -> "WaybackMachineAvailabilityAPI": """ - The main method for this Class, oldest and newest methods are dependent on this - method. + The main method for the Class, oldest() and newest() are dependent on it. It generates the timestamp based on the input either by calling the unix_timestamp_to_wayback_timestamp or wayback_timestamp method with appropriate arguments for their respective parameters. + Adds the timestamp to the payload dictionary. + And finally invoking the json method to make the API call then returns the instance. """ + if unix_timestamp: timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) else: diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index 7acad0b..97cc908 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -1,3 +1,14 @@ +""" +This module interfaces the Wayback Machine's CDX server API. + +The module has WaybackMachineCDXServerAPI which should be used by the users of +this module to consume the CDX server API. + +WaybackMachineCDXServerAPI has a snapshot method that yields the snapshots, and +the snapshots are yielded as instances of the CDXSnapshot class. +""" + + from typing import Dict, Generator, List, Optional, cast from .cdx_snapshot import CDXSnapshot @@ -16,6 +27,11 @@ from .utils import DEFAULT_USER_AGENT class WaybackMachineCDXServerAPI(object): """ Class that interfaces the CDX server API of the Wayback Machine. + + snapshot() returns a generator that can be iterated upon by the end-user, + the generator returns the snapshots/entries as instance of CDXSnapshot to + make the usage easy, just use '.' to get any attribute as the attributes are + accessible via a dot ".". """ # start_timestamp: from, can not use from as it's a keyword @@ -53,9 +69,35 @@ class WaybackMachineCDXServerAPI(object): def cdx_api_manager( self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False ) -> Generator[str, None, None]: + """ + Manages the API calls for the instance, it automatically selects the best + parameters by looking as the query of the end-user. For bigger queries + automatically use the CDX pagination API and for smaller queries use the + normal API. + + CDX Server API is a complex API and to make it easy for the end user to + consume it the CDX manager(this method) handles the selection of the + API output, whether to use the pagination API or not. + + For doing large/bulk queries, the use of the Pagination API is + recommended by the Wayback Machine authors. And it determines if the + query would be large or not by using the showNumPages=true parameter, + this tells the number of pages of CDX DATA that the pagination API + will return. + + If the number of page is less than 2 we use the normal non-pagination + API as the pagination API is known to lag and for big queries it should + not matter but for queries where the number of pages are less this + method chooses accuracy over the pagination API. + """ + + # number of pages that will returned by the pagination API. + # get_total_pages adds the showNumPages=true param to pagination API + # requests. + # This is a special query that will return a single number indicating + # the number of pages. total_pages = get_total_pages(self.url, self.user_agent) - # If we only have two or less pages of archives then we care for more accuracy - # pagination API is lagged sometimes + if use_page is True and total_pages >= 2: blank_pages = 0 for i in range(total_pages): @@ -78,11 +120,11 @@ class WaybackMachineCDXServerAPI(object): else: payload["showResumeKey"] = "true" payload["limit"] = str(self.limit) - resumeKey = None + resume_key = None more = True while more: - if resumeKey: - payload["resumeKey"] = resumeKey + if resume_key: + payload["resumeKey"] = resume_key url = full_url(self.endpoint, params=payload) res = get_response(url, headers=headers) @@ -102,13 +144,16 @@ class WaybackMachineCDXServerAPI(object): if len(second_last_line) == 0: - resumeKey = lines[-1].strip() - text = text.replace(resumeKey, "", 1).strip() + resume_key = lines[-1].strip() + text = text.replace(resume_key, "", 1).strip() more = True yield text def add_payload(self, payload: Dict[str, str]) -> None: + """ + Adds the payload to the payload dictionary. + """ if self.start_timestamp: payload["from"] = self.start_timestamp @@ -122,17 +167,35 @@ class WaybackMachineCDXServerAPI(object): payload["matchType"] = self.match_type if self.filters and len(self.filters) > 0: - for i, f in enumerate(self.filters): - payload["filter" + str(i)] = f + for i, _filter in enumerate(self.filters): + payload["filter" + str(i)] = _filter if self.collapses and len(self.collapses) > 0: - for i, f in enumerate(self.collapses): - payload["collapse" + str(i)] = f + for i, collapse in enumerate(self.collapses): + payload["collapse" + str(i)] = collapse # Don't need to return anything as it's dictionary. payload["url"] = self.url def snapshots(self) -> Generator[CDXSnapshot, None, None]: + """ + This function yields the CDX data lines as snapshots. + + As it is a generator it exhaustible, the reason that this is + a generator and not a list are: + + a) CDX server API can return millions of entries for a query and list + is not suitable for such cases. + + b) Preventing memory usage issues, as told before this method may yield + millions of records for some queries and your system may not have enough + memory for such a big list. Also Remember this if outputing to Jupyter + Notebooks. + + The objects yielded by this method are instance of CDXSnapshot class, + you can access the attributes of the entries as the attribute of the instance + itself. + """ payload: Dict[str, str] = {} headers = {"User-Agent": self.user_agent} @@ -144,18 +207,25 @@ class WaybackMachineCDXServerAPI(object): if self.collapses != []: self.use_page = False - texts = self.cdx_api_manager(payload, headers, use_page=self.use_page) + entries = self.cdx_api_manager(payload, headers, use_page=self.use_page) - for text in texts: + for entry in entries: - if text.isspace() or len(text) <= 1 or not text: + if entry.isspace() or len(entry) <= 1 or not entry: continue - snapshot_list = text.split("\n") + # each line is a snapshot aka entry of the CDX server API. + # We are able to split the page by lines because it only + # splits the lines on a sinlge page and not all the entries + # at once, thus there should be no issues of too much memory usage. + snapshot_list = entry.split("\n") for snapshot in snapshot_list: - if len(snapshot) < 46: # 14 + 32 (timestamp+digest) + # 14 + 32 == 46 ( timestamp + digest ), ignore the invalid entries. + # they are invalid if their length is smaller than sum of length + # of a standard wayback_timestamp and standard digest of an entry. + if len(snapshot) < 46: continue properties: Dict[str, Optional[str]] = { @@ -168,16 +238,16 @@ class WaybackMachineCDXServerAPI(object): "length": None, } - prop_values = snapshot.split(" ") + property_value = snapshot.split(" ") - prop_values_len = len(prop_values) - properties_len = len(properties) + total_property_values = len(property_value) + warranted_total_property_values = len(properties) - if prop_values_len != properties_len: + if total_property_values != warranted_total_property_values: raise WaybackError( - f"Snapshot returned by Cdx API has {prop_values_len} " - f"properties instead of expected {properties_len} properties.\n" - f"Problematic Snapshot: {snapshot}" + f"Snapshot returned by CDX API has {total_property_values} prop" + f"erties instead of expected {warranted_total_property_values} " + f"properties.\nProblematic Snapshot: {snapshot}" ) ( @@ -188,6 +258,6 @@ class WaybackMachineCDXServerAPI(object): properties["statuscode"], properties["digest"], properties["length"], - ) = prop_values + ) = property_value yield CDXSnapshot(cast(Dict[str, str], properties)) diff --git a/waybackpy/cdx_snapshot.py b/waybackpy/cdx_snapshot.py index ab96602..9cf9610 100644 --- a/waybackpy/cdx_snapshot.py +++ b/waybackpy/cdx_snapshot.py @@ -1,30 +1,83 @@ +""" +Module that contains the CDXSnapshot class, CDX records are casted +to CDXSnapshot objects for easier access. + +The CDX index format is plain text data. Each line ('record') indicates a +crawled document. And these lines are casted to CDXSnapshot. +""" + + from datetime import datetime from typing import Dict class CDXSnapshot(object): """ - Class for the CDX snapshot lines returned by the CDX API, + Class for the CDX snapshot lines('record') returned by the CDX API, Each valid line of the CDX API is casted to an CDXSnapshot object - by the CDX API interface. + by the CDX API interface, just use "." to access any attribute of the + CDX server API snapshot. + This provides the end-user the ease of using the data as attributes of the CDXSnapshot. + + The string representation of the class is identical to the line returned + by the CDX server API. + + Besides all the attributes of the CDX server API this class also provides + archive_url attribute, yes it is the archive url of the snapshot. + + Attributes of the this class and what they represents and are useful for: + + urlkey: The document captured, expressed as a SURT + SURT stands for Sort-friendly URI Reordering Transform, and is a + transformation applied to URIs which makes their left-to-right + representation better match the natural hierarchy of domain names. + A URI has SURT + form . + + timestamp: The timestamp of the archive, format is yyyyMMddhhmmss and type + is string. + + datetime_timestamp: The timestamp as a datetime object. + + original: The original URL of the archive. If archive_url is + https://web.archive.org/web/20220113130051/https://google.com then the + original URL is https://google.com + + mimetype: The document’s file type. e.g. text/html + + statuscode: HTTP response code for the document at the time of its crawling + + digest: Base32-encoded SHA-1 checksum of the document for discriminating + with others + + length: Document’s volume of bytes in the WARC file + + archive_url: The archive url of the snapshot, this is not returned by the + CDX server API but created by this class on init. """ def __init__(self, properties: Dict[str, str]) -> None: - self.urlkey = properties["urlkey"] - self.timestamp = properties["timestamp"] - self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S") - self.original = properties["original"] - self.mimetype = properties["mimetype"] - self.statuscode = properties["statuscode"] - self.digest = properties["digest"] - self.length = properties["length"] - self.archive_url = ( + self.urlkey: str = properties["urlkey"] + self.timestamp: str = properties["timestamp"] + self.datetime_timestamp: datetime = datetime.strptime( + self.timestamp, "%Y%m%d%H%M%S" + ) + self.original: str = properties["original"] + self.mimetype: str = properties["mimetype"] + self.statuscode: str = properties["statuscode"] + self.digest: str = properties["digest"] + self.length: str = properties["length"] + self.archive_url: str = ( f"https://web.archive.org/web/{self.timestamp}/{self.original}" ) def __str__(self) -> str: + """ + The string representation is same as the line returned by the + CDX server API for the snapshot. + """ return ( f"{self.urlkey} {self.timestamp} {self.original} " f"{self.mimetype} {self.statuscode} {self.digest} {self.length}" diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py index fce6acb..3585a2a 100644 --- a/waybackpy/cdx_utils.py +++ b/waybackpy/cdx_utils.py @@ -1,3 +1,10 @@ +""" +Utility functions required for accessing the CDX server API. + +These are here in this module so that we don’t make any module too +big. +""" + import re from typing import Any, Dict, List, Optional, Union from urllib.parse import quote @@ -11,28 +18,44 @@ from .utils import DEFAULT_USER_AGENT def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int: + """ + When using the pagination use adding showNumPages=true to the request + URL makes the CDX server return an integer which is the number of pages + of CDX pages available for us to query using the pagination API. + """ + endpoint = "https://web.archive.org/cdx/search/cdx?" payload = {"showNumPages": "true", "url": str(url)} headers = {"User-Agent": user_agent} request_url = full_url(endpoint, params=payload) response = get_response(request_url, headers=headers) + if isinstance(response, requests.Response): return int(response.text.strip()) - else: - raise response + raise response def full_url(endpoint: str, params: Dict[str, Any]) -> str: + """ + As the function's name already implies that it returns + full URL, but why we need a function for generating full URL? + The CDX server can support multiple arguments for parameters + such as filter and collapse and this function adds them without + overwriting earlier added arguments. + """ + if not params: return endpoint - full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") + _full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") + for key, val in params.items(): key = "filter" if key.startswith("filter") else key key = "collapse" if key.startswith("collapse") else key - amp = "" if full_url.endswith("?") else "&" + amp = "" if _full_url.endswith("?") else "&" val = quote(str(val), safe="") - full_url += f"{amp}{key}={val}" - return full_url + _full_url += f"{amp}{key}={val}" + + return _full_url def get_response( @@ -40,29 +63,31 @@ def get_response( headers: Optional[Dict[str, str]] = None, retries: int = 5, backoff_factor: float = 0.5, - # no_raise_on_redirects=False, ) -> Union[requests.Response, Exception]: + """ + Make get request to the CDX server and return the response. + """ + session = requests.Session() + retries_ = Retry( total=retries, backoff_factor=backoff_factor, status_forcelist=[500, 502, 503, 504], ) - session.mount("https://", HTTPAdapter(max_retries=retries_)) - try: - response = session.get(url, headers=headers) - session.close() - return response - except Exception as e: - reason = str(e) - exc_message = f"Error while retrieving {url}.\n{reason}" - exc = WaybackError(exc_message) - exc.__cause__ = e - raise exc + session.mount("https://", HTTPAdapter(max_retries=retries_)) + response = session.get(url, headers=headers) + session.close() + return response def check_filters(filters: List[str]) -> None: + """ + Check that the filter arguments passed by the end-user are valid. + If not valid then raise WaybackError. + """ + if not isinstance(filters, list): raise WaybackError("filters must be a list.") @@ -81,9 +106,15 @@ def check_filters(filters: List[str]) -> None: def check_collapses(collapses: List[str]) -> bool: + """ + Check that the collapse arguments passed by the end-user are valid. + If not valid then raise WaybackError. + """ + if not isinstance(collapses, list): raise WaybackError("collapses must be a list.") - elif len(collapses) == 0: + + if len(collapses) == 0: return True for collapse in collapses: @@ -103,18 +134,26 @@ def check_collapses(collapses: List[str]) -> bool: def check_match_type(match_type: Optional[str], url: str) -> bool: + """ + Check that the match_type argument passed by the end-user is valid. + If not valid then raise WaybackError. + """ + legal_match_type = ["exact", "prefix", "host", "domain"] + if not match_type: return True - elif "*" in url: + + if "*" in url: raise WaybackError( "Can not use wildcard in the URL along with the match_type arguments." ) - elif match_type not in legal_match_type: + + if match_type not in legal_match_type: exc_message = ( f"{match_type} is not an allowed match type.\n" "Use one from 'exact', 'prefix', 'host' or 'domain'" ) raise WaybackError(exc_message) - else: - return True + + return True diff --git a/waybackpy/cli.py b/waybackpy/cli.py index d4ca1b2..04daa82 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -1,3 +1,7 @@ +""" +Module that makes waybackpy a CLI tool. +""" + import json as JSON import os import random @@ -19,7 +23,10 @@ from .wrapper import Url def echo_availability_api( availability_api_instance: WaybackMachineAvailabilityAPI, json: bool ) -> None: - click.echo("Archive URL:") + """ + Output availability API depending functions. + Near, oldest and newest output by this method. + """ if not availability_api_instance.archive_url: archive_url = ( "NO ARCHIVE FOUND - The requested URL is probably " @@ -29,6 +36,7 @@ def echo_availability_api( ) else: archive_url = availability_api_instance.archive_url + click.echo("Archive URL:") click.echo(archive_url) if json: click.echo("JSON response:") @@ -36,6 +44,10 @@ def echo_availability_api( def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: + """ + Save output of CDX API on file. + Mainly here because of backwards compatibility. + """ domain = None sys_random = random.SystemRandom() uid = "".join( @@ -51,8 +63,8 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: domain = "domain-unknown" if match is None else match.group(1) file_name = f"{domain}-urls-{uid}.txt" file_path = os.path.join(os.getcwd(), file_name) - with open(file_path, "a") as f: - f.write(f"{url}\n") + with open(file_path, "a") as file: + file.write(f"{url}\n") click.echo(url) @@ -269,6 +281,7 @@ def main( # pylint: disable=no-value-for-parameter """ if version: click.echo(f"waybackpy version {__version__}") + elif show_license: click.echo( requests.get( @@ -277,6 +290,7 @@ def main( # pylint: disable=no-value-for-parameter ) elif url is None: click.echo("No URL detected. Please provide an URL.", err=True) + elif ( not version and not oldest @@ -291,14 +305,17 @@ def main( # pylint: disable=no-value-for-parameter "Use --help flag for help using waybackpy.", err=True, ) + elif oldest: availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) availability_api.oldest() echo_availability_api(availability_api, json) + elif newest: availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) availability_api.newest() echo_availability_api(availability_api, json) + elif near: availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) near_args = {} @@ -309,6 +326,7 @@ def main( # pylint: disable=no-value-for-parameter near_args[key] = arg availability_api.near(**near_args) echo_availability_api(availability_api, json) + elif save: save_api = WaybackMachineSaveAPI(url, user_agent=user_agent) save_api.save() @@ -319,15 +337,17 @@ def main( # pylint: disable=no-value-for-parameter if headers: click.echo("Save API headers:") click.echo(save_api.headers) + elif known_urls: wayback = Url(url, user_agent) url_gen = wayback.known_urls(subdomain=subdomain) if file: return save_urls_on_file(url_gen) - else: - for url in url_gen: - click.echo(url) + + for url in url_gen: + click.echo(url) + elif cdx: filters = list(cdx_filter) collapses = list(collapse) diff --git a/waybackpy/save_api.py b/waybackpy/save_api.py index 29fb2a3..e0d5cef 100644 --- a/waybackpy/save_api.py +++ b/waybackpy/save_api.py @@ -1,3 +1,10 @@ +""" +This module interfaces the Wayback Machine's SavePageNow (SPN) API. + +The module has WaybackMachineSaveAPI class which should be used by the users of +this module to use the SavePageNow API. +""" + import re import time from datetime import datetime @@ -8,7 +15,7 @@ from requests.adapters import HTTPAdapter from requests.structures import CaseInsensitiveDict from urllib3.util.retry import Retry -from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError +from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, WaybackError from .utils import DEFAULT_USER_AGENT @@ -47,8 +54,8 @@ class WaybackMachineSaveAPI(object): if self._archive_url: return self._archive_url - else: - return self.save() + + return self.save() def get_save_request_headers(self) -> None: """ @@ -66,6 +73,7 @@ class WaybackMachineSaveAPI(object): to be very unreliable thus if it fails first check opening the response URL yourself in the browser. """ + session = requests.Session() retries = Retry( total=self.total_save_retries, @@ -79,11 +87,24 @@ class WaybackMachineSaveAPI(object): self.status_code = self.response.status_code self.response_url = self.response.url session.close() + if self.status_code == 429: + # why wait 5 minutes and 429? + # see https://github.com/akamhy/waybackpy/issues/97 raise TooManyRequestsError( - "Seem to be refused to request by the server. " - "Save Page Now receives up to 15 URLs per minutes. " - "Wait a moment and run again." + f"Can not save '{self.url}'. " + f"Save request refused by the server. " + f"Save Page Now limits saving 15 URLs per minutes. " + f"Try waiting for 5 minutes and then try again." + ) + + # why 509? + # see https://github.com/akamhy/waybackpy/pull/99 + # also https://t.co/xww4YJ0Iwc + if self.status_code == 509: + raise WaybackError( + f"Can not save '{self.url}'. You have probably reached the " + f"limit of active sessions." ) def archive_url_parser(self) -> Optional[str]: @@ -146,13 +167,17 @@ class WaybackMachineSaveAPI(object): the Wayback Machine to serve cached archive if last archive was captured before last 45 minutes. """ - regex = r"https?://web\.archive.org/web/([0-9]{14})/http" - m = re.search(regex, str(self._archive_url)) - if m is None or len(m.groups()) != 1: - raise ValueError("Could not get timestamp") - string_timestamp = m.group(1) - timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S") + regex = r"https?://web\.archive.org/web/([0-9]{14})/http" + match = re.search(regex, str(self._archive_url)) + + if match is None or len(match.groups()) != 1: + raise ValueError( + f"Can not parse timestamp from archive URL, '{self._archive_url}'." + ) + + string_timestamp = match.group(1) + timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S") timestamp_unixtime = time.mktime(timestamp.timetuple()) instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple()) diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 413a205..1c3462e 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -1,3 +1,7 @@ +""" +Utility functions and shared variables like DEFAULT_USER_AGENT are here. +""" + import requests from . import __version__ @@ -8,6 +12,7 @@ DEFAULT_USER_AGENT: str = ( def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str: + """Latest waybackpy version on PyPi.""" request_url = "https://pypi.org/pypi/" + package_name + "/json" headers = {"User-Agent": user_agent} response = requests.get(request_url, headers=headers) @@ -20,13 +25,14 @@ def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) and data["info"]["version"] is not None ): return str(data["info"]["version"]) - else: - raise ValueError("Could not get latest pypi version") + + raise ValueError("Could not get latest pypi version") def latest_version_github( package_name: str, user_agent: str = DEFAULT_USER_AGENT ) -> str: + """Latest waybackpy version on GitHub.""" request_url = ( "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1" ) @@ -40,5 +46,5 @@ def latest_version_github( and "tag_name" in data[0] ): return str(data[0]["tag_name"]) - else: - raise ValueError("Could not get latest github version") + + raise ValueError("Could not get latest github version") diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index dbe3909..6c7b0ce 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -1,3 +1,9 @@ +""" +This module exists because backwards compatibility matters. +Don't touch this or add any new functionality here and don't use +the Url class. +""" + from datetime import datetime, timedelta from typing import Generator, Optional @@ -49,12 +55,14 @@ class Url(object): if not isinstance(self.timestamp, datetime): raise TypeError("timestamp must be a datetime") - elif self.timestamp == datetime.max: + + if self.timestamp == datetime.max: return td_max.days - else: - return (datetime.utcnow() - self.timestamp).days + + return (datetime.utcnow() - self.timestamp).days def save(self) -> "Url": + """Save the URL on wayback machine.""" self.wayback_machine_save_api = WaybackMachineSaveAPI( self.url, user_agent=self.user_agent ) @@ -72,7 +80,7 @@ class Url(object): minute: Optional[int] = None, unix_timestamp: Optional[int] = None, ) -> "Url": - + """Returns the archive of the URL close to a date and time.""" self.wayback_machine_availability_api.near( year=year, month=month, @@ -85,16 +93,19 @@ class Url(object): return self def oldest(self) -> "Url": + """Returns the oldest archive of the URL.""" self.wayback_machine_availability_api.oldest() self.set_availability_api_attrs() return self def newest(self) -> "Url": + """Returns the newest archive of the URL.""" self.wayback_machine_availability_api.newest() self.set_availability_api_attrs() return self def set_availability_api_attrs(self) -> None: + """Set the attributes for total backwards compatibility.""" self.archive_url = self.wayback_machine_availability_api.archive_url self.JSON = self.wayback_machine_availability_api.JSON self.timestamp = self.wayback_machine_availability_api.timestamp() @@ -102,6 +113,10 @@ class Url(object): def total_archives( self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None ) -> int: + """ + Returns an integer which indicates total number of archives for an URL. + Useless in my opinion, only here because of backwards compatibility. + """ cdx = WaybackMachineCDXServerAPI( self.url, user_agent=self.user_agent, @@ -122,6 +137,7 @@ class Url(object): end_timestamp: Optional[str] = None, match_type: str = "prefix", ) -> Generator[str, None, None]: + """Yields known URLs for any URL.""" if subdomain: match_type = "domain" if host: @@ -137,4 +153,4 @@ class Url(object): ) for snapshot in cdx.snapshots(): - yield (snapshot.original) + yield snapshot.original