diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index 73515a9..7acad0b 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -26,27 +26,25 @@ class WaybackMachineCDXServerAPI(object): user_agent: str = DEFAULT_USER_AGENT, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None, - filters: List[str] = [], + filters: Optional[List[str]] = None, match_type: Optional[str] = None, gzip: Optional[str] = None, - collapses: List[str] = [], + collapses: Optional[List[str]] = None, limit: Optional[str] = None, max_tries: int = 3, ) -> None: self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent - self.start_timestamp = ( - str(start_timestamp) if start_timestamp is not None else None - ) - self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None - self.filters = filters + self.start_timestamp = None if start_timestamp is None else str(start_timestamp) + self.end_timestamp = None if end_timestamp is None else str(end_timestamp) + self.filters = [] if filters is None else filters check_filters(self.filters) - self.match_type = str(match_type).strip() if match_type is not None else None + self.match_type = None if match_type is None else str(match_type).strip() check_match_type(self.match_type, self.url) self.gzip = gzip - self.collapses = collapses + self.collapses = [] if collapses is None else collapses check_collapses(self.collapses) - self.limit = limit if limit is not None else 5000 + self.limit = 5000 if limit is None else limit self.max_tries = max_tries self.last_api_request_url: Optional[str] = None self.use_page = False diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 759d87d..c6b4dc2 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -16,6 +16,52 @@ from .utils import DEFAULT_USER_AGENT from .wrapper import Url +def echo_availability_api( + availability_api_instance: WaybackMachineAvailabilityAPI, json: bool +) -> None: + click.echo("Archive URL:") + if not availability_api_instance.archive_url: + archive_url = ( + "NO ARCHIVE FOUND - The requested URL is probably " + + "not yet archived or if the URL was recently archived then it is " + + "not yet available via the Wayback Machine's availability API " + + "because of database lag and should be available after some time." + ) + else: + archive_url = availability_api_instance.archive_url + click.echo(archive_url) + if json: + click.echo("JSON response:") + click.echo(JSON.dumps(availability_api_instance.JSON)) + + +def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: + domain = None + sys_random = random.SystemRandom() + uid = "".join( + sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) + ) + url_count = 0 + file_name = None + + for url in url_gen: + url_count += 1 + if not domain: + match = re.search("https?://([A-Za-z_0-9.-]+).*", url) + domain = "domain-unknown" if match is None else match.group(1) + file_name = f"{domain}-urls-{uid}.txt" + file_path = os.path.join(os.getcwd(), file_name) + with open(file_path, "a") as f: + f.write(f"{url}\n") + + click.echo(url) + + if url_count > 0 or file_name is not None: + click.echo(f"\n\n'{file_name}' saved in current working directory") + else: + click.echo("No known URLs found. Please try a diffrent input!") + + @click.command() @click.option( "-u", "--url", help="URL on which Wayback machine operations are to be performed." @@ -30,11 +76,12 @@ from .wrapper import Url @click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.") @click.option( "-l", - "--showlicense", + "--show-license", + "--show_license", "--license", is_flag=True, default=False, - help="show license of Waybackpy.", + help="Show license of Waybackpy.", ) @click.option( "-n", @@ -129,7 +176,8 @@ from .wrapper import Url ) @click.option( "-f", - "--cdxfilter", + "--cdx-filter", + "--cdx_filter", "--filter", multiple=True, help="Filter on a specific field or all the CDX fields.", @@ -169,11 +217,11 @@ from .wrapper import Url + "if this parameter is not used then the plain text response of the CDX API " + "will be printed.", ) -def main( +def main( # pylint: disable=no-value-for-parameter url: Optional[str], user_agent: str, version: bool, - showlicense: bool, + show_license: bool, newest: bool, oldest: bool, json: bool, @@ -191,7 +239,7 @@ def main( cdx: bool, start_timestamp: Optional[str], end_timestamp: Optional[str], - cdxfilter: List[str], + cdx_filter: List[str], match_type: Optional[str], gzip: Optional[str], collapse: List[str], @@ -219,26 +267,18 @@ def main( Released under the MIT License. Use the flag --license for license. """ - if version: click.echo(f"waybackpy version {__version__}") - return - - if showlicense: + elif show_license: click.echo( requests.get( url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE" ).text ) - return - - if not url: + elif url is None: click.echo("No URL detected. Please provide an URL.") - return - - if ( - url - and not version + elif ( + not version and not oldest and not newest and not near @@ -250,39 +290,16 @@ def main( "Only URL passed, but did not specify what to do with the URL. " "Use --help flag for help using waybackpy." ) - return - - def echo_availability_api( - availability_api_instance: WaybackMachineAvailabilityAPI, - ) -> None: - click.echo("Archive URL:") - if not availability_api_instance.archive_url: - archive_url = ( - "NO ARCHIVE FOUND - The requested URL is probably " - + "not yet archived or if the URL was recently archived then it is " - + "not yet available via the Wayback Machine's availability API " - + "because of database lag and should be available after some time." - ) - else: - archive_url = availability_api_instance.archive_url - click.echo(archive_url) - if json: - click.echo("JSON response:") - click.echo(JSON.dumps(availability_api_instance.JSON)) - - availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) - - if oldest: + elif oldest: + availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) availability_api.oldest() - echo_availability_api(availability_api) - return - - if newest: + echo_availability_api(availability_api, json) + elif newest: + availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) availability_api.newest() - echo_availability_api(availability_api) - return - - if near: + echo_availability_api(availability_api, json) + elif near: + availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent) near_args = {} keys = ["year", "month", "day", "hour", "minute"] args_arr = [year, month, day, hour, minute] @@ -290,10 +307,8 @@ def main( if arg: near_args[key] = arg availability_api.near(**near_args) - echo_availability_api(availability_api) - return - - if save: + echo_availability_api(availability_api, json) + elif save: save_api = WaybackMachineSaveAPI(url, user_agent=user_agent) save_api.save() click.echo("Archive URL:") @@ -303,43 +318,7 @@ def main( if headers: click.echo("Save API headers:") click.echo(save_api.headers) - return - - def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: - domain = None - sys_random = random.SystemRandom() - uid = "".join( - sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) - ) - url_count = 0 - file_name = None - - for url in url_gen: - url_count += 1 - if not domain: - match = re.search("https?://([A-Za-z_0-9.-]+).*", url) - - domain = "domain-unknown" - - if match is not None: - domain = match.group(1) - - file_name = f"{domain}-urls-{uid}.txt" - file_path = os.path.join(os.getcwd(), file_name) - if not os.path.isfile(file_path): - open(file_path, "w+").close() - - with open(file_path, "a") as f: - f.write(f"{url}\n") - - click.echo(url) - - if url_count > 0 or file_name is not None: - click.echo(f"\n\n'{file_name}' saved in current working directory") - else: - click.echo("No known URLs found. Please try a diffrent input!") - - if known_urls: + elif known_urls: wayback = Url(url, user_agent) url_gen = wayback.known_urls(subdomain=subdomain) @@ -348,9 +327,8 @@ def main( else: for url in url_gen: click.echo(url) - - if cdx: - filters = list(cdxfilter) + elif cdx: + filters = list(cdx_filter) collapses = list(collapse) cdx_print = list(cdx_print) @@ -372,35 +350,36 @@ def main( if len(cdx_print) == 0: click.echo(snapshot) else: - output_string = "" + output_string = [] if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]): - output_string = output_string + snapshot.urlkey + " " + output_string.append(snapshot.urlkey) if any( val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"] ): - output_string = output_string + snapshot.timestamp + " " + output_string.append(snapshot.timestamp) if "original" in cdx_print: - output_string = output_string + snapshot.original + " " + output_string.append(snapshot.original) if any( val in cdx_print for val in ["mimetype", "mime-type", "mime_type"] ): - output_string = output_string + snapshot.mimetype + " " + output_string.append(snapshot.mimetype) if any( val in cdx_print for val in ["statuscode", "status-code", "status_code"] ): - output_string = output_string + snapshot.statuscode + " " + output_string.append(snapshot.statuscode) if "digest" in cdx_print: - output_string = output_string + snapshot.digest + " " + output_string.append(snapshot.digest) if "length" in cdx_print: - output_string = output_string + snapshot.length + " " + output_string.append(snapshot.length) if any( val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"] ): - output_string = output_string + snapshot.archive_url + " " - click.echo(output_string) + output_string.append(snapshot.archive_url) + + click.echo(" ".join(output_string)) if __name__ == "__main__": diff --git a/waybackpy/save_api.py b/waybackpy/save_api.py index f511ac8..c0ca55c 100644 --- a/waybackpy/save_api.py +++ b/waybackpy/save_api.py @@ -19,7 +19,10 @@ class WaybackMachineSaveAPI(object): """ def __init__( - self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8 + self, + url: str, + user_agent: str = DEFAULT_USER_AGENT, + max_tries: int = 8, ) -> None: self.url = str(url).strip().replace(" ", "%20") self.request_url = "https://web.archive.org/save/" + self.url @@ -169,17 +172,16 @@ class WaybackMachineSaveAPI(object): tries = 0 while True: - if self.saved_archive is None: - if tries >= 1: - self.sleep(tries) + if tries >= 1: + self.sleep(tries) - self.get_save_request_headers() - self.saved_archive = self.archive_url_parser() + self.get_save_request_headers() + self.saved_archive = self.archive_url_parser() - if isinstance(self.saved_archive, str): - self._archive_url = self.saved_archive - self.timestamp() - return self.saved_archive + if isinstance(self.saved_archive, str): + self._archive_url = self.saved_archive + self.timestamp() + return self.saved_archive tries += 1 if tries >= self.max_tries: diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 38dd1b6..dbe3909 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -6,21 +6,25 @@ from .cdx_api import WaybackMachineCDXServerAPI from .save_api import WaybackMachineSaveAPI from .utils import DEFAULT_USER_AGENT -""" -The Url class is not recommended to be used anymore, instead use the -WaybackMachineSaveAPI, WaybackMachineAvailabilityAPI and WaybackMachineCDXServerAPI. - -The reason it is still in the code is backwards compatibility with 2.x.x versions. - -If were are using the Url before the update to version 3.x.x, your code should still be -working fine and there is no hurry to update the interface but is recommended that you -do not use the Url class for new code as it would be removed after 2025 also the first -3.x.x versions was released in January 2022 and three years are more than enough to -update the older interface code. -""" - class Url(object): + """ + The Url class is not recommended to be used anymore, instead use: + + - WaybackMachineSaveAPI + - WaybackMachineAvailabilityAPI + - WaybackMachineCDXServerAPI + + The reason it is still in the code is backwards compatibility with 2.x.x + versions. + + If were are using the Url before the update to version 3.x.x, your code should + still be working fine and there is no hurry to update the interface but is + recommended that you do not use the Url class for new code as it would be + removed after 2025 also the first 3.x.x versions was released in January 2022 + and three years are more than enough to update the older interface code. + """ + def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None: self.url = url self.user_agent = str(user_agent)