Fix Pylint errors were pointed out by codacy (#133)
* fix: pylint errors were pointed out by codacy * fix: line length * fix: help text * fix: revert https://stackoverflow.com/a/64477857 makes cli unusable * fix: cli error and refactor codes
This commit is contained in:
		@@ -26,27 +26,25 @@ class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
        user_agent: str = DEFAULT_USER_AGENT,
 | 
			
		||||
        start_timestamp: Optional[str] = None,
 | 
			
		||||
        end_timestamp: Optional[str] = None,
 | 
			
		||||
        filters: List[str] = [],
 | 
			
		||||
        filters: Optional[List[str]] = None,
 | 
			
		||||
        match_type: Optional[str] = None,
 | 
			
		||||
        gzip: Optional[str] = None,
 | 
			
		||||
        collapses: List[str] = [],
 | 
			
		||||
        collapses: Optional[List[str]] = None,
 | 
			
		||||
        limit: Optional[str] = None,
 | 
			
		||||
        max_tries: int = 3,
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        self.url = str(url).strip().replace(" ", "%20")
 | 
			
		||||
        self.user_agent = user_agent
 | 
			
		||||
        self.start_timestamp = (
 | 
			
		||||
            str(start_timestamp) if start_timestamp is not None else None
 | 
			
		||||
        )
 | 
			
		||||
        self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
 | 
			
		||||
        self.filters = filters
 | 
			
		||||
        self.start_timestamp = None if start_timestamp is None else str(start_timestamp)
 | 
			
		||||
        self.end_timestamp = None if end_timestamp is None else str(end_timestamp)
 | 
			
		||||
        self.filters = [] if filters is None else filters
 | 
			
		||||
        check_filters(self.filters)
 | 
			
		||||
        self.match_type = str(match_type).strip() if match_type is not None else None
 | 
			
		||||
        self.match_type = None if match_type is None else str(match_type).strip()
 | 
			
		||||
        check_match_type(self.match_type, self.url)
 | 
			
		||||
        self.gzip = gzip
 | 
			
		||||
        self.collapses = collapses
 | 
			
		||||
        self.collapses = [] if collapses is None else collapses
 | 
			
		||||
        check_collapses(self.collapses)
 | 
			
		||||
        self.limit = limit if limit is not None else 5000
 | 
			
		||||
        self.limit = 5000 if limit is None else limit
 | 
			
		||||
        self.max_tries = max_tries
 | 
			
		||||
        self.last_api_request_url: Optional[str] = None
 | 
			
		||||
        self.use_page = False
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										185
									
								
								waybackpy/cli.py
									
									
									
									
									
								
							
							
						
						
									
										185
									
								
								waybackpy/cli.py
									
									
									
									
									
								
							@@ -16,6 +16,52 @@ from .utils import DEFAULT_USER_AGENT
 | 
			
		||||
from .wrapper import Url
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def echo_availability_api(
 | 
			
		||||
    availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
 | 
			
		||||
) -> None:
 | 
			
		||||
    click.echo("Archive URL:")
 | 
			
		||||
    if not availability_api_instance.archive_url:
 | 
			
		||||
        archive_url = (
 | 
			
		||||
            "NO ARCHIVE FOUND - The requested URL is probably "
 | 
			
		||||
            + "not yet archived or if the URL was recently archived then it is "
 | 
			
		||||
            + "not yet available via the Wayback Machine's availability API "
 | 
			
		||||
            + "because of database lag and should be available after some time."
 | 
			
		||||
        )
 | 
			
		||||
    else:
 | 
			
		||||
        archive_url = availability_api_instance.archive_url
 | 
			
		||||
    click.echo(archive_url)
 | 
			
		||||
    if json:
 | 
			
		||||
        click.echo("JSON response:")
 | 
			
		||||
        click.echo(JSON.dumps(availability_api_instance.JSON))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
 | 
			
		||||
    domain = None
 | 
			
		||||
    sys_random = random.SystemRandom()
 | 
			
		||||
    uid = "".join(
 | 
			
		||||
        sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
 | 
			
		||||
    )
 | 
			
		||||
    url_count = 0
 | 
			
		||||
    file_name = None
 | 
			
		||||
 | 
			
		||||
    for url in url_gen:
 | 
			
		||||
        url_count += 1
 | 
			
		||||
        if not domain:
 | 
			
		||||
            match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
 | 
			
		||||
            domain = "domain-unknown" if match is None else match.group(1)
 | 
			
		||||
            file_name = f"{domain}-urls-{uid}.txt"
 | 
			
		||||
            file_path = os.path.join(os.getcwd(), file_name)
 | 
			
		||||
            with open(file_path, "a") as f:
 | 
			
		||||
                f.write(f"{url}\n")
 | 
			
		||||
 | 
			
		||||
        click.echo(url)
 | 
			
		||||
 | 
			
		||||
    if url_count > 0 or file_name is not None:
 | 
			
		||||
        click.echo(f"\n\n'{file_name}' saved in current working directory")
 | 
			
		||||
    else:
 | 
			
		||||
        click.echo("No known URLs found. Please try a diffrent input!")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@click.command()
 | 
			
		||||
@click.option(
 | 
			
		||||
    "-u", "--url", help="URL on which Wayback machine operations are to be performed."
 | 
			
		||||
@@ -30,11 +76,12 @@ from .wrapper import Url
 | 
			
		||||
@click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.")
 | 
			
		||||
@click.option(
 | 
			
		||||
    "-l",
 | 
			
		||||
    "--showlicense",
 | 
			
		||||
    "--show-license",
 | 
			
		||||
    "--show_license",
 | 
			
		||||
    "--license",
 | 
			
		||||
    is_flag=True,
 | 
			
		||||
    default=False,
 | 
			
		||||
    help="show license of Waybackpy.",
 | 
			
		||||
    help="Show license of Waybackpy.",
 | 
			
		||||
)
 | 
			
		||||
@click.option(
 | 
			
		||||
    "-n",
 | 
			
		||||
@@ -129,7 +176,8 @@ from .wrapper import Url
 | 
			
		||||
)
 | 
			
		||||
@click.option(
 | 
			
		||||
    "-f",
 | 
			
		||||
    "--cdxfilter",
 | 
			
		||||
    "--cdx-filter",
 | 
			
		||||
    "--cdx_filter",
 | 
			
		||||
    "--filter",
 | 
			
		||||
    multiple=True,
 | 
			
		||||
    help="Filter on a specific field or all the CDX fields.",
 | 
			
		||||
@@ -169,11 +217,11 @@ from .wrapper import Url
 | 
			
		||||
    + "if this parameter is not used then the plain text response of the CDX API "
 | 
			
		||||
    + "will be printed.",
 | 
			
		||||
)
 | 
			
		||||
def main(
 | 
			
		||||
def main(  # pylint: disable=no-value-for-parameter
 | 
			
		||||
    url: Optional[str],
 | 
			
		||||
    user_agent: str,
 | 
			
		||||
    version: bool,
 | 
			
		||||
    showlicense: bool,
 | 
			
		||||
    show_license: bool,
 | 
			
		||||
    newest: bool,
 | 
			
		||||
    oldest: bool,
 | 
			
		||||
    json: bool,
 | 
			
		||||
@@ -191,7 +239,7 @@ def main(
 | 
			
		||||
    cdx: bool,
 | 
			
		||||
    start_timestamp: Optional[str],
 | 
			
		||||
    end_timestamp: Optional[str],
 | 
			
		||||
    cdxfilter: List[str],
 | 
			
		||||
    cdx_filter: List[str],
 | 
			
		||||
    match_type: Optional[str],
 | 
			
		||||
    gzip: Optional[str],
 | 
			
		||||
    collapse: List[str],
 | 
			
		||||
@@ -219,26 +267,18 @@ def main(
 | 
			
		||||
    Released under the MIT License. Use the flag --license for license.
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if version:
 | 
			
		||||
        click.echo(f"waybackpy version {__version__}")
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    if showlicense:
 | 
			
		||||
    elif show_license:
 | 
			
		||||
        click.echo(
 | 
			
		||||
            requests.get(
 | 
			
		||||
                url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE"
 | 
			
		||||
            ).text
 | 
			
		||||
        )
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    if not url:
 | 
			
		||||
    elif url is None:
 | 
			
		||||
        click.echo("No URL detected. Please provide an URL.")
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    if (
 | 
			
		||||
        url
 | 
			
		||||
        and not version
 | 
			
		||||
    elif (
 | 
			
		||||
        not version
 | 
			
		||||
        and not oldest
 | 
			
		||||
        and not newest
 | 
			
		||||
        and not near
 | 
			
		||||
@@ -250,39 +290,16 @@ def main(
 | 
			
		||||
            "Only URL passed, but did not specify what to do with the URL. "
 | 
			
		||||
            "Use --help flag for help using waybackpy."
 | 
			
		||||
        )
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    def echo_availability_api(
 | 
			
		||||
        availability_api_instance: WaybackMachineAvailabilityAPI,
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        click.echo("Archive URL:")
 | 
			
		||||
        if not availability_api_instance.archive_url:
 | 
			
		||||
            archive_url = (
 | 
			
		||||
                "NO ARCHIVE FOUND - The requested URL is probably "
 | 
			
		||||
                + "not yet archived or if the URL was recently archived then it is "
 | 
			
		||||
                + "not yet available via the Wayback Machine's availability API "
 | 
			
		||||
                + "because of database lag and should be available after some time."
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            archive_url = availability_api_instance.archive_url
 | 
			
		||||
        click.echo(archive_url)
 | 
			
		||||
        if json:
 | 
			
		||||
            click.echo("JSON response:")
 | 
			
		||||
            click.echo(JSON.dumps(availability_api_instance.JSON))
 | 
			
		||||
 | 
			
		||||
    availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
 | 
			
		||||
 | 
			
		||||
    if oldest:
 | 
			
		||||
    elif oldest:
 | 
			
		||||
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
 | 
			
		||||
        availability_api.oldest()
 | 
			
		||||
        echo_availability_api(availability_api)
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    if newest:
 | 
			
		||||
        echo_availability_api(availability_api, json)
 | 
			
		||||
    elif newest:
 | 
			
		||||
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
 | 
			
		||||
        availability_api.newest()
 | 
			
		||||
        echo_availability_api(availability_api)
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    if near:
 | 
			
		||||
        echo_availability_api(availability_api, json)
 | 
			
		||||
    elif near:
 | 
			
		||||
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
 | 
			
		||||
        near_args = {}
 | 
			
		||||
        keys = ["year", "month", "day", "hour", "minute"]
 | 
			
		||||
        args_arr = [year, month, day, hour, minute]
 | 
			
		||||
@@ -290,10 +307,8 @@ def main(
 | 
			
		||||
            if arg:
 | 
			
		||||
                near_args[key] = arg
 | 
			
		||||
        availability_api.near(**near_args)
 | 
			
		||||
        echo_availability_api(availability_api)
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    if save:
 | 
			
		||||
        echo_availability_api(availability_api, json)
 | 
			
		||||
    elif save:
 | 
			
		||||
        save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
 | 
			
		||||
        save_api.save()
 | 
			
		||||
        click.echo("Archive URL:")
 | 
			
		||||
@@ -303,43 +318,7 @@ def main(
 | 
			
		||||
        if headers:
 | 
			
		||||
            click.echo("Save API headers:")
 | 
			
		||||
            click.echo(save_api.headers)
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
 | 
			
		||||
        domain = None
 | 
			
		||||
        sys_random = random.SystemRandom()
 | 
			
		||||
        uid = "".join(
 | 
			
		||||
            sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
 | 
			
		||||
        )
 | 
			
		||||
        url_count = 0
 | 
			
		||||
        file_name = None
 | 
			
		||||
 | 
			
		||||
        for url in url_gen:
 | 
			
		||||
            url_count += 1
 | 
			
		||||
            if not domain:
 | 
			
		||||
                match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
 | 
			
		||||
 | 
			
		||||
                domain = "domain-unknown"
 | 
			
		||||
 | 
			
		||||
                if match is not None:
 | 
			
		||||
                    domain = match.group(1)
 | 
			
		||||
 | 
			
		||||
                file_name = f"{domain}-urls-{uid}.txt"
 | 
			
		||||
                file_path = os.path.join(os.getcwd(), file_name)
 | 
			
		||||
                if not os.path.isfile(file_path):
 | 
			
		||||
                    open(file_path, "w+").close()
 | 
			
		||||
 | 
			
		||||
                with open(file_path, "a") as f:
 | 
			
		||||
                    f.write(f"{url}\n")
 | 
			
		||||
 | 
			
		||||
            click.echo(url)
 | 
			
		||||
 | 
			
		||||
        if url_count > 0 or file_name is not None:
 | 
			
		||||
            click.echo(f"\n\n'{file_name}' saved in current working directory")
 | 
			
		||||
        else:
 | 
			
		||||
            click.echo("No known URLs found. Please try a diffrent input!")
 | 
			
		||||
 | 
			
		||||
    if known_urls:
 | 
			
		||||
    elif known_urls:
 | 
			
		||||
        wayback = Url(url, user_agent)
 | 
			
		||||
        url_gen = wayback.known_urls(subdomain=subdomain)
 | 
			
		||||
 | 
			
		||||
@@ -348,9 +327,8 @@ def main(
 | 
			
		||||
        else:
 | 
			
		||||
            for url in url_gen:
 | 
			
		||||
                click.echo(url)
 | 
			
		||||
 | 
			
		||||
    if cdx:
 | 
			
		||||
        filters = list(cdxfilter)
 | 
			
		||||
    elif cdx:
 | 
			
		||||
        filters = list(cdx_filter)
 | 
			
		||||
        collapses = list(collapse)
 | 
			
		||||
        cdx_print = list(cdx_print)
 | 
			
		||||
 | 
			
		||||
@@ -372,35 +350,36 @@ def main(
 | 
			
		||||
            if len(cdx_print) == 0:
 | 
			
		||||
                click.echo(snapshot)
 | 
			
		||||
            else:
 | 
			
		||||
                output_string = ""
 | 
			
		||||
                output_string = []
 | 
			
		||||
                if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
 | 
			
		||||
                    output_string = output_string + snapshot.urlkey + " "
 | 
			
		||||
                    output_string.append(snapshot.urlkey)
 | 
			
		||||
                if any(
 | 
			
		||||
                    val in cdx_print
 | 
			
		||||
                    for val in ["timestamp", "time-stamp", "time_stamp"]
 | 
			
		||||
                ):
 | 
			
		||||
                    output_string = output_string + snapshot.timestamp + " "
 | 
			
		||||
                    output_string.append(snapshot.timestamp)
 | 
			
		||||
                if "original" in cdx_print:
 | 
			
		||||
                    output_string = output_string + snapshot.original + " "
 | 
			
		||||
                    output_string.append(snapshot.original)
 | 
			
		||||
                if any(
 | 
			
		||||
                    val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]
 | 
			
		||||
                ):
 | 
			
		||||
                    output_string = output_string + snapshot.mimetype + " "
 | 
			
		||||
                    output_string.append(snapshot.mimetype)
 | 
			
		||||
                if any(
 | 
			
		||||
                    val in cdx_print
 | 
			
		||||
                    for val in ["statuscode", "status-code", "status_code"]
 | 
			
		||||
                ):
 | 
			
		||||
                    output_string = output_string + snapshot.statuscode + " "
 | 
			
		||||
                    output_string.append(snapshot.statuscode)
 | 
			
		||||
                if "digest" in cdx_print:
 | 
			
		||||
                    output_string = output_string + snapshot.digest + " "
 | 
			
		||||
                    output_string.append(snapshot.digest)
 | 
			
		||||
                if "length" in cdx_print:
 | 
			
		||||
                    output_string = output_string + snapshot.length + " "
 | 
			
		||||
                    output_string.append(snapshot.length)
 | 
			
		||||
                if any(
 | 
			
		||||
                    val in cdx_print
 | 
			
		||||
                    for val in ["archiveurl", "archive-url", "archive_url"]
 | 
			
		||||
                ):
 | 
			
		||||
                    output_string = output_string + snapshot.archive_url + " "
 | 
			
		||||
                click.echo(output_string)
 | 
			
		||||
                    output_string.append(snapshot.archive_url)
 | 
			
		||||
 | 
			
		||||
                click.echo(" ".join(output_string))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 
 | 
			
		||||
@@ -19,7 +19,10 @@ class WaybackMachineSaveAPI(object):
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
 | 
			
		||||
        self,
 | 
			
		||||
        url: str,
 | 
			
		||||
        user_agent: str = DEFAULT_USER_AGENT,
 | 
			
		||||
        max_tries: int = 8,
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        self.url = str(url).strip().replace(" ", "%20")
 | 
			
		||||
        self.request_url = "https://web.archive.org/save/" + self.url
 | 
			
		||||
@@ -169,17 +172,16 @@ class WaybackMachineSaveAPI(object):
 | 
			
		||||
        tries = 0
 | 
			
		||||
 | 
			
		||||
        while True:
 | 
			
		||||
            if self.saved_archive is None:
 | 
			
		||||
                if tries >= 1:
 | 
			
		||||
                    self.sleep(tries)
 | 
			
		||||
            if tries >= 1:
 | 
			
		||||
                self.sleep(tries)
 | 
			
		||||
 | 
			
		||||
                self.get_save_request_headers()
 | 
			
		||||
                self.saved_archive = self.archive_url_parser()
 | 
			
		||||
            self.get_save_request_headers()
 | 
			
		||||
            self.saved_archive = self.archive_url_parser()
 | 
			
		||||
 | 
			
		||||
                if isinstance(self.saved_archive, str):
 | 
			
		||||
                    self._archive_url = self.saved_archive
 | 
			
		||||
                    self.timestamp()
 | 
			
		||||
                    return self.saved_archive
 | 
			
		||||
            if isinstance(self.saved_archive, str):
 | 
			
		||||
                self._archive_url = self.saved_archive
 | 
			
		||||
                self.timestamp()
 | 
			
		||||
                return self.saved_archive
 | 
			
		||||
 | 
			
		||||
            tries += 1
 | 
			
		||||
            if tries >= self.max_tries:
 | 
			
		||||
 
 | 
			
		||||
@@ -6,21 +6,25 @@ from .cdx_api import WaybackMachineCDXServerAPI
 | 
			
		||||
from .save_api import WaybackMachineSaveAPI
 | 
			
		||||
from .utils import DEFAULT_USER_AGENT
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
The Url class is not recommended to be used anymore, instead use the
 | 
			
		||||
WaybackMachineSaveAPI, WaybackMachineAvailabilityAPI and WaybackMachineCDXServerAPI.
 | 
			
		||||
 | 
			
		||||
The reason it is still in the code is backwards compatibility with 2.x.x versions.
 | 
			
		||||
 | 
			
		||||
If were are using the Url before the update to version 3.x.x, your code should still be
 | 
			
		||||
working fine and there is no hurry to update the interface but is recommended that you
 | 
			
		||||
do not use the Url class for new code as it would be removed after 2025 also the first
 | 
			
		||||
3.x.x versions was released in January 2022 and three years are more than enough to
 | 
			
		||||
update the older interface code.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Url(object):
 | 
			
		||||
    """
 | 
			
		||||
    The Url class is not recommended to be used anymore, instead use:
 | 
			
		||||
 | 
			
		||||
    - WaybackMachineSaveAPI
 | 
			
		||||
    - WaybackMachineAvailabilityAPI
 | 
			
		||||
    - WaybackMachineCDXServerAPI
 | 
			
		||||
 | 
			
		||||
    The reason it is still in the code is backwards compatibility with 2.x.x
 | 
			
		||||
    versions.
 | 
			
		||||
 | 
			
		||||
    If were are using the Url before the update to version 3.x.x, your code should
 | 
			
		||||
    still be working fine and there is no hurry to update the interface but is
 | 
			
		||||
    recommended that you do not use the Url class for new code as it would be
 | 
			
		||||
    removed after 2025 also the first 3.x.x versions was released in January 2022
 | 
			
		||||
    and three years are more than enough to update the older interface code.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.user_agent = str(user_agent)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user