Fix Pylint errors were pointed out by codacy (#133)

* fix: pylint errors were pointed out by codacy

* fix: line length

* fix: help text

* fix: revert

https://stackoverflow.com/a/64477857 makes cli unusable

* fix: cli error and refactor codes
This commit is contained in:
eggplants 2022-02-05 05:25:40 +09:00 committed by GitHub
parent 9d9cc3328b
commit 5f3cd28046
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 119 additions and 136 deletions

View File

@ -26,27 +26,25 @@ class WaybackMachineCDXServerAPI(object):
user_agent: str = DEFAULT_USER_AGENT,
start_timestamp: Optional[str] = None,
end_timestamp: Optional[str] = None,
filters: List[str] = [],
filters: Optional[List[str]] = None,
match_type: Optional[str] = None,
gzip: Optional[str] = None,
collapses: List[str] = [],
collapses: Optional[List[str]] = None,
limit: Optional[str] = None,
max_tries: int = 3,
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent
self.start_timestamp = (
str(start_timestamp) if start_timestamp is not None else None
)
self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
self.filters = filters
self.start_timestamp = None if start_timestamp is None else str(start_timestamp)
self.end_timestamp = None if end_timestamp is None else str(end_timestamp)
self.filters = [] if filters is None else filters
check_filters(self.filters)
self.match_type = str(match_type).strip() if match_type is not None else None
self.match_type = None if match_type is None else str(match_type).strip()
check_match_type(self.match_type, self.url)
self.gzip = gzip
self.collapses = collapses
self.collapses = [] if collapses is None else collapses
check_collapses(self.collapses)
self.limit = limit if limit is not None else 5000
self.limit = 5000 if limit is None else limit
self.max_tries = max_tries
self.last_api_request_url: Optional[str] = None
self.use_page = False

View File

@ -16,6 +16,52 @@ from .utils import DEFAULT_USER_AGENT
from .wrapper import Url
def echo_availability_api(
availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
) -> None:
click.echo("Archive URL:")
if not availability_api_instance.archive_url:
archive_url = (
"NO ARCHIVE FOUND - The requested URL is probably "
+ "not yet archived or if the URL was recently archived then it is "
+ "not yet available via the Wayback Machine's availability API "
+ "because of database lag and should be available after some time."
)
else:
archive_url = availability_api_instance.archive_url
click.echo(archive_url)
if json:
click.echo("JSON response:")
click.echo(JSON.dumps(availability_api_instance.JSON))
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
domain = None
sys_random = random.SystemRandom()
uid = "".join(
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
)
url_count = 0
file_name = None
for url in url_gen:
url_count += 1
if not domain:
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
domain = "domain-unknown" if match is None else match.group(1)
file_name = f"{domain}-urls-{uid}.txt"
file_path = os.path.join(os.getcwd(), file_name)
with open(file_path, "a") as f:
f.write(f"{url}\n")
click.echo(url)
if url_count > 0 or file_name is not None:
click.echo(f"\n\n'{file_name}' saved in current working directory")
else:
click.echo("No known URLs found. Please try a diffrent input!")
@click.command()
@click.option(
"-u", "--url", help="URL on which Wayback machine operations are to be performed."
@ -30,11 +76,12 @@ from .wrapper import Url
@click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.")
@click.option(
"-l",
"--showlicense",
"--show-license",
"--show_license",
"--license",
is_flag=True,
default=False,
help="show license of Waybackpy.",
help="Show license of Waybackpy.",
)
@click.option(
"-n",
@ -129,7 +176,8 @@ from .wrapper import Url
)
@click.option(
"-f",
"--cdxfilter",
"--cdx-filter",
"--cdx_filter",
"--filter",
multiple=True,
help="Filter on a specific field or all the CDX fields.",
@ -169,11 +217,11 @@ from .wrapper import Url
+ "if this parameter is not used then the plain text response of the CDX API "
+ "will be printed.",
)
def main(
def main( # pylint: disable=no-value-for-parameter
url: Optional[str],
user_agent: str,
version: bool,
showlicense: bool,
show_license: bool,
newest: bool,
oldest: bool,
json: bool,
@ -191,7 +239,7 @@ def main(
cdx: bool,
start_timestamp: Optional[str],
end_timestamp: Optional[str],
cdxfilter: List[str],
cdx_filter: List[str],
match_type: Optional[str],
gzip: Optional[str],
collapse: List[str],
@ -219,26 +267,18 @@ def main(
Released under the MIT License. Use the flag --license for license.
"""
if version:
click.echo(f"waybackpy version {__version__}")
return
if showlicense:
elif show_license:
click.echo(
requests.get(
url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE"
).text
)
return
if not url:
elif url is None:
click.echo("No URL detected. Please provide an URL.")
return
if (
url
and not version
elif (
not version
and not oldest
and not newest
and not near
@ -250,39 +290,16 @@ def main(
"Only URL passed, but did not specify what to do with the URL. "
"Use --help flag for help using waybackpy."
)
return
def echo_availability_api(
availability_api_instance: WaybackMachineAvailabilityAPI,
) -> None:
click.echo("Archive URL:")
if not availability_api_instance.archive_url:
archive_url = (
"NO ARCHIVE FOUND - The requested URL is probably "
+ "not yet archived or if the URL was recently archived then it is "
+ "not yet available via the Wayback Machine's availability API "
+ "because of database lag and should be available after some time."
)
else:
archive_url = availability_api_instance.archive_url
click.echo(archive_url)
if json:
click.echo("JSON response:")
click.echo(JSON.dumps(availability_api_instance.JSON))
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
if oldest:
elif oldest:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
availability_api.oldest()
echo_availability_api(availability_api)
return
if newest:
echo_availability_api(availability_api, json)
elif newest:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
availability_api.newest()
echo_availability_api(availability_api)
return
if near:
echo_availability_api(availability_api, json)
elif near:
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
near_args = {}
keys = ["year", "month", "day", "hour", "minute"]
args_arr = [year, month, day, hour, minute]
@ -290,10 +307,8 @@ def main(
if arg:
near_args[key] = arg
availability_api.near(**near_args)
echo_availability_api(availability_api)
return
if save:
echo_availability_api(availability_api, json)
elif save:
save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
save_api.save()
click.echo("Archive URL:")
@ -303,43 +318,7 @@ def main(
if headers:
click.echo("Save API headers:")
click.echo(save_api.headers)
return
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
domain = None
sys_random = random.SystemRandom()
uid = "".join(
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
)
url_count = 0
file_name = None
for url in url_gen:
url_count += 1
if not domain:
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
domain = "domain-unknown"
if match is not None:
domain = match.group(1)
file_name = f"{domain}-urls-{uid}.txt"
file_path = os.path.join(os.getcwd(), file_name)
if not os.path.isfile(file_path):
open(file_path, "w+").close()
with open(file_path, "a") as f:
f.write(f"{url}\n")
click.echo(url)
if url_count > 0 or file_name is not None:
click.echo(f"\n\n'{file_name}' saved in current working directory")
else:
click.echo("No known URLs found. Please try a diffrent input!")
if known_urls:
elif known_urls:
wayback = Url(url, user_agent)
url_gen = wayback.known_urls(subdomain=subdomain)
@ -348,9 +327,8 @@ def main(
else:
for url in url_gen:
click.echo(url)
if cdx:
filters = list(cdxfilter)
elif cdx:
filters = list(cdx_filter)
collapses = list(collapse)
cdx_print = list(cdx_print)
@ -372,35 +350,36 @@ def main(
if len(cdx_print) == 0:
click.echo(snapshot)
else:
output_string = ""
output_string = []
if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
output_string = output_string + snapshot.urlkey + " "
output_string.append(snapshot.urlkey)
if any(
val in cdx_print
for val in ["timestamp", "time-stamp", "time_stamp"]
):
output_string = output_string + snapshot.timestamp + " "
output_string.append(snapshot.timestamp)
if "original" in cdx_print:
output_string = output_string + snapshot.original + " "
output_string.append(snapshot.original)
if any(
val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]
):
output_string = output_string + snapshot.mimetype + " "
output_string.append(snapshot.mimetype)
if any(
val in cdx_print
for val in ["statuscode", "status-code", "status_code"]
):
output_string = output_string + snapshot.statuscode + " "
output_string.append(snapshot.statuscode)
if "digest" in cdx_print:
output_string = output_string + snapshot.digest + " "
output_string.append(snapshot.digest)
if "length" in cdx_print:
output_string = output_string + snapshot.length + " "
output_string.append(snapshot.length)
if any(
val in cdx_print
for val in ["archiveurl", "archive-url", "archive_url"]
):
output_string = output_string + snapshot.archive_url + " "
click.echo(output_string)
output_string.append(snapshot.archive_url)
click.echo(" ".join(output_string))
if __name__ == "__main__":

View File

@ -19,7 +19,10 @@ class WaybackMachineSaveAPI(object):
"""
def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
self,
url: str,
user_agent: str = DEFAULT_USER_AGENT,
max_tries: int = 8,
) -> None:
self.url = str(url).strip().replace(" ", "%20")
self.request_url = "https://web.archive.org/save/" + self.url
@ -169,17 +172,16 @@ class WaybackMachineSaveAPI(object):
tries = 0
while True:
if self.saved_archive is None:
if tries >= 1:
self.sleep(tries)
if tries >= 1:
self.sleep(tries)
self.get_save_request_headers()
self.saved_archive = self.archive_url_parser()
self.get_save_request_headers()
self.saved_archive = self.archive_url_parser()
if isinstance(self.saved_archive, str):
self._archive_url = self.saved_archive
self.timestamp()
return self.saved_archive
if isinstance(self.saved_archive, str):
self._archive_url = self.saved_archive
self.timestamp()
return self.saved_archive
tries += 1
if tries >= self.max_tries:

View File

@ -6,21 +6,25 @@ from .cdx_api import WaybackMachineCDXServerAPI
from .save_api import WaybackMachineSaveAPI
from .utils import DEFAULT_USER_AGENT
"""
The Url class is not recommended to be used anymore, instead use the
WaybackMachineSaveAPI, WaybackMachineAvailabilityAPI and WaybackMachineCDXServerAPI.
The reason it is still in the code is backwards compatibility with 2.x.x versions.
If were are using the Url before the update to version 3.x.x, your code should still be
working fine and there is no hurry to update the interface but is recommended that you
do not use the Url class for new code as it would be removed after 2025 also the first
3.x.x versions was released in January 2022 and three years are more than enough to
update the older interface code.
"""
class Url(object):
"""
The Url class is not recommended to be used anymore, instead use:
- WaybackMachineSaveAPI
- WaybackMachineAvailabilityAPI
- WaybackMachineCDXServerAPI
The reason it is still in the code is backwards compatibility with 2.x.x
versions.
If were are using the Url before the update to version 3.x.x, your code should
still be working fine and there is no hurry to update the interface but is
recommended that you do not use the Url class for new code as it would be
removed after 2025 also the first 3.x.x versions was released in January 2022
and three years are more than enough to update the older interface code.
"""
def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
self.url = url
self.user_agent = str(user_agent)