Fix Pylint errors were pointed out by codacy (#133)
* fix: pylint errors were pointed out by codacy * fix: line length * fix: help text * fix: revert https://stackoverflow.com/a/64477857 makes cli unusable * fix: cli error and refactor codes
This commit is contained in:
parent
9d9cc3328b
commit
5f3cd28046
@ -26,27 +26,25 @@ class WaybackMachineCDXServerAPI(object):
|
||||
user_agent: str = DEFAULT_USER_AGENT,
|
||||
start_timestamp: Optional[str] = None,
|
||||
end_timestamp: Optional[str] = None,
|
||||
filters: List[str] = [],
|
||||
filters: Optional[List[str]] = None,
|
||||
match_type: Optional[str] = None,
|
||||
gzip: Optional[str] = None,
|
||||
collapses: List[str] = [],
|
||||
collapses: Optional[List[str]] = None,
|
||||
limit: Optional[str] = None,
|
||||
max_tries: int = 3,
|
||||
) -> None:
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
self.start_timestamp = (
|
||||
str(start_timestamp) if start_timestamp is not None else None
|
||||
)
|
||||
self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
|
||||
self.filters = filters
|
||||
self.start_timestamp = None if start_timestamp is None else str(start_timestamp)
|
||||
self.end_timestamp = None if end_timestamp is None else str(end_timestamp)
|
||||
self.filters = [] if filters is None else filters
|
||||
check_filters(self.filters)
|
||||
self.match_type = str(match_type).strip() if match_type is not None else None
|
||||
self.match_type = None if match_type is None else str(match_type).strip()
|
||||
check_match_type(self.match_type, self.url)
|
||||
self.gzip = gzip
|
||||
self.collapses = collapses
|
||||
self.collapses = [] if collapses is None else collapses
|
||||
check_collapses(self.collapses)
|
||||
self.limit = limit if limit is not None else 5000
|
||||
self.limit = 5000 if limit is None else limit
|
||||
self.max_tries = max_tries
|
||||
self.last_api_request_url: Optional[str] = None
|
||||
self.use_page = False
|
||||
|
185
waybackpy/cli.py
185
waybackpy/cli.py
@ -16,6 +16,52 @@ from .utils import DEFAULT_USER_AGENT
|
||||
from .wrapper import Url
|
||||
|
||||
|
||||
def echo_availability_api(
|
||||
availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
|
||||
) -> None:
|
||||
click.echo("Archive URL:")
|
||||
if not availability_api_instance.archive_url:
|
||||
archive_url = (
|
||||
"NO ARCHIVE FOUND - The requested URL is probably "
|
||||
+ "not yet archived or if the URL was recently archived then it is "
|
||||
+ "not yet available via the Wayback Machine's availability API "
|
||||
+ "because of database lag and should be available after some time."
|
||||
)
|
||||
else:
|
||||
archive_url = availability_api_instance.archive_url
|
||||
click.echo(archive_url)
|
||||
if json:
|
||||
click.echo("JSON response:")
|
||||
click.echo(JSON.dumps(availability_api_instance.JSON))
|
||||
|
||||
|
||||
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||
domain = None
|
||||
sys_random = random.SystemRandom()
|
||||
uid = "".join(
|
||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url_count = 0
|
||||
file_name = None
|
||||
|
||||
for url in url_gen:
|
||||
url_count += 1
|
||||
if not domain:
|
||||
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
||||
domain = "domain-unknown" if match is None else match.group(1)
|
||||
file_name = f"{domain}-urls-{uid}.txt"
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
with open(file_path, "a") as f:
|
||||
f.write(f"{url}\n")
|
||||
|
||||
click.echo(url)
|
||||
|
||||
if url_count > 0 or file_name is not None:
|
||||
click.echo(f"\n\n'{file_name}' saved in current working directory")
|
||||
else:
|
||||
click.echo("No known URLs found. Please try a diffrent input!")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"-u", "--url", help="URL on which Wayback machine operations are to be performed."
|
||||
@ -30,11 +76,12 @@ from .wrapper import Url
|
||||
@click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.")
|
||||
@click.option(
|
||||
"-l",
|
||||
"--showlicense",
|
||||
"--show-license",
|
||||
"--show_license",
|
||||
"--license",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="show license of Waybackpy.",
|
||||
help="Show license of Waybackpy.",
|
||||
)
|
||||
@click.option(
|
||||
"-n",
|
||||
@ -129,7 +176,8 @@ from .wrapper import Url
|
||||
)
|
||||
@click.option(
|
||||
"-f",
|
||||
"--cdxfilter",
|
||||
"--cdx-filter",
|
||||
"--cdx_filter",
|
||||
"--filter",
|
||||
multiple=True,
|
||||
help="Filter on a specific field or all the CDX fields.",
|
||||
@ -169,11 +217,11 @@ from .wrapper import Url
|
||||
+ "if this parameter is not used then the plain text response of the CDX API "
|
||||
+ "will be printed.",
|
||||
)
|
||||
def main(
|
||||
def main( # pylint: disable=no-value-for-parameter
|
||||
url: Optional[str],
|
||||
user_agent: str,
|
||||
version: bool,
|
||||
showlicense: bool,
|
||||
show_license: bool,
|
||||
newest: bool,
|
||||
oldest: bool,
|
||||
json: bool,
|
||||
@ -191,7 +239,7 @@ def main(
|
||||
cdx: bool,
|
||||
start_timestamp: Optional[str],
|
||||
end_timestamp: Optional[str],
|
||||
cdxfilter: List[str],
|
||||
cdx_filter: List[str],
|
||||
match_type: Optional[str],
|
||||
gzip: Optional[str],
|
||||
collapse: List[str],
|
||||
@ -219,26 +267,18 @@ def main(
|
||||
Released under the MIT License. Use the flag --license for license.
|
||||
|
||||
"""
|
||||
|
||||
if version:
|
||||
click.echo(f"waybackpy version {__version__}")
|
||||
return
|
||||
|
||||
if showlicense:
|
||||
elif show_license:
|
||||
click.echo(
|
||||
requests.get(
|
||||
url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE"
|
||||
).text
|
||||
)
|
||||
return
|
||||
|
||||
if not url:
|
||||
elif url is None:
|
||||
click.echo("No URL detected. Please provide an URL.")
|
||||
return
|
||||
|
||||
if (
|
||||
url
|
||||
and not version
|
||||
elif (
|
||||
not version
|
||||
and not oldest
|
||||
and not newest
|
||||
and not near
|
||||
@ -250,39 +290,16 @@ def main(
|
||||
"Only URL passed, but did not specify what to do with the URL. "
|
||||
"Use --help flag for help using waybackpy."
|
||||
)
|
||||
return
|
||||
|
||||
def echo_availability_api(
|
||||
availability_api_instance: WaybackMachineAvailabilityAPI,
|
||||
) -> None:
|
||||
click.echo("Archive URL:")
|
||||
if not availability_api_instance.archive_url:
|
||||
archive_url = (
|
||||
"NO ARCHIVE FOUND - The requested URL is probably "
|
||||
+ "not yet archived or if the URL was recently archived then it is "
|
||||
+ "not yet available via the Wayback Machine's availability API "
|
||||
+ "because of database lag and should be available after some time."
|
||||
)
|
||||
else:
|
||||
archive_url = availability_api_instance.archive_url
|
||||
click.echo(archive_url)
|
||||
if json:
|
||||
click.echo("JSON response:")
|
||||
click.echo(JSON.dumps(availability_api_instance.JSON))
|
||||
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
|
||||
if oldest:
|
||||
elif oldest:
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
availability_api.oldest()
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if newest:
|
||||
echo_availability_api(availability_api, json)
|
||||
elif newest:
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
availability_api.newest()
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if near:
|
||||
echo_availability_api(availability_api, json)
|
||||
elif near:
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
near_args = {}
|
||||
keys = ["year", "month", "day", "hour", "minute"]
|
||||
args_arr = [year, month, day, hour, minute]
|
||||
@ -290,10 +307,8 @@ def main(
|
||||
if arg:
|
||||
near_args[key] = arg
|
||||
availability_api.near(**near_args)
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if save:
|
||||
echo_availability_api(availability_api, json)
|
||||
elif save:
|
||||
save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
|
||||
save_api.save()
|
||||
click.echo("Archive URL:")
|
||||
@ -303,43 +318,7 @@ def main(
|
||||
if headers:
|
||||
click.echo("Save API headers:")
|
||||
click.echo(save_api.headers)
|
||||
return
|
||||
|
||||
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||
domain = None
|
||||
sys_random = random.SystemRandom()
|
||||
uid = "".join(
|
||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url_count = 0
|
||||
file_name = None
|
||||
|
||||
for url in url_gen:
|
||||
url_count += 1
|
||||
if not domain:
|
||||
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
||||
|
||||
domain = "domain-unknown"
|
||||
|
||||
if match is not None:
|
||||
domain = match.group(1)
|
||||
|
||||
file_name = f"{domain}-urls-{uid}.txt"
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
if not os.path.isfile(file_path):
|
||||
open(file_path, "w+").close()
|
||||
|
||||
with open(file_path, "a") as f:
|
||||
f.write(f"{url}\n")
|
||||
|
||||
click.echo(url)
|
||||
|
||||
if url_count > 0 or file_name is not None:
|
||||
click.echo(f"\n\n'{file_name}' saved in current working directory")
|
||||
else:
|
||||
click.echo("No known URLs found. Please try a diffrent input!")
|
||||
|
||||
if known_urls:
|
||||
elif known_urls:
|
||||
wayback = Url(url, user_agent)
|
||||
url_gen = wayback.known_urls(subdomain=subdomain)
|
||||
|
||||
@ -348,9 +327,8 @@ def main(
|
||||
else:
|
||||
for url in url_gen:
|
||||
click.echo(url)
|
||||
|
||||
if cdx:
|
||||
filters = list(cdxfilter)
|
||||
elif cdx:
|
||||
filters = list(cdx_filter)
|
||||
collapses = list(collapse)
|
||||
cdx_print = list(cdx_print)
|
||||
|
||||
@ -372,35 +350,36 @@ def main(
|
||||
if len(cdx_print) == 0:
|
||||
click.echo(snapshot)
|
||||
else:
|
||||
output_string = ""
|
||||
output_string = []
|
||||
if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
|
||||
output_string = output_string + snapshot.urlkey + " "
|
||||
output_string.append(snapshot.urlkey)
|
||||
if any(
|
||||
val in cdx_print
|
||||
for val in ["timestamp", "time-stamp", "time_stamp"]
|
||||
):
|
||||
output_string = output_string + snapshot.timestamp + " "
|
||||
output_string.append(snapshot.timestamp)
|
||||
if "original" in cdx_print:
|
||||
output_string = output_string + snapshot.original + " "
|
||||
output_string.append(snapshot.original)
|
||||
if any(
|
||||
val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]
|
||||
):
|
||||
output_string = output_string + snapshot.mimetype + " "
|
||||
output_string.append(snapshot.mimetype)
|
||||
if any(
|
||||
val in cdx_print
|
||||
for val in ["statuscode", "status-code", "status_code"]
|
||||
):
|
||||
output_string = output_string + snapshot.statuscode + " "
|
||||
output_string.append(snapshot.statuscode)
|
||||
if "digest" in cdx_print:
|
||||
output_string = output_string + snapshot.digest + " "
|
||||
output_string.append(snapshot.digest)
|
||||
if "length" in cdx_print:
|
||||
output_string = output_string + snapshot.length + " "
|
||||
output_string.append(snapshot.length)
|
||||
if any(
|
||||
val in cdx_print
|
||||
for val in ["archiveurl", "archive-url", "archive_url"]
|
||||
):
|
||||
output_string = output_string + snapshot.archive_url + " "
|
||||
click.echo(output_string)
|
||||
output_string.append(snapshot.archive_url)
|
||||
|
||||
click.echo(" ".join(output_string))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -19,7 +19,10 @@ class WaybackMachineSaveAPI(object):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
|
||||
self,
|
||||
url: str,
|
||||
user_agent: str = DEFAULT_USER_AGENT,
|
||||
max_tries: int = 8,
|
||||
) -> None:
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.request_url = "https://web.archive.org/save/" + self.url
|
||||
@ -169,17 +172,16 @@ class WaybackMachineSaveAPI(object):
|
||||
tries = 0
|
||||
|
||||
while True:
|
||||
if self.saved_archive is None:
|
||||
if tries >= 1:
|
||||
self.sleep(tries)
|
||||
if tries >= 1:
|
||||
self.sleep(tries)
|
||||
|
||||
self.get_save_request_headers()
|
||||
self.saved_archive = self.archive_url_parser()
|
||||
self.get_save_request_headers()
|
||||
self.saved_archive = self.archive_url_parser()
|
||||
|
||||
if isinstance(self.saved_archive, str):
|
||||
self._archive_url = self.saved_archive
|
||||
self.timestamp()
|
||||
return self.saved_archive
|
||||
if isinstance(self.saved_archive, str):
|
||||
self._archive_url = self.saved_archive
|
||||
self.timestamp()
|
||||
return self.saved_archive
|
||||
|
||||
tries += 1
|
||||
if tries >= self.max_tries:
|
||||
|
@ -6,21 +6,25 @@ from .cdx_api import WaybackMachineCDXServerAPI
|
||||
from .save_api import WaybackMachineSaveAPI
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
"""
|
||||
The Url class is not recommended to be used anymore, instead use the
|
||||
WaybackMachineSaveAPI, WaybackMachineAvailabilityAPI and WaybackMachineCDXServerAPI.
|
||||
|
||||
The reason it is still in the code is backwards compatibility with 2.x.x versions.
|
||||
|
||||
If were are using the Url before the update to version 3.x.x, your code should still be
|
||||
working fine and there is no hurry to update the interface but is recommended that you
|
||||
do not use the Url class for new code as it would be removed after 2025 also the first
|
||||
3.x.x versions was released in January 2022 and three years are more than enough to
|
||||
update the older interface code.
|
||||
"""
|
||||
|
||||
|
||||
class Url(object):
|
||||
"""
|
||||
The Url class is not recommended to be used anymore, instead use:
|
||||
|
||||
- WaybackMachineSaveAPI
|
||||
- WaybackMachineAvailabilityAPI
|
||||
- WaybackMachineCDXServerAPI
|
||||
|
||||
The reason it is still in the code is backwards compatibility with 2.x.x
|
||||
versions.
|
||||
|
||||
If were are using the Url before the update to version 3.x.x, your code should
|
||||
still be working fine and there is no hurry to update the interface but is
|
||||
recommended that you do not use the Url class for new code as it would be
|
||||
removed after 2025 also the first 3.x.x versions was released in January 2022
|
||||
and three years are more than enough to update the older interface code.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
|
||||
self.url = url
|
||||
self.user_agent = str(user_agent)
|
||||
|
Loading…
Reference in New Issue
Block a user