From f8bf9c16f9e667c3ac26a426f6c8d7fe1b5eff6c Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Tue, 8 Feb 2022 17:46:59 +0530 Subject: [PATCH] Add tests (#149) * enable codecov * fix save_urls_on_file * increase the limit of CDX to 25000 from 5000. 5X increase. * added test for the CLI module * make flake 8 happy * make mypy happy --- .github/workflows/unit-test.yml | 6 +- tests/test_cli.py | 169 ++++++++++++++++++++++++++++++++ waybackpy/cdx_api.py | 2 +- waybackpy/cli.py | 33 ++++--- 4 files changed, 195 insertions(+), 15 deletions(-) create mode 100644 tests/test_cli.py diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index d514154..5231d20 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -38,6 +38,6 @@ jobs: - name: Test with pytest run: | pytest - # - name: Upload coverage to Codecov - # run: | - # bash <(curl -s https://codecov.io/bash) -t ${{ secrets.CODECOV_TOKEN }} + - name: Upload coverage to Codecov + run: | + bash <(curl -s https://codecov.io/bash) -t ${{ secrets.CODECOV_TOKEN }} diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..8157442 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,169 @@ +import requests +from click.testing import CliRunner + +from waybackpy.cli import main +from waybackpy import __version__ + + +def test_oldest() -> None: + runner = CliRunner() + result = runner.invoke(main, ["--url", " https://github.com ", "--oldest"]) + assert result.exit_code == 0 + assert ( + result.output + == "Archive URL:\nhttps://web.archive.org/web/2008051421\ +0148/http://github.com/\n" + ) + + +def test_near() -> None: + runner = CliRunner() + result = runner.invoke( + main, + [ + "--url", + " https://facebook.com ", + "--near", + "--year", + "2010", + "--month", + "5", + "--day", + "10", + "--hour", + "6", + ], + ) + assert result.exit_code == 0 + assert ( + result.output + == "Archive URL:\nhttps://web.archive.org/web/2010051008\ +2647/http://www.facebook.com/\n" + ) + + +def test_json() -> None: + runner = CliRunner() + result = runner.invoke( + main, + [ + "--url", + " https://apple.com ", + "--near", + "--year", + "2010", + "--month", + "2", + "--day", + "8", + "--hour", + "12", + "--json", + ], + ) + assert result.exit_code == 0 + assert ( + result.output.find( + """Archive URL:\nhttps://web.archive.org/web/2010020812\ +5854/http://www.apple.com/\nJSON respons\ +e:\n{"url": "https://apple.com", "archived_snapshots": {"close\ +st": {"status": "200", "available": true, "url": "http://web.ar\ +chive.org/web/20100208125854/http://www.apple.com/", "timest\ +amp": "20100208125854"}}, "timestamp":""" + ) + != -1 + ) + + +def test_newest() -> None: + runner = CliRunner() + result = runner.invoke(main, ["--url", " https://microsoft.com ", "--newest"]) + assert result.exit_code == 0 + assert ( + result.output.find("microsoft.com") != -1 + and result.output.find("Archive URL:\n") != -1 + ) + + +def test_cdx() -> None: + runner = CliRunner() + result = runner.invoke( + main, + "--url https://twitter.com/jack --cdx --user-agent some-user-agent \ +--start-timestamp 2010 --end-timestamp 2012 --collapse urlkey \ +--match-type prefix --cdx-print archiveurl --cdx-print length \ +--cdx-print digest --cdx-print statuscode --cdx-print mimetype \ +--cdx-print original --cdx-print timestamp --cdx-print urlkey".split( + " " + ), + ) + assert result.exit_code == 0 + assert result.output.count("\n") > 3000 + + +def test_save() -> None: + runner = CliRunner() + result = runner.invoke( + main, + "--url https://news.ycombinator.com --user_agent my-unique-user-agent \ +--save --headers".split( + " " + ), + ) + assert result.exit_code == 0 + assert result.output.find("Archive URL:") != -1 + assert (result.output.find("Cached save:\nTrue") != -1) or ( + result.output.find("Cached save:\nFalse") != -1 + ) + assert result.output.find("Save API headers:\n") != -1 + assert result.output.find("://news.ycombinator.com") != -1 + + +def test_version() -> None: + runner = CliRunner() + result = runner.invoke(main, ["--version"]) + assert result.exit_code == 0 + assert result.output == f"waybackpy version {__version__}\n" + + +def test_license() -> None: + runner = CliRunner() + result = runner.invoke(main, ["--license"]) + assert result.exit_code == 0 + assert ( + result.output + == requests.get( + url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE" + ).text + + "\n" + ) + + +def test_only_url() -> None: + runner = CliRunner() + result = runner.invoke(main, ["--url", "https://google.com"]) + assert result.exit_code == 0 + assert ( + result.output + == "Only URL passed, but did not specify what to do with the URL. Use \ +--help flag for help using waybackpy.\n" + ) + + +def test_known_url() -> None: + # with file generator enabled + runner = CliRunner() + result = runner.invoke( + main, ["--url", "https://akamhy.github.io", "--known-urls", "--file"] + ) + assert result.exit_code == 0 + assert result.output.count("\n") > 40 + assert result.output.count("akamhy.github.io") > 40 + assert result.output.find("in the current working directory.\n") != -1 + + # without file + runner = CliRunner() + result = runner.invoke(main, ["--url", "https://akamhy.github.io", "--known-urls"]) + assert result.exit_code == 0 + assert result.output.count("\n") > 40 + assert result.output.count("akamhy.github.io") > 40 diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index 7f8b2a4..ce46ae4 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -60,7 +60,7 @@ class WaybackMachineCDXServerAPI: self.gzip = gzip self.collapses = [] if collapses is None else collapses check_collapses(self.collapses) - self.limit = 5000 if limit is None else limit + self.limit = 25000 if limit is None else limit self.max_tries = max_tries self.last_api_request_url: Optional[str] = None self.use_page = False diff --git a/waybackpy/cli.py b/waybackpy/cli.py index bf01f25..320efe9 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -59,17 +59,28 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: for url in url_gen: url_count += 1 if not domain: - match = re.search("https?://([A-Za-z_0-9.-]+).*", url) - domain = "domain-unknown" if match is None else match.group(1) - file_name = f"{domain}-urls-{uid}.txt" + m = re.search("https?://([A-Za-z_0-9.-]+).*", url) + + domain = "domain-unknown" + + if m: + domain = m.group(1) + + file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid) file_path = os.path.join(os.getcwd(), file_name) - with open(file_path, "a", encoding="UTF-8") as file: - file.write(f"{url}\n") + if not os.path.isfile(file_path): + open(file_path, "w+").close() + + with open(file_path, "a") as f: + f.write("{url}\n".format(url=url)) click.echo(url) - if url_count > 0 or file_name is not None: - click.echo(f"\n\n'{file_name}' saved in current working directory") + if url_count > 0: + click.echo( + f"\n\n{url_count} URLs saved inside '{file_name}' in the current " + + "working directory." + ) else: click.echo("No known URLs found. Please try a diffrent input!") @@ -343,10 +354,10 @@ def main( # pylint: disable=no-value-for-parameter url_gen = wayback.known_urls(subdomain=subdomain) if file: - return save_urls_on_file(url_gen) - - for url_ in url_gen: - click.echo(url_) + save_urls_on_file(url_gen) + else: + for url_ in url_gen: + click.echo(url_) elif cdx: filters = list(cdx_filter)