diff --git a/setup.py b/setup.py index e006313..fe415f4 100644 --- a/setup.py +++ b/setup.py @@ -1,54 +1,54 @@ import os.path from setuptools import setup -with open(os.path.join(os.path.dirname(__file__), 'README.md')) as f: +with open(os.path.join(os.path.dirname(__file__), "README.md")) as f: long_description = f.read() about = {} -with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py')) as f: +with open(os.path.join(os.path.dirname(__file__), "waybackpy", "__version__.py")) as f: exec(f.read(), about) setup( - name=about['__title__'], - packages=['waybackpy'], - version=about['__version__'], - description=about['__description__'], + name=about["__title__"], + packages=["waybackpy"], + version=about["__version__"], + description=about["__description__"], long_description=long_description, - long_description_content_type='text/markdown', - license=about['__license__'], - author=about['__author__'], - author_email=about['__author_email__'], - url=about['__url__'], - download_url='https://github.com/akamhy/waybackpy/archive/2.3.0.tar.gz', - keywords=['Archive It', 'Archive Website', 'Wayback Machine', - 'waybackurls', 'Internet Archive', - ], - install_requires=['requests'], + long_description_content_type="text/markdown", + license=about["__license__"], + author=about["__author__"], + author_email=about["__author_email__"], + url=about["__url__"], + download_url="https://github.com/akamhy/waybackpy/archive/2.3.0.tar.gz", + keywords=[ + "Archive It", + "Archive Website", + "Wayback Machine", + "waybackurls", + "Internet Archive", + ], + install_requires=["requests"], python_requires=">=3.4", classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'Natural Language :: English', - 'Topic :: Software Development :: Build Tools', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: Implementation :: CPython', - ], - entry_points={ - 'console_scripts': [ - 'waybackpy = waybackpy.cli:main' - ] - }, + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Natural Language :: English", + "Topic :: Software Development :: Build Tools", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: Implementation :: CPython", + ], + entry_points={"console_scripts": ["waybackpy = waybackpy.cli:main"]}, project_urls={ - 'Documentation': 'https://akamhy.github.io/waybackpy/', - 'Source': 'https://github.com/akamhy/waybackpy', - 'Tracker': 'https://github.com/akamhy/waybackpy/issues', + "Documentation": "https://akamhy.github.io/waybackpy/", + "Source": "https://github.com/akamhy/waybackpy", + "Tracker": "https://github.com/akamhy/waybackpy/issues", }, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 5ae1742..7793759 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,7 @@ import argparse sys.path.append("..") import waybackpy.cli as cli # noqa: E402 -from waybackpy.wrapper import Url # noqa: E402 +from waybackpy.wrapper import Url # noqa: E402 from waybackpy.__version__ import __version__ # Namespace(day=None, get=None, hour=None, minute=None, month=None, near=False, @@ -14,88 +14,284 @@ from waybackpy.__version__ import __version__ def test_save(): - args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=True, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) + args = argparse.Namespace( + user_agent=None, + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=True, + json=False, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get=None, + ) reply = cli.args_handler(args) assert "pypi.org/user/akamhy" in str(reply) + def test_json(): - args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=True, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) + args = argparse.Namespace( + user_agent=None, + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=True, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get=None, + ) reply = cli.args_handler(args) assert "archived_snapshots" in str(reply) + def test_archive_url(): - args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=False, archive_url=True, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) + args = argparse.Namespace( + user_agent=None, + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=True, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get=None, + ) reply = cli.args_handler(args) assert "https://web.archive.org/web/" in str(reply) + def test_oldest(): - args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=True, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) + args = argparse.Namespace( + user_agent=None, + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=True, + save=False, + json=False, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get=None, + ) reply = cli.args_handler(args) assert "pypi.org/user/akamhy" in str(reply) + def test_newest(): - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=True, near=False, alive=False, subdomain=False, known_urls=False, get=None) + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=True, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get=None, + ) reply = cli.args_handler(args) assert "pypi.org/user/akamhy" in str(reply) + def test_total_archives(): - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=True, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://pypi.org/user/akamhy/", + total=True, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get=None, + ) reply = cli.args_handler(args) assert isinstance(reply, int) + def test_known_urls(): - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://akamhy.github.io", total=False, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=True, subdomain=True, known_urls=True, get=None) + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://akamhy.github.io", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=False, + near=False, + alive=True, + subdomain=True, + known_urls=True, + get=None, + ) reply = cli.args_handler(args) assert "github" in str(reply) + def test_near(): - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=False, near=True, alive=False, subdomain=False, known_urls=False, get=None, year=2020, month=7, day=15, hour=1, minute=1) + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=False, + near=True, + alive=False, + subdomain=False, + known_urls=False, + get=None, + year=2020, + month=7, + day=15, + hour=1, + minute=1, + ) reply = cli.args_handler(args) assert "202007" in str(reply) + def test_get(): - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="url") + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get="url", + ) reply = cli.args_handler(args) assert "waybackpy" in str(reply) - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="oldest") + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get="oldest", + ) reply = cli.args_handler(args) assert "waybackpy" in str(reply) - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="newest") + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get="newest", + ) reply = cli.args_handler(args) assert "waybackpy" in str(reply) - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="save") + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get="save", + ) reply = cli.args_handler(args) assert "waybackpy" in str(reply) - args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ - (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, - oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="BullShit") + args = argparse.Namespace( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ + (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", + url="https://pypi.org/user/akamhy/", + total=False, + version=False, + oldest=False, + save=False, + json=False, + archive_url=False, + newest=False, + near=False, + alive=False, + subdomain=False, + known_urls=False, + get="BullShit", + ) reply = cli.args_handler(args) assert "get the source code of the" in str(reply) + def test_args_handler(): args = argparse.Namespace(version=True) reply = cli.args_handler(args) @@ -105,6 +301,7 @@ def test_args_handler(): reply = cli.args_handler(args) assert ("waybackpy %s" % (__version__)) in str(reply) + def test_main(): # This also tests the parse_args method in cli.py - cli.main(['temp.py', '--version']) + cli.main(["temp.py", "--version"]) diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index fd79f50..7af6d1f 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -3,6 +3,7 @@ import sys import pytest import random import requests + sys.path.append("..") import waybackpy.wrapper as waybackpy # noqa: E402 @@ -18,6 +19,7 @@ def test_clean_url(): test_result = target._clean_url() assert answer == test_result + def test_dunders(): url = "https://en.wikipedia.org/wiki/Network_security" user_agent = "UA" @@ -25,6 +27,7 @@ def test_dunders(): assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target) assert "en.wikipedia.org" in str(target) + def test_archive_url_parser(): endpoint = "https://amazon.com" user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0" @@ -34,6 +37,7 @@ def test_archive_url_parser(): with pytest.raises(Exception): waybackpy._archive_url_parser(header) + def test_url_check(): broken_url = "http://wwwgooglecom/" with pytest.raises(Exception): @@ -61,8 +65,6 @@ def test_save(): archived_url1 = str(target.save()) assert url1 in archived_url1 - - # Test for urls that are incorrect. with pytest.raises(Exception): url2 = "ha ha ha ha" @@ -89,7 +91,6 @@ def test_near(): archive_near_year = target.near(year=2010) assert "2010" in str(archive_near_year) - archive_near_month_year = str(target.near(year=2015, month=2)) assert ( ("201502" in archive_near_month_year) @@ -102,9 +103,9 @@ def test_near(): "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246", ) - archive_near_hour_day_month_year = str(target.near( - year=2008, month=5, day=9, hour=15 - )) + archive_near_hour_day_month_year = str( + target.near(year=2008, month=5, day=9, hour=15) + ) assert ( ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) @@ -119,22 +120,24 @@ def test_near(): target.near(year=2010) - def test_oldest(): url = "github.com/akamhy/waybackpy" target = waybackpy.Url(url, user_agent) assert "20200504141153" in str(target.oldest()) + def test_json(): url = "github.com/akamhy/waybackpy" target = waybackpy.Url(url, user_agent) assert "archived_snapshots" in str(target.JSON) + def test_archive_url(): url = "github.com/akamhy/waybackpy" target = waybackpy.Url(url, user_agent) assert "github.com/akamhy" in str(target.archive_url) + def test_newest(): url = "github.com/akamhy/waybackpy" target = waybackpy.Url(url, user_agent) @@ -146,17 +149,16 @@ def test_get(): assert "Welcome to Google" in target.get(target.oldest()) - def test_wayback_timestamp(): - ts = waybackpy._wayback_timestamp( - year=2020, month=1, day=2, hour=3, minute=4 - ) + ts = waybackpy._wayback_timestamp(year=2020, month=1, day=2, hour=3, minute=4) assert "202001020304" in str(ts) def test_get_response(): endpoint = "https://www.google.com" - user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" + user_agent = ( + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" + ) headers = {"User-Agent": "%s" % user_agent} response = waybackpy._get_response(endpoint, params=None, headers=headers) assert response.status_code == 200 @@ -172,6 +174,7 @@ def test_total_archives(): ) assert target.total_archives() == 0 + def test_known_urls(): target = waybackpy.Url("akamhy.github.io", user_agent) diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 4bff3bd..35e96e5 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -10,27 +10,27 @@ from waybackpy.__version__ import __version__ def _save(obj): - return (obj.save()) + return obj.save() def _archive_url(obj): - return (obj.archive_url) + return obj.archive_url def _json(obj): - return (obj.JSON) + return obj.JSON def _oldest(obj): - return (obj.oldest()) + return obj.oldest() def _newest(obj): - return (obj.newest()) + return obj.newest() def _total_archives(obj): - return (obj.total_archives()) + return obj.total_archives() def _near(obj, args): @@ -45,17 +45,19 @@ def _near(obj, args): _near_args["hour"] = args.hour if args.minute: _near_args["minute"] = args.minute - return (obj.near(**_near_args)) + return obj.near(**_near_args) def _save_urls_on_file(input_list, live_url_count): - m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) + m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0]) if m: domain = m.group(1) else: domain = "domain-unknown" - uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) + uid = "".join( + random.choice(string.ascii_lowercase + string.digits) for _ in range(6) + ) file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid) file_content = "\n".join(input_list) @@ -89,34 +91,37 @@ def _known_urls(obj, args): def _get(obj, args): if args.get.lower() == "url": - return (obj.get()) + return obj.get() if args.get.lower() == "archive_url": - return (obj.get(obj.archive_url)) + return obj.get(obj.archive_url) if args.get.lower() == "oldest": - return (obj.get(obj.oldest())) + return obj.get(obj.oldest()) if args.get.lower() == "latest" or args.get.lower() == "newest": - return (obj.get(obj.newest())) + return obj.get(obj.newest()) if args.get.lower() == "save": - return (obj.get(obj.save())) + return obj.get(obj.save()) - return ("Use get as \"--get 'source'\", 'source' can be one of the followings: \ + return "Use get as \"--get 'source'\", 'source' can be one of the followings: \ \n1) url - get the source code of the url specified using --url/-u.\ \n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\ \n3) oldest - get the source code of the oldest archive for the supplied url.\ \n4) newest - get the source code of the newest archive for the supplied url.\ - \n5) save - Create a new archive and get the source code of this new archive for the supplied url.") + \n5) save - Create a new archive and get the source code of this new archive for the supplied url." def args_handler(args): if args.version: - return ("waybackpy version %s" % __version__) + return "waybackpy version %s" % __version__ if not args.url: - return ("waybackpy %s \nSee 'waybackpy --help' for help using this tool." % __version__) + return ( + "waybackpy %s \nSee 'waybackpy --help' for help using this tool." + % __version__ + ) if args.user_agent: obj = Url(args.url, args.user_agent) @@ -151,52 +156,93 @@ def args_handler(args): def parse_args(argv): parser = argparse.ArgumentParser() - requiredArgs = parser.add_argument_group('URL argument (required)') - requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur") + requiredArgs = parser.add_argument_group("URL argument (required)") + requiredArgs.add_argument( + "--url", "-u", help="URL on which Wayback machine operations would occur" + ) - userAgentArg = parser.add_argument_group('User Agent') - help_text = "User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"" + userAgentArg = parser.add_argument_group("User Agent") + help_text = 'User agent, default user_agent is "waybackpy python package - https://github.com/akamhy/waybackpy"' userAgentArg.add_argument("--user_agent", "-ua", help=help_text) saveArg = parser.add_argument_group("Create new archive/save URL") - saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine") + saveArg.add_argument( + "--save", "-s", action="store_true", help="Save the URL on the Wayback machine" + ) auArg = parser.add_argument_group("Get the latest Archive") - auArg.add_argument("--archive_url", "-au", action='store_true', help="Get the latest archive URL, alias for --newest") + auArg.add_argument( + "--archive_url", + "-au", + action="store_true", + help="Get the latest archive URL, alias for --newest", + ) jsonArg = parser.add_argument_group("Get the JSON data") - jsonArg.add_argument("--json", "-j", action='store_true', help="JSON data of the availability API request") + jsonArg.add_argument( + "--json", + "-j", + action="store_true", + help="JSON data of the availability API request", + ) oldestArg = parser.add_argument_group("Oldest archive") - oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL") + oldestArg.add_argument( + "--oldest", + "-o", + action="store_true", + help="Oldest archive for the specified URL", + ) newestArg = parser.add_argument_group("Newest archive") - newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL") + newestArg.add_argument( + "--newest", + "-n", + action="store_true", + help="Newest archive for the specified URL", + ) totalArg = parser.add_argument_group("Total number of archives") - totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL") + totalArg.add_argument( + "--total", + "-t", + action="store_true", + help="Total number of archives for the specified URL", + ) getArg = parser.add_argument_group("Get source code") - getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage") + getArg.add_argument( + "--get", + "-g", + help="Prints the source code of the supplied url. Use '--get help' for extended usage", + ) - knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.") - knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.") + knownUrlArg = parser.add_argument_group( + "URLs known and archived to Waybcak Machine for the site." + ) + knownUrlArg.add_argument( + "--known_urls", "-ku", action="store_true", help="URLs known for the domain." + ) help_text = "Use with '--known_urls' to include known URLs for subdomains." - knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help=help_text) + knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text) help_text = "Only include live URLs. Will not inlclude dead links." - knownUrlArg.add_argument("--alive", "-a", action='store_true', help=help_text) + knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text) - nearArg = parser.add_argument_group('Archive close to time specified') - nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time") + nearArg = parser.add_argument_group("Archive close to time specified") + nearArg.add_argument( + "--near", "-N", action="store_true", help="Archive near specified time" + ) - nearArgs = parser.add_argument_group('Arguments that are used only with --near') + nearArgs = parser.add_argument_group("Arguments that are used only with --near") nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer") nearArgs.add_argument("--month", "-M", type=int, help="Month in integer") nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.") nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege") nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer") - parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version") + parser.add_argument( + "--version", "-v", action="store_true", help="Waybackpy version" + ) return parser.parse_args(argv[1:]) diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 85337d0..2de2b6f 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- + class WaybackError(Exception): """ Raised when Wayback Machine API Service is unreachable/down. """ + class URLError(Exception): """ Raised when malformed URLs are passed as arguments. diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index c06a9a7..6ea2880 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -14,9 +14,7 @@ default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy" def _archive_url_parser(header): """Parse out the archive from header.""" # Regex1 - arch = re.search( - r"Content-Location: (/web/[0-9]{14}/.*)", str(header) - ) + arch = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header)) if arch: return "web.archive.org" + arch.group(1) # Regex2 @@ -79,11 +77,7 @@ class Url: def __len__(self): td_max = timedelta( - days=999999999, - hours=23, - minutes=59, - seconds=59, - microseconds=999999 + days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999 ) if self.timestamp == datetime.max: return td_max.days @@ -112,9 +106,7 @@ class Url: else: archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = archive_url.replace( - "http://web.archive.org/web/", - "https://web.archive.org/web/", - 1 + "http://web.archive.org/web/", "https://web.archive.org/web/", 1 ) return archive_url @@ -127,10 +119,9 @@ class Url: time = datetime.max else: - time = datetime.strptime(data["archived_snapshots"] - ["closest"] - ["timestamp"], - '%Y%m%d%H%M%S') + time = datetime.strptime( + data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" + ) return time @@ -170,9 +161,9 @@ class Url: return response.content.decode(encoding.replace("text/html", "UTF-8", 1)) def near(self, year=None, month=None, day=None, hour=None, minute=None): - """ Return the closest Wayback Machine archive to the time supplied. - Supported params are year, month, day, hour and minute. - Any non-supplied parameters default to the current time. + """Return the closest Wayback Machine archive to the time supplied. + Supported params are year, month, day, hour and minute. + Any non-supplied parameters default to the current time. """ now = datetime.utcnow().timetuple() @@ -184,10 +175,9 @@ class Url: minute=minute if minute else now.tm_min, ) - endpoint = "https://archive.org/wayback/available" headers = {"User-Agent": "%s" % self.user_agent} - payload = {"url": "%s" % self._clean_url(), "timestamp" : timestamp} + payload = {"url": "%s" % self._clean_url(), "timestamp": timestamp} response = _get_response(endpoint, params=payload, headers=headers) data = response.json() if not data["archived_snapshots"]: @@ -201,7 +191,9 @@ class Url: ) self.archive_url = archive_url - self.timestamp = datetime.strptime(data["archived_snapshots"]["closest"]["timestamp"], '%Y%m%d%H%M%S') + self.timestamp = datetime.strptime( + data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" + ) return self @@ -221,7 +213,11 @@ class Url: """Returns the total number of Wayback Machine archives for this URL.""" endpoint = "https://web.archive.org/cdx/search/cdx" - headers = {"User-Agent": "%s" % self.user_agent, "output" : "json", "fl" : "statuscode"} + headers = { + "User-Agent": "%s" % self.user_agent, + "output": "json", + "fl": "statuscode", + } payload = {"url": "%s" % self._clean_url()} response = _get_response(endpoint, params=payload, headers=headers) @@ -253,11 +249,13 @@ class Url: if subdomain: request_url = ( - "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" % self._clean_url() + "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" + % self._clean_url() ) else: request_url = ( - "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" % self._clean_url() + "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" + % self._clean_url() ) headers = {"User-Agent": "%s" % self.user_agent}