code formated with black (#47)

This commit is contained in:
Akash Mahanty 2020-12-14 01:18:04 +05:30 committed by GitHub
parent fde28d57aa
commit d3e68d0e70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 395 additions and 149 deletions

View File

@ -1,54 +1,54 @@
import os.path import os.path
from setuptools import setup from setuptools import setup
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as f: with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
long_description = f.read() long_description = f.read()
about = {} about = {}
with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py')) as f: with open(os.path.join(os.path.dirname(__file__), "waybackpy", "__version__.py")) as f:
exec(f.read(), about) exec(f.read(), about)
setup( setup(
name=about['__title__'], name=about["__title__"],
packages=['waybackpy'], packages=["waybackpy"],
version=about['__version__'], version=about["__version__"],
description=about['__description__'], description=about["__description__"],
long_description=long_description, long_description=long_description,
long_description_content_type='text/markdown', long_description_content_type="text/markdown",
license=about['__license__'], license=about["__license__"],
author=about['__author__'], author=about["__author__"],
author_email=about['__author_email__'], author_email=about["__author_email__"],
url=about['__url__'], url=about["__url__"],
download_url='https://github.com/akamhy/waybackpy/archive/2.3.0.tar.gz', download_url="https://github.com/akamhy/waybackpy/archive/2.3.0.tar.gz",
keywords=['Archive It', 'Archive Website', 'Wayback Machine', keywords=[
'waybackurls', 'Internet Archive', "Archive It",
"Archive Website",
"Wayback Machine",
"waybackurls",
"Internet Archive",
], ],
install_requires=['requests'], install_requires=["requests"],
python_requires=">=3.4", python_requires=">=3.4",
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', "Development Status :: 5 - Production/Stable",
'Intended Audience :: Developers', "Intended Audience :: Developers",
'Natural Language :: English', "Natural Language :: English",
'Topic :: Software Development :: Build Tools', "Topic :: Software Development :: Build Tools",
'License :: OSI Approved :: MIT License', "License :: OSI Approved :: MIT License",
'Programming Language :: Python', "Programming Language :: Python",
'Programming Language :: Python :: 3', "Programming Language :: Python :: 3",
'Programming Language :: Python :: 3.4', "Programming Language :: Python :: 3.4",
'Programming Language :: Python :: 3.5', "Programming Language :: Python :: 3.5",
'Programming Language :: Python :: 3.6', "Programming Language :: Python :: 3.6",
'Programming Language :: Python :: 3.7', "Programming Language :: Python :: 3.7",
'Programming Language :: Python :: 3.8', "Programming Language :: Python :: 3.8",
'Programming Language :: Python :: 3.9', "Programming Language :: Python :: 3.9",
'Programming Language :: Python :: Implementation :: CPython', "Programming Language :: Python :: Implementation :: CPython",
], ],
entry_points={ entry_points={"console_scripts": ["waybackpy = waybackpy.cli:main"]},
'console_scripts': [
'waybackpy = waybackpy.cli:main'
]
},
project_urls={ project_urls={
'Documentation': 'https://akamhy.github.io/waybackpy/', "Documentation": "https://akamhy.github.io/waybackpy/",
'Source': 'https://github.com/akamhy/waybackpy', "Source": "https://github.com/akamhy/waybackpy",
'Tracker': 'https://github.com/akamhy/waybackpy/issues', "Tracker": "https://github.com/akamhy/waybackpy/issues",
}, },
) )

View File

@ -14,88 +14,284 @@ from waybackpy.__version__ import __version__
def test_save(): def test_save():
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False, args = argparse.Namespace(
oldest=False, save=True, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) user_agent=None,
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=True,
json=False,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "pypi.org/user/akamhy" in str(reply) assert "pypi.org/user/akamhy" in str(reply)
def test_json(): def test_json():
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False, args = argparse.Namespace(
oldest=False, save=False, json=True, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) user_agent=None,
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=True,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "archived_snapshots" in str(reply) assert "archived_snapshots" in str(reply)
def test_archive_url(): def test_archive_url():
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False, args = argparse.Namespace(
oldest=False, save=False, json=False, archive_url=True, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) user_agent=None,
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=True,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "https://web.archive.org/web/" in str(reply) assert "https://web.archive.org/web/" in str(reply)
def test_oldest(): def test_oldest():
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False, args = argparse.Namespace(
oldest=True, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) user_agent=None,
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=True,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "pypi.org/user/akamhy" in str(reply) assert "pypi.org/user/akamhy" in str(reply)
def test_newest(): def test_newest():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=True, near=False, alive=False, subdomain=False, known_urls=False, get=None) (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=True,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "pypi.org/user/akamhy" in str(reply) assert "pypi.org/user/akamhy" in str(reply)
def test_total_archives(): def test_total_archives():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=True, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None) (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=True,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get=None,
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert isinstance(reply, int) assert isinstance(reply, int)
def test_known_urls(): def test_known_urls():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://akamhy.github.io", total=False, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=True, subdomain=True, known_urls=True, get=None) (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://akamhy.github.io",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
alive=True,
subdomain=True,
known_urls=True,
get=None,
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "github" in str(reply) assert "github" in str(reply)
def test_near(): def test_near():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=False, near=True, alive=False, subdomain=False, known_urls=False, get=None, year=2020, month=7, day=15, hour=1, minute=1) (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=True,
alive=False,
subdomain=False,
known_urls=False,
get=None,
year=2020,
month=7,
day=15,
hour=1,
minute=1,
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "202007" in str(reply) assert "202007" in str(reply)
def test_get(): def test_get():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="url") (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="url",
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "waybackpy" in str(reply) assert "waybackpy" in str(reply)
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="oldest") (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="oldest",
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "waybackpy" in str(reply) assert "waybackpy" in str(reply)
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="newest") (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="newest",
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "waybackpy" in str(reply) assert "waybackpy" in str(reply)
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="save") (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="save",
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "waybackpy" in str(reply) assert "waybackpy" in str(reply)
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ args = argparse.Namespace(
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False, user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="BullShit") (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
total=False,
version=False,
oldest=False,
save=False,
json=False,
archive_url=False,
newest=False,
near=False,
alive=False,
subdomain=False,
known_urls=False,
get="BullShit",
)
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "get the source code of the" in str(reply) assert "get the source code of the" in str(reply)
def test_args_handler(): def test_args_handler():
args = argparse.Namespace(version=True) args = argparse.Namespace(version=True)
reply = cli.args_handler(args) reply = cli.args_handler(args)
@ -105,6 +301,7 @@ def test_args_handler():
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert ("waybackpy %s" % (__version__)) in str(reply) assert ("waybackpy %s" % (__version__)) in str(reply)
def test_main(): def test_main():
# This also tests the parse_args method in cli.py # This also tests the parse_args method in cli.py
cli.main(['temp.py', '--version']) cli.main(["temp.py", "--version"])

View File

@ -3,6 +3,7 @@ import sys
import pytest import pytest
import random import random
import requests import requests
sys.path.append("..") sys.path.append("..")
import waybackpy.wrapper as waybackpy # noqa: E402 import waybackpy.wrapper as waybackpy # noqa: E402
@ -18,6 +19,7 @@ def test_clean_url():
test_result = target._clean_url() test_result = target._clean_url()
assert answer == test_result assert answer == test_result
def test_dunders(): def test_dunders():
url = "https://en.wikipedia.org/wiki/Network_security" url = "https://en.wikipedia.org/wiki/Network_security"
user_agent = "UA" user_agent = "UA"
@ -25,6 +27,7 @@ def test_dunders():
assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target) assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target)
assert "en.wikipedia.org" in str(target) assert "en.wikipedia.org" in str(target)
def test_archive_url_parser(): def test_archive_url_parser():
endpoint = "https://amazon.com" endpoint = "https://amazon.com"
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0" user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
@ -34,6 +37,7 @@ def test_archive_url_parser():
with pytest.raises(Exception): with pytest.raises(Exception):
waybackpy._archive_url_parser(header) waybackpy._archive_url_parser(header)
def test_url_check(): def test_url_check():
broken_url = "http://wwwgooglecom/" broken_url = "http://wwwgooglecom/"
with pytest.raises(Exception): with pytest.raises(Exception):
@ -61,8 +65,6 @@ def test_save():
archived_url1 = str(target.save()) archived_url1 = str(target.save())
assert url1 in archived_url1 assert url1 in archived_url1
# Test for urls that are incorrect. # Test for urls that are incorrect.
with pytest.raises(Exception): with pytest.raises(Exception):
url2 = "ha ha ha ha" url2 = "ha ha ha ha"
@ -89,7 +91,6 @@ def test_near():
archive_near_year = target.near(year=2010) archive_near_year = target.near(year=2010)
assert "2010" in str(archive_near_year) assert "2010" in str(archive_near_year)
archive_near_month_year = str(target.near(year=2015, month=2)) archive_near_month_year = str(target.near(year=2015, month=2))
assert ( assert (
("201502" in archive_near_month_year) ("201502" in archive_near_month_year)
@ -102,9 +103,9 @@ def test_near():
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246", "(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
) )
archive_near_hour_day_month_year = str(target.near( archive_near_hour_day_month_year = str(
year=2008, month=5, day=9, hour=15 target.near(year=2008, month=5, day=9, hour=15)
)) )
assert ( assert (
("2008050915" in archive_near_hour_day_month_year) ("2008050915" in archive_near_hour_day_month_year)
or ("2008050914" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year)
@ -119,22 +120,24 @@ def test_near():
target.near(year=2010) target.near(year=2010)
def test_oldest(): def test_oldest():
url = "github.com/akamhy/waybackpy" url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent) target = waybackpy.Url(url, user_agent)
assert "20200504141153" in str(target.oldest()) assert "20200504141153" in str(target.oldest())
def test_json(): def test_json():
url = "github.com/akamhy/waybackpy" url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent) target = waybackpy.Url(url, user_agent)
assert "archived_snapshots" in str(target.JSON) assert "archived_snapshots" in str(target.JSON)
def test_archive_url(): def test_archive_url():
url = "github.com/akamhy/waybackpy" url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent) target = waybackpy.Url(url, user_agent)
assert "github.com/akamhy" in str(target.archive_url) assert "github.com/akamhy" in str(target.archive_url)
def test_newest(): def test_newest():
url = "github.com/akamhy/waybackpy" url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent) target = waybackpy.Url(url, user_agent)
@ -146,17 +149,16 @@ def test_get():
assert "Welcome to Google" in target.get(target.oldest()) assert "Welcome to Google" in target.get(target.oldest())
def test_wayback_timestamp(): def test_wayback_timestamp():
ts = waybackpy._wayback_timestamp( ts = waybackpy._wayback_timestamp(year=2020, month=1, day=2, hour=3, minute=4)
year=2020, month=1, day=2, hour=3, minute=4
)
assert "202001020304" in str(ts) assert "202001020304" in str(ts)
def test_get_response(): def test_get_response():
endpoint = "https://www.google.com" endpoint = "https://www.google.com"
user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" user_agent = (
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
)
headers = {"User-Agent": "%s" % user_agent} headers = {"User-Agent": "%s" % user_agent}
response = waybackpy._get_response(endpoint, params=None, headers=headers) response = waybackpy._get_response(endpoint, params=None, headers=headers)
assert response.status_code == 200 assert response.status_code == 200
@ -172,6 +174,7 @@ def test_total_archives():
) )
assert target.total_archives() == 0 assert target.total_archives() == 0
def test_known_urls(): def test_known_urls():
target = waybackpy.Url("akamhy.github.io", user_agent) target = waybackpy.Url("akamhy.github.io", user_agent)

View File

@ -10,27 +10,27 @@ from waybackpy.__version__ import __version__
def _save(obj): def _save(obj):
return (obj.save()) return obj.save()
def _archive_url(obj): def _archive_url(obj):
return (obj.archive_url) return obj.archive_url
def _json(obj): def _json(obj):
return (obj.JSON) return obj.JSON
def _oldest(obj): def _oldest(obj):
return (obj.oldest()) return obj.oldest()
def _newest(obj): def _newest(obj):
return (obj.newest()) return obj.newest()
def _total_archives(obj): def _total_archives(obj):
return (obj.total_archives()) return obj.total_archives()
def _near(obj, args): def _near(obj, args):
@ -45,17 +45,19 @@ def _near(obj, args):
_near_args["hour"] = args.hour _near_args["hour"] = args.hour
if args.minute: if args.minute:
_near_args["minute"] = args.minute _near_args["minute"] = args.minute
return (obj.near(**_near_args)) return obj.near(**_near_args)
def _save_urls_on_file(input_list, live_url_count): def _save_urls_on_file(input_list, live_url_count):
m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
if m: if m:
domain = m.group(1) domain = m.group(1)
else: else:
domain = "domain-unknown" domain = "domain-unknown"
uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) uid = "".join(
random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
)
file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid) file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid)
file_content = "\n".join(input_list) file_content = "\n".join(input_list)
@ -89,34 +91,37 @@ def _known_urls(obj, args):
def _get(obj, args): def _get(obj, args):
if args.get.lower() == "url": if args.get.lower() == "url":
return (obj.get()) return obj.get()
if args.get.lower() == "archive_url": if args.get.lower() == "archive_url":
return (obj.get(obj.archive_url)) return obj.get(obj.archive_url)
if args.get.lower() == "oldest": if args.get.lower() == "oldest":
return (obj.get(obj.oldest())) return obj.get(obj.oldest())
if args.get.lower() == "latest" or args.get.lower() == "newest": if args.get.lower() == "latest" or args.get.lower() == "newest":
return (obj.get(obj.newest())) return obj.get(obj.newest())
if args.get.lower() == "save": if args.get.lower() == "save":
return (obj.get(obj.save())) return obj.get(obj.save())
return ("Use get as \"--get 'source'\", 'source' can be one of the followings: \ return "Use get as \"--get 'source'\", 'source' can be one of the followings: \
\n1) url - get the source code of the url specified using --url/-u.\ \n1) url - get the source code of the url specified using --url/-u.\
\n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\ \n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\
\n3) oldest - get the source code of the oldest archive for the supplied url.\ \n3) oldest - get the source code of the oldest archive for the supplied url.\
\n4) newest - get the source code of the newest archive for the supplied url.\ \n4) newest - get the source code of the newest archive for the supplied url.\
\n5) save - Create a new archive and get the source code of this new archive for the supplied url.") \n5) save - Create a new archive and get the source code of this new archive for the supplied url."
def args_handler(args): def args_handler(args):
if args.version: if args.version:
return ("waybackpy version %s" % __version__) return "waybackpy version %s" % __version__
if not args.url: if not args.url:
return ("waybackpy %s \nSee 'waybackpy --help' for help using this tool." % __version__) return (
"waybackpy %s \nSee 'waybackpy --help' for help using this tool."
% __version__
)
if args.user_agent: if args.user_agent:
obj = Url(args.url, args.user_agent) obj = Url(args.url, args.user_agent)
@ -151,52 +156,93 @@ def args_handler(args):
def parse_args(argv): def parse_args(argv):
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
requiredArgs = parser.add_argument_group('URL argument (required)') requiredArgs = parser.add_argument_group("URL argument (required)")
requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur") requiredArgs.add_argument(
"--url", "-u", help="URL on which Wayback machine operations would occur"
)
userAgentArg = parser.add_argument_group('User Agent') userAgentArg = parser.add_argument_group("User Agent")
help_text = "User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"" help_text = 'User agent, default user_agent is "waybackpy python package - https://github.com/akamhy/waybackpy"'
userAgentArg.add_argument("--user_agent", "-ua", help=help_text) userAgentArg.add_argument("--user_agent", "-ua", help=help_text)
saveArg = parser.add_argument_group("Create new archive/save URL") saveArg = parser.add_argument_group("Create new archive/save URL")
saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine") saveArg.add_argument(
"--save", "-s", action="store_true", help="Save the URL on the Wayback machine"
)
auArg = parser.add_argument_group("Get the latest Archive") auArg = parser.add_argument_group("Get the latest Archive")
auArg.add_argument("--archive_url", "-au", action='store_true', help="Get the latest archive URL, alias for --newest") auArg.add_argument(
"--archive_url",
"-au",
action="store_true",
help="Get the latest archive URL, alias for --newest",
)
jsonArg = parser.add_argument_group("Get the JSON data") jsonArg = parser.add_argument_group("Get the JSON data")
jsonArg.add_argument("--json", "-j", action='store_true', help="JSON data of the availability API request") jsonArg.add_argument(
"--json",
"-j",
action="store_true",
help="JSON data of the availability API request",
)
oldestArg = parser.add_argument_group("Oldest archive") oldestArg = parser.add_argument_group("Oldest archive")
oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL") oldestArg.add_argument(
"--oldest",
"-o",
action="store_true",
help="Oldest archive for the specified URL",
)
newestArg = parser.add_argument_group("Newest archive") newestArg = parser.add_argument_group("Newest archive")
newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL") newestArg.add_argument(
"--newest",
"-n",
action="store_true",
help="Newest archive for the specified URL",
)
totalArg = parser.add_argument_group("Total number of archives") totalArg = parser.add_argument_group("Total number of archives")
totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL") totalArg.add_argument(
"--total",
"-t",
action="store_true",
help="Total number of archives for the specified URL",
)
getArg = parser.add_argument_group("Get source code") getArg = parser.add_argument_group("Get source code")
getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage") getArg.add_argument(
"--get",
"-g",
help="Prints the source code of the supplied url. Use '--get help' for extended usage",
)
knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.") knownUrlArg = parser.add_argument_group(
knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.") "URLs known and archived to Waybcak Machine for the site."
)
knownUrlArg.add_argument(
"--known_urls", "-ku", action="store_true", help="URLs known for the domain."
)
help_text = "Use with '--known_urls' to include known URLs for subdomains." help_text = "Use with '--known_urls' to include known URLs for subdomains."
knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help=help_text) knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
help_text = "Only include live URLs. Will not inlclude dead links." help_text = "Only include live URLs. Will not inlclude dead links."
knownUrlArg.add_argument("--alive", "-a", action='store_true', help=help_text) knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text)
nearArg = parser.add_argument_group('Archive close to time specified') nearArg = parser.add_argument_group("Archive close to time specified")
nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time") nearArg.add_argument(
"--near", "-N", action="store_true", help="Archive near specified time"
)
nearArgs = parser.add_argument_group('Arguments that are used only with --near') nearArgs = parser.add_argument_group("Arguments that are used only with --near")
nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer") nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
nearArgs.add_argument("--month", "-M", type=int, help="Month in integer") nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.") nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.")
nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege") nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer") nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version") parser.add_argument(
"--version", "-v", action="store_true", help="Waybackpy version"
)
return parser.parse_args(argv[1:]) return parser.parse_args(argv[1:])

View File

@ -1,10 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
class WaybackError(Exception): class WaybackError(Exception):
""" """
Raised when Wayback Machine API Service is unreachable/down. Raised when Wayback Machine API Service is unreachable/down.
""" """
class URLError(Exception): class URLError(Exception):
""" """
Raised when malformed URLs are passed as arguments. Raised when malformed URLs are passed as arguments.

View File

@ -14,9 +14,7 @@ default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _archive_url_parser(header): def _archive_url_parser(header):
"""Parse out the archive from header.""" """Parse out the archive from header."""
# Regex1 # Regex1
arch = re.search( arch = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
r"Content-Location: (/web/[0-9]{14}/.*)", str(header)
)
if arch: if arch:
return "web.archive.org" + arch.group(1) return "web.archive.org" + arch.group(1)
# Regex2 # Regex2
@ -79,11 +77,7 @@ class Url:
def __len__(self): def __len__(self):
td_max = timedelta( td_max = timedelta(
days=999999999, days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
hours=23,
minutes=59,
seconds=59,
microseconds=999999
) )
if self.timestamp == datetime.max: if self.timestamp == datetime.max:
return td_max.days return td_max.days
@ -112,9 +106,7 @@ class Url:
else: else:
archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace( archive_url = archive_url.replace(
"http://web.archive.org/web/", "http://web.archive.org/web/", "https://web.archive.org/web/", 1
"https://web.archive.org/web/",
1
) )
return archive_url return archive_url
@ -127,10 +119,9 @@ class Url:
time = datetime.max time = datetime.max
else: else:
time = datetime.strptime(data["archived_snapshots"] time = datetime.strptime(
["closest"] data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
["timestamp"], )
'%Y%m%d%H%M%S')
return time return time
@ -170,7 +161,7 @@ class Url:
return response.content.decode(encoding.replace("text/html", "UTF-8", 1)) return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
def near(self, year=None, month=None, day=None, hour=None, minute=None): def near(self, year=None, month=None, day=None, hour=None, minute=None):
""" Return the closest Wayback Machine archive to the time supplied. """Return the closest Wayback Machine archive to the time supplied.
Supported params are year, month, day, hour and minute. Supported params are year, month, day, hour and minute.
Any non-supplied parameters default to the current time. Any non-supplied parameters default to the current time.
@ -184,10 +175,9 @@ class Url:
minute=minute if minute else now.tm_min, minute=minute if minute else now.tm_min,
) )
endpoint = "https://archive.org/wayback/available" endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": "%s" % self.user_agent} headers = {"User-Agent": "%s" % self.user_agent}
payload = {"url": "%s" % self._clean_url(), "timestamp" : timestamp} payload = {"url": "%s" % self._clean_url(), "timestamp": timestamp}
response = _get_response(endpoint, params=payload, headers=headers) response = _get_response(endpoint, params=payload, headers=headers)
data = response.json() data = response.json()
if not data["archived_snapshots"]: if not data["archived_snapshots"]:
@ -201,7 +191,9 @@ class Url:
) )
self.archive_url = archive_url self.archive_url = archive_url
self.timestamp = datetime.strptime(data["archived_snapshots"]["closest"]["timestamp"], '%Y%m%d%H%M%S') self.timestamp = datetime.strptime(
data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
return self return self
@ -221,7 +213,11 @@ class Url:
"""Returns the total number of Wayback Machine archives for this URL.""" """Returns the total number of Wayback Machine archives for this URL."""
endpoint = "https://web.archive.org/cdx/search/cdx" endpoint = "https://web.archive.org/cdx/search/cdx"
headers = {"User-Agent": "%s" % self.user_agent, "output" : "json", "fl" : "statuscode"} headers = {
"User-Agent": "%s" % self.user_agent,
"output": "json",
"fl": "statuscode",
}
payload = {"url": "%s" % self._clean_url()} payload = {"url": "%s" % self._clean_url()}
response = _get_response(endpoint, params=payload, headers=headers) response = _get_response(endpoint, params=payload, headers=headers)
@ -253,11 +249,13 @@ class Url:
if subdomain: if subdomain:
request_url = ( request_url = (
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" % self._clean_url() "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url()
) )
else: else:
request_url = ( request_url = (
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" % self._clean_url() "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url()
) )
headers = {"User-Agent": "%s" % self.user_agent} headers = {"User-Agent": "%s" % self.user_agent}