diff --git a/.github/workflows/build_test.yml b/.github/workflows/build-test.yml similarity index 100% rename from .github/workflows/build_test.yml rename to .github/workflows/build-test.yml diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit-test.yml similarity index 88% rename from .github/workflows/unit_test.yml rename to .github/workflows/unit-test.yml index 8fd5b95..d514154 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit-test.yml @@ -28,14 +28,13 @@ jobs: pip install '.[dev]' - name: Lint with flake8 run: | - # stop the build if there are Python syntax errors or undefined names flake8 . --count --show-source --statistics - name: Lint with black run: | black . --check --diff - # - name: Static type test with mypy - # run: | - # mypy + - name: Static type test with mypy + run: | + mypy -p waybackpy -p tests - name: Test with pytest run: | pytest diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 9112cc3..0000000 --- a/pytest.ini +++ /dev/null @@ -1,11 +0,0 @@ -[pytest] -addopts = - # show summary of all tests that did not pass - -ra - # enable all warnings - -Wd - # coverage and html report - --cov=waybackpy - --cov-report=html -testpaths = - tests diff --git a/requirements-dev.txt b/requirements-dev.txt index 288ffd3..c694b4c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,7 +3,8 @@ click codecov flake8 mypy -setuptools>=46.4.0 pytest pytest-cov requests +setuptools>=46.4.0 +types-requests diff --git a/requirements.txt b/requirements.txt index 0d8c96e..0557d22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ click requests +urllib3 diff --git a/setup.cfg b/setup.cfg index 197b071..a084b14 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,7 @@ python_requires = >= 3.7 install_requires = click requests + urllib3 [options.extras_require] dev = @@ -52,7 +53,7 @@ dev = pytest pytest-cov setuptools>=46.4.0 - + types-requests [options.entry_points] console_scripts = @@ -64,4 +65,26 @@ profile = black [flake8] indent-size = 4 max-line-length = 88 -extend-ignore = E203,W503,E501,W605 +extend-ignore = W605 + +[mypy] +python_version = 3.9 +show_error_codes = True +pretty = True +strict = True + +[tool:pytest] +addopts = + # show summary of all tests that did not pass + -ra + # enable all warnings + -Wd + # coverage and html report + --cov=waybackpy + --cov-report=html +testpaths = + tests + +[pycodestyle] +# for `license` and `filter in `waybackpy.cli.main` +ignore = W0622 diff --git a/tests/test_availability_api.py b/tests/test_availability_api.py index 3b2b812..42803df 100644 --- a/tests/test_availability_api.py +++ b/tests/test_availability_api.py @@ -12,33 +12,42 @@ from waybackpy.exceptions import ( now = datetime.utcnow() url = "https://example.com/" -user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" +user_agent = ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" +) -def rndstr(n): +def rndstr(n: int) -> str: return "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(n) ) -def test_oldest(): +def test_oldest() -> None: """ Test the oldest archive of Google.com and also checks the attributes. """ url = "https://example.com/" - user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" + user_agent = ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" + ) availability_api = WaybackMachineAvailabilityAPI(url, user_agent) oldest = availability_api.oldest() oldest_archive_url = oldest.archive_url assert "2002" in oldest_archive_url oldest_timestamp = oldest.timestamp() assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years - assert availability_api.JSON["archived_snapshots"]["closest"]["available"] is True + assert ( + availability_api.JSON is not None + and availability_api.JSON["archived_snapshots"]["closest"]["available"] is True + ) assert repr(oldest).find("example.com") != -1 assert "2002" in str(oldest) -def test_newest(): +def test_newest() -> None: """ Assuming that the recent most Google Archive was made no more earlier than last one day which is 86400 seconds. @@ -54,16 +63,17 @@ def test_newest(): assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3) -def test_invalid_json(): +def test_invalid_json() -> None: """ - When the API is malfunctioning or we don't pass a URL it may return invalid JSON data. + When the API is malfunctioning or we don't pass a URL, + it may return invalid JSON data. """ with pytest.raises(InvalidJSONInAvailabilityAPIResponse): availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent) _ = availability_api.archive_url -def test_no_archive(): +def test_no_archive() -> None: """ ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not replied with the archive despite the fact that we know the site has million @@ -74,12 +84,12 @@ def test_no_archive(): """ with pytest.raises(ArchiveNotInAvailabilityAPIResponse): availability_api = WaybackMachineAvailabilityAPI( - url="https://%s.cn" % rndstr(30), user_agent=user_agent + url=f"https://{rndstr(30)}.cn", user_agent=user_agent ) _ = availability_api.archive_url -def test_no_api_call_str_repr(): +def test_no_api_call_str_repr() -> None: """ Some entitled users maybe want to see what is the string representation if they don’t make any API requests. @@ -87,17 +97,17 @@ def test_no_api_call_str_repr(): str() must not return None so we return "" """ availability_api = WaybackMachineAvailabilityAPI( - url="https://%s.gov" % rndstr(30), user_agent=user_agent + url=f"https://{rndstr(30)}.gov", user_agent=user_agent ) assert "" == str(availability_api) -def test_no_call_timestamp(): +def test_no_call_timestamp() -> None: """ If no API requests were made the bound timestamp() method returns the datetime.max as a default value. """ availability_api = WaybackMachineAvailabilityAPI( - url="https://%s.in" % rndstr(30), user_agent=user_agent + url=f"https://{rndstr(30)}.in", user_agent=user_agent ) assert datetime.max == availability_api.timestamp() diff --git a/tests/test_cdx_api.py b/tests/test_cdx_api.py index d574954..410a318 100644 --- a/tests/test_cdx_api.py +++ b/tests/test_cdx_api.py @@ -1,8 +1,11 @@ from waybackpy.cdx_api import WaybackMachineCDXServerAPI -def test_a(): - user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" +def test_a() -> None: + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) url = "https://twitter.com/jack" wayback = WaybackMachineCDXServerAPI( @@ -21,8 +24,11 @@ def test_a(): assert snapshot.timestamp.startswith("2010") -def test_b(): - user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" +def test_b() -> None: + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) url = "https://www.google.com" wayback = WaybackMachineCDXServerAPI( diff --git a/tests/test_cdx_snapshot.py b/tests/test_cdx_snapshot.py index baea40e..a99977e 100644 --- a/tests/test_cdx_snapshot.py +++ b/tests/test_cdx_snapshot.py @@ -3,8 +3,11 @@ from datetime import datetime from waybackpy.cdx_snapshot import CDXSnapshot -def test_CDXSnapshot(): - sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" +def test_CDXSnapshot() -> None: + sample_input = ( + "org,archive)/ 20080126045828 http://github.com " + "text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" + ) prop_values = sample_input.split(" ") properties = {} ( diff --git a/tests/test_cdx_utils.py b/tests/test_cdx_utils.py index 406e7e6..8378dd0 100644 --- a/tests/test_cdx_utils.py +++ b/tests/test_cdx_utils.py @@ -1,3 +1,5 @@ +from typing import Any, Dict, List + import pytest from waybackpy.cdx_utils import ( @@ -11,15 +13,18 @@ from waybackpy.cdx_utils import ( from waybackpy.exceptions import WaybackError -def test_get_total_pages(): +def test_get_total_pages() -> None: url = "twitter.com" - user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15" + user_agent = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15" + ) assert get_total_pages(url=url, user_agent=user_agent) >= 56 -def test_full_url(): - params = {} +def test_full_url() -> None: endpoint = "https://web.archive.org/cdx/search/cdx" + params: Dict[str, Any] = {} assert endpoint == full_url(endpoint, params) params = {"a": "1"} @@ -39,36 +44,36 @@ def test_full_url(): ) -def test_get_response(): +def test_get_response() -> None: url = "https://github.com" user_agent = ( "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" ) - headers = {"User-Agent": "%s" % user_agent} + headers = {"User-Agent": str(user_agent)} response = get_response(url, headers=headers) - assert response.status_code == 200 + assert not isinstance(response, Exception) and response.status_code == 200 url = "http/wwhfhfvhvjhmom" with pytest.raises(WaybackError): get_response(url, headers=headers) -def test_check_filters(): - filters = [] +def test_check_filters() -> None: + filters: List[str] = [] check_filters(filters) filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"] check_filters(filters) with pytest.raises(WaybackError): - check_filters("not-list") + check_filters("not-list") # type: ignore[arg-type] with pytest.raises(WaybackError): check_filters(["invalid"]) -def test_check_collapses(): - collapses = [] +def test_check_collapses() -> None: + collapses: List[str] = [] check_collapses(collapses) collapses = ["timestamp:10"] @@ -77,7 +82,7 @@ def test_check_collapses(): collapses = ["urlkey"] check_collapses(collapses) - collapses = "urlkey" # NOT LIST + collapses = "urlkey" # type: ignore[assignment] with pytest.raises(WaybackError): check_collapses(collapses) @@ -86,11 +91,11 @@ def test_check_collapses(): check_collapses(collapses) -def test_check_match_type(): - assert check_match_type(None, "url") is None +def test_check_match_type() -> None: + assert check_match_type(None, "url") match_type = "exact" url = "test_url" - assert check_match_type(match_type, url) is None + assert check_match_type(match_type, url) url = "has * in it" with pytest.raises(WaybackError): diff --git a/tests/test_save_api.py b/tests/test_save_api.py index 7cd725d..ab2e4fc 100644 --- a/tests/test_save_api.py +++ b/tests/test_save_api.py @@ -2,22 +2,27 @@ import random import string import time from datetime import datetime +from typing import cast import pytest +from requests.structures import CaseInsensitiveDict from waybackpy.exceptions import MaximumSaveRetriesExceeded from waybackpy.save_api import WaybackMachineSaveAPI -def rndstr(n): +def rndstr(n: int) -> str: return "".join( random.choice(string.ascii_uppercase + string.digits) for _ in range(n) ) -def test_save(): +def test_save() -> None: url = "https://github.com/akamhy/waybackpy" - user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) save_api = WaybackMachineSaveAPI(url, user_agent) save_api.save() archive_url = save_api.archive_url @@ -31,15 +36,18 @@ def test_save(): assert isinstance(save_api.timestamp(), datetime) -def test_max_redirect_exceeded(): +def test_max_redirect_exceeded() -> None: with pytest.raises(MaximumSaveRetriesExceeded): - url = "https://%s.gov" % rndstr - user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + url = f"https://{rndstr}.gov" + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3) save_api.save() -def test_sleep(): +def test_sleep() -> None: """ sleeping is actually very important for SaveAPI interface stability. @@ -47,7 +55,10 @@ def test_sleep(): is as intended. """ url = "https://example.com" - user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) save_api = WaybackMachineSaveAPI(url, user_agent) s_time = int(time.time()) save_api.sleep(6) # multiple of 3 sleep for 10 seconds @@ -60,76 +71,150 @@ def test_sleep(): assert (e_time - s_time) >= 5 -def test_timestamp(): +def test_timestamp() -> None: url = "https://example.com" - user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" - save_api = WaybackMachineSaveAPI(url, user_agent) - now = datetime.utcnow() - save_api._archive_url = ( - "https://web.archive.org/web/%s/" % now.strftime("%Y%m%d%H%M%S") + url + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" ) + save_api = WaybackMachineSaveAPI(url, user_agent) + now = datetime.utcnow().strftime("%Y%m%d%H%M%S") + save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/" save_api.timestamp() assert save_api.cached_save is False - save_api._archive_url = "https://web.archive.org/web/%s/" % "20100124063622" + url + now = "20100124063622" + save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/" save_api.timestamp() assert save_api.cached_save is True -def test_archive_url_parser(): +def test_archive_url_parser() -> None: """ Testing three regex for matches and also tests the response URL. """ url = "https://example.com" - user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) save_api = WaybackMachineSaveAPI(url, user_agent) - save_api.headers = """ - START - Content-Location: /web/20201126185327/https://www.scribbr.com/citing-sources/et-al - END - """ - - assert ( - save_api.archive_url_parser() - == "https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al" + h = ( + "\nSTART\nContent-Location: " + "/web/20201126185327/https://www.scribbr.com/citing-sources/et-al" + "\nEND\n" ) + save_api.headers = h # type: ignore[assignment] - save_api.headers = """ - {'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="timegate", ; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", ; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", ; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", ; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'} - """ - - assert ( - save_api.archive_url_parser() - == "https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/" + expected_url = ( + "https://web.archive.org/web/20201126185327/" + "https://www.scribbr.com/citing-sources/et-al" ) + assert save_api.archive_url_parser() == expected_url - save_api.headers = """ - START - X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US - END - """ + headers = { + "Server": "nginx/1.15.8", + "Date": "Sat, 02 Jan 2021 09:40:25 GMT", + "Content-Type": "text/html; charset=UTF-8", + "Transfer-Encoding": "chunked", + "Connection": "keep-alive", + "X-Archive-Orig-Server": "nginx", + "X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT", + "X-Archive-Orig-Transfer-Encoding": "chunked", + "X-Archive-Orig-Connection": "keep-alive", + "X-Archive-Orig-Vary": "Accept-Encoding", + "X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT", + "X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;", + "X-Archive-Guessed-Content-Type": "text/html", + "X-Archive-Guessed-Charset": "utf-8", + "Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT", + "Link": ( + '; rel="original", ' + "; rel="timemap"; type="application/link-format", ' + "; rel="timegate", ; rel="first memento"; ' + 'datetime="Mon, 01 Jun 2020 08:29:11 GMT", ; " + 'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", ' + "; rel="memento"; datetime="Sat, 02 Jan 2021 ' + '09:40:09 GMT", ; " + 'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"' + ), + "Content-Security-Policy": ( + "default-src 'self' 'unsafe-eval' 'unsafe-inline' " + "data: blob: archive.org web.archive.org analytics.archive.org " + "pragma.archivelab.org", + ), + "X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz", + "Server-Timing": ( + "captures_list;dur=112.646325, exclusion.robots;dur=0.172010, " + "exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, " + "esindex;dur=0.014647, LoadShardBlock;dur=82.205012, " + "PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, " + "load_resource;dur=26.520179" + ), + "X-App-Server": "wwwb-app200", + "X-ts": "200", + "X-location": "All", + "X-Cache-Key": ( + "httpsweb.archive.org/web/20210102094009/" + "https://www.scribbr.com/citing-sources/et-al/IN", + ), + "X-RL": "0", + "X-Page-Cache": "MISS", + "X-Archive-Screenname": "0", + "Content-Encoding": "gzip", + } - assert ( - save_api.archive_url_parser() - == "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/" + save_api.headers = cast(CaseInsensitiveDict[str], headers) + + expected_url2 = ( + "https://web.archive.org/web/20210102094009/" + "https://www.scribbr.com/citing-sources/et-al/" ) + assert save_api.archive_url_parser() == expected_url2 - save_api.headers = "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING" - save_api.response_url = "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al" - assert ( - save_api.archive_url_parser() - == "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al" + expected_url_3 = ( + "https://web.archive.org/web/20171128185327/" + "https://www.scribbr.com/citing-sources/et-al/US" ) + h = f"START\nX-Cache-Key: {expected_url_3}\nEND\n" + save_api.headers = h # type: ignore[assignment] + + expected_url4 = ( + "https://web.archive.org/web/20171128185327/" + "https://www.scribbr.com/citing-sources/et-al/" + ) + assert save_api.archive_url_parser() == expected_url4 + + h = "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING" + save_api.headers = h # type: ignore[assignment] + save_api.response_url = ( + "https://web.archive.org/web/20171128185327/" + "https://www.scribbr.com/citing-sources/et-al" + ) + expected_url5 = ( + "https://web.archive.org/web/20171128185327/" + "https://www.scribbr.com/citing-sources/et-al" + ) + assert save_api.archive_url_parser() == expected_url5 -def test_archive_url(): +def test_archive_url() -> None: """ Checks the attribute archive_url's value when the save method was not explicitly invoked by the end-user but the save method was invoked implicitly by the archive_url method which is an attribute due to @property. """ url = "https://example.com" - user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) save_api = WaybackMachineSaveAPI(url, user_agent) save_api.saved_archive = ( "https://web.archive.org/web/20220124063056/https://example.com/" diff --git a/tests/test_utils.py b/tests/test_utils.py index 6f6d509..c6467f6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,13 +6,13 @@ from waybackpy.utils import ( ) -def test_default_user_agent(): +def test_default_user_agent() -> None: assert ( DEFAULT_USER_AGENT - == "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__ + == f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" ) -def test_latest_version(): +def test_latest_version() -> None: package_name = "waybackpy" assert latest_version_github(package_name) == latest_version_pypi(package_name) diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py index 602ca00..e73471b 100644 --- a/waybackpy/__init__.py +++ b/waybackpy/__init__.py @@ -5,11 +5,7 @@ __description__ = ( ) __url__ = "https://akamhy.github.io/waybackpy/" __version__ = "3.0.2" -__download_url__ = ( - "https://github.com/akamhy/waybackpy/archive/{version}.tar.gz".format( - version=__version__ - ) -) +__download_url__ = f"https://github.com/akamhy/waybackpy/archive/{__version__}.tar.gz" __author__ = "Akash Mahanty" __author_email__ = "akamhy@yahoo.com" __license__ = "MIT" diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 6e76bb8..11a9716 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -1,6 +1,7 @@ import json import time from datetime import datetime +from typing import Any, Dict, Optional import requests @@ -10,37 +11,42 @@ from .exceptions import ( ) from .utils import DEFAULT_USER_AGENT +ResponseJSON = Dict[str, Any] -class WaybackMachineAvailabilityAPI: + +class WaybackMachineAvailabilityAPI(object): """ Class that interfaces the availability API of the Wayback Machine. """ - def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3): + def __init__( + self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3 + ) -> None: self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent - self.headers = {"User-Agent": self.user_agent} - self.payload = {"url": "{url}".format(url=self.url)} + self.headers: Dict[str, str] = {"User-Agent": self.user_agent} + self.payload = {"url": self.url} self.endpoint = "https://archive.org/wayback/available" self.max_tries = max_tries self.tries = 0 self.last_api_call_unix_time = int(time.time()) self.api_call_time_gap = 5 - self.JSON = None + self.JSON: Optional[ResponseJSON] = None - def unix_timestamp_to_wayback_timestamp(self, unix_timestamp): + @staticmethod + def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: """ Converts Unix time to wayback Machine timestamp. """ return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") - def __repr__(self): + def __repr__(self) -> str: """ Same as string representation, just return the archive URL as a string. """ return str(self) - def __str__(self): + def __str__(self) -> str: """ String representation of the class. If atleast one API call was successfully made then return the archive URL as a string. Else returns None. @@ -54,7 +60,7 @@ class WaybackMachineAvailabilityAPI: return self.archive_url - def json(self): + def json(self) -> Optional[ResponseJSON]: """ Makes the API call to the availability API can set the JSON response to the JSON attribute of the instance and also returns the JSON attribute. @@ -74,12 +80,12 @@ class WaybackMachineAvailabilityAPI: self.JSON = self.response.json() except json.decoder.JSONDecodeError: raise InvalidJSONInAvailabilityAPIResponse( - "Response data:\n{text}".format(text=self.response.text) + f"Response data:\n{self.response.text}" ) return self.JSON - def timestamp(self): + def timestamp(self) -> datetime: """ Converts the timestamp form the JSON response to datetime object. If JSON attribute of the instance is None it implies that the either @@ -91,19 +97,29 @@ class WaybackMachineAvailabilityAPI: If you get an URL as a response form the availability API it is guaranteed that you can get the datetime object from the timestamp. """ - if not self.JSON or not self.JSON["archived_snapshots"]: + if self.JSON is None or "archived_snapshots" not in self.JSON: return datetime.max - - return datetime.strptime( - self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" - ) + elif ( + self.JSON is not None + and "archived_snapshots" in self.JSON + and self.JSON["archived_snapshots"] is not None + and "closest" in self.JSON["archived_snapshots"] + and self.JSON["archived_snapshots"]["closest"] is not None + and "timestamp" in self.JSON["archived_snapshots"]["closest"] + ): + return datetime.strptime( + self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" + ) + else: + raise ValueError("Could not get timestamp from result") @property - def archive_url(self): + def archive_url(self) -> str: """ Reads the the JSON response data and tries to get the timestamp and returns the timestamp if found else returns None. """ + archive_url = "" data = self.JSON # If the user didn't used oldest, newest or near but tries to access the @@ -127,9 +143,9 @@ class WaybackMachineAvailabilityAPI: if not data or not data["archived_snapshots"]: raise ArchiveNotInAvailabilityAPIResponse( "Archive not found in the availability " - + "API response, the URL you requested may not have any " - + "archives yet. You may retry after some time or archive the webpage now." - + "\nResponse data:\n{response}".format(response=self.response.text) + "API response, the URL you requested may not have any archives " + "yet. You may retry after some time or archive the webpage now.\n" + f"Response data:\n{self.response.text}" ) else: archive_url = data["archived_snapshots"]["closest"]["url"] @@ -138,7 +154,8 @@ class WaybackMachineAvailabilityAPI: ) return archive_url - def wayback_timestamp(self, **kwargs): + @staticmethod + def wayback_timestamp(**kwargs: int) -> str: """ Prepends zero before the year, month, day, hour and minute so that they are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. @@ -148,7 +165,7 @@ class WaybackMachineAvailabilityAPI: for key in ["year", "month", "day", "hour", "minute"] ) - def oldest(self): + def oldest(self) -> "WaybackMachineAvailabilityAPI": """ Passing the year 1994 should return the oldest archive because wayback machine was started in May, 1996 and there should be no archive @@ -156,7 +173,7 @@ class WaybackMachineAvailabilityAPI: """ return self.near(year=1994) - def newest(self): + def newest(self) -> "WaybackMachineAvailabilityAPI": """ Passing the current UNIX time should be sufficient to get the newest archive considering the API request-response time delay and also the @@ -166,13 +183,13 @@ class WaybackMachineAvailabilityAPI: def near( self, - year=None, - month=None, - day=None, - hour=None, - minute=None, - unix_timestamp=None, - ): + year: Optional[int] = None, + month: Optional[int] = None, + day: Optional[int] = None, + hour: Optional[int] = None, + minute: Optional[int] = None, + unix_timestamp: Optional[int] = None, + ) -> "WaybackMachineAvailabilityAPI": """ The main method for this Class, oldest and newest methods are dependent on this method. @@ -181,18 +198,19 @@ class WaybackMachineAvailabilityAPI: unix_timestamp_to_wayback_timestamp or wayback_timestamp method with appropriate arguments for their respective parameters. Adds the timestamp to the payload dictionary. - And finally invoking the json method to make the API call then returns the instance. + And finally invoking the json method to make the API call then returns + the instance. """ if unix_timestamp: timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) else: now = datetime.utcnow().timetuple() timestamp = self.wayback_timestamp( - year=year if year else now.tm_year, - month=month if month else now.tm_mon, - day=day if day else now.tm_mday, - hour=hour if hour else now.tm_hour, - minute=minute if minute else now.tm_min, + year=now.tm_year if year is None else year, + month=now.tm_mon if month is None else month, + day=now.tm_mday if day is None else day, + hour=now.tm_hour if hour is None else hour, + minute=now.tm_min if minute is None else minute, ) self.payload["timestamp"] = timestamp diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index a04f8af..73515a9 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -1,3 +1,5 @@ +from typing import Dict, Generator, List, Optional, cast + from .cdx_snapshot import CDXSnapshot from .cdx_utils import ( check_collapses, @@ -11,43 +13,48 @@ from .exceptions import WaybackError from .utils import DEFAULT_USER_AGENT -class WaybackMachineCDXServerAPI: +class WaybackMachineCDXServerAPI(object): """ Class that interfaces the CDX server API of the Wayback Machine. """ + # start_timestamp: from, can not use from as it's a keyword + # end_timestamp: to, not using to as can not use from def __init__( self, - url, - user_agent=DEFAULT_USER_AGENT, - start_timestamp=None, # from, can not use from as it's a keyword - end_timestamp=None, # to, not using to as can not use from - filters=[], - match_type=None, - gzip=None, - collapses=[], - limit=None, - max_tries=3, - ): + url: str, + user_agent: str = DEFAULT_USER_AGENT, + start_timestamp: Optional[str] = None, + end_timestamp: Optional[str] = None, + filters: List[str] = [], + match_type: Optional[str] = None, + gzip: Optional[str] = None, + collapses: List[str] = [], + limit: Optional[str] = None, + max_tries: int = 3, + ) -> None: self.url = str(url).strip().replace(" ", "%20") self.user_agent = user_agent - self.start_timestamp = str(start_timestamp) if start_timestamp else None - self.end_timestamp = str(end_timestamp) if end_timestamp else None + self.start_timestamp = ( + str(start_timestamp) if start_timestamp is not None else None + ) + self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None self.filters = filters check_filters(self.filters) - self.match_type = str(match_type).strip() if match_type else None + self.match_type = str(match_type).strip() if match_type is not None else None check_match_type(self.match_type, self.url) - self.gzip = gzip if gzip else True + self.gzip = gzip self.collapses = collapses check_collapses(self.collapses) - self.limit = limit if limit else 5000 + self.limit = limit if limit is not None else 5000 self.max_tries = max_tries - self.last_api_request_url = None + self.last_api_request_url: Optional[str] = None self.use_page = False self.endpoint = "https://web.archive.org/cdx/search/cdx" - def cdx_api_manager(self, payload, headers, use_page=False): - + def cdx_api_manager( + self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False + ) -> Generator[str, None, None]: total_pages = get_total_pages(self.url, self.user_agent) # If we only have two or less pages of archives then we care for more accuracy # pagination API is lagged sometimes @@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI: url = full_url(self.endpoint, params=payload) res = get_response(url, headers=headers) + if isinstance(res, Exception): + raise res self.last_api_request_url = url text = res.text @@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI: yield text else: - payload["showResumeKey"] = "true" payload["limit"] = str(self.limit) resumeKey = None - more = True while more: - if resumeKey: payload["resumeKey"] = resumeKey url = full_url(self.endpoint, params=payload) res = get_response(url, headers=headers) + if isinstance(res, Exception): + raise res self.last_api_request_url = url @@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI: yield text - def add_payload(self, payload): + def add_payload(self, payload: Dict[str, str]) -> None: if self.start_timestamp: payload["from"] = self.start_timestamp if self.end_timestamp: payload["to"] = self.end_timestamp - if self.gzip is not True: + if self.gzip is None: payload["gzip"] = "false" if self.match_type: @@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI: # Don't need to return anything as it's dictionary. payload["url"] = self.url - def snapshots(self): - payload = {} + def snapshots(self) -> Generator[CDXSnapshot, None, None]: + payload: Dict[str, str] = {} headers = {"User-Agent": self.user_agent} self.add_payload(payload) @@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI: if len(snapshot) < 46: # 14 + 32 (timestamp+digest) continue - properties = { + properties: Dict[str, Optional[str]] = { "urlkey": None, "timestamp": None, "original": None, @@ -169,15 +177,9 @@ class WaybackMachineCDXServerAPI: if prop_values_len != properties_len: raise WaybackError( - "Snapshot returned by Cdx API has {prop_values_len} properties".format( - prop_values_len=prop_values_len - ) - + " instead of expected {properties_len} ".format( - properties_len=properties_len - ) - + "properties.\nProblematic Snapshot : {snapshot}".format( - snapshot=snapshot - ) + f"Snapshot returned by Cdx API has {prop_values_len} " + f"properties instead of expected {properties_len} properties.\n" + f"Problematic Snapshot: {snapshot}" ) ( @@ -190,4 +192,4 @@ class WaybackMachineCDXServerAPI: properties["length"], ) = prop_values - yield CDXSnapshot(properties) + yield CDXSnapshot(cast(Dict[str, str], properties)) diff --git a/waybackpy/cdx_snapshot.py b/waybackpy/cdx_snapshot.py index 58d4e8b..ab96602 100644 --- a/waybackpy/cdx_snapshot.py +++ b/waybackpy/cdx_snapshot.py @@ -1,7 +1,8 @@ from datetime import datetime +from typing import Dict -class CDXSnapshot: +class CDXSnapshot(object): """ Class for the CDX snapshot lines returned by the CDX API, Each valid line of the CDX API is casted to an CDXSnapshot object @@ -10,7 +11,7 @@ class CDXSnapshot: of the CDXSnapshot. """ - def __init__(self, properties): + def __init__(self, properties: Dict[str, str]) -> None: self.urlkey = properties["urlkey"] self.timestamp = properties["timestamp"] self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S") @@ -20,16 +21,11 @@ class CDXSnapshot: self.digest = properties["digest"] self.length = properties["length"] self.archive_url = ( - "https://web.archive.org/web/" + self.timestamp + "/" + self.original + f"https://web.archive.org/web/{self.timestamp}/{self.original}" ) - def __str__(self): - return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format( - urlkey=self.urlkey, - timestamp=self.timestamp, - original=self.original, - mimetype=self.mimetype, - statuscode=self.statuscode, - digest=self.digest, - length=self.length, + def __str__(self) -> str: + return ( + f"{self.urlkey} {self.timestamp} {self.original} " + f"{self.mimetype} {self.statuscode} {self.digest} {self.length}" ) diff --git a/waybackpy/cdx_utils.py b/waybackpy/cdx_utils.py index 06f043c..fce6acb 100644 --- a/waybackpy/cdx_utils.py +++ b/waybackpy/cdx_utils.py @@ -1,4 +1,6 @@ import re +from typing import Any, Dict, List, Optional, Union +from urllib.parse import quote import requests from requests.adapters import HTTPAdapter @@ -8,16 +10,19 @@ from .exceptions import WaybackError from .utils import DEFAULT_USER_AGENT -def get_total_pages(url, user_agent=DEFAULT_USER_AGENT): +def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int: endpoint = "https://web.archive.org/cdx/search/cdx?" payload = {"showNumPages": "true", "url": str(url)} headers = {"User-Agent": user_agent} request_url = full_url(endpoint, params=payload) response = get_response(request_url, headers=headers) - return int(response.text.strip()) + if isinstance(response, requests.Response): + return int(response.text.strip()) + else: + raise response -def full_url(endpoint, params): +def full_url(endpoint: str, params: Dict[str, Any]) -> str: if not params: return endpoint full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") @@ -25,28 +30,25 @@ def full_url(endpoint, params): key = "filter" if key.startswith("filter") else key key = "collapse" if key.startswith("collapse") else key amp = "" if full_url.endswith("?") else "&" - full_url = ( - full_url - + amp - + "{key}={val}".format(key=key, val=requests.utils.quote(str(val))) - ) + val = quote(str(val), safe="") + full_url += f"{amp}{key}={val}" return full_url def get_response( - url, - headers=None, - retries=5, - backoff_factor=0.5, - no_raise_on_redirects=False, -): + url: str, + headers: Optional[Dict[str, str]] = None, + retries: int = 5, + backoff_factor: float = 0.5, + # no_raise_on_redirects=False, +) -> Union[requests.Response, Exception]: session = requests.Session() - retries = Retry( + retries_ = Retry( total=retries, backoff_factor=backoff_factor, status_forcelist=[500, 502, 503, 504], ) - session.mount("https://", HTTPAdapter(max_retries=retries)) + session.mount("https://", HTTPAdapter(max_retries=retries_)) try: response = session.get(url, headers=headers) @@ -54,77 +56,65 @@ def get_response( return response except Exception as e: reason = str(e) - exc_message = "Error while retrieving {url}.\n{reason}".format( - url=url, reason=reason - ) + exc_message = f"Error while retrieving {url}.\n{reason}" exc = WaybackError(exc_message) exc.__cause__ = e raise exc -def check_filters(filters): +def check_filters(filters: List[str]) -> None: if not isinstance(filters, list): raise WaybackError("filters must be a list.") # [!]field:regex for _filter in filters: - try: + match = re.search( + r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):" + r"(.*)", + _filter, + ) - match = re.search( - r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)", - _filter, - ) + if match is None or len(match.groups()) != 2: - match.group(1) - match.group(2) - - except Exception: - - exc_message = ( - "Filter '{_filter}' is not following the cdx filter syntax.".format( - _filter=_filter - ) - ) + exc_message = f"Filter '{_filter}' is not following the cdx filter syntax." raise WaybackError(exc_message) -def check_collapses(collapses): - +def check_collapses(collapses: List[str]) -> bool: if not isinstance(collapses, list): raise WaybackError("collapses must be a list.") - - if len(collapses) == 0: - return + elif len(collapses) == 0: + return True for collapse in collapses: - try: - match = re.search( - r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?", - collapse, - ) - match.group(1) - if 2 == len(match.groups()): - match.group(2) - except Exception: - exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format( - collapse=collapse + match = re.search( + r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)" + r"(:?[0-9]{1,99})?", + collapse, + ) + if match is None or len(match.groups()) != 2: + exc_message = ( + f"collapse argument '{collapse}' " + "is not following the cdx collapse syntax." ) raise WaybackError(exc_message) + return True -def check_match_type(match_type, url): + +def check_match_type(match_type: Optional[str], url: str) -> bool: + legal_match_type = ["exact", "prefix", "host", "domain"] if not match_type: - return - - if "*" in url: + return True + elif "*" in url: raise WaybackError( "Can not use wildcard in the URL along with the match_type arguments." ) - - legal_match_type = ["exact", "prefix", "host", "domain"] - - if match_type not in legal_match_type: - exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format( - match_type=match_type + elif match_type not in legal_match_type: + exc_message = ( + f"{match_type} is not an allowed match type.\n" + "Use one from 'exact', 'prefix', 'host' or 'domain'" ) raise WaybackError(exc_message) + else: + return True diff --git a/waybackpy/cli.py b/waybackpy/cli.py index f1117c2..adf41d0 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -3,6 +3,7 @@ import os import random import re import string +from typing import Generator, List, Optional import click import requests @@ -24,7 +25,7 @@ from .wrapper import Url "--user-agent", "--user_agent", default=DEFAULT_USER_AGENT, - help="User agent, default value is '%s'." % DEFAULT_USER_AGENT, + help=f"User agent, default value is '{DEFAULT_USER_AGENT}'.", ) @click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.") @click.option( @@ -163,34 +164,34 @@ from .wrapper import Url + "will be printed.", ) def main( - url, - user_agent, - version, - license, - newest, - oldest, - json, - near, - year, - month, - day, - hour, - minute, - save, - headers, - known_urls, - subdomain, - file, - cdx, - start_timestamp, - end_timestamp, - filter, - match_type, - gzip, - collapse, - limit, - cdx_print, -): + url: Optional[str], + user_agent: str, + version: bool, + license: bool, + newest: bool, + oldest: bool, + json: bool, + near: bool, + year: Optional[int], + month: Optional[int], + day: Optional[int], + hour: Optional[int], + minute: Optional[int], + save: bool, + headers: bool, + known_urls: bool, + subdomain: bool, + file: bool, + cdx: bool, + start_timestamp: Optional[str], + end_timestamp: Optional[str], + filter: List[str], + match_type: Optional[str], + gzip: Optional[str], + collapse: List[str], + limit: Optional[str], + cdx_print: List[str], +) -> None: """\b _ _ | | | | @@ -214,7 +215,7 @@ def main( """ if version: - click.echo("waybackpy version %s" % __version__) + click.echo(f"waybackpy version {__version__}") return if license: @@ -240,11 +241,14 @@ def main( and not cdx ): click.echo( - "Only URL passed, but did not specify what to do with the URL. Use --help flag for help using waybackpy." + "Only URL passed, but did not specify what to do with the URL. " + "Use --help flag for help using waybackpy." ) return - def echo_availability_api(availability_api_instance): + def echo_availability_api( + availability_api_instance: WaybackMachineAvailabilityAPI, + ) -> None: click.echo("Archive URL:") if not availability_api_instance.archive_url: archive_url = ( @@ -295,13 +299,14 @@ def main( click.echo(save_api.headers) return - def save_urls_on_file(url_gen): + def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: domain = None sys_random = random.SystemRandom() uid = "".join( sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) ) url_count = 0 + file_name = None for url in url_gen: url_count += 1 @@ -310,25 +315,21 @@ def main( domain = "domain-unknown" - if match: + if match is not None: domain = match.group(1) - file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid) + file_name = f"{domain}-urls-{uid}.txt" file_path = os.path.join(os.getcwd(), file_name) if not os.path.isfile(file_path): open(file_path, "w+").close() - with open(file_path, "a") as f: - f.write("{url}\n".format(url=url)) + with open(file_path, "a") as f: + f.write(f"{url}\n") click.echo(url) - if url_count > 0: - click.echo( - "\n\n'{file_name}' saved in current working directory".format( - file_name=file_name - ) - ) + if url_count > 0 or file_name is not None: + click.echo(f"\n\n'{file_name}' saved in current working directory") else: click.echo("No known URLs found. Please try a diffrent input!") diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index 8e75aea..53f00c2 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -14,6 +14,8 @@ class WaybackError(Exception): All other exceptions are inherited from this class. """ + pass + class RedirectSaveError(WaybackError): """ @@ -21,32 +23,44 @@ class RedirectSaveError(WaybackError): redirect URL is archived but not the original URL. """ + pass + class URLError(Exception): """ Raised when malformed URLs are passed as arguments. """ + pass + class MaximumRetriesExceeded(WaybackError): """ MaximumRetriesExceeded """ + pass + class MaximumSaveRetriesExceeded(MaximumRetriesExceeded): """ MaximumSaveRetriesExceeded """ + pass + class ArchiveNotInAvailabilityAPIResponse(WaybackError): """ Could not parse the archive in the JSON response of the availability API. """ + pass + class InvalidJSONInAvailabilityAPIResponse(WaybackError): """ availability api returned invalid JSON """ + + pass diff --git a/waybackpy/save_api.py b/waybackpy/save_api.py index 530e03a..f511ac8 100644 --- a/waybackpy/save_api.py +++ b/waybackpy/save_api.py @@ -1,38 +1,41 @@ import re import time from datetime import datetime +from typing import Dict, Optional import requests from requests.adapters import HTTPAdapter +from requests.structures import CaseInsensitiveDict from urllib3.util.retry import Retry from .exceptions import MaximumSaveRetriesExceeded from .utils import DEFAULT_USER_AGENT -class WaybackMachineSaveAPI: - +class WaybackMachineSaveAPI(object): """ WaybackMachineSaveAPI class provides an interface for saving URLs on the Wayback Machine. """ - def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8): + def __init__( + self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8 + ) -> None: self.url = str(url).strip().replace(" ", "%20") self.request_url = "https://web.archive.org/save/" + self.url self.user_agent = user_agent - self.request_headers = {"User-Agent": self.user_agent} + self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent} if max_tries < 1: raise ValueError("max_tries should be positive") self.max_tries = max_tries self.total_save_retries = 5 self.backoff_factor = 0.5 self.status_forcelist = [500, 502, 503, 504] - self._archive_url = None + self._archive_url: Optional[str] = None self.instance_birth_time = datetime.utcnow() @property - def archive_url(self): + def archive_url(self) -> str: """ Returns the archive URL is already cached by _archive_url else invoke the save method to save the archive which returns the @@ -44,7 +47,7 @@ class WaybackMachineSaveAPI: else: return self.save() - def get_save_request_headers(self): + def get_save_request_headers(self) -> None: """ Creates a session and tries 'retries' number of times to retrieve the archive. @@ -68,14 +71,13 @@ class WaybackMachineSaveAPI: ) session.mount("https://", HTTPAdapter(max_retries=retries)) self.response = session.get(self.request_url, headers=self.request_headers) - self.headers = ( - self.response.headers - ) # + # requests.response.headers is requests.structures.CaseInsensitiveDict + self.headers: CaseInsensitiveDict[str] = self.response.headers self.status_code = self.response.status_code self.response_url = self.response.url session.close() - def archive_url_parser(self): + def archive_url_parser(self) -> Optional[str]: """ Three regexen (like oxen?) are used to search for the archive URL in the headers and finally look in the response URL @@ -89,12 +91,12 @@ class WaybackMachineSaveAPI: regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>" match = re.search(regex2, str(self.headers)) - if match: + if match is not None and len(match.groups()) == 1: return "https://" + match.group(1) regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}" match = re.search(regex3, str(self.headers)) - if match: + if match is not None and len(match.groups()) == 1: return "https" + match.group(1) if self.response_url: @@ -105,7 +107,10 @@ class WaybackMachineSaveAPI: if match: return "https://" + match.group(0) - def sleep(self, tries): + return None + + @staticmethod + def sleep(tries: int) -> None: """ Ensure that the we wait some time before succesive retries so that we don't waste the retries before the page is even captured by the Wayback @@ -120,7 +125,7 @@ class WaybackMachineSaveAPI: sleep_seconds = 10 time.sleep(sleep_seconds) - def timestamp(self): + def timestamp(self) -> datetime: """ Read the timestamp off the archive URL and convert the Wayback Machine timestamp to datetime object. @@ -128,14 +133,16 @@ class WaybackMachineSaveAPI: Also check if the time on archive is URL and compare it to instance birth time. - If time on the archive is older than the instance creation time set the cached_save - to True else set it to False. The flag can be used to check if the Wayback Machine - didn't serve a Cached URL. It is quite common for the Wayback Machine to serve - cached archive if last archive was captured before last 45 minutes. + If time on the archive is older than the instance creation time set the + cached_save to True else set it to False. The flag can be used to check + if the Wayback Machine didn't serve a Cached URL. It is quite common for + the Wayback Machine to serve cached archive if last archive was captured + before last 45 minutes. """ - m = re.search( - r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url - ) + regex = r"https?://web\.archive.org/web/([0-9]{14})/http" + m = re.search(regex, str(self._archive_url)) + if m is None or len(m.groups()) != 1: + raise ValueError("Could not get timestamp") string_timestamp = m.group(1) timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S") @@ -149,7 +156,7 @@ class WaybackMachineSaveAPI: return timestamp - def save(self): + def save(self) -> str: """ Calls the SavePageNow API of the Wayback Machine with required parameters and headers to save the URL. @@ -162,14 +169,14 @@ class WaybackMachineSaveAPI: tries = 0 while True: - if not self.saved_archive: + if self.saved_archive is None: if tries >= 1: self.sleep(tries) self.get_save_request_headers() self.saved_archive = self.archive_url_parser() - if self.saved_archive is not None: + if isinstance(self.saved_archive, str): self._archive_url = self.saved_archive self.timestamp() return self.saved_archive @@ -177,7 +184,8 @@ class WaybackMachineSaveAPI: tries += 1 if tries >= self.max_tries: raise MaximumSaveRetriesExceeded( - "Tried %s times but failed to save and retrieve the" % str(tries) - + " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n" - % (self.url, self.response_url, str(self.headers)), + f"Tried {tries} times but failed to save " + f"and retrieve the archive for {self.url}.\n" + f"Response URL:\n{self.response_url}\n" + f"Response Header:\n{self.headers}" ) diff --git a/waybackpy/utils.py b/waybackpy/utils.py index 7201403..413a205 100644 --- a/waybackpy/utils.py +++ b/waybackpy/utils.py @@ -2,22 +2,43 @@ import requests from . import __version__ -DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__ +DEFAULT_USER_AGENT: str = ( + f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" +) -def latest_version_pypi(package_name, user_agent=DEFAULT_USER_AGENT): +def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str: request_url = "https://pypi.org/pypi/" + package_name + "/json" headers = {"User-Agent": user_agent} response = requests.get(request_url, headers=headers) data = response.json() - return data["info"]["version"] + if ( + data is not None + and "info" in data + and data["info"] is not None + and "version" in data["info"] + and data["info"]["version"] is not None + ): + return str(data["info"]["version"]) + else: + raise ValueError("Could not get latest pypi version") -def latest_version_github(package_name, user_agent=DEFAULT_USER_AGENT): +def latest_version_github( + package_name: str, user_agent: str = DEFAULT_USER_AGENT +) -> str: request_url = ( "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1" ) headers = {"User-Agent": user_agent} response = requests.get(request_url, headers=headers) data = response.json() - return data[0]["tag_name"] + if ( + data is not None + and len(data) > 0 + and data[0] is not None + and "tag_name" in data[0] + ): + return str(data[0]["tag_name"]) + else: + raise ValueError("Could not get latest github version") diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 3121b77..38dd1b6 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta +from typing import Generator, Optional from .availability_api import WaybackMachineAvailabilityAPI from .cdx_api import WaybackMachineCDXServerAPI @@ -14,40 +15,42 @@ The reason it is still in the code is backwards compatibility with 2.x.x version If were are using the Url before the update to version 3.x.x, your code should still be working fine and there is no hurry to update the interface but is recommended that you do not use the Url class for new code as it would be removed after 2025 also the first -3.x.x versions was released in January 2022 and three years are more than enough to update -the older interface code. +3.x.x versions was released in January 2022 and three years are more than enough to +update the older interface code. """ -class Url: - def __init__(self, url, user_agent=DEFAULT_USER_AGENT): +class Url(object): + def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None: self.url = url self.user_agent = str(user_agent) - self.archive_url = None - self.timestamp = None + self.archive_url: Optional[str] = None + self.timestamp: Optional[datetime] = None self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI( self.url, user_agent=self.user_agent ) - def __str__(self): + def __str__(self) -> str: if not self.archive_url: self.newest() - return self.archive_url + return str(self.archive_url) - def __len__(self): + def __len__(self) -> int: td_max = timedelta( days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999 ) - if not self.timestamp: + if not isinstance(self.timestamp, datetime): self.oldest() - if self.timestamp == datetime.max: + if not isinstance(self.timestamp, datetime): + raise TypeError("timestamp must be a datetime") + elif self.timestamp == datetime.max: return td_max.days + else: + return (datetime.utcnow() - self.timestamp).days - return (datetime.utcnow() - self.timestamp).days - - def save(self): + def save(self) -> "Url": self.wayback_machine_save_api = WaybackMachineSaveAPI( self.url, user_agent=self.user_agent ) @@ -58,13 +61,13 @@ class Url: def near( self, - year=None, - month=None, - day=None, - hour=None, - minute=None, - unix_timestamp=None, - ): + year: Optional[int] = None, + month: Optional[int] = None, + day: Optional[int] = None, + hour: Optional[int] = None, + minute: Optional[int] = None, + unix_timestamp: Optional[int] = None, + ) -> "Url": self.wayback_machine_availability_api.near( year=year, @@ -77,22 +80,24 @@ class Url: self.set_availability_api_attrs() return self - def oldest(self): + def oldest(self) -> "Url": self.wayback_machine_availability_api.oldest() self.set_availability_api_attrs() return self - def newest(self): + def newest(self) -> "Url": self.wayback_machine_availability_api.newest() self.set_availability_api_attrs() return self - def set_availability_api_attrs(self): + def set_availability_api_attrs(self) -> None: self.archive_url = self.wayback_machine_availability_api.archive_url self.JSON = self.wayback_machine_availability_api.JSON self.timestamp = self.wayback_machine_availability_api.timestamp() - def total_archives(self, start_timestamp=None, end_timestamp=None): + def total_archives( + self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None + ) -> int: cdx = WaybackMachineCDXServerAPI( self.url, user_agent=self.user_agent, @@ -107,12 +112,12 @@ class Url: def known_urls( self, - subdomain=False, - host=False, - start_timestamp=None, - end_timestamp=None, - match_type="prefix", - ): + subdomain: bool = False, + host: bool = False, + start_timestamp: Optional[str] = None, + end_timestamp: Optional[str] = None, + match_type: str = "prefix", + ) -> Generator[str, None, None]: if subdomain: match_type = "domain" if host: