* fix: CI yml name

* add: mypy configuraion

* add: type annotation to waybackpy modules

* add: type annotation to test modules

* fix: mypy command

* add: types-requests to dev deps

* fix: disable max-line-length

* fix: move pytest.ini into setup.cfg

* add: urllib3 to deps

* fix: Retry (ref: https://github.com/python/typeshed/issues/6893)

* fix: f-string

* fix: shorten long lines

* add: staticmethod decorator to no-self-use methods

* fix: str(headers)->headers_str

* fix: error message

* fix: revert "str(headers)->headers_str" and ignore assignment CaseInsensitiveDict with str

* fix: mypy error
This commit is contained in:
eggplants 2022-02-05 03:23:36 +09:00 committed by GitHub
parent 320ef30371
commit d8cabdfdb5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 537 additions and 364 deletions

View File

@ -28,14 +28,13 @@ jobs:
pip install '.[dev]' pip install '.[dev]'
- name: Lint with flake8 - name: Lint with flake8
run: | run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --show-source --statistics flake8 . --count --show-source --statistics
- name: Lint with black - name: Lint with black
run: | run: |
black . --check --diff black . --check --diff
# - name: Static type test with mypy - name: Static type test with mypy
# run: | run: |
# mypy mypy -p waybackpy -p tests
- name: Test with pytest - name: Test with pytest
run: | run: |
pytest pytest

View File

@ -1,11 +0,0 @@
[pytest]
addopts =
# show summary of all tests that did not pass
-ra
# enable all warnings
-Wd
# coverage and html report
--cov=waybackpy
--cov-report=html
testpaths =
tests

View File

@ -3,7 +3,8 @@ click
codecov codecov
flake8 flake8
mypy mypy
setuptools>=46.4.0
pytest pytest
pytest-cov pytest-cov
requests requests
setuptools>=46.4.0
types-requests

View File

@ -1,2 +1,3 @@
click click
requests requests
urllib3

View File

@ -42,6 +42,7 @@ python_requires = >= 3.7
install_requires = install_requires =
click click
requests requests
urllib3
[options.extras_require] [options.extras_require]
dev = dev =
@ -52,7 +53,7 @@ dev =
pytest pytest
pytest-cov pytest-cov
setuptools>=46.4.0 setuptools>=46.4.0
types-requests
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =
@ -64,4 +65,26 @@ profile = black
[flake8] [flake8]
indent-size = 4 indent-size = 4
max-line-length = 88 max-line-length = 88
extend-ignore = E203,W503,E501,W605 extend-ignore = W605
[mypy]
python_version = 3.9
show_error_codes = True
pretty = True
strict = True
[tool:pytest]
addopts =
# show summary of all tests that did not pass
-ra
# enable all warnings
-Wd
# coverage and html report
--cov=waybackpy
--cov-report=html
testpaths =
tests
[pycodestyle]
# for `license` and `filter in `waybackpy.cli.main`
ignore = W0622

View File

@ -12,33 +12,42 @@ from waybackpy.exceptions import (
now = datetime.utcnow() now = datetime.utcnow()
url = "https://example.com/" url = "https://example.com/"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" user_agent = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
)
def rndstr(n): def rndstr(n: int) -> str:
return "".join( return "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(n) random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
) )
def test_oldest(): def test_oldest() -> None:
""" """
Test the oldest archive of Google.com and also checks the attributes. Test the oldest archive of Google.com and also checks the attributes.
""" """
url = "https://example.com/" url = "https://example.com/"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" user_agent = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
)
availability_api = WaybackMachineAvailabilityAPI(url, user_agent) availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
oldest = availability_api.oldest() oldest = availability_api.oldest()
oldest_archive_url = oldest.archive_url oldest_archive_url = oldest.archive_url
assert "2002" in oldest_archive_url assert "2002" in oldest_archive_url
oldest_timestamp = oldest.timestamp() oldest_timestamp = oldest.timestamp()
assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years
assert availability_api.JSON["archived_snapshots"]["closest"]["available"] is True assert (
availability_api.JSON is not None
and availability_api.JSON["archived_snapshots"]["closest"]["available"] is True
)
assert repr(oldest).find("example.com") != -1 assert repr(oldest).find("example.com") != -1
assert "2002" in str(oldest) assert "2002" in str(oldest)
def test_newest(): def test_newest() -> None:
""" """
Assuming that the recent most Google Archive was made no more earlier than Assuming that the recent most Google Archive was made no more earlier than
last one day which is 86400 seconds. last one day which is 86400 seconds.
@ -54,16 +63,17 @@ def test_newest():
assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3) assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3)
def test_invalid_json(): def test_invalid_json() -> None:
""" """
When the API is malfunctioning or we don't pass a URL it may return invalid JSON data. When the API is malfunctioning or we don't pass a URL,
it may return invalid JSON data.
""" """
with pytest.raises(InvalidJSONInAvailabilityAPIResponse): with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent) availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)
_ = availability_api.archive_url _ = availability_api.archive_url
def test_no_archive(): def test_no_archive() -> None:
""" """
ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not
replied with the archive despite the fact that we know the site has million replied with the archive despite the fact that we know the site has million
@ -74,12 +84,12 @@ def test_no_archive():
""" """
with pytest.raises(ArchiveNotInAvailabilityAPIResponse): with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
availability_api = WaybackMachineAvailabilityAPI( availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.cn" % rndstr(30), user_agent=user_agent url=f"https://{rndstr(30)}.cn", user_agent=user_agent
) )
_ = availability_api.archive_url _ = availability_api.archive_url
def test_no_api_call_str_repr(): def test_no_api_call_str_repr() -> None:
""" """
Some entitled users maybe want to see what is the string representation Some entitled users maybe want to see what is the string representation
if they dont make any API requests. if they dont make any API requests.
@ -87,17 +97,17 @@ def test_no_api_call_str_repr():
str() must not return None so we return "" str() must not return None so we return ""
""" """
availability_api = WaybackMachineAvailabilityAPI( availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.gov" % rndstr(30), user_agent=user_agent url=f"https://{rndstr(30)}.gov", user_agent=user_agent
) )
assert "" == str(availability_api) assert "" == str(availability_api)
def test_no_call_timestamp(): def test_no_call_timestamp() -> None:
""" """
If no API requests were made the bound timestamp() method returns If no API requests were made the bound timestamp() method returns
the datetime.max as a default value. the datetime.max as a default value.
""" """
availability_api = WaybackMachineAvailabilityAPI( availability_api = WaybackMachineAvailabilityAPI(
url="https://%s.in" % rndstr(30), user_agent=user_agent url=f"https://{rndstr(30)}.in", user_agent=user_agent
) )
assert datetime.max == availability_api.timestamp() assert datetime.max == availability_api.timestamp()

View File

@ -1,8 +1,11 @@
from waybackpy.cdx_api import WaybackMachineCDXServerAPI from waybackpy.cdx_api import WaybackMachineCDXServerAPI
def test_a(): def test_a() -> None:
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
url = "https://twitter.com/jack" url = "https://twitter.com/jack"
wayback = WaybackMachineCDXServerAPI( wayback = WaybackMachineCDXServerAPI(
@ -21,8 +24,11 @@ def test_a():
assert snapshot.timestamp.startswith("2010") assert snapshot.timestamp.startswith("2010")
def test_b(): def test_b() -> None:
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
url = "https://www.google.com" url = "https://www.google.com"
wayback = WaybackMachineCDXServerAPI( wayback = WaybackMachineCDXServerAPI(

View File

@ -3,8 +3,11 @@ from datetime import datetime
from waybackpy.cdx_snapshot import CDXSnapshot from waybackpy.cdx_snapshot import CDXSnapshot
def test_CDXSnapshot(): def test_CDXSnapshot() -> None:
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" sample_input = (
"org,archive)/ 20080126045828 http://github.com "
"text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
)
prop_values = sample_input.split(" ") prop_values = sample_input.split(" ")
properties = {} properties = {}
( (

View File

@ -1,3 +1,5 @@
from typing import Any, Dict, List
import pytest import pytest
from waybackpy.cdx_utils import ( from waybackpy.cdx_utils import (
@ -11,15 +13,18 @@ from waybackpy.cdx_utils import (
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
def test_get_total_pages(): def test_get_total_pages() -> None:
url = "twitter.com" url = "twitter.com"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15" user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
)
assert get_total_pages(url=url, user_agent=user_agent) >= 56 assert get_total_pages(url=url, user_agent=user_agent) >= 56
def test_full_url(): def test_full_url() -> None:
params = {}
endpoint = "https://web.archive.org/cdx/search/cdx" endpoint = "https://web.archive.org/cdx/search/cdx"
params: Dict[str, Any] = {}
assert endpoint == full_url(endpoint, params) assert endpoint == full_url(endpoint, params)
params = {"a": "1"} params = {"a": "1"}
@ -39,36 +44,36 @@ def test_full_url():
) )
def test_get_response(): def test_get_response() -> None:
url = "https://github.com" url = "https://github.com"
user_agent = ( user_agent = (
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
) )
headers = {"User-Agent": "%s" % user_agent} headers = {"User-Agent": str(user_agent)}
response = get_response(url, headers=headers) response = get_response(url, headers=headers)
assert response.status_code == 200 assert not isinstance(response, Exception) and response.status_code == 200
url = "http/wwhfhfvhvjhmom" url = "http/wwhfhfvhvjhmom"
with pytest.raises(WaybackError): with pytest.raises(WaybackError):
get_response(url, headers=headers) get_response(url, headers=headers)
def test_check_filters(): def test_check_filters() -> None:
filters = [] filters: List[str] = []
check_filters(filters) check_filters(filters)
filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"] filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"]
check_filters(filters) check_filters(filters)
with pytest.raises(WaybackError): with pytest.raises(WaybackError):
check_filters("not-list") check_filters("not-list") # type: ignore[arg-type]
with pytest.raises(WaybackError): with pytest.raises(WaybackError):
check_filters(["invalid"]) check_filters(["invalid"])
def test_check_collapses(): def test_check_collapses() -> None:
collapses = [] collapses: List[str] = []
check_collapses(collapses) check_collapses(collapses)
collapses = ["timestamp:10"] collapses = ["timestamp:10"]
@ -77,7 +82,7 @@ def test_check_collapses():
collapses = ["urlkey"] collapses = ["urlkey"]
check_collapses(collapses) check_collapses(collapses)
collapses = "urlkey" # NOT LIST collapses = "urlkey" # type: ignore[assignment]
with pytest.raises(WaybackError): with pytest.raises(WaybackError):
check_collapses(collapses) check_collapses(collapses)
@ -86,11 +91,11 @@ def test_check_collapses():
check_collapses(collapses) check_collapses(collapses)
def test_check_match_type(): def test_check_match_type() -> None:
assert check_match_type(None, "url") is None assert check_match_type(None, "url")
match_type = "exact" match_type = "exact"
url = "test_url" url = "test_url"
assert check_match_type(match_type, url) is None assert check_match_type(match_type, url)
url = "has * in it" url = "has * in it"
with pytest.raises(WaybackError): with pytest.raises(WaybackError):

View File

@ -2,22 +2,27 @@ import random
import string import string
import time import time
from datetime import datetime from datetime import datetime
from typing import cast
import pytest import pytest
from requests.structures import CaseInsensitiveDict
from waybackpy.exceptions import MaximumSaveRetriesExceeded from waybackpy.exceptions import MaximumSaveRetriesExceeded
from waybackpy.save_api import WaybackMachineSaveAPI from waybackpy.save_api import WaybackMachineSaveAPI
def rndstr(n): def rndstr(n: int) -> str:
return "".join( return "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(n) random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
) )
def test_save(): def test_save() -> None:
url = "https://github.com/akamhy/waybackpy" url = "https://github.com/akamhy/waybackpy"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.save() save_api.save()
archive_url = save_api.archive_url archive_url = save_api.archive_url
@ -31,15 +36,18 @@ def test_save():
assert isinstance(save_api.timestamp(), datetime) assert isinstance(save_api.timestamp(), datetime)
def test_max_redirect_exceeded(): def test_max_redirect_exceeded() -> None:
with pytest.raises(MaximumSaveRetriesExceeded): with pytest.raises(MaximumSaveRetriesExceeded):
url = "https://%s.gov" % rndstr url = f"https://{rndstr}.gov"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3) save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3)
save_api.save() save_api.save()
def test_sleep(): def test_sleep() -> None:
""" """
sleeping is actually very important for SaveAPI sleeping is actually very important for SaveAPI
interface stability. interface stability.
@ -47,7 +55,10 @@ def test_sleep():
is as intended. is as intended.
""" """
url = "https://example.com" url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
s_time = int(time.time()) s_time = int(time.time())
save_api.sleep(6) # multiple of 3 sleep for 10 seconds save_api.sleep(6) # multiple of 3 sleep for 10 seconds
@ -60,76 +71,150 @@ def test_sleep():
assert (e_time - s_time) >= 5 assert (e_time - s_time) >= 5
def test_timestamp(): def test_timestamp() -> None:
url = "https://example.com" url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
save_api = WaybackMachineSaveAPI(url, user_agent) "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
now = datetime.utcnow() "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
save_api._archive_url = (
"https://web.archive.org/web/%s/" % now.strftime("%Y%m%d%H%M%S") + url
) )
save_api = WaybackMachineSaveAPI(url, user_agent)
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
save_api.timestamp() save_api.timestamp()
assert save_api.cached_save is False assert save_api.cached_save is False
save_api._archive_url = "https://web.archive.org/web/%s/" % "20100124063622" + url now = "20100124063622"
save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
save_api.timestamp() save_api.timestamp()
assert save_api.cached_save is True assert save_api.cached_save is True
def test_archive_url_parser(): def test_archive_url_parser() -> None:
""" """
Testing three regex for matches and also tests the response URL. Testing three regex for matches and also tests the response URL.
""" """
url = "https://example.com" url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.headers = """ h = (
START "\nSTART\nContent-Location: "
Content-Location: /web/20201126185327/https://www.scribbr.com/citing-sources/et-al "/web/20201126185327/https://www.scribbr.com/citing-sources/et-al"
END "\nEND\n"
"""
assert (
save_api.archive_url_parser()
== "https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al"
) )
save_api.headers = h # type: ignore[assignment]
save_api.headers = """ expected_url = (
{'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'} "https://web.archive.org/web/20201126185327/"
""" "https://www.scribbr.com/citing-sources/et-al"
assert (
save_api.archive_url_parser()
== "https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/"
) )
assert save_api.archive_url_parser() == expected_url
save_api.headers = """ headers = {
START "Server": "nginx/1.15.8",
X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US "Date": "Sat, 02 Jan 2021 09:40:25 GMT",
END "Content-Type": "text/html; charset=UTF-8",
""" "Transfer-Encoding": "chunked",
"Connection": "keep-alive",
"X-Archive-Orig-Server": "nginx",
"X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT",
"X-Archive-Orig-Transfer-Encoding": "chunked",
"X-Archive-Orig-Connection": "keep-alive",
"X-Archive-Orig-Vary": "Accept-Encoding",
"X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT",
"X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;",
"X-Archive-Guessed-Content-Type": "text/html",
"X-Archive-Guessed-Charset": "utf-8",
"Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT",
"Link": (
'<https://www.scribbr.com/citing-sources/et-al/>; rel="original", '
"<https://web.archive.org/web/timemap/link/https://www.scribbr.com/"
'citing-sources/et-al/>; rel="timemap"; type="application/link-format", '
"<https://web.archive.org/web/https://www.scribbr.com/citing-sources/"
'et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/'
'https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; '
'datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/'
"20201126185327/https://www.scribbr.com/citing-sources/et-al/>; "
'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", '
"<https://web.archive.org/web/20210102094009/https://www.scribbr.com/"
'citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 '
'09:40:09 GMT", <https://web.archive.org/web/20210102094009/'
"https://www.scribbr.com/citing-sources/et-al/>; "
'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"'
),
"Content-Security-Policy": (
"default-src 'self' 'unsafe-eval' 'unsafe-inline' "
"data: blob: archive.org web.archive.org analytics.archive.org "
"pragma.archivelab.org",
),
"X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz",
"Server-Timing": (
"captures_list;dur=112.646325, exclusion.robots;dur=0.172010, "
"exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, "
"esindex;dur=0.014647, LoadShardBlock;dur=82.205012, "
"PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, "
"load_resource;dur=26.520179"
),
"X-App-Server": "wwwb-app200",
"X-ts": "200",
"X-location": "All",
"X-Cache-Key": (
"httpsweb.archive.org/web/20210102094009/"
"https://www.scribbr.com/citing-sources/et-al/IN",
),
"X-RL": "0",
"X-Page-Cache": "MISS",
"X-Archive-Screenname": "0",
"Content-Encoding": "gzip",
}
assert ( save_api.headers = cast(CaseInsensitiveDict[str], headers)
save_api.archive_url_parser()
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/" expected_url2 = (
"https://web.archive.org/web/20210102094009/"
"https://www.scribbr.com/citing-sources/et-al/"
) )
assert save_api.archive_url_parser() == expected_url2
save_api.headers = "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING" expected_url_3 = (
save_api.response_url = "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al" "https://web.archive.org/web/20171128185327/"
assert ( "https://www.scribbr.com/citing-sources/et-al/US"
save_api.archive_url_parser()
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al"
) )
h = f"START\nX-Cache-Key: {expected_url_3}\nEND\n"
save_api.headers = h # type: ignore[assignment]
expected_url4 = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al/"
)
assert save_api.archive_url_parser() == expected_url4
h = "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING"
save_api.headers = h # type: ignore[assignment]
save_api.response_url = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al"
)
expected_url5 = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al"
)
assert save_api.archive_url_parser() == expected_url5
def test_archive_url(): def test_archive_url() -> None:
""" """
Checks the attribute archive_url's value when the save method was not Checks the attribute archive_url's value when the save method was not
explicitly invoked by the end-user but the save method was invoked implicitly explicitly invoked by the end-user but the save method was invoked implicitly
by the archive_url method which is an attribute due to @property. by the archive_url method which is an attribute due to @property.
""" """
url = "https://example.com" url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.saved_archive = ( save_api.saved_archive = (
"https://web.archive.org/web/20220124063056/https://example.com/" "https://web.archive.org/web/20220124063056/https://example.com/"

View File

@ -6,13 +6,13 @@ from waybackpy.utils import (
) )
def test_default_user_agent(): def test_default_user_agent() -> None:
assert ( assert (
DEFAULT_USER_AGENT DEFAULT_USER_AGENT
== "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__ == f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
) )
def test_latest_version(): def test_latest_version() -> None:
package_name = "waybackpy" package_name = "waybackpy"
assert latest_version_github(package_name) == latest_version_pypi(package_name) assert latest_version_github(package_name) == latest_version_pypi(package_name)

View File

@ -5,11 +5,7 @@ __description__ = (
) )
__url__ = "https://akamhy.github.io/waybackpy/" __url__ = "https://akamhy.github.io/waybackpy/"
__version__ = "3.0.2" __version__ = "3.0.2"
__download_url__ = ( __download_url__ = f"https://github.com/akamhy/waybackpy/archive/{__version__}.tar.gz"
"https://github.com/akamhy/waybackpy/archive/{version}.tar.gz".format(
version=__version__
)
)
__author__ = "Akash Mahanty" __author__ = "Akash Mahanty"
__author_email__ = "akamhy@yahoo.com" __author_email__ = "akamhy@yahoo.com"
__license__ = "MIT" __license__ = "MIT"

View File

@ -1,6 +1,7 @@
import json import json
import time import time
from datetime import datetime from datetime import datetime
from typing import Any, Dict, Optional
import requests import requests
@ -10,37 +11,42 @@ from .exceptions import (
) )
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
ResponseJSON = Dict[str, Any]
class WaybackMachineAvailabilityAPI:
class WaybackMachineAvailabilityAPI(object):
""" """
Class that interfaces the availability API of the Wayback Machine. Class that interfaces the availability API of the Wayback Machine.
""" """
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3): def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
) -> None:
self.url = str(url).strip().replace(" ", "%20") self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent self.user_agent = user_agent
self.headers = {"User-Agent": self.user_agent} self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
self.payload = {"url": "{url}".format(url=self.url)} self.payload = {"url": self.url}
self.endpoint = "https://archive.org/wayback/available" self.endpoint = "https://archive.org/wayback/available"
self.max_tries = max_tries self.max_tries = max_tries
self.tries = 0 self.tries = 0
self.last_api_call_unix_time = int(time.time()) self.last_api_call_unix_time = int(time.time())
self.api_call_time_gap = 5 self.api_call_time_gap = 5
self.JSON = None self.JSON: Optional[ResponseJSON] = None
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp): @staticmethod
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
""" """
Converts Unix time to wayback Machine timestamp. Converts Unix time to wayback Machine timestamp.
""" """
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
def __repr__(self): def __repr__(self) -> str:
""" """
Same as string representation, just return the archive URL as a string. Same as string representation, just return the archive URL as a string.
""" """
return str(self) return str(self)
def __str__(self): def __str__(self) -> str:
""" """
String representation of the class. If atleast one API call was successfully String representation of the class. If atleast one API call was successfully
made then return the archive URL as a string. Else returns None. made then return the archive URL as a string. Else returns None.
@ -54,7 +60,7 @@ class WaybackMachineAvailabilityAPI:
return self.archive_url return self.archive_url
def json(self): def json(self) -> Optional[ResponseJSON]:
""" """
Makes the API call to the availability API can set the JSON response Makes the API call to the availability API can set the JSON response
to the JSON attribute of the instance and also returns the JSON attribute. to the JSON attribute of the instance and also returns the JSON attribute.
@ -74,12 +80,12 @@ class WaybackMachineAvailabilityAPI:
self.JSON = self.response.json() self.JSON = self.response.json()
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError:
raise InvalidJSONInAvailabilityAPIResponse( raise InvalidJSONInAvailabilityAPIResponse(
"Response data:\n{text}".format(text=self.response.text) f"Response data:\n{self.response.text}"
) )
return self.JSON return self.JSON
def timestamp(self): def timestamp(self) -> datetime:
""" """
Converts the timestamp form the JSON response to datetime object. Converts the timestamp form the JSON response to datetime object.
If JSON attribute of the instance is None it implies that the either If JSON attribute of the instance is None it implies that the either
@ -91,19 +97,29 @@ class WaybackMachineAvailabilityAPI:
If you get an URL as a response form the availability API it is guaranteed If you get an URL as a response form the availability API it is guaranteed
that you can get the datetime object from the timestamp. that you can get the datetime object from the timestamp.
""" """
if not self.JSON or not self.JSON["archived_snapshots"]: if self.JSON is None or "archived_snapshots" not in self.JSON:
return datetime.max return datetime.max
elif (
return datetime.strptime( self.JSON is not None
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" and "archived_snapshots" in self.JSON
) and self.JSON["archived_snapshots"] is not None
and "closest" in self.JSON["archived_snapshots"]
and self.JSON["archived_snapshots"]["closest"] is not None
and "timestamp" in self.JSON["archived_snapshots"]["closest"]
):
return datetime.strptime(
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
)
else:
raise ValueError("Could not get timestamp from result")
@property @property
def archive_url(self): def archive_url(self) -> str:
""" """
Reads the the JSON response data and tries to get the timestamp and returns Reads the the JSON response data and tries to get the timestamp and returns
the timestamp if found else returns None. the timestamp if found else returns None.
""" """
archive_url = ""
data = self.JSON data = self.JSON
# If the user didn't used oldest, newest or near but tries to access the # If the user didn't used oldest, newest or near but tries to access the
@ -127,9 +143,9 @@ class WaybackMachineAvailabilityAPI:
if not data or not data["archived_snapshots"]: if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse( raise ArchiveNotInAvailabilityAPIResponse(
"Archive not found in the availability " "Archive not found in the availability "
+ "API response, the URL you requested may not have any " "API response, the URL you requested may not have any archives "
+ "archives yet. You may retry after some time or archive the webpage now." "yet. You may retry after some time or archive the webpage now.\n"
+ "\nResponse data:\n{response}".format(response=self.response.text) f"Response data:\n{self.response.text}"
) )
else: else:
archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = data["archived_snapshots"]["closest"]["url"]
@ -138,7 +154,8 @@ class WaybackMachineAvailabilityAPI:
) )
return archive_url return archive_url
def wayback_timestamp(self, **kwargs): @staticmethod
def wayback_timestamp(**kwargs: int) -> str:
""" """
Prepends zero before the year, month, day, hour and minute so that they Prepends zero before the year, month, day, hour and minute so that they
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format. are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
@ -148,7 +165,7 @@ class WaybackMachineAvailabilityAPI:
for key in ["year", "month", "day", "hour", "minute"] for key in ["year", "month", "day", "hour", "minute"]
) )
def oldest(self): def oldest(self) -> "WaybackMachineAvailabilityAPI":
""" """
Passing the year 1994 should return the oldest archive because Passing the year 1994 should return the oldest archive because
wayback machine was started in May, 1996 and there should be no archive wayback machine was started in May, 1996 and there should be no archive
@ -156,7 +173,7 @@ class WaybackMachineAvailabilityAPI:
""" """
return self.near(year=1994) return self.near(year=1994)
def newest(self): def newest(self) -> "WaybackMachineAvailabilityAPI":
""" """
Passing the current UNIX time should be sufficient to get the newest Passing the current UNIX time should be sufficient to get the newest
archive considering the API request-response time delay and also the archive considering the API request-response time delay and also the
@ -166,13 +183,13 @@ class WaybackMachineAvailabilityAPI:
def near( def near(
self, self,
year=None, year: Optional[int] = None,
month=None, month: Optional[int] = None,
day=None, day: Optional[int] = None,
hour=None, hour: Optional[int] = None,
minute=None, minute: Optional[int] = None,
unix_timestamp=None, unix_timestamp: Optional[int] = None,
): ) -> "WaybackMachineAvailabilityAPI":
""" """
The main method for this Class, oldest and newest methods are dependent on this The main method for this Class, oldest and newest methods are dependent on this
method. method.
@ -181,18 +198,19 @@ class WaybackMachineAvailabilityAPI:
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
appropriate arguments for their respective parameters. appropriate arguments for their respective parameters.
Adds the timestamp to the payload dictionary. Adds the timestamp to the payload dictionary.
And finally invoking the json method to make the API call then returns the instance. And finally invoking the json method to make the API call then returns
the instance.
""" """
if unix_timestamp: if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
else: else:
now = datetime.utcnow().timetuple() now = datetime.utcnow().timetuple()
timestamp = self.wayback_timestamp( timestamp = self.wayback_timestamp(
year=year if year else now.tm_year, year=now.tm_year if year is None else year,
month=month if month else now.tm_mon, month=now.tm_mon if month is None else month,
day=day if day else now.tm_mday, day=now.tm_mday if day is None else day,
hour=hour if hour else now.tm_hour, hour=now.tm_hour if hour is None else hour,
minute=minute if minute else now.tm_min, minute=now.tm_min if minute is None else minute,
) )
self.payload["timestamp"] = timestamp self.payload["timestamp"] = timestamp

View File

@ -1,3 +1,5 @@
from typing import Dict, Generator, List, Optional, cast
from .cdx_snapshot import CDXSnapshot from .cdx_snapshot import CDXSnapshot
from .cdx_utils import ( from .cdx_utils import (
check_collapses, check_collapses,
@ -11,43 +13,48 @@ from .exceptions import WaybackError
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
class WaybackMachineCDXServerAPI: class WaybackMachineCDXServerAPI(object):
""" """
Class that interfaces the CDX server API of the Wayback Machine. Class that interfaces the CDX server API of the Wayback Machine.
""" """
# start_timestamp: from, can not use from as it's a keyword
# end_timestamp: to, not using to as can not use from
def __init__( def __init__(
self, self,
url, url: str,
user_agent=DEFAULT_USER_AGENT, user_agent: str = DEFAULT_USER_AGENT,
start_timestamp=None, # from, can not use from as it's a keyword start_timestamp: Optional[str] = None,
end_timestamp=None, # to, not using to as can not use from end_timestamp: Optional[str] = None,
filters=[], filters: List[str] = [],
match_type=None, match_type: Optional[str] = None,
gzip=None, gzip: Optional[str] = None,
collapses=[], collapses: List[str] = [],
limit=None, limit: Optional[str] = None,
max_tries=3, max_tries: int = 3,
): ) -> None:
self.url = str(url).strip().replace(" ", "%20") self.url = str(url).strip().replace(" ", "%20")
self.user_agent = user_agent self.user_agent = user_agent
self.start_timestamp = str(start_timestamp) if start_timestamp else None self.start_timestamp = (
self.end_timestamp = str(end_timestamp) if end_timestamp else None str(start_timestamp) if start_timestamp is not None else None
)
self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
self.filters = filters self.filters = filters
check_filters(self.filters) check_filters(self.filters)
self.match_type = str(match_type).strip() if match_type else None self.match_type = str(match_type).strip() if match_type is not None else None
check_match_type(self.match_type, self.url) check_match_type(self.match_type, self.url)
self.gzip = gzip if gzip else True self.gzip = gzip
self.collapses = collapses self.collapses = collapses
check_collapses(self.collapses) check_collapses(self.collapses)
self.limit = limit if limit else 5000 self.limit = limit if limit is not None else 5000
self.max_tries = max_tries self.max_tries = max_tries
self.last_api_request_url = None self.last_api_request_url: Optional[str] = None
self.use_page = False self.use_page = False
self.endpoint = "https://web.archive.org/cdx/search/cdx" self.endpoint = "https://web.archive.org/cdx/search/cdx"
def cdx_api_manager(self, payload, headers, use_page=False): def cdx_api_manager(
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
) -> Generator[str, None, None]:
total_pages = get_total_pages(self.url, self.user_agent) total_pages = get_total_pages(self.url, self.user_agent)
# If we only have two or less pages of archives then we care for more accuracy # If we only have two or less pages of archives then we care for more accuracy
# pagination API is lagged sometimes # pagination API is lagged sometimes
@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI:
url = full_url(self.endpoint, params=payload) url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers) res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url self.last_api_request_url = url
text = res.text text = res.text
@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI:
yield text yield text
else: else:
payload["showResumeKey"] = "true" payload["showResumeKey"] = "true"
payload["limit"] = str(self.limit) payload["limit"] = str(self.limit)
resumeKey = None resumeKey = None
more = True more = True
while more: while more:
if resumeKey: if resumeKey:
payload["resumeKey"] = resumeKey payload["resumeKey"] = resumeKey
url = full_url(self.endpoint, params=payload) url = full_url(self.endpoint, params=payload)
res = get_response(url, headers=headers) res = get_response(url, headers=headers)
if isinstance(res, Exception):
raise res
self.last_api_request_url = url self.last_api_request_url = url
@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI:
yield text yield text
def add_payload(self, payload): def add_payload(self, payload: Dict[str, str]) -> None:
if self.start_timestamp: if self.start_timestamp:
payload["from"] = self.start_timestamp payload["from"] = self.start_timestamp
if self.end_timestamp: if self.end_timestamp:
payload["to"] = self.end_timestamp payload["to"] = self.end_timestamp
if self.gzip is not True: if self.gzip is None:
payload["gzip"] = "false" payload["gzip"] = "false"
if self.match_type: if self.match_type:
@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI:
# Don't need to return anything as it's dictionary. # Don't need to return anything as it's dictionary.
payload["url"] = self.url payload["url"] = self.url
def snapshots(self): def snapshots(self) -> Generator[CDXSnapshot, None, None]:
payload = {} payload: Dict[str, str] = {}
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
self.add_payload(payload) self.add_payload(payload)
@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI:
if len(snapshot) < 46: # 14 + 32 (timestamp+digest) if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
continue continue
properties = { properties: Dict[str, Optional[str]] = {
"urlkey": None, "urlkey": None,
"timestamp": None, "timestamp": None,
"original": None, "original": None,
@ -169,15 +177,9 @@ class WaybackMachineCDXServerAPI:
if prop_values_len != properties_len: if prop_values_len != properties_len:
raise WaybackError( raise WaybackError(
"Snapshot returned by Cdx API has {prop_values_len} properties".format( f"Snapshot returned by Cdx API has {prop_values_len} "
prop_values_len=prop_values_len f"properties instead of expected {properties_len} properties.\n"
) f"Problematic Snapshot: {snapshot}"
+ " instead of expected {properties_len} ".format(
properties_len=properties_len
)
+ "properties.\nProblematic Snapshot : {snapshot}".format(
snapshot=snapshot
)
) )
( (
@ -190,4 +192,4 @@ class WaybackMachineCDXServerAPI:
properties["length"], properties["length"],
) = prop_values ) = prop_values
yield CDXSnapshot(properties) yield CDXSnapshot(cast(Dict[str, str], properties))

View File

@ -1,7 +1,8 @@
from datetime import datetime from datetime import datetime
from typing import Dict
class CDXSnapshot: class CDXSnapshot(object):
""" """
Class for the CDX snapshot lines returned by the CDX API, Class for the CDX snapshot lines returned by the CDX API,
Each valid line of the CDX API is casted to an CDXSnapshot object Each valid line of the CDX API is casted to an CDXSnapshot object
@ -10,7 +11,7 @@ class CDXSnapshot:
of the CDXSnapshot. of the CDXSnapshot.
""" """
def __init__(self, properties): def __init__(self, properties: Dict[str, str]) -> None:
self.urlkey = properties["urlkey"] self.urlkey = properties["urlkey"]
self.timestamp = properties["timestamp"] self.timestamp = properties["timestamp"]
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S") self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
@ -20,16 +21,11 @@ class CDXSnapshot:
self.digest = properties["digest"] self.digest = properties["digest"]
self.length = properties["length"] self.length = properties["length"]
self.archive_url = ( self.archive_url = (
"https://web.archive.org/web/" + self.timestamp + "/" + self.original f"https://web.archive.org/web/{self.timestamp}/{self.original}"
) )
def __str__(self): def __str__(self) -> str:
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format( return (
urlkey=self.urlkey, f"{self.urlkey} {self.timestamp} {self.original} "
timestamp=self.timestamp, f"{self.mimetype} {self.statuscode} {self.digest} {self.length}"
original=self.original,
mimetype=self.mimetype,
statuscode=self.statuscode,
digest=self.digest,
length=self.length,
) )

View File

@ -1,4 +1,6 @@
import re import re
from typing import Any, Dict, List, Optional, Union
from urllib.parse import quote
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
@ -8,16 +10,19 @@ from .exceptions import WaybackError
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
def get_total_pages(url, user_agent=DEFAULT_USER_AGENT): def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
endpoint = "https://web.archive.org/cdx/search/cdx?" endpoint = "https://web.archive.org/cdx/search/cdx?"
payload = {"showNumPages": "true", "url": str(url)} payload = {"showNumPages": "true", "url": str(url)}
headers = {"User-Agent": user_agent} headers = {"User-Agent": user_agent}
request_url = full_url(endpoint, params=payload) request_url = full_url(endpoint, params=payload)
response = get_response(request_url, headers=headers) response = get_response(request_url, headers=headers)
return int(response.text.strip()) if isinstance(response, requests.Response):
return int(response.text.strip())
else:
raise response
def full_url(endpoint, params): def full_url(endpoint: str, params: Dict[str, Any]) -> str:
if not params: if not params:
return endpoint return endpoint
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
@ -25,28 +30,25 @@ def full_url(endpoint, params):
key = "filter" if key.startswith("filter") else key key = "filter" if key.startswith("filter") else key
key = "collapse" if key.startswith("collapse") else key key = "collapse" if key.startswith("collapse") else key
amp = "" if full_url.endswith("?") else "&" amp = "" if full_url.endswith("?") else "&"
full_url = ( val = quote(str(val), safe="")
full_url full_url += f"{amp}{key}={val}"
+ amp
+ "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
)
return full_url return full_url
def get_response( def get_response(
url, url: str,
headers=None, headers: Optional[Dict[str, str]] = None,
retries=5, retries: int = 5,
backoff_factor=0.5, backoff_factor: float = 0.5,
no_raise_on_redirects=False, # no_raise_on_redirects=False,
): ) -> Union[requests.Response, Exception]:
session = requests.Session() session = requests.Session()
retries = Retry( retries_ = Retry(
total=retries, total=retries,
backoff_factor=backoff_factor, backoff_factor=backoff_factor,
status_forcelist=[500, 502, 503, 504], status_forcelist=[500, 502, 503, 504],
) )
session.mount("https://", HTTPAdapter(max_retries=retries)) session.mount("https://", HTTPAdapter(max_retries=retries_))
try: try:
response = session.get(url, headers=headers) response = session.get(url, headers=headers)
@ -54,77 +56,65 @@ def get_response(
return response return response
except Exception as e: except Exception as e:
reason = str(e) reason = str(e)
exc_message = "Error while retrieving {url}.\n{reason}".format( exc_message = f"Error while retrieving {url}.\n{reason}"
url=url, reason=reason
)
exc = WaybackError(exc_message) exc = WaybackError(exc_message)
exc.__cause__ = e exc.__cause__ = e
raise exc raise exc
def check_filters(filters): def check_filters(filters: List[str]) -> None:
if not isinstance(filters, list): if not isinstance(filters, list):
raise WaybackError("filters must be a list.") raise WaybackError("filters must be a list.")
# [!]field:regex # [!]field:regex
for _filter in filters: for _filter in filters:
try: match = re.search(
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):"
r"(.*)",
_filter,
)
match = re.search( if match is None or len(match.groups()) != 2:
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
_filter,
)
match.group(1) exc_message = f"Filter '{_filter}' is not following the cdx filter syntax."
match.group(2)
except Exception:
exc_message = (
"Filter '{_filter}' is not following the cdx filter syntax.".format(
_filter=_filter
)
)
raise WaybackError(exc_message) raise WaybackError(exc_message)
def check_collapses(collapses): def check_collapses(collapses: List[str]) -> bool:
if not isinstance(collapses, list): if not isinstance(collapses, list):
raise WaybackError("collapses must be a list.") raise WaybackError("collapses must be a list.")
elif len(collapses) == 0:
if len(collapses) == 0: return True
return
for collapse in collapses: for collapse in collapses:
try: match = re.search(
match = re.search( r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)"
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?", r"(:?[0-9]{1,99})?",
collapse, collapse,
) )
match.group(1) if match is None or len(match.groups()) != 2:
if 2 == len(match.groups()): exc_message = (
match.group(2) f"collapse argument '{collapse}' "
except Exception: "is not following the cdx collapse syntax."
exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
collapse=collapse
) )
raise WaybackError(exc_message) raise WaybackError(exc_message)
return True
def check_match_type(match_type, url):
def check_match_type(match_type: Optional[str], url: str) -> bool:
legal_match_type = ["exact", "prefix", "host", "domain"]
if not match_type: if not match_type:
return return True
elif "*" in url:
if "*" in url:
raise WaybackError( raise WaybackError(
"Can not use wildcard in the URL along with the match_type arguments." "Can not use wildcard in the URL along with the match_type arguments."
) )
elif match_type not in legal_match_type:
legal_match_type = ["exact", "prefix", "host", "domain"] exc_message = (
f"{match_type} is not an allowed match type.\n"
if match_type not in legal_match_type: "Use one from 'exact', 'prefix', 'host' or 'domain'"
exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
match_type=match_type
) )
raise WaybackError(exc_message) raise WaybackError(exc_message)
else:
return True

View File

@ -3,6 +3,7 @@ import os
import random import random
import re import re
import string import string
from typing import Generator, List, Optional
import click import click
import requests import requests
@ -24,7 +25,7 @@ from .wrapper import Url
"--user-agent", "--user-agent",
"--user_agent", "--user_agent",
default=DEFAULT_USER_AGENT, default=DEFAULT_USER_AGENT,
help="User agent, default value is '%s'." % DEFAULT_USER_AGENT, help=f"User agent, default value is '{DEFAULT_USER_AGENT}'.",
) )
@click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.") @click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.")
@click.option( @click.option(
@ -163,34 +164,34 @@ from .wrapper import Url
+ "will be printed.", + "will be printed.",
) )
def main( def main(
url, url: Optional[str],
user_agent, user_agent: str,
version, version: bool,
license, license: bool,
newest, newest: bool,
oldest, oldest: bool,
json, json: bool,
near, near: bool,
year, year: Optional[int],
month, month: Optional[int],
day, day: Optional[int],
hour, hour: Optional[int],
minute, minute: Optional[int],
save, save: bool,
headers, headers: bool,
known_urls, known_urls: bool,
subdomain, subdomain: bool,
file, file: bool,
cdx, cdx: bool,
start_timestamp, start_timestamp: Optional[str],
end_timestamp, end_timestamp: Optional[str],
filter, filter: List[str],
match_type, match_type: Optional[str],
gzip, gzip: Optional[str],
collapse, collapse: List[str],
limit, limit: Optional[str],
cdx_print, cdx_print: List[str],
): ) -> None:
"""\b """\b
_ _ _ _
| | | | | | | |
@ -214,7 +215,7 @@ def main(
""" """
if version: if version:
click.echo("waybackpy version %s" % __version__) click.echo(f"waybackpy version {__version__}")
return return
if license: if license:
@ -240,11 +241,14 @@ def main(
and not cdx and not cdx
): ):
click.echo( click.echo(
"Only URL passed, but did not specify what to do with the URL. Use --help flag for help using waybackpy." "Only URL passed, but did not specify what to do with the URL. "
"Use --help flag for help using waybackpy."
) )
return return
def echo_availability_api(availability_api_instance): def echo_availability_api(
availability_api_instance: WaybackMachineAvailabilityAPI,
) -> None:
click.echo("Archive URL:") click.echo("Archive URL:")
if not availability_api_instance.archive_url: if not availability_api_instance.archive_url:
archive_url = ( archive_url = (
@ -295,13 +299,14 @@ def main(
click.echo(save_api.headers) click.echo(save_api.headers)
return return
def save_urls_on_file(url_gen): def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
domain = None domain = None
sys_random = random.SystemRandom() sys_random = random.SystemRandom()
uid = "".join( uid = "".join(
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
) )
url_count = 0 url_count = 0
file_name = None
for url in url_gen: for url in url_gen:
url_count += 1 url_count += 1
@ -310,25 +315,21 @@ def main(
domain = "domain-unknown" domain = "domain-unknown"
if match: if match is not None:
domain = match.group(1) domain = match.group(1)
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid) file_name = f"{domain}-urls-{uid}.txt"
file_path = os.path.join(os.getcwd(), file_name) file_path = os.path.join(os.getcwd(), file_name)
if not os.path.isfile(file_path): if not os.path.isfile(file_path):
open(file_path, "w+").close() open(file_path, "w+").close()
with open(file_path, "a") as f: with open(file_path, "a") as f:
f.write("{url}\n".format(url=url)) f.write(f"{url}\n")
click.echo(url) click.echo(url)
if url_count > 0: if url_count > 0 or file_name is not None:
click.echo( click.echo(f"\n\n'{file_name}' saved in current working directory")
"\n\n'{file_name}' saved in current working directory".format(
file_name=file_name
)
)
else: else:
click.echo("No known URLs found. Please try a diffrent input!") click.echo("No known URLs found. Please try a diffrent input!")

View File

@ -14,6 +14,8 @@ class WaybackError(Exception):
All other exceptions are inherited from this class. All other exceptions are inherited from this class.
""" """
pass
class RedirectSaveError(WaybackError): class RedirectSaveError(WaybackError):
""" """
@ -21,32 +23,44 @@ class RedirectSaveError(WaybackError):
redirect URL is archived but not the original URL. redirect URL is archived but not the original URL.
""" """
pass
class URLError(Exception): class URLError(Exception):
""" """
Raised when malformed URLs are passed as arguments. Raised when malformed URLs are passed as arguments.
""" """
pass
class MaximumRetriesExceeded(WaybackError): class MaximumRetriesExceeded(WaybackError):
""" """
MaximumRetriesExceeded MaximumRetriesExceeded
""" """
pass
class MaximumSaveRetriesExceeded(MaximumRetriesExceeded): class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
""" """
MaximumSaveRetriesExceeded MaximumSaveRetriesExceeded
""" """
pass
class ArchiveNotInAvailabilityAPIResponse(WaybackError): class ArchiveNotInAvailabilityAPIResponse(WaybackError):
""" """
Could not parse the archive in the JSON response of the availability API. Could not parse the archive in the JSON response of the availability API.
""" """
pass
class InvalidJSONInAvailabilityAPIResponse(WaybackError): class InvalidJSONInAvailabilityAPIResponse(WaybackError):
""" """
availability api returned invalid JSON availability api returned invalid JSON
""" """
pass

View File

@ -1,38 +1,41 @@
import re import re
import time import time
from datetime import datetime from datetime import datetime
from typing import Dict, Optional
import requests import requests
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from requests.structures import CaseInsensitiveDict
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from .exceptions import MaximumSaveRetriesExceeded from .exceptions import MaximumSaveRetriesExceeded
from .utils import DEFAULT_USER_AGENT from .utils import DEFAULT_USER_AGENT
class WaybackMachineSaveAPI: class WaybackMachineSaveAPI(object):
""" """
WaybackMachineSaveAPI class provides an interface for saving URLs on the WaybackMachineSaveAPI class provides an interface for saving URLs on the
Wayback Machine. Wayback Machine.
""" """
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8): def __init__(
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
) -> None:
self.url = str(url).strip().replace(" ", "%20") self.url = str(url).strip().replace(" ", "%20")
self.request_url = "https://web.archive.org/save/" + self.url self.request_url = "https://web.archive.org/save/" + self.url
self.user_agent = user_agent self.user_agent = user_agent
self.request_headers = {"User-Agent": self.user_agent} self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
if max_tries < 1: if max_tries < 1:
raise ValueError("max_tries should be positive") raise ValueError("max_tries should be positive")
self.max_tries = max_tries self.max_tries = max_tries
self.total_save_retries = 5 self.total_save_retries = 5
self.backoff_factor = 0.5 self.backoff_factor = 0.5
self.status_forcelist = [500, 502, 503, 504] self.status_forcelist = [500, 502, 503, 504]
self._archive_url = None self._archive_url: Optional[str] = None
self.instance_birth_time = datetime.utcnow() self.instance_birth_time = datetime.utcnow()
@property @property
def archive_url(self): def archive_url(self) -> str:
""" """
Returns the archive URL is already cached by _archive_url Returns the archive URL is already cached by _archive_url
else invoke the save method to save the archive which returns the else invoke the save method to save the archive which returns the
@ -44,7 +47,7 @@ class WaybackMachineSaveAPI:
else: else:
return self.save() return self.save()
def get_save_request_headers(self): def get_save_request_headers(self) -> None:
""" """
Creates a session and tries 'retries' number of times to Creates a session and tries 'retries' number of times to
retrieve the archive. retrieve the archive.
@ -68,14 +71,13 @@ class WaybackMachineSaveAPI:
) )
session.mount("https://", HTTPAdapter(max_retries=retries)) session.mount("https://", HTTPAdapter(max_retries=retries))
self.response = session.get(self.request_url, headers=self.request_headers) self.response = session.get(self.request_url, headers=self.request_headers)
self.headers = ( # requests.response.headers is requests.structures.CaseInsensitiveDict
self.response.headers self.headers: CaseInsensitiveDict[str] = self.response.headers
) # <class 'requests.structures.CaseInsensitiveDict'>
self.status_code = self.response.status_code self.status_code = self.response.status_code
self.response_url = self.response.url self.response_url = self.response.url
session.close() session.close()
def archive_url_parser(self): def archive_url_parser(self) -> Optional[str]:
""" """
Three regexen (like oxen?) are used to search for the Three regexen (like oxen?) are used to search for the
archive URL in the headers and finally look in the response URL archive URL in the headers and finally look in the response URL
@ -89,12 +91,12 @@ class WaybackMachineSaveAPI:
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>" regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
match = re.search(regex2, str(self.headers)) match = re.search(regex2, str(self.headers))
if match: if match is not None and len(match.groups()) == 1:
return "https://" + match.group(1) return "https://" + match.group(1)
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}" regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
match = re.search(regex3, str(self.headers)) match = re.search(regex3, str(self.headers))
if match: if match is not None and len(match.groups()) == 1:
return "https" + match.group(1) return "https" + match.group(1)
if self.response_url: if self.response_url:
@ -105,7 +107,10 @@ class WaybackMachineSaveAPI:
if match: if match:
return "https://" + match.group(0) return "https://" + match.group(0)
def sleep(self, tries): return None
@staticmethod
def sleep(tries: int) -> None:
""" """
Ensure that the we wait some time before succesive retries so that we Ensure that the we wait some time before succesive retries so that we
don't waste the retries before the page is even captured by the Wayback don't waste the retries before the page is even captured by the Wayback
@ -120,7 +125,7 @@ class WaybackMachineSaveAPI:
sleep_seconds = 10 sleep_seconds = 10
time.sleep(sleep_seconds) time.sleep(sleep_seconds)
def timestamp(self): def timestamp(self) -> datetime:
""" """
Read the timestamp off the archive URL and convert the Wayback Machine Read the timestamp off the archive URL and convert the Wayback Machine
timestamp to datetime object. timestamp to datetime object.
@ -128,14 +133,16 @@ class WaybackMachineSaveAPI:
Also check if the time on archive is URL and compare it to instance birth Also check if the time on archive is URL and compare it to instance birth
time. time.
If time on the archive is older than the instance creation time set the cached_save If time on the archive is older than the instance creation time set the
to True else set it to False. The flag can be used to check if the Wayback Machine cached_save to True else set it to False. The flag can be used to check
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve if the Wayback Machine didn't serve a Cached URL. It is quite common for
cached archive if last archive was captured before last 45 minutes. the Wayback Machine to serve cached archive if last archive was captured
before last 45 minutes.
""" """
m = re.search( regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url m = re.search(regex, str(self._archive_url))
) if m is None or len(m.groups()) != 1:
raise ValueError("Could not get timestamp")
string_timestamp = m.group(1) string_timestamp = m.group(1)
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S") timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:
return timestamp return timestamp
def save(self): def save(self) -> str:
""" """
Calls the SavePageNow API of the Wayback Machine with required parameters Calls the SavePageNow API of the Wayback Machine with required parameters
and headers to save the URL. and headers to save the URL.
@ -162,14 +169,14 @@ class WaybackMachineSaveAPI:
tries = 0 tries = 0
while True: while True:
if not self.saved_archive: if self.saved_archive is None:
if tries >= 1: if tries >= 1:
self.sleep(tries) self.sleep(tries)
self.get_save_request_headers() self.get_save_request_headers()
self.saved_archive = self.archive_url_parser() self.saved_archive = self.archive_url_parser()
if self.saved_archive is not None: if isinstance(self.saved_archive, str):
self._archive_url = self.saved_archive self._archive_url = self.saved_archive
self.timestamp() self.timestamp()
return self.saved_archive return self.saved_archive
@ -177,7 +184,8 @@ class WaybackMachineSaveAPI:
tries += 1 tries += 1
if tries >= self.max_tries: if tries >= self.max_tries:
raise MaximumSaveRetriesExceeded( raise MaximumSaveRetriesExceeded(
"Tried %s times but failed to save and retrieve the" % str(tries) f"Tried {tries} times but failed to save "
+ " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n" f"and retrieve the archive for {self.url}.\n"
% (self.url, self.response_url, str(self.headers)), f"Response URL:\n{self.response_url}\n"
f"Response Header:\n{self.headers}"
) )

View File

@ -2,22 +2,43 @@ import requests
from . import __version__ from . import __version__
DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__ DEFAULT_USER_AGENT: str = (
f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
)
def latest_version_pypi(package_name, user_agent=DEFAULT_USER_AGENT): def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
request_url = "https://pypi.org/pypi/" + package_name + "/json" request_url = "https://pypi.org/pypi/" + package_name + "/json"
headers = {"User-Agent": user_agent} headers = {"User-Agent": user_agent}
response = requests.get(request_url, headers=headers) response = requests.get(request_url, headers=headers)
data = response.json() data = response.json()
return data["info"]["version"] if (
data is not None
and "info" in data
and data["info"] is not None
and "version" in data["info"]
and data["info"]["version"] is not None
):
return str(data["info"]["version"])
else:
raise ValueError("Could not get latest pypi version")
def latest_version_github(package_name, user_agent=DEFAULT_USER_AGENT): def latest_version_github(
package_name: str, user_agent: str = DEFAULT_USER_AGENT
) -> str:
request_url = ( request_url = (
"https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1" "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
) )
headers = {"User-Agent": user_agent} headers = {"User-Agent": user_agent}
response = requests.get(request_url, headers=headers) response = requests.get(request_url, headers=headers)
data = response.json() data = response.json()
return data[0]["tag_name"] if (
data is not None
and len(data) > 0
and data[0] is not None
and "tag_name" in data[0]
):
return str(data[0]["tag_name"])
else:
raise ValueError("Could not get latest github version")

View File

@ -1,4 +1,5 @@
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Generator, Optional
from .availability_api import WaybackMachineAvailabilityAPI from .availability_api import WaybackMachineAvailabilityAPI
from .cdx_api import WaybackMachineCDXServerAPI from .cdx_api import WaybackMachineCDXServerAPI
@ -14,40 +15,42 @@ The reason it is still in the code is backwards compatibility with 2.x.x version
If were are using the Url before the update to version 3.x.x, your code should still be If were are using the Url before the update to version 3.x.x, your code should still be
working fine and there is no hurry to update the interface but is recommended that you working fine and there is no hurry to update the interface but is recommended that you
do not use the Url class for new code as it would be removed after 2025 also the first do not use the Url class for new code as it would be removed after 2025 also the first
3.x.x versions was released in January 2022 and three years are more than enough to update 3.x.x versions was released in January 2022 and three years are more than enough to
the older interface code. update the older interface code.
""" """
class Url: class Url(object):
def __init__(self, url, user_agent=DEFAULT_USER_AGENT): def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
self.url = url self.url = url
self.user_agent = str(user_agent) self.user_agent = str(user_agent)
self.archive_url = None self.archive_url: Optional[str] = None
self.timestamp = None self.timestamp: Optional[datetime] = None
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI( self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
self.url, user_agent=self.user_agent self.url, user_agent=self.user_agent
) )
def __str__(self): def __str__(self) -> str:
if not self.archive_url: if not self.archive_url:
self.newest() self.newest()
return self.archive_url return str(self.archive_url)
def __len__(self): def __len__(self) -> int:
td_max = timedelta( td_max = timedelta(
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999 days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
) )
if not self.timestamp: if not isinstance(self.timestamp, datetime):
self.oldest() self.oldest()
if self.timestamp == datetime.max: if not isinstance(self.timestamp, datetime):
raise TypeError("timestamp must be a datetime")
elif self.timestamp == datetime.max:
return td_max.days return td_max.days
else:
return (datetime.utcnow() - self.timestamp).days
return (datetime.utcnow() - self.timestamp).days def save(self) -> "Url":
def save(self):
self.wayback_machine_save_api = WaybackMachineSaveAPI( self.wayback_machine_save_api = WaybackMachineSaveAPI(
self.url, user_agent=self.user_agent self.url, user_agent=self.user_agent
) )
@ -58,13 +61,13 @@ class Url:
def near( def near(
self, self,
year=None, year: Optional[int] = None,
month=None, month: Optional[int] = None,
day=None, day: Optional[int] = None,
hour=None, hour: Optional[int] = None,
minute=None, minute: Optional[int] = None,
unix_timestamp=None, unix_timestamp: Optional[int] = None,
): ) -> "Url":
self.wayback_machine_availability_api.near( self.wayback_machine_availability_api.near(
year=year, year=year,
@ -77,22 +80,24 @@ class Url:
self.set_availability_api_attrs() self.set_availability_api_attrs()
return self return self
def oldest(self): def oldest(self) -> "Url":
self.wayback_machine_availability_api.oldest() self.wayback_machine_availability_api.oldest()
self.set_availability_api_attrs() self.set_availability_api_attrs()
return self return self
def newest(self): def newest(self) -> "Url":
self.wayback_machine_availability_api.newest() self.wayback_machine_availability_api.newest()
self.set_availability_api_attrs() self.set_availability_api_attrs()
return self return self
def set_availability_api_attrs(self): def set_availability_api_attrs(self) -> None:
self.archive_url = self.wayback_machine_availability_api.archive_url self.archive_url = self.wayback_machine_availability_api.archive_url
self.JSON = self.wayback_machine_availability_api.JSON self.JSON = self.wayback_machine_availability_api.JSON
self.timestamp = self.wayback_machine_availability_api.timestamp() self.timestamp = self.wayback_machine_availability_api.timestamp()
def total_archives(self, start_timestamp=None, end_timestamp=None): def total_archives(
self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
) -> int:
cdx = WaybackMachineCDXServerAPI( cdx = WaybackMachineCDXServerAPI(
self.url, self.url,
user_agent=self.user_agent, user_agent=self.user_agent,
@ -107,12 +112,12 @@ class Url:
def known_urls( def known_urls(
self, self,
subdomain=False, subdomain: bool = False,
host=False, host: bool = False,
start_timestamp=None, start_timestamp: Optional[str] = None,
end_timestamp=None, end_timestamp: Optional[str] = None,
match_type="prefix", match_type: str = "prefix",
): ) -> Generator[str, None, None]:
if subdomain: if subdomain:
match_type = "domain" match_type = "domain"
if host: if host: