Typing (#128)
* fix: CI yml name * add: mypy configuraion * add: type annotation to waybackpy modules * add: type annotation to test modules * fix: mypy command * add: types-requests to dev deps * fix: disable max-line-length * fix: move pytest.ini into setup.cfg * add: urllib3 to deps * fix: Retry (ref: https://github.com/python/typeshed/issues/6893) * fix: f-string * fix: shorten long lines * add: staticmethod decorator to no-self-use methods * fix: str(headers)->headers_str * fix: error message * fix: revert "str(headers)->headers_str" and ignore assignment CaseInsensitiveDict with str * fix: mypy error
This commit is contained in:
parent
320ef30371
commit
d8cabdfdb5
@ -28,14 +28,13 @@ jobs:
|
|||||||
pip install '.[dev]'
|
pip install '.[dev]'
|
||||||
- name: Lint with flake8
|
- name: Lint with flake8
|
||||||
run: |
|
run: |
|
||||||
# stop the build if there are Python syntax errors or undefined names
|
|
||||||
flake8 . --count --show-source --statistics
|
flake8 . --count --show-source --statistics
|
||||||
- name: Lint with black
|
- name: Lint with black
|
||||||
run: |
|
run: |
|
||||||
black . --check --diff
|
black . --check --diff
|
||||||
# - name: Static type test with mypy
|
- name: Static type test with mypy
|
||||||
# run: |
|
run: |
|
||||||
# mypy
|
mypy -p waybackpy -p tests
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
pytest
|
pytest
|
11
pytest.ini
11
pytest.ini
@ -1,11 +0,0 @@
|
|||||||
[pytest]
|
|
||||||
addopts =
|
|
||||||
# show summary of all tests that did not pass
|
|
||||||
-ra
|
|
||||||
# enable all warnings
|
|
||||||
-Wd
|
|
||||||
# coverage and html report
|
|
||||||
--cov=waybackpy
|
|
||||||
--cov-report=html
|
|
||||||
testpaths =
|
|
||||||
tests
|
|
@ -3,7 +3,8 @@ click
|
|||||||
codecov
|
codecov
|
||||||
flake8
|
flake8
|
||||||
mypy
|
mypy
|
||||||
setuptools>=46.4.0
|
|
||||||
pytest
|
pytest
|
||||||
pytest-cov
|
pytest-cov
|
||||||
requests
|
requests
|
||||||
|
setuptools>=46.4.0
|
||||||
|
types-requests
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
click
|
click
|
||||||
requests
|
requests
|
||||||
|
urllib3
|
||||||
|
27
setup.cfg
27
setup.cfg
@ -42,6 +42,7 @@ python_requires = >= 3.7
|
|||||||
install_requires =
|
install_requires =
|
||||||
click
|
click
|
||||||
requests
|
requests
|
||||||
|
urllib3
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
dev =
|
dev =
|
||||||
@ -52,7 +53,7 @@ dev =
|
|||||||
pytest
|
pytest
|
||||||
pytest-cov
|
pytest-cov
|
||||||
setuptools>=46.4.0
|
setuptools>=46.4.0
|
||||||
|
types-requests
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
@ -64,4 +65,26 @@ profile = black
|
|||||||
[flake8]
|
[flake8]
|
||||||
indent-size = 4
|
indent-size = 4
|
||||||
max-line-length = 88
|
max-line-length = 88
|
||||||
extend-ignore = E203,W503,E501,W605
|
extend-ignore = W605
|
||||||
|
|
||||||
|
[mypy]
|
||||||
|
python_version = 3.9
|
||||||
|
show_error_codes = True
|
||||||
|
pretty = True
|
||||||
|
strict = True
|
||||||
|
|
||||||
|
[tool:pytest]
|
||||||
|
addopts =
|
||||||
|
# show summary of all tests that did not pass
|
||||||
|
-ra
|
||||||
|
# enable all warnings
|
||||||
|
-Wd
|
||||||
|
# coverage and html report
|
||||||
|
--cov=waybackpy
|
||||||
|
--cov-report=html
|
||||||
|
testpaths =
|
||||||
|
tests
|
||||||
|
|
||||||
|
[pycodestyle]
|
||||||
|
# for `license` and `filter in `waybackpy.cli.main`
|
||||||
|
ignore = W0622
|
||||||
|
@ -12,33 +12,42 @@ from waybackpy.exceptions import (
|
|||||||
|
|
||||||
now = datetime.utcnow()
|
now = datetime.utcnow()
|
||||||
url = "https://example.com/"
|
url = "https://example.com/"
|
||||||
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def rndstr(n):
|
def rndstr(n: int) -> str:
|
||||||
return "".join(
|
return "".join(
|
||||||
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
|
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_oldest():
|
def test_oldest() -> None:
|
||||||
"""
|
"""
|
||||||
Test the oldest archive of Google.com and also checks the attributes.
|
Test the oldest archive of Google.com and also checks the attributes.
|
||||||
"""
|
"""
|
||||||
url = "https://example.com/"
|
url = "https://example.com/"
|
||||||
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
||||||
|
)
|
||||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
||||||
oldest = availability_api.oldest()
|
oldest = availability_api.oldest()
|
||||||
oldest_archive_url = oldest.archive_url
|
oldest_archive_url = oldest.archive_url
|
||||||
assert "2002" in oldest_archive_url
|
assert "2002" in oldest_archive_url
|
||||||
oldest_timestamp = oldest.timestamp()
|
oldest_timestamp = oldest.timestamp()
|
||||||
assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years
|
assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years
|
||||||
assert availability_api.JSON["archived_snapshots"]["closest"]["available"] is True
|
assert (
|
||||||
|
availability_api.JSON is not None
|
||||||
|
and availability_api.JSON["archived_snapshots"]["closest"]["available"] is True
|
||||||
|
)
|
||||||
assert repr(oldest).find("example.com") != -1
|
assert repr(oldest).find("example.com") != -1
|
||||||
assert "2002" in str(oldest)
|
assert "2002" in str(oldest)
|
||||||
|
|
||||||
|
|
||||||
def test_newest():
|
def test_newest() -> None:
|
||||||
"""
|
"""
|
||||||
Assuming that the recent most Google Archive was made no more earlier than
|
Assuming that the recent most Google Archive was made no more earlier than
|
||||||
last one day which is 86400 seconds.
|
last one day which is 86400 seconds.
|
||||||
@ -54,16 +63,17 @@ def test_newest():
|
|||||||
assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3)
|
assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3)
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_json():
|
def test_invalid_json() -> None:
|
||||||
"""
|
"""
|
||||||
When the API is malfunctioning or we don't pass a URL it may return invalid JSON data.
|
When the API is malfunctioning or we don't pass a URL,
|
||||||
|
it may return invalid JSON data.
|
||||||
"""
|
"""
|
||||||
with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
|
with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
|
||||||
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)
|
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)
|
||||||
_ = availability_api.archive_url
|
_ = availability_api.archive_url
|
||||||
|
|
||||||
|
|
||||||
def test_no_archive():
|
def test_no_archive() -> None:
|
||||||
"""
|
"""
|
||||||
ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not
|
ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not
|
||||||
replied with the archive despite the fact that we know the site has million
|
replied with the archive despite the fact that we know the site has million
|
||||||
@ -74,12 +84,12 @@ def test_no_archive():
|
|||||||
"""
|
"""
|
||||||
with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
|
with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
|
||||||
availability_api = WaybackMachineAvailabilityAPI(
|
availability_api = WaybackMachineAvailabilityAPI(
|
||||||
url="https://%s.cn" % rndstr(30), user_agent=user_agent
|
url=f"https://{rndstr(30)}.cn", user_agent=user_agent
|
||||||
)
|
)
|
||||||
_ = availability_api.archive_url
|
_ = availability_api.archive_url
|
||||||
|
|
||||||
|
|
||||||
def test_no_api_call_str_repr():
|
def test_no_api_call_str_repr() -> None:
|
||||||
"""
|
"""
|
||||||
Some entitled users maybe want to see what is the string representation
|
Some entitled users maybe want to see what is the string representation
|
||||||
if they don’t make any API requests.
|
if they don’t make any API requests.
|
||||||
@ -87,17 +97,17 @@ def test_no_api_call_str_repr():
|
|||||||
str() must not return None so we return ""
|
str() must not return None so we return ""
|
||||||
"""
|
"""
|
||||||
availability_api = WaybackMachineAvailabilityAPI(
|
availability_api = WaybackMachineAvailabilityAPI(
|
||||||
url="https://%s.gov" % rndstr(30), user_agent=user_agent
|
url=f"https://{rndstr(30)}.gov", user_agent=user_agent
|
||||||
)
|
)
|
||||||
assert "" == str(availability_api)
|
assert "" == str(availability_api)
|
||||||
|
|
||||||
|
|
||||||
def test_no_call_timestamp():
|
def test_no_call_timestamp() -> None:
|
||||||
"""
|
"""
|
||||||
If no API requests were made the bound timestamp() method returns
|
If no API requests were made the bound timestamp() method returns
|
||||||
the datetime.max as a default value.
|
the datetime.max as a default value.
|
||||||
"""
|
"""
|
||||||
availability_api = WaybackMachineAvailabilityAPI(
|
availability_api = WaybackMachineAvailabilityAPI(
|
||||||
url="https://%s.in" % rndstr(30), user_agent=user_agent
|
url=f"https://{rndstr(30)}.in", user_agent=user_agent
|
||||||
)
|
)
|
||||||
assert datetime.max == availability_api.timestamp()
|
assert datetime.max == availability_api.timestamp()
|
||||||
|
@ -1,8 +1,11 @@
|
|||||||
from waybackpy.cdx_api import WaybackMachineCDXServerAPI
|
from waybackpy.cdx_api import WaybackMachineCDXServerAPI
|
||||||
|
|
||||||
|
|
||||||
def test_a():
|
def test_a() -> None:
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
url = "https://twitter.com/jack"
|
url = "https://twitter.com/jack"
|
||||||
|
|
||||||
wayback = WaybackMachineCDXServerAPI(
|
wayback = WaybackMachineCDXServerAPI(
|
||||||
@ -21,8 +24,11 @@ def test_a():
|
|||||||
assert snapshot.timestamp.startswith("2010")
|
assert snapshot.timestamp.startswith("2010")
|
||||||
|
|
||||||
|
|
||||||
def test_b():
|
def test_b() -> None:
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
url = "https://www.google.com"
|
url = "https://www.google.com"
|
||||||
|
|
||||||
wayback = WaybackMachineCDXServerAPI(
|
wayback = WaybackMachineCDXServerAPI(
|
||||||
|
@ -3,8 +3,11 @@ from datetime import datetime
|
|||||||
from waybackpy.cdx_snapshot import CDXSnapshot
|
from waybackpy.cdx_snapshot import CDXSnapshot
|
||||||
|
|
||||||
|
|
||||||
def test_CDXSnapshot():
|
def test_CDXSnapshot() -> None:
|
||||||
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
|
sample_input = (
|
||||||
|
"org,archive)/ 20080126045828 http://github.com "
|
||||||
|
"text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
|
||||||
|
)
|
||||||
prop_values = sample_input.split(" ")
|
prop_values = sample_input.split(" ")
|
||||||
properties = {}
|
properties = {}
|
||||||
(
|
(
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from waybackpy.cdx_utils import (
|
from waybackpy.cdx_utils import (
|
||||||
@ -11,15 +13,18 @@ from waybackpy.cdx_utils import (
|
|||||||
from waybackpy.exceptions import WaybackError
|
from waybackpy.exceptions import WaybackError
|
||||||
|
|
||||||
|
|
||||||
def test_get_total_pages():
|
def test_get_total_pages() -> None:
|
||||||
url = "twitter.com"
|
url = "twitter.com"
|
||||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
|
||||||
|
)
|
||||||
assert get_total_pages(url=url, user_agent=user_agent) >= 56
|
assert get_total_pages(url=url, user_agent=user_agent) >= 56
|
||||||
|
|
||||||
|
|
||||||
def test_full_url():
|
def test_full_url() -> None:
|
||||||
params = {}
|
|
||||||
endpoint = "https://web.archive.org/cdx/search/cdx"
|
endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||||
|
params: Dict[str, Any] = {}
|
||||||
assert endpoint == full_url(endpoint, params)
|
assert endpoint == full_url(endpoint, params)
|
||||||
|
|
||||||
params = {"a": "1"}
|
params = {"a": "1"}
|
||||||
@ -39,36 +44,36 @@ def test_full_url():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get_response():
|
def test_get_response() -> None:
|
||||||
url = "https://github.com"
|
url = "https://github.com"
|
||||||
user_agent = (
|
user_agent = (
|
||||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
|
||||||
)
|
)
|
||||||
headers = {"User-Agent": "%s" % user_agent}
|
headers = {"User-Agent": str(user_agent)}
|
||||||
response = get_response(url, headers=headers)
|
response = get_response(url, headers=headers)
|
||||||
assert response.status_code == 200
|
assert not isinstance(response, Exception) and response.status_code == 200
|
||||||
|
|
||||||
url = "http/wwhfhfvhvjhmom"
|
url = "http/wwhfhfvhvjhmom"
|
||||||
with pytest.raises(WaybackError):
|
with pytest.raises(WaybackError):
|
||||||
get_response(url, headers=headers)
|
get_response(url, headers=headers)
|
||||||
|
|
||||||
|
|
||||||
def test_check_filters():
|
def test_check_filters() -> None:
|
||||||
filters = []
|
filters: List[str] = []
|
||||||
check_filters(filters)
|
check_filters(filters)
|
||||||
|
|
||||||
filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"]
|
filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"]
|
||||||
check_filters(filters)
|
check_filters(filters)
|
||||||
|
|
||||||
with pytest.raises(WaybackError):
|
with pytest.raises(WaybackError):
|
||||||
check_filters("not-list")
|
check_filters("not-list") # type: ignore[arg-type]
|
||||||
|
|
||||||
with pytest.raises(WaybackError):
|
with pytest.raises(WaybackError):
|
||||||
check_filters(["invalid"])
|
check_filters(["invalid"])
|
||||||
|
|
||||||
|
|
||||||
def test_check_collapses():
|
def test_check_collapses() -> None:
|
||||||
collapses = []
|
collapses: List[str] = []
|
||||||
check_collapses(collapses)
|
check_collapses(collapses)
|
||||||
|
|
||||||
collapses = ["timestamp:10"]
|
collapses = ["timestamp:10"]
|
||||||
@ -77,7 +82,7 @@ def test_check_collapses():
|
|||||||
collapses = ["urlkey"]
|
collapses = ["urlkey"]
|
||||||
check_collapses(collapses)
|
check_collapses(collapses)
|
||||||
|
|
||||||
collapses = "urlkey" # NOT LIST
|
collapses = "urlkey" # type: ignore[assignment]
|
||||||
with pytest.raises(WaybackError):
|
with pytest.raises(WaybackError):
|
||||||
check_collapses(collapses)
|
check_collapses(collapses)
|
||||||
|
|
||||||
@ -86,11 +91,11 @@ def test_check_collapses():
|
|||||||
check_collapses(collapses)
|
check_collapses(collapses)
|
||||||
|
|
||||||
|
|
||||||
def test_check_match_type():
|
def test_check_match_type() -> None:
|
||||||
assert check_match_type(None, "url") is None
|
assert check_match_type(None, "url")
|
||||||
match_type = "exact"
|
match_type = "exact"
|
||||||
url = "test_url"
|
url = "test_url"
|
||||||
assert check_match_type(match_type, url) is None
|
assert check_match_type(match_type, url)
|
||||||
|
|
||||||
url = "has * in it"
|
url = "has * in it"
|
||||||
with pytest.raises(WaybackError):
|
with pytest.raises(WaybackError):
|
||||||
|
@ -2,22 +2,27 @@ import random
|
|||||||
import string
|
import string
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
from requests.structures import CaseInsensitiveDict
|
||||||
|
|
||||||
from waybackpy.exceptions import MaximumSaveRetriesExceeded
|
from waybackpy.exceptions import MaximumSaveRetriesExceeded
|
||||||
from waybackpy.save_api import WaybackMachineSaveAPI
|
from waybackpy.save_api import WaybackMachineSaveAPI
|
||||||
|
|
||||||
|
|
||||||
def rndstr(n):
|
def rndstr(n: int) -> str:
|
||||||
return "".join(
|
return "".join(
|
||||||
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
|
random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_save():
|
def test_save() -> None:
|
||||||
url = "https://github.com/akamhy/waybackpy"
|
url = "https://github.com/akamhy/waybackpy"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
save_api.save()
|
save_api.save()
|
||||||
archive_url = save_api.archive_url
|
archive_url = save_api.archive_url
|
||||||
@ -31,15 +36,18 @@ def test_save():
|
|||||||
assert isinstance(save_api.timestamp(), datetime)
|
assert isinstance(save_api.timestamp(), datetime)
|
||||||
|
|
||||||
|
|
||||||
def test_max_redirect_exceeded():
|
def test_max_redirect_exceeded() -> None:
|
||||||
with pytest.raises(MaximumSaveRetriesExceeded):
|
with pytest.raises(MaximumSaveRetriesExceeded):
|
||||||
url = "https://%s.gov" % rndstr
|
url = f"https://{rndstr}.gov"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3)
|
save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3)
|
||||||
save_api.save()
|
save_api.save()
|
||||||
|
|
||||||
|
|
||||||
def test_sleep():
|
def test_sleep() -> None:
|
||||||
"""
|
"""
|
||||||
sleeping is actually very important for SaveAPI
|
sleeping is actually very important for SaveAPI
|
||||||
interface stability.
|
interface stability.
|
||||||
@ -47,7 +55,10 @@ def test_sleep():
|
|||||||
is as intended.
|
is as intended.
|
||||||
"""
|
"""
|
||||||
url = "https://example.com"
|
url = "https://example.com"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
s_time = int(time.time())
|
s_time = int(time.time())
|
||||||
save_api.sleep(6) # multiple of 3 sleep for 10 seconds
|
save_api.sleep(6) # multiple of 3 sleep for 10 seconds
|
||||||
@ -60,76 +71,150 @@ def test_sleep():
|
|||||||
assert (e_time - s_time) >= 5
|
assert (e_time - s_time) >= 5
|
||||||
|
|
||||||
|
|
||||||
def test_timestamp():
|
def test_timestamp() -> None:
|
||||||
url = "https://example.com"
|
url = "https://example.com"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
now = datetime.utcnow()
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
save_api._archive_url = (
|
|
||||||
"https://web.archive.org/web/%s/" % now.strftime("%Y%m%d%H%M%S") + url
|
|
||||||
)
|
)
|
||||||
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
|
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
||||||
|
save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
|
||||||
save_api.timestamp()
|
save_api.timestamp()
|
||||||
assert save_api.cached_save is False
|
assert save_api.cached_save is False
|
||||||
save_api._archive_url = "https://web.archive.org/web/%s/" % "20100124063622" + url
|
now = "20100124063622"
|
||||||
|
save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
|
||||||
save_api.timestamp()
|
save_api.timestamp()
|
||||||
assert save_api.cached_save is True
|
assert save_api.cached_save is True
|
||||||
|
|
||||||
|
|
||||||
def test_archive_url_parser():
|
def test_archive_url_parser() -> None:
|
||||||
"""
|
"""
|
||||||
Testing three regex for matches and also tests the response URL.
|
Testing three regex for matches and also tests the response URL.
|
||||||
"""
|
"""
|
||||||
url = "https://example.com"
|
url = "https://example.com"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
|
|
||||||
save_api.headers = """
|
h = (
|
||||||
START
|
"\nSTART\nContent-Location: "
|
||||||
Content-Location: /web/20201126185327/https://www.scribbr.com/citing-sources/et-al
|
"/web/20201126185327/https://www.scribbr.com/citing-sources/et-al"
|
||||||
END
|
"\nEND\n"
|
||||||
"""
|
|
||||||
|
|
||||||
assert (
|
|
||||||
save_api.archive_url_parser()
|
|
||||||
== "https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al"
|
|
||||||
)
|
)
|
||||||
|
save_api.headers = h # type: ignore[assignment]
|
||||||
|
|
||||||
save_api.headers = """
|
expected_url = (
|
||||||
{'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'}
|
"https://web.archive.org/web/20201126185327/"
|
||||||
"""
|
"https://www.scribbr.com/citing-sources/et-al"
|
||||||
|
|
||||||
assert (
|
|
||||||
save_api.archive_url_parser()
|
|
||||||
== "https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/"
|
|
||||||
)
|
)
|
||||||
|
assert save_api.archive_url_parser() == expected_url
|
||||||
|
|
||||||
save_api.headers = """
|
headers = {
|
||||||
START
|
"Server": "nginx/1.15.8",
|
||||||
X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US
|
"Date": "Sat, 02 Jan 2021 09:40:25 GMT",
|
||||||
END
|
"Content-Type": "text/html; charset=UTF-8",
|
||||||
"""
|
"Transfer-Encoding": "chunked",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"X-Archive-Orig-Server": "nginx",
|
||||||
|
"X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT",
|
||||||
|
"X-Archive-Orig-Transfer-Encoding": "chunked",
|
||||||
|
"X-Archive-Orig-Connection": "keep-alive",
|
||||||
|
"X-Archive-Orig-Vary": "Accept-Encoding",
|
||||||
|
"X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT",
|
||||||
|
"X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;",
|
||||||
|
"X-Archive-Guessed-Content-Type": "text/html",
|
||||||
|
"X-Archive-Guessed-Charset": "utf-8",
|
||||||
|
"Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT",
|
||||||
|
"Link": (
|
||||||
|
'<https://www.scribbr.com/citing-sources/et-al/>; rel="original", '
|
||||||
|
"<https://web.archive.org/web/timemap/link/https://www.scribbr.com/"
|
||||||
|
'citing-sources/et-al/>; rel="timemap"; type="application/link-format", '
|
||||||
|
"<https://web.archive.org/web/https://www.scribbr.com/citing-sources/"
|
||||||
|
'et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/'
|
||||||
|
'https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; '
|
||||||
|
'datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/'
|
||||||
|
"20201126185327/https://www.scribbr.com/citing-sources/et-al/>; "
|
||||||
|
'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", '
|
||||||
|
"<https://web.archive.org/web/20210102094009/https://www.scribbr.com/"
|
||||||
|
'citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 '
|
||||||
|
'09:40:09 GMT", <https://web.archive.org/web/20210102094009/'
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al/>; "
|
||||||
|
'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"'
|
||||||
|
),
|
||||||
|
"Content-Security-Policy": (
|
||||||
|
"default-src 'self' 'unsafe-eval' 'unsafe-inline' "
|
||||||
|
"data: blob: archive.org web.archive.org analytics.archive.org "
|
||||||
|
"pragma.archivelab.org",
|
||||||
|
),
|
||||||
|
"X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz",
|
||||||
|
"Server-Timing": (
|
||||||
|
"captures_list;dur=112.646325, exclusion.robots;dur=0.172010, "
|
||||||
|
"exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, "
|
||||||
|
"esindex;dur=0.014647, LoadShardBlock;dur=82.205012, "
|
||||||
|
"PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, "
|
||||||
|
"load_resource;dur=26.520179"
|
||||||
|
),
|
||||||
|
"X-App-Server": "wwwb-app200",
|
||||||
|
"X-ts": "200",
|
||||||
|
"X-location": "All",
|
||||||
|
"X-Cache-Key": (
|
||||||
|
"httpsweb.archive.org/web/20210102094009/"
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al/IN",
|
||||||
|
),
|
||||||
|
"X-RL": "0",
|
||||||
|
"X-Page-Cache": "MISS",
|
||||||
|
"X-Archive-Screenname": "0",
|
||||||
|
"Content-Encoding": "gzip",
|
||||||
|
}
|
||||||
|
|
||||||
assert (
|
save_api.headers = cast(CaseInsensitiveDict[str], headers)
|
||||||
save_api.archive_url_parser()
|
|
||||||
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/"
|
expected_url2 = (
|
||||||
|
"https://web.archive.org/web/20210102094009/"
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al/"
|
||||||
)
|
)
|
||||||
|
assert save_api.archive_url_parser() == expected_url2
|
||||||
|
|
||||||
save_api.headers = "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING"
|
expected_url_3 = (
|
||||||
save_api.response_url = "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al"
|
"https://web.archive.org/web/20171128185327/"
|
||||||
assert (
|
"https://www.scribbr.com/citing-sources/et-al/US"
|
||||||
save_api.archive_url_parser()
|
|
||||||
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al"
|
|
||||||
)
|
)
|
||||||
|
h = f"START\nX-Cache-Key: {expected_url_3}\nEND\n"
|
||||||
|
save_api.headers = h # type: ignore[assignment]
|
||||||
|
|
||||||
|
expected_url4 = (
|
||||||
|
"https://web.archive.org/web/20171128185327/"
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al/"
|
||||||
|
)
|
||||||
|
assert save_api.archive_url_parser() == expected_url4
|
||||||
|
|
||||||
|
h = "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING"
|
||||||
|
save_api.headers = h # type: ignore[assignment]
|
||||||
|
save_api.response_url = (
|
||||||
|
"https://web.archive.org/web/20171128185327/"
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al"
|
||||||
|
)
|
||||||
|
expected_url5 = (
|
||||||
|
"https://web.archive.org/web/20171128185327/"
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al"
|
||||||
|
)
|
||||||
|
assert save_api.archive_url_parser() == expected_url5
|
||||||
|
|
||||||
|
|
||||||
def test_archive_url():
|
def test_archive_url() -> None:
|
||||||
"""
|
"""
|
||||||
Checks the attribute archive_url's value when the save method was not
|
Checks the attribute archive_url's value when the save method was not
|
||||||
explicitly invoked by the end-user but the save method was invoked implicitly
|
explicitly invoked by the end-user but the save method was invoked implicitly
|
||||||
by the archive_url method which is an attribute due to @property.
|
by the archive_url method which is an attribute due to @property.
|
||||||
"""
|
"""
|
||||||
url = "https://example.com"
|
url = "https://example.com"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
save_api.saved_archive = (
|
save_api.saved_archive = (
|
||||||
"https://web.archive.org/web/20220124063056/https://example.com/"
|
"https://web.archive.org/web/20220124063056/https://example.com/"
|
||||||
|
@ -6,13 +6,13 @@ from waybackpy.utils import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_default_user_agent():
|
def test_default_user_agent() -> None:
|
||||||
assert (
|
assert (
|
||||||
DEFAULT_USER_AGENT
|
DEFAULT_USER_AGENT
|
||||||
== "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
|
== f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_latest_version():
|
def test_latest_version() -> None:
|
||||||
package_name = "waybackpy"
|
package_name = "waybackpy"
|
||||||
assert latest_version_github(package_name) == latest_version_pypi(package_name)
|
assert latest_version_github(package_name) == latest_version_pypi(package_name)
|
||||||
|
@ -5,11 +5,7 @@ __description__ = (
|
|||||||
)
|
)
|
||||||
__url__ = "https://akamhy.github.io/waybackpy/"
|
__url__ = "https://akamhy.github.io/waybackpy/"
|
||||||
__version__ = "3.0.2"
|
__version__ = "3.0.2"
|
||||||
__download_url__ = (
|
__download_url__ = f"https://github.com/akamhy/waybackpy/archive/{__version__}.tar.gz"
|
||||||
"https://github.com/akamhy/waybackpy/archive/{version}.tar.gz".format(
|
|
||||||
version=__version__
|
|
||||||
)
|
|
||||||
)
|
|
||||||
__author__ = "Akash Mahanty"
|
__author__ = "Akash Mahanty"
|
||||||
__author_email__ = "akamhy@yahoo.com"
|
__author_email__ = "akamhy@yahoo.com"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@ -10,37 +11,42 @@ from .exceptions import (
|
|||||||
)
|
)
|
||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
|
|
||||||
|
ResponseJSON = Dict[str, Any]
|
||||||
|
|
||||||
class WaybackMachineAvailabilityAPI:
|
|
||||||
|
class WaybackMachineAvailabilityAPI(object):
|
||||||
"""
|
"""
|
||||||
Class that interfaces the availability API of the Wayback Machine.
|
Class that interfaces the availability API of the Wayback Machine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=3):
|
def __init__(
|
||||||
|
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
|
||||||
|
) -> None:
|
||||||
self.url = str(url).strip().replace(" ", "%20")
|
self.url = str(url).strip().replace(" ", "%20")
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
self.headers = {"User-Agent": self.user_agent}
|
self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
|
||||||
self.payload = {"url": "{url}".format(url=self.url)}
|
self.payload = {"url": self.url}
|
||||||
self.endpoint = "https://archive.org/wayback/available"
|
self.endpoint = "https://archive.org/wayback/available"
|
||||||
self.max_tries = max_tries
|
self.max_tries = max_tries
|
||||||
self.tries = 0
|
self.tries = 0
|
||||||
self.last_api_call_unix_time = int(time.time())
|
self.last_api_call_unix_time = int(time.time())
|
||||||
self.api_call_time_gap = 5
|
self.api_call_time_gap = 5
|
||||||
self.JSON = None
|
self.JSON: Optional[ResponseJSON] = None
|
||||||
|
|
||||||
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
|
@staticmethod
|
||||||
|
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
|
||||||
"""
|
"""
|
||||||
Converts Unix time to wayback Machine timestamp.
|
Converts Unix time to wayback Machine timestamp.
|
||||||
"""
|
"""
|
||||||
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
"""
|
"""
|
||||||
Same as string representation, just return the archive URL as a string.
|
Same as string representation, just return the archive URL as a string.
|
||||||
"""
|
"""
|
||||||
return str(self)
|
return str(self)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self) -> str:
|
||||||
"""
|
"""
|
||||||
String representation of the class. If atleast one API call was successfully
|
String representation of the class. If atleast one API call was successfully
|
||||||
made then return the archive URL as a string. Else returns None.
|
made then return the archive URL as a string. Else returns None.
|
||||||
@ -54,7 +60,7 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
|
|
||||||
return self.archive_url
|
return self.archive_url
|
||||||
|
|
||||||
def json(self):
|
def json(self) -> Optional[ResponseJSON]:
|
||||||
"""
|
"""
|
||||||
Makes the API call to the availability API can set the JSON response
|
Makes the API call to the availability API can set the JSON response
|
||||||
to the JSON attribute of the instance and also returns the JSON attribute.
|
to the JSON attribute of the instance and also returns the JSON attribute.
|
||||||
@ -74,12 +80,12 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
self.JSON = self.response.json()
|
self.JSON = self.response.json()
|
||||||
except json.decoder.JSONDecodeError:
|
except json.decoder.JSONDecodeError:
|
||||||
raise InvalidJSONInAvailabilityAPIResponse(
|
raise InvalidJSONInAvailabilityAPIResponse(
|
||||||
"Response data:\n{text}".format(text=self.response.text)
|
f"Response data:\n{self.response.text}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return self.JSON
|
return self.JSON
|
||||||
|
|
||||||
def timestamp(self):
|
def timestamp(self) -> datetime:
|
||||||
"""
|
"""
|
||||||
Converts the timestamp form the JSON response to datetime object.
|
Converts the timestamp form the JSON response to datetime object.
|
||||||
If JSON attribute of the instance is None it implies that the either
|
If JSON attribute of the instance is None it implies that the either
|
||||||
@ -91,19 +97,29 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
If you get an URL as a response form the availability API it is guaranteed
|
If you get an URL as a response form the availability API it is guaranteed
|
||||||
that you can get the datetime object from the timestamp.
|
that you can get the datetime object from the timestamp.
|
||||||
"""
|
"""
|
||||||
if not self.JSON or not self.JSON["archived_snapshots"]:
|
if self.JSON is None or "archived_snapshots" not in self.JSON:
|
||||||
return datetime.max
|
return datetime.max
|
||||||
|
elif (
|
||||||
return datetime.strptime(
|
self.JSON is not None
|
||||||
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
and "archived_snapshots" in self.JSON
|
||||||
)
|
and self.JSON["archived_snapshots"] is not None
|
||||||
|
and "closest" in self.JSON["archived_snapshots"]
|
||||||
|
and self.JSON["archived_snapshots"]["closest"] is not None
|
||||||
|
and "timestamp" in self.JSON["archived_snapshots"]["closest"]
|
||||||
|
):
|
||||||
|
return datetime.strptime(
|
||||||
|
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError("Could not get timestamp from result")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def archive_url(self):
|
def archive_url(self) -> str:
|
||||||
"""
|
"""
|
||||||
Reads the the JSON response data and tries to get the timestamp and returns
|
Reads the the JSON response data and tries to get the timestamp and returns
|
||||||
the timestamp if found else returns None.
|
the timestamp if found else returns None.
|
||||||
"""
|
"""
|
||||||
|
archive_url = ""
|
||||||
data = self.JSON
|
data = self.JSON
|
||||||
|
|
||||||
# If the user didn't used oldest, newest or near but tries to access the
|
# If the user didn't used oldest, newest or near but tries to access the
|
||||||
@ -127,9 +143,9 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
if not data or not data["archived_snapshots"]:
|
if not data or not data["archived_snapshots"]:
|
||||||
raise ArchiveNotInAvailabilityAPIResponse(
|
raise ArchiveNotInAvailabilityAPIResponse(
|
||||||
"Archive not found in the availability "
|
"Archive not found in the availability "
|
||||||
+ "API response, the URL you requested may not have any "
|
"API response, the URL you requested may not have any archives "
|
||||||
+ "archives yet. You may retry after some time or archive the webpage now."
|
"yet. You may retry after some time or archive the webpage now.\n"
|
||||||
+ "\nResponse data:\n{response}".format(response=self.response.text)
|
f"Response data:\n{self.response.text}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||||
@ -138,7 +154,8 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
)
|
)
|
||||||
return archive_url
|
return archive_url
|
||||||
|
|
||||||
def wayback_timestamp(self, **kwargs):
|
@staticmethod
|
||||||
|
def wayback_timestamp(**kwargs: int) -> str:
|
||||||
"""
|
"""
|
||||||
Prepends zero before the year, month, day, hour and minute so that they
|
Prepends zero before the year, month, day, hour and minute so that they
|
||||||
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
|
are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
|
||||||
@ -148,7 +165,7 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
for key in ["year", "month", "day", "hour", "minute"]
|
for key in ["year", "month", "day", "hour", "minute"]
|
||||||
)
|
)
|
||||||
|
|
||||||
def oldest(self):
|
def oldest(self) -> "WaybackMachineAvailabilityAPI":
|
||||||
"""
|
"""
|
||||||
Passing the year 1994 should return the oldest archive because
|
Passing the year 1994 should return the oldest archive because
|
||||||
wayback machine was started in May, 1996 and there should be no archive
|
wayback machine was started in May, 1996 and there should be no archive
|
||||||
@ -156,7 +173,7 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
"""
|
"""
|
||||||
return self.near(year=1994)
|
return self.near(year=1994)
|
||||||
|
|
||||||
def newest(self):
|
def newest(self) -> "WaybackMachineAvailabilityAPI":
|
||||||
"""
|
"""
|
||||||
Passing the current UNIX time should be sufficient to get the newest
|
Passing the current UNIX time should be sufficient to get the newest
|
||||||
archive considering the API request-response time delay and also the
|
archive considering the API request-response time delay and also the
|
||||||
@ -166,13 +183,13 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
|
|
||||||
def near(
|
def near(
|
||||||
self,
|
self,
|
||||||
year=None,
|
year: Optional[int] = None,
|
||||||
month=None,
|
month: Optional[int] = None,
|
||||||
day=None,
|
day: Optional[int] = None,
|
||||||
hour=None,
|
hour: Optional[int] = None,
|
||||||
minute=None,
|
minute: Optional[int] = None,
|
||||||
unix_timestamp=None,
|
unix_timestamp: Optional[int] = None,
|
||||||
):
|
) -> "WaybackMachineAvailabilityAPI":
|
||||||
"""
|
"""
|
||||||
The main method for this Class, oldest and newest methods are dependent on this
|
The main method for this Class, oldest and newest methods are dependent on this
|
||||||
method.
|
method.
|
||||||
@ -181,18 +198,19 @@ class WaybackMachineAvailabilityAPI:
|
|||||||
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
|
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
|
||||||
appropriate arguments for their respective parameters.
|
appropriate arguments for their respective parameters.
|
||||||
Adds the timestamp to the payload dictionary.
|
Adds the timestamp to the payload dictionary.
|
||||||
And finally invoking the json method to make the API call then returns the instance.
|
And finally invoking the json method to make the API call then returns
|
||||||
|
the instance.
|
||||||
"""
|
"""
|
||||||
if unix_timestamp:
|
if unix_timestamp:
|
||||||
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||||
else:
|
else:
|
||||||
now = datetime.utcnow().timetuple()
|
now = datetime.utcnow().timetuple()
|
||||||
timestamp = self.wayback_timestamp(
|
timestamp = self.wayback_timestamp(
|
||||||
year=year if year else now.tm_year,
|
year=now.tm_year if year is None else year,
|
||||||
month=month if month else now.tm_mon,
|
month=now.tm_mon if month is None else month,
|
||||||
day=day if day else now.tm_mday,
|
day=now.tm_mday if day is None else day,
|
||||||
hour=hour if hour else now.tm_hour,
|
hour=now.tm_hour if hour is None else hour,
|
||||||
minute=minute if minute else now.tm_min,
|
minute=now.tm_min if minute is None else minute,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.payload["timestamp"] = timestamp
|
self.payload["timestamp"] = timestamp
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
from typing import Dict, Generator, List, Optional, cast
|
||||||
|
|
||||||
from .cdx_snapshot import CDXSnapshot
|
from .cdx_snapshot import CDXSnapshot
|
||||||
from .cdx_utils import (
|
from .cdx_utils import (
|
||||||
check_collapses,
|
check_collapses,
|
||||||
@ -11,43 +13,48 @@ from .exceptions import WaybackError
|
|||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
class WaybackMachineCDXServerAPI:
|
class WaybackMachineCDXServerAPI(object):
|
||||||
"""
|
"""
|
||||||
Class that interfaces the CDX server API of the Wayback Machine.
|
Class that interfaces the CDX server API of the Wayback Machine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# start_timestamp: from, can not use from as it's a keyword
|
||||||
|
# end_timestamp: to, not using to as can not use from
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
url,
|
url: str,
|
||||||
user_agent=DEFAULT_USER_AGENT,
|
user_agent: str = DEFAULT_USER_AGENT,
|
||||||
start_timestamp=None, # from, can not use from as it's a keyword
|
start_timestamp: Optional[str] = None,
|
||||||
end_timestamp=None, # to, not using to as can not use from
|
end_timestamp: Optional[str] = None,
|
||||||
filters=[],
|
filters: List[str] = [],
|
||||||
match_type=None,
|
match_type: Optional[str] = None,
|
||||||
gzip=None,
|
gzip: Optional[str] = None,
|
||||||
collapses=[],
|
collapses: List[str] = [],
|
||||||
limit=None,
|
limit: Optional[str] = None,
|
||||||
max_tries=3,
|
max_tries: int = 3,
|
||||||
):
|
) -> None:
|
||||||
self.url = str(url).strip().replace(" ", "%20")
|
self.url = str(url).strip().replace(" ", "%20")
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
self.start_timestamp = str(start_timestamp) if start_timestamp else None
|
self.start_timestamp = (
|
||||||
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
str(start_timestamp) if start_timestamp is not None else None
|
||||||
|
)
|
||||||
|
self.end_timestamp = str(end_timestamp) if end_timestamp is not None else None
|
||||||
self.filters = filters
|
self.filters = filters
|
||||||
check_filters(self.filters)
|
check_filters(self.filters)
|
||||||
self.match_type = str(match_type).strip() if match_type else None
|
self.match_type = str(match_type).strip() if match_type is not None else None
|
||||||
check_match_type(self.match_type, self.url)
|
check_match_type(self.match_type, self.url)
|
||||||
self.gzip = gzip if gzip else True
|
self.gzip = gzip
|
||||||
self.collapses = collapses
|
self.collapses = collapses
|
||||||
check_collapses(self.collapses)
|
check_collapses(self.collapses)
|
||||||
self.limit = limit if limit else 5000
|
self.limit = limit if limit is not None else 5000
|
||||||
self.max_tries = max_tries
|
self.max_tries = max_tries
|
||||||
self.last_api_request_url = None
|
self.last_api_request_url: Optional[str] = None
|
||||||
self.use_page = False
|
self.use_page = False
|
||||||
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||||
|
|
||||||
def cdx_api_manager(self, payload, headers, use_page=False):
|
def cdx_api_manager(
|
||||||
|
self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
|
||||||
|
) -> Generator[str, None, None]:
|
||||||
total_pages = get_total_pages(self.url, self.user_agent)
|
total_pages = get_total_pages(self.url, self.user_agent)
|
||||||
# If we only have two or less pages of archives then we care for more accuracy
|
# If we only have two or less pages of archives then we care for more accuracy
|
||||||
# pagination API is lagged sometimes
|
# pagination API is lagged sometimes
|
||||||
@ -58,6 +65,8 @@ class WaybackMachineCDXServerAPI:
|
|||||||
|
|
||||||
url = full_url(self.endpoint, params=payload)
|
url = full_url(self.endpoint, params=payload)
|
||||||
res = get_response(url, headers=headers)
|
res = get_response(url, headers=headers)
|
||||||
|
if isinstance(res, Exception):
|
||||||
|
raise res
|
||||||
|
|
||||||
self.last_api_request_url = url
|
self.last_api_request_url = url
|
||||||
text = res.text
|
text = res.text
|
||||||
@ -69,19 +78,18 @@ class WaybackMachineCDXServerAPI:
|
|||||||
|
|
||||||
yield text
|
yield text
|
||||||
else:
|
else:
|
||||||
|
|
||||||
payload["showResumeKey"] = "true"
|
payload["showResumeKey"] = "true"
|
||||||
payload["limit"] = str(self.limit)
|
payload["limit"] = str(self.limit)
|
||||||
resumeKey = None
|
resumeKey = None
|
||||||
|
|
||||||
more = True
|
more = True
|
||||||
while more:
|
while more:
|
||||||
|
|
||||||
if resumeKey:
|
if resumeKey:
|
||||||
payload["resumeKey"] = resumeKey
|
payload["resumeKey"] = resumeKey
|
||||||
|
|
||||||
url = full_url(self.endpoint, params=payload)
|
url = full_url(self.endpoint, params=payload)
|
||||||
res = get_response(url, headers=headers)
|
res = get_response(url, headers=headers)
|
||||||
|
if isinstance(res, Exception):
|
||||||
|
raise res
|
||||||
|
|
||||||
self.last_api_request_url = url
|
self.last_api_request_url = url
|
||||||
|
|
||||||
@ -102,14 +110,14 @@ class WaybackMachineCDXServerAPI:
|
|||||||
|
|
||||||
yield text
|
yield text
|
||||||
|
|
||||||
def add_payload(self, payload):
|
def add_payload(self, payload: Dict[str, str]) -> None:
|
||||||
if self.start_timestamp:
|
if self.start_timestamp:
|
||||||
payload["from"] = self.start_timestamp
|
payload["from"] = self.start_timestamp
|
||||||
|
|
||||||
if self.end_timestamp:
|
if self.end_timestamp:
|
||||||
payload["to"] = self.end_timestamp
|
payload["to"] = self.end_timestamp
|
||||||
|
|
||||||
if self.gzip is not True:
|
if self.gzip is None:
|
||||||
payload["gzip"] = "false"
|
payload["gzip"] = "false"
|
||||||
|
|
||||||
if self.match_type:
|
if self.match_type:
|
||||||
@ -126,8 +134,8 @@ class WaybackMachineCDXServerAPI:
|
|||||||
# Don't need to return anything as it's dictionary.
|
# Don't need to return anything as it's dictionary.
|
||||||
payload["url"] = self.url
|
payload["url"] = self.url
|
||||||
|
|
||||||
def snapshots(self):
|
def snapshots(self) -> Generator[CDXSnapshot, None, None]:
|
||||||
payload = {}
|
payload: Dict[str, str] = {}
|
||||||
headers = {"User-Agent": self.user_agent}
|
headers = {"User-Agent": self.user_agent}
|
||||||
|
|
||||||
self.add_payload(payload)
|
self.add_payload(payload)
|
||||||
@ -152,7 +160,7 @@ class WaybackMachineCDXServerAPI:
|
|||||||
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
|
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
properties = {
|
properties: Dict[str, Optional[str]] = {
|
||||||
"urlkey": None,
|
"urlkey": None,
|
||||||
"timestamp": None,
|
"timestamp": None,
|
||||||
"original": None,
|
"original": None,
|
||||||
@ -169,15 +177,9 @@ class WaybackMachineCDXServerAPI:
|
|||||||
|
|
||||||
if prop_values_len != properties_len:
|
if prop_values_len != properties_len:
|
||||||
raise WaybackError(
|
raise WaybackError(
|
||||||
"Snapshot returned by Cdx API has {prop_values_len} properties".format(
|
f"Snapshot returned by Cdx API has {prop_values_len} "
|
||||||
prop_values_len=prop_values_len
|
f"properties instead of expected {properties_len} properties.\n"
|
||||||
)
|
f"Problematic Snapshot: {snapshot}"
|
||||||
+ " instead of expected {properties_len} ".format(
|
|
||||||
properties_len=properties_len
|
|
||||||
)
|
|
||||||
+ "properties.\nProblematic Snapshot : {snapshot}".format(
|
|
||||||
snapshot=snapshot
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
(
|
(
|
||||||
@ -190,4 +192,4 @@ class WaybackMachineCDXServerAPI:
|
|||||||
properties["length"],
|
properties["length"],
|
||||||
) = prop_values
|
) = prop_values
|
||||||
|
|
||||||
yield CDXSnapshot(properties)
|
yield CDXSnapshot(cast(Dict[str, str], properties))
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
|
||||||
class CDXSnapshot:
|
class CDXSnapshot(object):
|
||||||
"""
|
"""
|
||||||
Class for the CDX snapshot lines returned by the CDX API,
|
Class for the CDX snapshot lines returned by the CDX API,
|
||||||
Each valid line of the CDX API is casted to an CDXSnapshot object
|
Each valid line of the CDX API is casted to an CDXSnapshot object
|
||||||
@ -10,7 +11,7 @@ class CDXSnapshot:
|
|||||||
of the CDXSnapshot.
|
of the CDXSnapshot.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, properties):
|
def __init__(self, properties: Dict[str, str]) -> None:
|
||||||
self.urlkey = properties["urlkey"]
|
self.urlkey = properties["urlkey"]
|
||||||
self.timestamp = properties["timestamp"]
|
self.timestamp = properties["timestamp"]
|
||||||
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
||||||
@ -20,16 +21,11 @@ class CDXSnapshot:
|
|||||||
self.digest = properties["digest"]
|
self.digest = properties["digest"]
|
||||||
self.length = properties["length"]
|
self.length = properties["length"]
|
||||||
self.archive_url = (
|
self.archive_url = (
|
||||||
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
|
f"https://web.archive.org/web/{self.timestamp}/{self.original}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self) -> str:
|
||||||
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
|
return (
|
||||||
urlkey=self.urlkey,
|
f"{self.urlkey} {self.timestamp} {self.original} "
|
||||||
timestamp=self.timestamp,
|
f"{self.mimetype} {self.statuscode} {self.digest} {self.length}"
|
||||||
original=self.original,
|
|
||||||
mimetype=self.mimetype,
|
|
||||||
statuscode=self.statuscode,
|
|
||||||
digest=self.digest,
|
|
||||||
length=self.length,
|
|
||||||
)
|
)
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
@ -8,16 +10,19 @@ from .exceptions import WaybackError
|
|||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
def get_total_pages(url, user_agent=DEFAULT_USER_AGENT):
|
def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
|
||||||
endpoint = "https://web.archive.org/cdx/search/cdx?"
|
endpoint = "https://web.archive.org/cdx/search/cdx?"
|
||||||
payload = {"showNumPages": "true", "url": str(url)}
|
payload = {"showNumPages": "true", "url": str(url)}
|
||||||
headers = {"User-Agent": user_agent}
|
headers = {"User-Agent": user_agent}
|
||||||
request_url = full_url(endpoint, params=payload)
|
request_url = full_url(endpoint, params=payload)
|
||||||
response = get_response(request_url, headers=headers)
|
response = get_response(request_url, headers=headers)
|
||||||
return int(response.text.strip())
|
if isinstance(response, requests.Response):
|
||||||
|
return int(response.text.strip())
|
||||||
|
else:
|
||||||
|
raise response
|
||||||
|
|
||||||
|
|
||||||
def full_url(endpoint, params):
|
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
|
||||||
if not params:
|
if not params:
|
||||||
return endpoint
|
return endpoint
|
||||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||||
@ -25,28 +30,25 @@ def full_url(endpoint, params):
|
|||||||
key = "filter" if key.startswith("filter") else key
|
key = "filter" if key.startswith("filter") else key
|
||||||
key = "collapse" if key.startswith("collapse") else key
|
key = "collapse" if key.startswith("collapse") else key
|
||||||
amp = "" if full_url.endswith("?") else "&"
|
amp = "" if full_url.endswith("?") else "&"
|
||||||
full_url = (
|
val = quote(str(val), safe="")
|
||||||
full_url
|
full_url += f"{amp}{key}={val}"
|
||||||
+ amp
|
|
||||||
+ "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
|
|
||||||
)
|
|
||||||
return full_url
|
return full_url
|
||||||
|
|
||||||
|
|
||||||
def get_response(
|
def get_response(
|
||||||
url,
|
url: str,
|
||||||
headers=None,
|
headers: Optional[Dict[str, str]] = None,
|
||||||
retries=5,
|
retries: int = 5,
|
||||||
backoff_factor=0.5,
|
backoff_factor: float = 0.5,
|
||||||
no_raise_on_redirects=False,
|
# no_raise_on_redirects=False,
|
||||||
):
|
) -> Union[requests.Response, Exception]:
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
retries = Retry(
|
retries_ = Retry(
|
||||||
total=retries,
|
total=retries,
|
||||||
backoff_factor=backoff_factor,
|
backoff_factor=backoff_factor,
|
||||||
status_forcelist=[500, 502, 503, 504],
|
status_forcelist=[500, 502, 503, 504],
|
||||||
)
|
)
|
||||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
session.mount("https://", HTTPAdapter(max_retries=retries_))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = session.get(url, headers=headers)
|
response = session.get(url, headers=headers)
|
||||||
@ -54,77 +56,65 @@ def get_response(
|
|||||||
return response
|
return response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
reason = str(e)
|
reason = str(e)
|
||||||
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
exc_message = f"Error while retrieving {url}.\n{reason}"
|
||||||
url=url, reason=reason
|
|
||||||
)
|
|
||||||
exc = WaybackError(exc_message)
|
exc = WaybackError(exc_message)
|
||||||
exc.__cause__ = e
|
exc.__cause__ = e
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
|
|
||||||
def check_filters(filters):
|
def check_filters(filters: List[str]) -> None:
|
||||||
if not isinstance(filters, list):
|
if not isinstance(filters, list):
|
||||||
raise WaybackError("filters must be a list.")
|
raise WaybackError("filters must be a list.")
|
||||||
|
|
||||||
# [!]field:regex
|
# [!]field:regex
|
||||||
for _filter in filters:
|
for _filter in filters:
|
||||||
try:
|
match = re.search(
|
||||||
|
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):"
|
||||||
|
r"(.*)",
|
||||||
|
_filter,
|
||||||
|
)
|
||||||
|
|
||||||
match = re.search(
|
if match is None or len(match.groups()) != 2:
|
||||||
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
|
|
||||||
_filter,
|
|
||||||
)
|
|
||||||
|
|
||||||
match.group(1)
|
exc_message = f"Filter '{_filter}' is not following the cdx filter syntax."
|
||||||
match.group(2)
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
|
|
||||||
exc_message = (
|
|
||||||
"Filter '{_filter}' is not following the cdx filter syntax.".format(
|
|
||||||
_filter=_filter
|
|
||||||
)
|
|
||||||
)
|
|
||||||
raise WaybackError(exc_message)
|
raise WaybackError(exc_message)
|
||||||
|
|
||||||
|
|
||||||
def check_collapses(collapses):
|
def check_collapses(collapses: List[str]) -> bool:
|
||||||
|
|
||||||
if not isinstance(collapses, list):
|
if not isinstance(collapses, list):
|
||||||
raise WaybackError("collapses must be a list.")
|
raise WaybackError("collapses must be a list.")
|
||||||
|
elif len(collapses) == 0:
|
||||||
if len(collapses) == 0:
|
return True
|
||||||
return
|
|
||||||
|
|
||||||
for collapse in collapses:
|
for collapse in collapses:
|
||||||
try:
|
match = re.search(
|
||||||
match = re.search(
|
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)"
|
||||||
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
|
r"(:?[0-9]{1,99})?",
|
||||||
collapse,
|
collapse,
|
||||||
)
|
)
|
||||||
match.group(1)
|
if match is None or len(match.groups()) != 2:
|
||||||
if 2 == len(match.groups()):
|
exc_message = (
|
||||||
match.group(2)
|
f"collapse argument '{collapse}' "
|
||||||
except Exception:
|
"is not following the cdx collapse syntax."
|
||||||
exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
|
|
||||||
collapse=collapse
|
|
||||||
)
|
)
|
||||||
raise WaybackError(exc_message)
|
raise WaybackError(exc_message)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def check_match_type(match_type, url):
|
|
||||||
|
def check_match_type(match_type: Optional[str], url: str) -> bool:
|
||||||
|
legal_match_type = ["exact", "prefix", "host", "domain"]
|
||||||
if not match_type:
|
if not match_type:
|
||||||
return
|
return True
|
||||||
|
elif "*" in url:
|
||||||
if "*" in url:
|
|
||||||
raise WaybackError(
|
raise WaybackError(
|
||||||
"Can not use wildcard in the URL along with the match_type arguments."
|
"Can not use wildcard in the URL along with the match_type arguments."
|
||||||
)
|
)
|
||||||
|
elif match_type not in legal_match_type:
|
||||||
legal_match_type = ["exact", "prefix", "host", "domain"]
|
exc_message = (
|
||||||
|
f"{match_type} is not an allowed match type.\n"
|
||||||
if match_type not in legal_match_type:
|
"Use one from 'exact', 'prefix', 'host' or 'domain'"
|
||||||
exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
|
|
||||||
match_type=match_type
|
|
||||||
)
|
)
|
||||||
raise WaybackError(exc_message)
|
raise WaybackError(exc_message)
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
@ -3,6 +3,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
|
from typing import Generator, List, Optional
|
||||||
|
|
||||||
import click
|
import click
|
||||||
import requests
|
import requests
|
||||||
@ -24,7 +25,7 @@ from .wrapper import Url
|
|||||||
"--user-agent",
|
"--user-agent",
|
||||||
"--user_agent",
|
"--user_agent",
|
||||||
default=DEFAULT_USER_AGENT,
|
default=DEFAULT_USER_AGENT,
|
||||||
help="User agent, default value is '%s'." % DEFAULT_USER_AGENT,
|
help=f"User agent, default value is '{DEFAULT_USER_AGENT}'.",
|
||||||
)
|
)
|
||||||
@click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.")
|
@click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.")
|
||||||
@click.option(
|
@click.option(
|
||||||
@ -163,34 +164,34 @@ from .wrapper import Url
|
|||||||
+ "will be printed.",
|
+ "will be printed.",
|
||||||
)
|
)
|
||||||
def main(
|
def main(
|
||||||
url,
|
url: Optional[str],
|
||||||
user_agent,
|
user_agent: str,
|
||||||
version,
|
version: bool,
|
||||||
license,
|
license: bool,
|
||||||
newest,
|
newest: bool,
|
||||||
oldest,
|
oldest: bool,
|
||||||
json,
|
json: bool,
|
||||||
near,
|
near: bool,
|
||||||
year,
|
year: Optional[int],
|
||||||
month,
|
month: Optional[int],
|
||||||
day,
|
day: Optional[int],
|
||||||
hour,
|
hour: Optional[int],
|
||||||
minute,
|
minute: Optional[int],
|
||||||
save,
|
save: bool,
|
||||||
headers,
|
headers: bool,
|
||||||
known_urls,
|
known_urls: bool,
|
||||||
subdomain,
|
subdomain: bool,
|
||||||
file,
|
file: bool,
|
||||||
cdx,
|
cdx: bool,
|
||||||
start_timestamp,
|
start_timestamp: Optional[str],
|
||||||
end_timestamp,
|
end_timestamp: Optional[str],
|
||||||
filter,
|
filter: List[str],
|
||||||
match_type,
|
match_type: Optional[str],
|
||||||
gzip,
|
gzip: Optional[str],
|
||||||
collapse,
|
collapse: List[str],
|
||||||
limit,
|
limit: Optional[str],
|
||||||
cdx_print,
|
cdx_print: List[str],
|
||||||
):
|
) -> None:
|
||||||
"""\b
|
"""\b
|
||||||
_ _
|
_ _
|
||||||
| | | |
|
| | | |
|
||||||
@ -214,7 +215,7 @@ def main(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if version:
|
if version:
|
||||||
click.echo("waybackpy version %s" % __version__)
|
click.echo(f"waybackpy version {__version__}")
|
||||||
return
|
return
|
||||||
|
|
||||||
if license:
|
if license:
|
||||||
@ -240,11 +241,14 @@ def main(
|
|||||||
and not cdx
|
and not cdx
|
||||||
):
|
):
|
||||||
click.echo(
|
click.echo(
|
||||||
"Only URL passed, but did not specify what to do with the URL. Use --help flag for help using waybackpy."
|
"Only URL passed, but did not specify what to do with the URL. "
|
||||||
|
"Use --help flag for help using waybackpy."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def echo_availability_api(availability_api_instance):
|
def echo_availability_api(
|
||||||
|
availability_api_instance: WaybackMachineAvailabilityAPI,
|
||||||
|
) -> None:
|
||||||
click.echo("Archive URL:")
|
click.echo("Archive URL:")
|
||||||
if not availability_api_instance.archive_url:
|
if not availability_api_instance.archive_url:
|
||||||
archive_url = (
|
archive_url = (
|
||||||
@ -295,13 +299,14 @@ def main(
|
|||||||
click.echo(save_api.headers)
|
click.echo(save_api.headers)
|
||||||
return
|
return
|
||||||
|
|
||||||
def save_urls_on_file(url_gen):
|
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||||
domain = None
|
domain = None
|
||||||
sys_random = random.SystemRandom()
|
sys_random = random.SystemRandom()
|
||||||
uid = "".join(
|
uid = "".join(
|
||||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||||
)
|
)
|
||||||
url_count = 0
|
url_count = 0
|
||||||
|
file_name = None
|
||||||
|
|
||||||
for url in url_gen:
|
for url in url_gen:
|
||||||
url_count += 1
|
url_count += 1
|
||||||
@ -310,25 +315,21 @@ def main(
|
|||||||
|
|
||||||
domain = "domain-unknown"
|
domain = "domain-unknown"
|
||||||
|
|
||||||
if match:
|
if match is not None:
|
||||||
domain = match.group(1)
|
domain = match.group(1)
|
||||||
|
|
||||||
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
|
file_name = f"{domain}-urls-{uid}.txt"
|
||||||
file_path = os.path.join(os.getcwd(), file_name)
|
file_path = os.path.join(os.getcwd(), file_name)
|
||||||
if not os.path.isfile(file_path):
|
if not os.path.isfile(file_path):
|
||||||
open(file_path, "w+").close()
|
open(file_path, "w+").close()
|
||||||
|
|
||||||
with open(file_path, "a") as f:
|
with open(file_path, "a") as f:
|
||||||
f.write("{url}\n".format(url=url))
|
f.write(f"{url}\n")
|
||||||
|
|
||||||
click.echo(url)
|
click.echo(url)
|
||||||
|
|
||||||
if url_count > 0:
|
if url_count > 0 or file_name is not None:
|
||||||
click.echo(
|
click.echo(f"\n\n'{file_name}' saved in current working directory")
|
||||||
"\n\n'{file_name}' saved in current working directory".format(
|
|
||||||
file_name=file_name
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
click.echo("No known URLs found. Please try a diffrent input!")
|
click.echo("No known URLs found. Please try a diffrent input!")
|
||||||
|
|
||||||
|
@ -14,6 +14,8 @@ class WaybackError(Exception):
|
|||||||
All other exceptions are inherited from this class.
|
All other exceptions are inherited from this class.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class RedirectSaveError(WaybackError):
|
class RedirectSaveError(WaybackError):
|
||||||
"""
|
"""
|
||||||
@ -21,32 +23,44 @@ class RedirectSaveError(WaybackError):
|
|||||||
redirect URL is archived but not the original URL.
|
redirect URL is archived but not the original URL.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class URLError(Exception):
|
class URLError(Exception):
|
||||||
"""
|
"""
|
||||||
Raised when malformed URLs are passed as arguments.
|
Raised when malformed URLs are passed as arguments.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class MaximumRetriesExceeded(WaybackError):
|
class MaximumRetriesExceeded(WaybackError):
|
||||||
"""
|
"""
|
||||||
MaximumRetriesExceeded
|
MaximumRetriesExceeded
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
|
class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
|
||||||
"""
|
"""
|
||||||
MaximumSaveRetriesExceeded
|
MaximumSaveRetriesExceeded
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ArchiveNotInAvailabilityAPIResponse(WaybackError):
|
class ArchiveNotInAvailabilityAPIResponse(WaybackError):
|
||||||
"""
|
"""
|
||||||
Could not parse the archive in the JSON response of the availability API.
|
Could not parse the archive in the JSON response of the availability API.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class InvalidJSONInAvailabilityAPIResponse(WaybackError):
|
class InvalidJSONInAvailabilityAPIResponse(WaybackError):
|
||||||
"""
|
"""
|
||||||
availability api returned invalid JSON
|
availability api returned invalid JSON
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
@ -1,38 +1,41 @@
|
|||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
|
from requests.structures import CaseInsensitiveDict
|
||||||
from urllib3.util.retry import Retry
|
from urllib3.util.retry import Retry
|
||||||
|
|
||||||
from .exceptions import MaximumSaveRetriesExceeded
|
from .exceptions import MaximumSaveRetriesExceeded
|
||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
class WaybackMachineSaveAPI:
|
class WaybackMachineSaveAPI(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
WaybackMachineSaveAPI class provides an interface for saving URLs on the
|
WaybackMachineSaveAPI class provides an interface for saving URLs on the
|
||||||
Wayback Machine.
|
Wayback Machine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
|
def __init__(
|
||||||
|
self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 8
|
||||||
|
) -> None:
|
||||||
self.url = str(url).strip().replace(" ", "%20")
|
self.url = str(url).strip().replace(" ", "%20")
|
||||||
self.request_url = "https://web.archive.org/save/" + self.url
|
self.request_url = "https://web.archive.org/save/" + self.url
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
self.request_headers = {"User-Agent": self.user_agent}
|
self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
|
||||||
if max_tries < 1:
|
if max_tries < 1:
|
||||||
raise ValueError("max_tries should be positive")
|
raise ValueError("max_tries should be positive")
|
||||||
self.max_tries = max_tries
|
self.max_tries = max_tries
|
||||||
self.total_save_retries = 5
|
self.total_save_retries = 5
|
||||||
self.backoff_factor = 0.5
|
self.backoff_factor = 0.5
|
||||||
self.status_forcelist = [500, 502, 503, 504]
|
self.status_forcelist = [500, 502, 503, 504]
|
||||||
self._archive_url = None
|
self._archive_url: Optional[str] = None
|
||||||
self.instance_birth_time = datetime.utcnow()
|
self.instance_birth_time = datetime.utcnow()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def archive_url(self):
|
def archive_url(self) -> str:
|
||||||
"""
|
"""
|
||||||
Returns the archive URL is already cached by _archive_url
|
Returns the archive URL is already cached by _archive_url
|
||||||
else invoke the save method to save the archive which returns the
|
else invoke the save method to save the archive which returns the
|
||||||
@ -44,7 +47,7 @@ class WaybackMachineSaveAPI:
|
|||||||
else:
|
else:
|
||||||
return self.save()
|
return self.save()
|
||||||
|
|
||||||
def get_save_request_headers(self):
|
def get_save_request_headers(self) -> None:
|
||||||
"""
|
"""
|
||||||
Creates a session and tries 'retries' number of times to
|
Creates a session and tries 'retries' number of times to
|
||||||
retrieve the archive.
|
retrieve the archive.
|
||||||
@ -68,14 +71,13 @@ class WaybackMachineSaveAPI:
|
|||||||
)
|
)
|
||||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||||
self.response = session.get(self.request_url, headers=self.request_headers)
|
self.response = session.get(self.request_url, headers=self.request_headers)
|
||||||
self.headers = (
|
# requests.response.headers is requests.structures.CaseInsensitiveDict
|
||||||
self.response.headers
|
self.headers: CaseInsensitiveDict[str] = self.response.headers
|
||||||
) # <class 'requests.structures.CaseInsensitiveDict'>
|
|
||||||
self.status_code = self.response.status_code
|
self.status_code = self.response.status_code
|
||||||
self.response_url = self.response.url
|
self.response_url = self.response.url
|
||||||
session.close()
|
session.close()
|
||||||
|
|
||||||
def archive_url_parser(self):
|
def archive_url_parser(self) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Three regexen (like oxen?) are used to search for the
|
Three regexen (like oxen?) are used to search for the
|
||||||
archive URL in the headers and finally look in the response URL
|
archive URL in the headers and finally look in the response URL
|
||||||
@ -89,12 +91,12 @@ class WaybackMachineSaveAPI:
|
|||||||
|
|
||||||
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
|
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
|
||||||
match = re.search(regex2, str(self.headers))
|
match = re.search(regex2, str(self.headers))
|
||||||
if match:
|
if match is not None and len(match.groups()) == 1:
|
||||||
return "https://" + match.group(1)
|
return "https://" + match.group(1)
|
||||||
|
|
||||||
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
|
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
|
||||||
match = re.search(regex3, str(self.headers))
|
match = re.search(regex3, str(self.headers))
|
||||||
if match:
|
if match is not None and len(match.groups()) == 1:
|
||||||
return "https" + match.group(1)
|
return "https" + match.group(1)
|
||||||
|
|
||||||
if self.response_url:
|
if self.response_url:
|
||||||
@ -105,7 +107,10 @@ class WaybackMachineSaveAPI:
|
|||||||
if match:
|
if match:
|
||||||
return "https://" + match.group(0)
|
return "https://" + match.group(0)
|
||||||
|
|
||||||
def sleep(self, tries):
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def sleep(tries: int) -> None:
|
||||||
"""
|
"""
|
||||||
Ensure that the we wait some time before succesive retries so that we
|
Ensure that the we wait some time before succesive retries so that we
|
||||||
don't waste the retries before the page is even captured by the Wayback
|
don't waste the retries before the page is even captured by the Wayback
|
||||||
@ -120,7 +125,7 @@ class WaybackMachineSaveAPI:
|
|||||||
sleep_seconds = 10
|
sleep_seconds = 10
|
||||||
time.sleep(sleep_seconds)
|
time.sleep(sleep_seconds)
|
||||||
|
|
||||||
def timestamp(self):
|
def timestamp(self) -> datetime:
|
||||||
"""
|
"""
|
||||||
Read the timestamp off the archive URL and convert the Wayback Machine
|
Read the timestamp off the archive URL and convert the Wayback Machine
|
||||||
timestamp to datetime object.
|
timestamp to datetime object.
|
||||||
@ -128,14 +133,16 @@ class WaybackMachineSaveAPI:
|
|||||||
Also check if the time on archive is URL and compare it to instance birth
|
Also check if the time on archive is URL and compare it to instance birth
|
||||||
time.
|
time.
|
||||||
|
|
||||||
If time on the archive is older than the instance creation time set the cached_save
|
If time on the archive is older than the instance creation time set the
|
||||||
to True else set it to False. The flag can be used to check if the Wayback Machine
|
cached_save to True else set it to False. The flag can be used to check
|
||||||
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
|
if the Wayback Machine didn't serve a Cached URL. It is quite common for
|
||||||
cached archive if last archive was captured before last 45 minutes.
|
the Wayback Machine to serve cached archive if last archive was captured
|
||||||
|
before last 45 minutes.
|
||||||
"""
|
"""
|
||||||
m = re.search(
|
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
|
||||||
r"https?://web\.archive.org/web/([0-9]{14})/http", self._archive_url
|
m = re.search(regex, str(self._archive_url))
|
||||||
)
|
if m is None or len(m.groups()) != 1:
|
||||||
|
raise ValueError("Could not get timestamp")
|
||||||
string_timestamp = m.group(1)
|
string_timestamp = m.group(1)
|
||||||
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
|
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
|
||||||
|
|
||||||
@ -149,7 +156,7 @@ class WaybackMachineSaveAPI:
|
|||||||
|
|
||||||
return timestamp
|
return timestamp
|
||||||
|
|
||||||
def save(self):
|
def save(self) -> str:
|
||||||
"""
|
"""
|
||||||
Calls the SavePageNow API of the Wayback Machine with required parameters
|
Calls the SavePageNow API of the Wayback Machine with required parameters
|
||||||
and headers to save the URL.
|
and headers to save the URL.
|
||||||
@ -162,14 +169,14 @@ class WaybackMachineSaveAPI:
|
|||||||
tries = 0
|
tries = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
if not self.saved_archive:
|
if self.saved_archive is None:
|
||||||
if tries >= 1:
|
if tries >= 1:
|
||||||
self.sleep(tries)
|
self.sleep(tries)
|
||||||
|
|
||||||
self.get_save_request_headers()
|
self.get_save_request_headers()
|
||||||
self.saved_archive = self.archive_url_parser()
|
self.saved_archive = self.archive_url_parser()
|
||||||
|
|
||||||
if self.saved_archive is not None:
|
if isinstance(self.saved_archive, str):
|
||||||
self._archive_url = self.saved_archive
|
self._archive_url = self.saved_archive
|
||||||
self.timestamp()
|
self.timestamp()
|
||||||
return self.saved_archive
|
return self.saved_archive
|
||||||
@ -177,7 +184,8 @@ class WaybackMachineSaveAPI:
|
|||||||
tries += 1
|
tries += 1
|
||||||
if tries >= self.max_tries:
|
if tries >= self.max_tries:
|
||||||
raise MaximumSaveRetriesExceeded(
|
raise MaximumSaveRetriesExceeded(
|
||||||
"Tried %s times but failed to save and retrieve the" % str(tries)
|
f"Tried {tries} times but failed to save "
|
||||||
+ " archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
f"and retrieve the archive for {self.url}.\n"
|
||||||
% (self.url, self.response_url, str(self.headers)),
|
f"Response URL:\n{self.response_url}\n"
|
||||||
|
f"Response Header:\n{self.headers}"
|
||||||
)
|
)
|
||||||
|
@ -2,22 +2,43 @@ import requests
|
|||||||
|
|
||||||
from . import __version__
|
from . import __version__
|
||||||
|
|
||||||
DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
|
DEFAULT_USER_AGENT: str = (
|
||||||
|
f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def latest_version_pypi(package_name, user_agent=DEFAULT_USER_AGENT):
|
def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
|
||||||
request_url = "https://pypi.org/pypi/" + package_name + "/json"
|
request_url = "https://pypi.org/pypi/" + package_name + "/json"
|
||||||
headers = {"User-Agent": user_agent}
|
headers = {"User-Agent": user_agent}
|
||||||
response = requests.get(request_url, headers=headers)
|
response = requests.get(request_url, headers=headers)
|
||||||
data = response.json()
|
data = response.json()
|
||||||
return data["info"]["version"]
|
if (
|
||||||
|
data is not None
|
||||||
|
and "info" in data
|
||||||
|
and data["info"] is not None
|
||||||
|
and "version" in data["info"]
|
||||||
|
and data["info"]["version"] is not None
|
||||||
|
):
|
||||||
|
return str(data["info"]["version"])
|
||||||
|
else:
|
||||||
|
raise ValueError("Could not get latest pypi version")
|
||||||
|
|
||||||
|
|
||||||
def latest_version_github(package_name, user_agent=DEFAULT_USER_AGENT):
|
def latest_version_github(
|
||||||
|
package_name: str, user_agent: str = DEFAULT_USER_AGENT
|
||||||
|
) -> str:
|
||||||
request_url = (
|
request_url = (
|
||||||
"https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
|
"https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
|
||||||
)
|
)
|
||||||
headers = {"User-Agent": user_agent}
|
headers = {"User-Agent": user_agent}
|
||||||
response = requests.get(request_url, headers=headers)
|
response = requests.get(request_url, headers=headers)
|
||||||
data = response.json()
|
data = response.json()
|
||||||
return data[0]["tag_name"]
|
if (
|
||||||
|
data is not None
|
||||||
|
and len(data) > 0
|
||||||
|
and data[0] is not None
|
||||||
|
and "tag_name" in data[0]
|
||||||
|
):
|
||||||
|
return str(data[0]["tag_name"])
|
||||||
|
else:
|
||||||
|
raise ValueError("Could not get latest github version")
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Generator, Optional
|
||||||
|
|
||||||
from .availability_api import WaybackMachineAvailabilityAPI
|
from .availability_api import WaybackMachineAvailabilityAPI
|
||||||
from .cdx_api import WaybackMachineCDXServerAPI
|
from .cdx_api import WaybackMachineCDXServerAPI
|
||||||
@ -14,40 +15,42 @@ The reason it is still in the code is backwards compatibility with 2.x.x version
|
|||||||
If were are using the Url before the update to version 3.x.x, your code should still be
|
If were are using the Url before the update to version 3.x.x, your code should still be
|
||||||
working fine and there is no hurry to update the interface but is recommended that you
|
working fine and there is no hurry to update the interface but is recommended that you
|
||||||
do not use the Url class for new code as it would be removed after 2025 also the first
|
do not use the Url class for new code as it would be removed after 2025 also the first
|
||||||
3.x.x versions was released in January 2022 and three years are more than enough to update
|
3.x.x versions was released in January 2022 and three years are more than enough to
|
||||||
the older interface code.
|
update the older interface code.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class Url:
|
class Url(object):
|
||||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
|
def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
|
||||||
self.url = url
|
self.url = url
|
||||||
self.user_agent = str(user_agent)
|
self.user_agent = str(user_agent)
|
||||||
self.archive_url = None
|
self.archive_url: Optional[str] = None
|
||||||
self.timestamp = None
|
self.timestamp: Optional[datetime] = None
|
||||||
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
||||||
self.url, user_agent=self.user_agent
|
self.url, user_agent=self.user_agent
|
||||||
)
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self) -> str:
|
||||||
if not self.archive_url:
|
if not self.archive_url:
|
||||||
self.newest()
|
self.newest()
|
||||||
return self.archive_url
|
return str(self.archive_url)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
td_max = timedelta(
|
td_max = timedelta(
|
||||||
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
||||||
)
|
)
|
||||||
|
|
||||||
if not self.timestamp:
|
if not isinstance(self.timestamp, datetime):
|
||||||
self.oldest()
|
self.oldest()
|
||||||
|
|
||||||
if self.timestamp == datetime.max:
|
if not isinstance(self.timestamp, datetime):
|
||||||
|
raise TypeError("timestamp must be a datetime")
|
||||||
|
elif self.timestamp == datetime.max:
|
||||||
return td_max.days
|
return td_max.days
|
||||||
|
else:
|
||||||
|
return (datetime.utcnow() - self.timestamp).days
|
||||||
|
|
||||||
return (datetime.utcnow() - self.timestamp).days
|
def save(self) -> "Url":
|
||||||
|
|
||||||
def save(self):
|
|
||||||
self.wayback_machine_save_api = WaybackMachineSaveAPI(
|
self.wayback_machine_save_api = WaybackMachineSaveAPI(
|
||||||
self.url, user_agent=self.user_agent
|
self.url, user_agent=self.user_agent
|
||||||
)
|
)
|
||||||
@ -58,13 +61,13 @@ class Url:
|
|||||||
|
|
||||||
def near(
|
def near(
|
||||||
self,
|
self,
|
||||||
year=None,
|
year: Optional[int] = None,
|
||||||
month=None,
|
month: Optional[int] = None,
|
||||||
day=None,
|
day: Optional[int] = None,
|
||||||
hour=None,
|
hour: Optional[int] = None,
|
||||||
minute=None,
|
minute: Optional[int] = None,
|
||||||
unix_timestamp=None,
|
unix_timestamp: Optional[int] = None,
|
||||||
):
|
) -> "Url":
|
||||||
|
|
||||||
self.wayback_machine_availability_api.near(
|
self.wayback_machine_availability_api.near(
|
||||||
year=year,
|
year=year,
|
||||||
@ -77,22 +80,24 @@ class Url:
|
|||||||
self.set_availability_api_attrs()
|
self.set_availability_api_attrs()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def oldest(self):
|
def oldest(self) -> "Url":
|
||||||
self.wayback_machine_availability_api.oldest()
|
self.wayback_machine_availability_api.oldest()
|
||||||
self.set_availability_api_attrs()
|
self.set_availability_api_attrs()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def newest(self):
|
def newest(self) -> "Url":
|
||||||
self.wayback_machine_availability_api.newest()
|
self.wayback_machine_availability_api.newest()
|
||||||
self.set_availability_api_attrs()
|
self.set_availability_api_attrs()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def set_availability_api_attrs(self):
|
def set_availability_api_attrs(self) -> None:
|
||||||
self.archive_url = self.wayback_machine_availability_api.archive_url
|
self.archive_url = self.wayback_machine_availability_api.archive_url
|
||||||
self.JSON = self.wayback_machine_availability_api.JSON
|
self.JSON = self.wayback_machine_availability_api.JSON
|
||||||
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
||||||
|
|
||||||
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
def total_archives(
|
||||||
|
self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
|
||||||
|
) -> int:
|
||||||
cdx = WaybackMachineCDXServerAPI(
|
cdx = WaybackMachineCDXServerAPI(
|
||||||
self.url,
|
self.url,
|
||||||
user_agent=self.user_agent,
|
user_agent=self.user_agent,
|
||||||
@ -107,12 +112,12 @@ class Url:
|
|||||||
|
|
||||||
def known_urls(
|
def known_urls(
|
||||||
self,
|
self,
|
||||||
subdomain=False,
|
subdomain: bool = False,
|
||||||
host=False,
|
host: bool = False,
|
||||||
start_timestamp=None,
|
start_timestamp: Optional[str] = None,
|
||||||
end_timestamp=None,
|
end_timestamp: Optional[str] = None,
|
||||||
match_type="prefix",
|
match_type: str = "prefix",
|
||||||
):
|
) -> Generator[str, None, None]:
|
||||||
if subdomain:
|
if subdomain:
|
||||||
match_type = "domain"
|
match_type = "domain"
|
||||||
if host:
|
if host:
|
||||||
|
Loading…
Reference in New Issue
Block a user