fix: shorten long lines
This commit is contained in:
@@ -64,8 +64,8 @@ profile = black
|
|||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
indent-size = 4
|
indent-size = 4
|
||||||
# max-line-length = 88
|
max-line-length = 88
|
||||||
extend-ignore = E203,W503,E501,W605
|
extend-ignore = W605
|
||||||
|
|
||||||
[mypy]
|
[mypy]
|
||||||
python_version = 3.9
|
python_version = 3.9
|
||||||
|
@@ -12,7 +12,10 @@ from waybackpy.exceptions import (
|
|||||||
|
|
||||||
now = datetime.utcnow()
|
now = datetime.utcnow()
|
||||||
url = "https://example.com/"
|
url = "https://example.com/"
|
||||||
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def rndstr(n: int) -> str:
|
def rndstr(n: int) -> str:
|
||||||
@@ -26,7 +29,10 @@ def test_oldest() -> None:
|
|||||||
Test the oldest archive of Google.com and also checks the attributes.
|
Test the oldest archive of Google.com and also checks the attributes.
|
||||||
"""
|
"""
|
||||||
url = "https://example.com/"
|
url = "https://example.com/"
|
||||||
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
|
||||||
|
)
|
||||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
||||||
oldest = availability_api.oldest()
|
oldest = availability_api.oldest()
|
||||||
oldest_archive_url = oldest.archive_url
|
oldest_archive_url = oldest.archive_url
|
||||||
@@ -59,7 +65,8 @@ def test_newest() -> None:
|
|||||||
|
|
||||||
def test_invalid_json() -> None:
|
def test_invalid_json() -> None:
|
||||||
"""
|
"""
|
||||||
When the API is malfunctioning or we don't pass a URL it may return invalid JSON data.
|
When the API is malfunctioning or we don't pass a URL,
|
||||||
|
it may return invalid JSON data.
|
||||||
"""
|
"""
|
||||||
with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
|
with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
|
||||||
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)
|
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)
|
||||||
|
@@ -2,7 +2,10 @@ from waybackpy.cdx_api import WaybackMachineCDXServerAPI
|
|||||||
|
|
||||||
|
|
||||||
def test_a() -> None:
|
def test_a() -> None:
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
url = "https://twitter.com/jack"
|
url = "https://twitter.com/jack"
|
||||||
|
|
||||||
wayback = WaybackMachineCDXServerAPI(
|
wayback = WaybackMachineCDXServerAPI(
|
||||||
@@ -22,7 +25,10 @@ def test_a() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def test_b() -> None:
|
def test_b() -> None:
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
|
||||||
|
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
url = "https://www.google.com"
|
url = "https://www.google.com"
|
||||||
|
|
||||||
wayback = WaybackMachineCDXServerAPI(
|
wayback = WaybackMachineCDXServerAPI(
|
||||||
|
@@ -4,7 +4,10 @@ from waybackpy.cdx_snapshot import CDXSnapshot
|
|||||||
|
|
||||||
|
|
||||||
def test_CDXSnapshot() -> None:
|
def test_CDXSnapshot() -> None:
|
||||||
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
|
sample_input = (
|
||||||
|
"org,archive)/ 20080126045828 http://github.com "
|
||||||
|
"text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
|
||||||
|
)
|
||||||
prop_values = sample_input.split(" ")
|
prop_values = sample_input.split(" ")
|
||||||
properties = {}
|
properties = {}
|
||||||
(
|
(
|
||||||
|
@@ -15,7 +15,10 @@ from waybackpy.exceptions import WaybackError
|
|||||||
|
|
||||||
def test_get_total_pages() -> None:
|
def test_get_total_pages() -> None:
|
||||||
url = "twitter.com"
|
url = "twitter.com"
|
||||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
|
||||||
|
)
|
||||||
assert get_total_pages(url=url, user_agent=user_agent) >= 56
|
assert get_total_pages(url=url, user_agent=user_agent) >= 56
|
||||||
|
|
||||||
|
|
||||||
|
@@ -17,7 +17,10 @@ def rndstr(n: int) -> str:
|
|||||||
|
|
||||||
def test_save() -> None:
|
def test_save() -> None:
|
||||||
url = "https://github.com/akamhy/waybackpy"
|
url = "https://github.com/akamhy/waybackpy"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
save_api.save()
|
save_api.save()
|
||||||
archive_url = save_api.archive_url
|
archive_url = save_api.archive_url
|
||||||
@@ -34,7 +37,10 @@ def test_save() -> None:
|
|||||||
def test_max_redirect_exceeded() -> None:
|
def test_max_redirect_exceeded() -> None:
|
||||||
with pytest.raises(MaximumSaveRetriesExceeded):
|
with pytest.raises(MaximumSaveRetriesExceeded):
|
||||||
url = f"https://{rndstr}.gov"
|
url = f"https://{rndstr}.gov"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3)
|
save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3)
|
||||||
save_api.save()
|
save_api.save()
|
||||||
|
|
||||||
@@ -47,7 +53,10 @@ def test_sleep() -> None:
|
|||||||
is as intended.
|
is as intended.
|
||||||
"""
|
"""
|
||||||
url = "https://example.com"
|
url = "https://example.com"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
s_time = int(time.time())
|
s_time = int(time.time())
|
||||||
save_api.sleep(6) # multiple of 3 sleep for 10 seconds
|
save_api.sleep(6) # multiple of 3 sleep for 10 seconds
|
||||||
@@ -62,14 +71,17 @@ def test_sleep() -> None:
|
|||||||
|
|
||||||
def test_timestamp() -> None:
|
def test_timestamp() -> None:
|
||||||
url = "https://example.com"
|
url = "https://example.com"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
||||||
save_api._archive_url = f"https://web.archive.org/web/{now}{url}/"
|
save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
|
||||||
save_api.timestamp()
|
save_api.timestamp()
|
||||||
assert save_api.cached_save is False
|
assert save_api.cached_save is False
|
||||||
now = "20100124063622"
|
now = "20100124063622"
|
||||||
save_api._archive_url = f"https://web.archive.org/web/{now}{url}/"
|
save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
|
||||||
save_api.timestamp()
|
save_api.timestamp()
|
||||||
assert save_api.cached_save is True
|
assert save_api.cached_save is True
|
||||||
|
|
||||||
@@ -79,7 +91,10 @@ def test_archive_url_parser() -> None:
|
|||||||
Testing three regex for matches and also tests the response URL.
|
Testing three regex for matches and also tests the response URL.
|
||||||
"""
|
"""
|
||||||
url = "https://example.com"
|
url = "https://example.com"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
|
|
||||||
save_api.headers_str = """
|
save_api.headers_str = """
|
||||||
@@ -88,39 +103,101 @@ def test_archive_url_parser() -> None:
|
|||||||
END
|
END
|
||||||
"""
|
"""
|
||||||
|
|
||||||
assert (
|
expected_url = (
|
||||||
save_api.archive_url_parser()
|
"https://web.archive.org/web/20201126185327/"
|
||||||
== "https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al"
|
"https://www.scribbr.com/citing-sources/et-al"
|
||||||
)
|
)
|
||||||
|
assert save_api.archive_url_parser() == expected_url
|
||||||
|
|
||||||
save_api.headers_str = """
|
headers = {
|
||||||
{'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'}
|
"Server": "nginx/1.15.8",
|
||||||
"""
|
"Date": "Sat, 02 Jan 2021 09:40:25 GMT",
|
||||||
|
"Content-Type": "text/html; charset=UTF-8",
|
||||||
|
"Transfer-Encoding": "chunked",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"X-Archive-Orig-Server": "nginx",
|
||||||
|
"X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT",
|
||||||
|
"X-Archive-Orig-Transfer-Encoding": "chunked",
|
||||||
|
"X-Archive-Orig-Connection": "keep-alive",
|
||||||
|
"X-Archive-Orig-Vary": "Accept-Encoding",
|
||||||
|
"X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT",
|
||||||
|
"X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;",
|
||||||
|
"X-Archive-Guessed-Content-Type": "text/html",
|
||||||
|
"X-Archive-Guessed-Charset": "utf-8",
|
||||||
|
"Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT",
|
||||||
|
"Link": (
|
||||||
|
'<https://www.scribbr.com/citing-sources/et-al/>; rel="original", '
|
||||||
|
"<https://web.archive.org/web/timemap/link/https://www.scribbr.com/"
|
||||||
|
'citing-sources/et-al/>; rel="timemap"; type="application/link-format", '
|
||||||
|
"<https://web.archive.org/web/https://www.scribbr.com/citing-sources/"
|
||||||
|
'et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/'
|
||||||
|
'https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; '
|
||||||
|
'datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/'
|
||||||
|
"20201126185327/https://www.scribbr.com/citing-sources/et-al/>; "
|
||||||
|
'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", '
|
||||||
|
"<https://web.archive.org/web/20210102094009/https://www.scribbr.com/"
|
||||||
|
'citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 '
|
||||||
|
'09:40:09 GMT", <https://web.archive.org/web/20210102094009/'
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al/>; "
|
||||||
|
'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"'
|
||||||
|
),
|
||||||
|
"Content-Security-Policy": (
|
||||||
|
"default-src 'self' 'unsafe-eval' 'unsafe-inline' "
|
||||||
|
"data: blob: archive.org web.archive.org analytics.archive.org "
|
||||||
|
"pragma.archivelab.org",
|
||||||
|
),
|
||||||
|
"X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz",
|
||||||
|
"Server-Timing": (
|
||||||
|
"captures_list;dur=112.646325, exclusion.robots;dur=0.172010, "
|
||||||
|
"exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, "
|
||||||
|
"esindex;dur=0.014647, LoadShardBlock;dur=82.205012, "
|
||||||
|
"PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, "
|
||||||
|
"load_resource;dur=26.520179"
|
||||||
|
),
|
||||||
|
"X-App-Server": "wwwb-app200",
|
||||||
|
"X-ts": "200",
|
||||||
|
"X-location": "All",
|
||||||
|
"X-Cache-Key": (
|
||||||
|
"httpsweb.archive.org/web/20210102094009/"
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al/IN",
|
||||||
|
),
|
||||||
|
"X-RL": "0",
|
||||||
|
"X-Page-Cache": "MISS",
|
||||||
|
"X-Archive-Screenname": "0",
|
||||||
|
"Content-Encoding": "gzip",
|
||||||
|
}
|
||||||
|
save_api.headers_str = str(headers)
|
||||||
|
|
||||||
assert (
|
expected_url2 = (
|
||||||
save_api.archive_url_parser()
|
"https://web.archive.org/web/20210102094009/"
|
||||||
== "https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/"
|
"https://www.scribbr.com/citing-sources/et-al/"
|
||||||
)
|
)
|
||||||
|
assert save_api.archive_url_parser() == expected_url2
|
||||||
|
|
||||||
save_api.headers_str = """
|
expected_url_3 = (
|
||||||
START
|
"https://web.archive.org/web/20171128185327/"
|
||||||
X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US
|
"https://www.scribbr.com/citing-sources/et-al/US"
|
||||||
END
|
|
||||||
"""
|
|
||||||
|
|
||||||
assert (
|
|
||||||
save_api.archive_url_parser()
|
|
||||||
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/"
|
|
||||||
)
|
)
|
||||||
|
save_api.headers_str = f"START\nX-Cache-Key: {expected_url_3}\nEND\n"
|
||||||
|
|
||||||
|
expected_url4 = (
|
||||||
|
"https://web.archive.org/web/20171128185327/"
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al/"
|
||||||
|
)
|
||||||
|
assert save_api.archive_url_parser() == expected_url4
|
||||||
|
|
||||||
save_api.headers_str = (
|
save_api.headers_str = (
|
||||||
"TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING"
|
"TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING"
|
||||||
)
|
)
|
||||||
save_api.response_url = "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al"
|
save_api.response_url = (
|
||||||
assert (
|
"https://web.archive.org/web/20171128185327/"
|
||||||
save_api.archive_url_parser()
|
"https://www.scribbr.com/citing-sources/et-al"
|
||||||
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al"
|
|
||||||
)
|
)
|
||||||
|
expected_url5 = (
|
||||||
|
"https://web.archive.org/web/20171128185327/"
|
||||||
|
"https://www.scribbr.com/citing-sources/et-al"
|
||||||
|
)
|
||||||
|
assert save_api.archive_url_parser() == expected_url5
|
||||||
|
|
||||||
|
|
||||||
def test_archive_url() -> None:
|
def test_archive_url() -> None:
|
||||||
@@ -130,7 +207,10 @@ def test_archive_url() -> None:
|
|||||||
by the archive_url method which is an attribute due to @property.
|
by the archive_url method which is an attribute due to @property.
|
||||||
"""
|
"""
|
||||||
url = "https://example.com"
|
url = "https://example.com"
|
||||||
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
user_agent = (
|
||||||
|
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
|
||||||
|
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
|
||||||
|
)
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent)
|
save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||||
save_api.saved_archive = (
|
save_api.saved_archive = (
|
||||||
"https://web.archive.org/web/20220124063056/https://example.com/"
|
"https://web.archive.org/web/20220124063056/https://example.com/"
|
||||||
|
@@ -142,8 +142,8 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
if not data or not data["archived_snapshots"]:
|
if not data or not data["archived_snapshots"]:
|
||||||
raise ArchiveNotInAvailabilityAPIResponse(
|
raise ArchiveNotInAvailabilityAPIResponse(
|
||||||
"Archive not found in the availability "
|
"Archive not found in the availability "
|
||||||
"API response, the URL you requested may not have any "
|
"API response, the URL you requested may not have any archives "
|
||||||
"archives yet. You may retry after some time or archive the webpage now.\n"
|
"yet. You may retry after some time or archive the webpage now.\n"
|
||||||
f"Response data:\n{self.response.text}"
|
f"Response data:\n{self.response.text}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -196,7 +196,8 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
|
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
|
||||||
appropriate arguments for their respective parameters.
|
appropriate arguments for their respective parameters.
|
||||||
Adds the timestamp to the payload dictionary.
|
Adds the timestamp to the payload dictionary.
|
||||||
And finally invoking the json method to make the API call then returns the instance.
|
And finally invoking the json method to make the API call then returns
|
||||||
|
the instance.
|
||||||
"""
|
"""
|
||||||
if unix_timestamp:
|
if unix_timestamp:
|
||||||
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||||
|
@@ -177,8 +177,8 @@ class WaybackMachineCDXServerAPI(object):
|
|||||||
|
|
||||||
if prop_values_len != properties_len:
|
if prop_values_len != properties_len:
|
||||||
raise WaybackError(
|
raise WaybackError(
|
||||||
f"Snapshot returned by Cdx API has {prop_values_len} properties "
|
f"Snapshot returned by Cdx API has {prop_values_len} "
|
||||||
f"instead of expected {properties_len} properties.\n"
|
f"properties instead of expected {properties_len} properties.\n"
|
||||||
f"Problematic Snapshot: {snapshot}"
|
f"Problematic Snapshot: {snapshot}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -69,7 +69,8 @@ def check_filters(filters: List[str]) -> None:
|
|||||||
# [!]field:regex
|
# [!]field:regex
|
||||||
for _filter in filters:
|
for _filter in filters:
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
|
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):"
|
||||||
|
r"(.*)",
|
||||||
_filter,
|
_filter,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -87,11 +88,15 @@ def check_collapses(collapses: List[str]) -> bool:
|
|||||||
|
|
||||||
for collapse in collapses:
|
for collapse in collapses:
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
|
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)"
|
||||||
|
r"(:?[0-9]{1,99})?",
|
||||||
collapse,
|
collapse,
|
||||||
)
|
)
|
||||||
if match is None or len(match.groups()) != 2:
|
if match is None or len(match.groups()) != 2:
|
||||||
exc_message = f"collapse argument '{collapse}' is not following the cdx collapse syntax."
|
exc_message = (
|
||||||
|
f"collapse argument '{collapse}' "
|
||||||
|
"is not following the cdx collapse syntax."
|
||||||
|
)
|
||||||
raise WaybackError(exc_message)
|
raise WaybackError(exc_message)
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
@@ -106,7 +111,10 @@ def check_match_type(match_type: Optional[str], url: str) -> bool:
|
|||||||
"Can not use wildcard in the URL along with the match_type arguments."
|
"Can not use wildcard in the URL along with the match_type arguments."
|
||||||
)
|
)
|
||||||
elif match_type not in legal_match_type:
|
elif match_type not in legal_match_type:
|
||||||
exc_message = f"{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'"
|
exc_message = (
|
||||||
|
f"{match_type} is not an allowed match type.\n"
|
||||||
|
"Use one from 'exact', 'prefix', 'host' or 'domain'"
|
||||||
|
)
|
||||||
raise WaybackError(exc_message)
|
raise WaybackError(exc_message)
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
@@ -241,7 +241,8 @@ def main(
|
|||||||
and not cdx
|
and not cdx
|
||||||
):
|
):
|
||||||
click.echo(
|
click.echo(
|
||||||
"Only URL passed, but did not specify what to do with the URL. Use --help flag for help using waybackpy."
|
"Only URL passed, but did not specify what to do with the URL. "
|
||||||
|
"Use --help flag for help using waybackpy."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@@ -72,7 +72,6 @@ class WaybackMachineSaveAPI(object):
|
|||||||
self.response = session.get(self.request_url, headers=self.request_headers)
|
self.response = session.get(self.request_url, headers=self.request_headers)
|
||||||
# requests.response.headers is requests.structures.CaseInsensitiveDict
|
# requests.response.headers is requests.structures.CaseInsensitiveDict
|
||||||
self.headers = self.response.headers
|
self.headers = self.response.headers
|
||||||
self.headers_str = str(self.headers)
|
|
||||||
self.status_code = self.response.status_code
|
self.status_code = self.response.status_code
|
||||||
self.response_url = self.response.url
|
self.response_url = self.response.url
|
||||||
session.close()
|
session.close()
|
||||||
@@ -85,17 +84,17 @@ class WaybackMachineSaveAPI(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
||||||
match = re.search(regex1, self.headers_str)
|
match = re.search(regex1, str(self.headers))
|
||||||
if match:
|
if match:
|
||||||
return "https://web.archive.org" + match.group(1)
|
return "https://web.archive.org" + match.group(1)
|
||||||
|
|
||||||
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
|
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
|
||||||
match = re.search(regex2, self.headers_str)
|
match = re.search(regex2, str(self.headers))
|
||||||
if match is not None and len(match.groups()) == 1:
|
if match is not None and len(match.groups()) == 1:
|
||||||
return "https://" + match.group(1)
|
return "https://" + match.group(1)
|
||||||
|
|
||||||
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
|
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
|
||||||
match = re.search(regex3, self.headers_str)
|
match = re.search(regex3, str(self.headers))
|
||||||
if match is not None and len(match.groups()) == 1:
|
if match is not None and len(match.groups()) == 1:
|
||||||
return "https" + match.group(1)
|
return "https" + match.group(1)
|
||||||
|
|
||||||
@@ -132,10 +131,11 @@ class WaybackMachineSaveAPI(object):
|
|||||||
Also check if the time on archive is URL and compare it to instance birth
|
Also check if the time on archive is URL and compare it to instance birth
|
||||||
time.
|
time.
|
||||||
|
|
||||||
If time on the archive is older than the instance creation time set the cached_save
|
If time on the archive is older than the instance creation time set the
|
||||||
to True else set it to False. The flag can be used to check if the Wayback Machine
|
cached_save to True else set it to False. The flag can be used to check
|
||||||
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
|
if the Wayback Machine didn't serve a Cached URL. It is quite common for
|
||||||
cached archive if last archive was captured before last 45 minutes.
|
the Wayback Machine to serve cached archive if last archive was captured
|
||||||
|
before last 45 minutes.
|
||||||
"""
|
"""
|
||||||
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
|
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
|
||||||
m = re.search(regex, str(self._archive_url))
|
m = re.search(regex, str(self._archive_url))
|
||||||
@@ -167,7 +167,7 @@ class WaybackMachineSaveAPI(object):
|
|||||||
tries = 0
|
tries = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
if not self.saved_archive:
|
if self.saved_archive is None:
|
||||||
if tries >= 1:
|
if tries >= 1:
|
||||||
self.sleep(tries)
|
self.sleep(tries)
|
||||||
|
|
||||||
@@ -182,7 +182,8 @@ class WaybackMachineSaveAPI(object):
|
|||||||
tries += 1
|
tries += 1
|
||||||
if tries >= self.max_tries:
|
if tries >= self.max_tries:
|
||||||
raise MaximumSaveRetriesExceeded(
|
raise MaximumSaveRetriesExceeded(
|
||||||
f"Tried {str(tries)} times but failed to save and retrieve the archive for {self.url}.\n"
|
f"Tried {tries} times but failed to save "
|
||||||
|
f"and retrieve the archive for {self.url}.\n"
|
||||||
f"Response URL:\n{self.response_url}\n"
|
f"Response URL:\n{self.response_url}\n"
|
||||||
f"Response Header:\n{self.headers_str}"
|
f"Response Header:\n{self.headers}"
|
||||||
)
|
)
|
||||||
|
@@ -15,8 +15,8 @@ The reason it is still in the code is backwards compatibility with 2.x.x version
|
|||||||
If were are using the Url before the update to version 3.x.x, your code should still be
|
If were are using the Url before the update to version 3.x.x, your code should still be
|
||||||
working fine and there is no hurry to update the interface but is recommended that you
|
working fine and there is no hurry to update the interface but is recommended that you
|
||||||
do not use the Url class for new code as it would be removed after 2025 also the first
|
do not use the Url class for new code as it would be removed after 2025 also the first
|
||||||
3.x.x versions was released in January 2022 and three years are more than enough to update
|
3.x.x versions was released in January 2022 and three years are more than enough to
|
||||||
the older interface code.
|
update the older interface code.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user