fix: shorten long lines

This commit is contained in:
eggplants
2022-02-04 07:20:02 +09:00
parent b496f7008e
commit 5a324b9f61
12 changed files with 172 additions and 62 deletions

View File

@@ -64,8 +64,8 @@ profile = black
[flake8] [flake8]
indent-size = 4 indent-size = 4
# max-line-length = 88 max-line-length = 88
extend-ignore = E203,W503,E501,W605 extend-ignore = W605
[mypy] [mypy]
python_version = 3.9 python_version = 3.9

View File

@@ -12,7 +12,10 @@ from waybackpy.exceptions import (
now = datetime.utcnow() now = datetime.utcnow()
url = "https://example.com/" url = "https://example.com/"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" user_agent = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
)
def rndstr(n: int) -> str: def rndstr(n: int) -> str:
@@ -26,7 +29,10 @@ def test_oldest() -> None:
Test the oldest archive of Google.com and also checks the attributes. Test the oldest archive of Google.com and also checks the attributes.
""" """
url = "https://example.com/" url = "https://example.com/"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" user_agent = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
)
availability_api = WaybackMachineAvailabilityAPI(url, user_agent) availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
oldest = availability_api.oldest() oldest = availability_api.oldest()
oldest_archive_url = oldest.archive_url oldest_archive_url = oldest.archive_url
@@ -59,7 +65,8 @@ def test_newest() -> None:
def test_invalid_json() -> None: def test_invalid_json() -> None:
""" """
When the API is malfunctioning or we don't pass a URL it may return invalid JSON data. When the API is malfunctioning or we don't pass a URL,
it may return invalid JSON data.
""" """
with pytest.raises(InvalidJSONInAvailabilityAPIResponse): with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent) availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)

View File

@@ -2,7 +2,10 @@ from waybackpy.cdx_api import WaybackMachineCDXServerAPI
def test_a() -> None: def test_a() -> None:
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
url = "https://twitter.com/jack" url = "https://twitter.com/jack"
wayback = WaybackMachineCDXServerAPI( wayback = WaybackMachineCDXServerAPI(
@@ -22,7 +25,10 @@ def test_a() -> None:
def test_b() -> None: def test_b() -> None:
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
url = "https://www.google.com" url = "https://www.google.com"
wayback = WaybackMachineCDXServerAPI( wayback = WaybackMachineCDXServerAPI(

View File

@@ -4,7 +4,10 @@ from waybackpy.cdx_snapshot import CDXSnapshot
def test_CDXSnapshot() -> None: def test_CDXSnapshot() -> None:
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" sample_input = (
"org,archive)/ 20080126045828 http://github.com "
"text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
)
prop_values = sample_input.split(" ") prop_values = sample_input.split(" ")
properties = {} properties = {}
( (

View File

@@ -15,7 +15,10 @@ from waybackpy.exceptions import WaybackError
def test_get_total_pages() -> None: def test_get_total_pages() -> None:
url = "twitter.com" url = "twitter.com"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15" user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
)
assert get_total_pages(url=url, user_agent=user_agent) >= 56 assert get_total_pages(url=url, user_agent=user_agent) >= 56

View File

@@ -17,7 +17,10 @@ def rndstr(n: int) -> str:
def test_save() -> None: def test_save() -> None:
url = "https://github.com/akamhy/waybackpy" url = "https://github.com/akamhy/waybackpy"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.save() save_api.save()
archive_url = save_api.archive_url archive_url = save_api.archive_url
@@ -34,7 +37,10 @@ def test_save() -> None:
def test_max_redirect_exceeded() -> None: def test_max_redirect_exceeded() -> None:
with pytest.raises(MaximumSaveRetriesExceeded): with pytest.raises(MaximumSaveRetriesExceeded):
url = f"https://{rndstr}.gov" url = f"https://{rndstr}.gov"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3) save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3)
save_api.save() save_api.save()
@@ -47,7 +53,10 @@ def test_sleep() -> None:
is as intended. is as intended.
""" """
url = "https://example.com" url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
s_time = int(time.time()) s_time = int(time.time())
save_api.sleep(6) # multiple of 3 sleep for 10 seconds save_api.sleep(6) # multiple of 3 sleep for 10 seconds
@@ -62,14 +71,17 @@ def test_sleep() -> None:
def test_timestamp() -> None: def test_timestamp() -> None:
url = "https://example.com" url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
now = datetime.utcnow().strftime("%Y%m%d%H%M%S") now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
save_api._archive_url = f"https://web.archive.org/web/{now}{url}/" save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
save_api.timestamp() save_api.timestamp()
assert save_api.cached_save is False assert save_api.cached_save is False
now = "20100124063622" now = "20100124063622"
save_api._archive_url = f"https://web.archive.org/web/{now}{url}/" save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
save_api.timestamp() save_api.timestamp()
assert save_api.cached_save is True assert save_api.cached_save is True
@@ -79,7 +91,10 @@ def test_archive_url_parser() -> None:
Testing three regex for matches and also tests the response URL. Testing three regex for matches and also tests the response URL.
""" """
url = "https://example.com" url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.headers_str = """ save_api.headers_str = """
@@ -88,39 +103,101 @@ def test_archive_url_parser() -> None:
END END
""" """
assert ( expected_url = (
save_api.archive_url_parser() "https://web.archive.org/web/20201126185327/"
== "https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al" "https://www.scribbr.com/citing-sources/et-al"
) )
assert save_api.archive_url_parser() == expected_url
save_api.headers_str = """ headers = {
{'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'} "Server": "nginx/1.15.8",
""" "Date": "Sat, 02 Jan 2021 09:40:25 GMT",
"Content-Type": "text/html; charset=UTF-8",
"Transfer-Encoding": "chunked",
"Connection": "keep-alive",
"X-Archive-Orig-Server": "nginx",
"X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT",
"X-Archive-Orig-Transfer-Encoding": "chunked",
"X-Archive-Orig-Connection": "keep-alive",
"X-Archive-Orig-Vary": "Accept-Encoding",
"X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT",
"X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;",
"X-Archive-Guessed-Content-Type": "text/html",
"X-Archive-Guessed-Charset": "utf-8",
"Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT",
"Link": (
'<https://www.scribbr.com/citing-sources/et-al/>; rel="original", '
"<https://web.archive.org/web/timemap/link/https://www.scribbr.com/"
'citing-sources/et-al/>; rel="timemap"; type="application/link-format", '
"<https://web.archive.org/web/https://www.scribbr.com/citing-sources/"
'et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/'
'https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; '
'datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/'
"20201126185327/https://www.scribbr.com/citing-sources/et-al/>; "
'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", '
"<https://web.archive.org/web/20210102094009/https://www.scribbr.com/"
'citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 '
'09:40:09 GMT", <https://web.archive.org/web/20210102094009/'
"https://www.scribbr.com/citing-sources/et-al/>; "
'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"'
),
"Content-Security-Policy": (
"default-src 'self' 'unsafe-eval' 'unsafe-inline' "
"data: blob: archive.org web.archive.org analytics.archive.org "
"pragma.archivelab.org",
),
"X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz",
"Server-Timing": (
"captures_list;dur=112.646325, exclusion.robots;dur=0.172010, "
"exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, "
"esindex;dur=0.014647, LoadShardBlock;dur=82.205012, "
"PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, "
"load_resource;dur=26.520179"
),
"X-App-Server": "wwwb-app200",
"X-ts": "200",
"X-location": "All",
"X-Cache-Key": (
"httpsweb.archive.org/web/20210102094009/"
"https://www.scribbr.com/citing-sources/et-al/IN",
),
"X-RL": "0",
"X-Page-Cache": "MISS",
"X-Archive-Screenname": "0",
"Content-Encoding": "gzip",
}
save_api.headers_str = str(headers)
assert ( expected_url2 = (
save_api.archive_url_parser() "https://web.archive.org/web/20210102094009/"
== "https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/" "https://www.scribbr.com/citing-sources/et-al/"
) )
assert save_api.archive_url_parser() == expected_url2
save_api.headers_str = """ expected_url_3 = (
START "https://web.archive.org/web/20171128185327/"
X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US "https://www.scribbr.com/citing-sources/et-al/US"
END
"""
assert (
save_api.archive_url_parser()
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/"
) )
save_api.headers_str = f"START\nX-Cache-Key: {expected_url_3}\nEND\n"
expected_url4 = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al/"
)
assert save_api.archive_url_parser() == expected_url4
save_api.headers_str = ( save_api.headers_str = (
"TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING" "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING"
) )
save_api.response_url = "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al" save_api.response_url = (
assert ( "https://web.archive.org/web/20171128185327/"
save_api.archive_url_parser() "https://www.scribbr.com/citing-sources/et-al"
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al"
) )
expected_url5 = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al"
)
assert save_api.archive_url_parser() == expected_url5
def test_archive_url() -> None: def test_archive_url() -> None:
@@ -130,7 +207,10 @@ def test_archive_url() -> None:
by the archive_url method which is an attribute due to @property. by the archive_url method which is an attribute due to @property.
""" """
url = "https://example.com" url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent) save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.saved_archive = ( save_api.saved_archive = (
"https://web.archive.org/web/20220124063056/https://example.com/" "https://web.archive.org/web/20220124063056/https://example.com/"

View File

@@ -142,8 +142,8 @@ class WaybackMachineAvailabilityAPI(object):
if not data or not data["archived_snapshots"]: if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse( raise ArchiveNotInAvailabilityAPIResponse(
"Archive not found in the availability " "Archive not found in the availability "
"API response, the URL you requested may not have any " "API response, the URL you requested may not have any archives "
"archives yet. You may retry after some time or archive the webpage now.\n" "yet. You may retry after some time or archive the webpage now.\n"
f"Response data:\n{self.response.text}" f"Response data:\n{self.response.text}"
) )
else: else:
@@ -196,7 +196,8 @@ class WaybackMachineAvailabilityAPI(object):
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
appropriate arguments for their respective parameters. appropriate arguments for their respective parameters.
Adds the timestamp to the payload dictionary. Adds the timestamp to the payload dictionary.
And finally invoking the json method to make the API call then returns the instance. And finally invoking the json method to make the API call then returns
the instance.
""" """
if unix_timestamp: if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)

View File

@@ -177,8 +177,8 @@ class WaybackMachineCDXServerAPI(object):
if prop_values_len != properties_len: if prop_values_len != properties_len:
raise WaybackError( raise WaybackError(
f"Snapshot returned by Cdx API has {prop_values_len} properties " f"Snapshot returned by Cdx API has {prop_values_len} "
f"instead of expected {properties_len} properties.\n" f"properties instead of expected {properties_len} properties.\n"
f"Problematic Snapshot: {snapshot}" f"Problematic Snapshot: {snapshot}"
) )

View File

@@ -69,7 +69,8 @@ def check_filters(filters: List[str]) -> None:
# [!]field:regex # [!]field:regex
for _filter in filters: for _filter in filters:
match = re.search( match = re.search(
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)", r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):"
r"(.*)",
_filter, _filter,
) )
@@ -87,11 +88,15 @@ def check_collapses(collapses: List[str]) -> bool:
for collapse in collapses: for collapse in collapses:
match = re.search( match = re.search(
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?", r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)"
r"(:?[0-9]{1,99})?",
collapse, collapse,
) )
if match is None or len(match.groups()) != 2: if match is None or len(match.groups()) != 2:
exc_message = f"collapse argument '{collapse}' is not following the cdx collapse syntax." exc_message = (
f"collapse argument '{collapse}' "
"is not following the cdx collapse syntax."
)
raise WaybackError(exc_message) raise WaybackError(exc_message)
else: else:
return True return True
@@ -106,7 +111,10 @@ def check_match_type(match_type: Optional[str], url: str) -> bool:
"Can not use wildcard in the URL along with the match_type arguments." "Can not use wildcard in the URL along with the match_type arguments."
) )
elif match_type not in legal_match_type: elif match_type not in legal_match_type:
exc_message = f"{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'" exc_message = (
f"{match_type} is not an allowed match type.\n"
"Use one from 'exact', 'prefix', 'host' or 'domain'"
)
raise WaybackError(exc_message) raise WaybackError(exc_message)
else: else:
return True return True

View File

@@ -241,7 +241,8 @@ def main(
and not cdx and not cdx
): ):
click.echo( click.echo(
"Only URL passed, but did not specify what to do with the URL. Use --help flag for help using waybackpy." "Only URL passed, but did not specify what to do with the URL. "
"Use --help flag for help using waybackpy."
) )
return return

View File

@@ -72,7 +72,6 @@ class WaybackMachineSaveAPI(object):
self.response = session.get(self.request_url, headers=self.request_headers) self.response = session.get(self.request_url, headers=self.request_headers)
# requests.response.headers is requests.structures.CaseInsensitiveDict # requests.response.headers is requests.structures.CaseInsensitiveDict
self.headers = self.response.headers self.headers = self.response.headers
self.headers_str = str(self.headers)
self.status_code = self.response.status_code self.status_code = self.response.status_code
self.response_url = self.response.url self.response_url = self.response.url
session.close() session.close()
@@ -85,17 +84,17 @@ class WaybackMachineSaveAPI(object):
""" """
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)" regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
match = re.search(regex1, self.headers_str) match = re.search(regex1, str(self.headers))
if match: if match:
return "https://web.archive.org" + match.group(1) return "https://web.archive.org" + match.group(1)
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>" regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
match = re.search(regex2, self.headers_str) match = re.search(regex2, str(self.headers))
if match is not None and len(match.groups()) == 1: if match is not None and len(match.groups()) == 1:
return "https://" + match.group(1) return "https://" + match.group(1)
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}" regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
match = re.search(regex3, self.headers_str) match = re.search(regex3, str(self.headers))
if match is not None and len(match.groups()) == 1: if match is not None and len(match.groups()) == 1:
return "https" + match.group(1) return "https" + match.group(1)
@@ -132,10 +131,11 @@ class WaybackMachineSaveAPI(object):
Also check if the time on archive is URL and compare it to instance birth Also check if the time on archive is URL and compare it to instance birth
time. time.
If time on the archive is older than the instance creation time set the cached_save If time on the archive is older than the instance creation time set the
to True else set it to False. The flag can be used to check if the Wayback Machine cached_save to True else set it to False. The flag can be used to check
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve if the Wayback Machine didn't serve a Cached URL. It is quite common for
cached archive if last archive was captured before last 45 minutes. the Wayback Machine to serve cached archive if last archive was captured
before last 45 minutes.
""" """
regex = r"https?://web\.archive.org/web/([0-9]{14})/http" regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
m = re.search(regex, str(self._archive_url)) m = re.search(regex, str(self._archive_url))
@@ -167,7 +167,7 @@ class WaybackMachineSaveAPI(object):
tries = 0 tries = 0
while True: while True:
if not self.saved_archive: if self.saved_archive is None:
if tries >= 1: if tries >= 1:
self.sleep(tries) self.sleep(tries)
@@ -182,7 +182,8 @@ class WaybackMachineSaveAPI(object):
tries += 1 tries += 1
if tries >= self.max_tries: if tries >= self.max_tries:
raise MaximumSaveRetriesExceeded( raise MaximumSaveRetriesExceeded(
f"Tried {str(tries)} times but failed to save and retrieve the archive for {self.url}.\n" f"Tried {tries} times but failed to save "
f"and retrieve the archive for {self.url}.\n"
f"Response URL:\n{self.response_url}\n" f"Response URL:\n{self.response_url}\n"
f"Response Header:\n{self.headers_str}" f"Response Header:\n{self.headers}"
) )

View File

@@ -15,8 +15,8 @@ The reason it is still in the code is backwards compatibility with 2.x.x version
If were are using the Url before the update to version 3.x.x, your code should still be If were are using the Url before the update to version 3.x.x, your code should still be
working fine and there is no hurry to update the interface but is recommended that you working fine and there is no hurry to update the interface but is recommended that you
do not use the Url class for new code as it would be removed after 2025 also the first do not use the Url class for new code as it would be removed after 2025 also the first
3.x.x versions was released in January 2022 and three years are more than enough to update 3.x.x versions was released in January 2022 and three years are more than enough to
the older interface code. update the older interface code.
""" """