fix: shorten long lines

This commit is contained in:
eggplants
2022-02-04 07:20:02 +09:00
parent b496f7008e
commit 5a324b9f61
12 changed files with 172 additions and 62 deletions

View File

@@ -64,8 +64,8 @@ profile = black
[flake8]
indent-size = 4
# max-line-length = 88
extend-ignore = E203,W503,E501,W605
max-line-length = 88
extend-ignore = W605
[mypy]
python_version = 3.9

View File

@@ -12,7 +12,10 @@ from waybackpy.exceptions import (
now = datetime.utcnow()
url = "https://example.com/"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
user_agent = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
)
def rndstr(n: int) -> str:
@@ -26,7 +29,10 @@ def test_oldest() -> None:
Test the oldest archive of Google.com and also checks the attributes.
"""
url = "https://example.com/"
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
user_agent = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
)
availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
oldest = availability_api.oldest()
oldest_archive_url = oldest.archive_url
@@ -59,7 +65,8 @@ def test_newest() -> None:
def test_invalid_json() -> None:
"""
When the API is malfunctioning or we don't pass a URL it may return invalid JSON data.
When the API is malfunctioning or we don't pass a URL,
it may return invalid JSON data.
"""
with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)

View File

@@ -2,7 +2,10 @@ from waybackpy.cdx_api import WaybackMachineCDXServerAPI
def test_a() -> None:
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
url = "https://twitter.com/jack"
wayback = WaybackMachineCDXServerAPI(
@@ -22,7 +25,10 @@ def test_a() -> None:
def test_b() -> None:
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
url = "https://www.google.com"
wayback = WaybackMachineCDXServerAPI(

View File

@@ -4,7 +4,10 @@ from waybackpy.cdx_snapshot import CDXSnapshot
def test_CDXSnapshot() -> None:
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
sample_input = (
"org,archive)/ 20080126045828 http://github.com "
"text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
)
prop_values = sample_input.split(" ")
properties = {}
(

View File

@@ -15,7 +15,10 @@ from waybackpy.exceptions import WaybackError
def test_get_total_pages() -> None:
url = "twitter.com"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
)
assert get_total_pages(url=url, user_agent=user_agent) >= 56

View File

@@ -17,7 +17,10 @@ def rndstr(n: int) -> str:
def test_save() -> None:
url = "https://github.com/akamhy/waybackpy"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.save()
archive_url = save_api.archive_url
@@ -34,7 +37,10 @@ def test_save() -> None:
def test_max_redirect_exceeded() -> None:
with pytest.raises(MaximumSaveRetriesExceeded):
url = f"https://{rndstr}.gov"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3)
save_api.save()
@@ -47,7 +53,10 @@ def test_sleep() -> None:
is as intended.
"""
url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent)
s_time = int(time.time())
save_api.sleep(6) # multiple of 3 sleep for 10 seconds
@@ -62,14 +71,17 @@ def test_sleep() -> None:
def test_timestamp() -> None:
url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent)
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
save_api._archive_url = f"https://web.archive.org/web/{now}{url}/"
save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
save_api.timestamp()
assert save_api.cached_save is False
now = "20100124063622"
save_api._archive_url = f"https://web.archive.org/web/{now}{url}/"
save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
save_api.timestamp()
assert save_api.cached_save is True
@@ -79,7 +91,10 @@ def test_archive_url_parser() -> None:
Testing three regex for matches and also tests the response URL.
"""
url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.headers_str = """
@@ -88,39 +103,101 @@ def test_archive_url_parser() -> None:
END
"""
assert (
save_api.archive_url_parser()
== "https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al"
expected_url = (
"https://web.archive.org/web/20201126185327/"
"https://www.scribbr.com/citing-sources/et-al"
)
assert save_api.archive_url_parser() == expected_url
save_api.headers_str = """
{'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'}
"""
headers = {
"Server": "nginx/1.15.8",
"Date": "Sat, 02 Jan 2021 09:40:25 GMT",
"Content-Type": "text/html; charset=UTF-8",
"Transfer-Encoding": "chunked",
"Connection": "keep-alive",
"X-Archive-Orig-Server": "nginx",
"X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT",
"X-Archive-Orig-Transfer-Encoding": "chunked",
"X-Archive-Orig-Connection": "keep-alive",
"X-Archive-Orig-Vary": "Accept-Encoding",
"X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT",
"X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;",
"X-Archive-Guessed-Content-Type": "text/html",
"X-Archive-Guessed-Charset": "utf-8",
"Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT",
"Link": (
'<https://www.scribbr.com/citing-sources/et-al/>; rel="original", '
"<https://web.archive.org/web/timemap/link/https://www.scribbr.com/"
'citing-sources/et-al/>; rel="timemap"; type="application/link-format", '
"<https://web.archive.org/web/https://www.scribbr.com/citing-sources/"
'et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/'
'https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; '
'datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/'
"20201126185327/https://www.scribbr.com/citing-sources/et-al/>; "
'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", '
"<https://web.archive.org/web/20210102094009/https://www.scribbr.com/"
'citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 '
'09:40:09 GMT", <https://web.archive.org/web/20210102094009/'
"https://www.scribbr.com/citing-sources/et-al/>; "
'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"'
),
"Content-Security-Policy": (
"default-src 'self' 'unsafe-eval' 'unsafe-inline' "
"data: blob: archive.org web.archive.org analytics.archive.org "
"pragma.archivelab.org",
),
"X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz",
"Server-Timing": (
"captures_list;dur=112.646325, exclusion.robots;dur=0.172010, "
"exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, "
"esindex;dur=0.014647, LoadShardBlock;dur=82.205012, "
"PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, "
"load_resource;dur=26.520179"
),
"X-App-Server": "wwwb-app200",
"X-ts": "200",
"X-location": "All",
"X-Cache-Key": (
"httpsweb.archive.org/web/20210102094009/"
"https://www.scribbr.com/citing-sources/et-al/IN",
),
"X-RL": "0",
"X-Page-Cache": "MISS",
"X-Archive-Screenname": "0",
"Content-Encoding": "gzip",
}
save_api.headers_str = str(headers)
assert (
save_api.archive_url_parser()
== "https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/"
expected_url2 = (
"https://web.archive.org/web/20210102094009/"
"https://www.scribbr.com/citing-sources/et-al/"
)
assert save_api.archive_url_parser() == expected_url2
save_api.headers_str = """
START
X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US
END
"""
assert (
save_api.archive_url_parser()
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/"
expected_url_3 = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al/US"
)
save_api.headers_str = f"START\nX-Cache-Key: {expected_url_3}\nEND\n"
expected_url4 = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al/"
)
assert save_api.archive_url_parser() == expected_url4
save_api.headers_str = (
"TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING"
)
save_api.response_url = "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al"
assert (
save_api.archive_url_parser()
== "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al"
save_api.response_url = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al"
)
expected_url5 = (
"https://web.archive.org/web/20171128185327/"
"https://www.scribbr.com/citing-sources/et-al"
)
assert save_api.archive_url_parser() == expected_url5
def test_archive_url() -> None:
@@ -130,7 +207,10 @@ def test_archive_url() -> None:
by the archive_url method which is an attribute due to @property.
"""
url = "https://example.com"
user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
user_agent = (
"Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
)
save_api = WaybackMachineSaveAPI(url, user_agent)
save_api.saved_archive = (
"https://web.archive.org/web/20220124063056/https://example.com/"

View File

@@ -142,8 +142,8 @@ class WaybackMachineAvailabilityAPI(object):
if not data or not data["archived_snapshots"]:
raise ArchiveNotInAvailabilityAPIResponse(
"Archive not found in the availability "
"API response, the URL you requested may not have any "
"archives yet. You may retry after some time or archive the webpage now.\n"
"API response, the URL you requested may not have any archives "
"yet. You may retry after some time or archive the webpage now.\n"
f"Response data:\n{self.response.text}"
)
else:
@@ -196,7 +196,8 @@ class WaybackMachineAvailabilityAPI(object):
unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
appropriate arguments for their respective parameters.
Adds the timestamp to the payload dictionary.
And finally invoking the json method to make the API call then returns the instance.
And finally invoking the json method to make the API call then returns
the instance.
"""
if unix_timestamp:
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)

View File

@@ -177,8 +177,8 @@ class WaybackMachineCDXServerAPI(object):
if prop_values_len != properties_len:
raise WaybackError(
f"Snapshot returned by Cdx API has {prop_values_len} properties "
f"instead of expected {properties_len} properties.\n"
f"Snapshot returned by Cdx API has {prop_values_len} "
f"properties instead of expected {properties_len} properties.\n"
f"Problematic Snapshot: {snapshot}"
)

View File

@@ -69,7 +69,8 @@ def check_filters(filters: List[str]) -> None:
# [!]field:regex
for _filter in filters:
match = re.search(
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):"
r"(.*)",
_filter,
)
@@ -87,11 +88,15 @@ def check_collapses(collapses: List[str]) -> bool:
for collapse in collapses:
match = re.search(
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)"
r"(:?[0-9]{1,99})?",
collapse,
)
if match is None or len(match.groups()) != 2:
exc_message = f"collapse argument '{collapse}' is not following the cdx collapse syntax."
exc_message = (
f"collapse argument '{collapse}' "
"is not following the cdx collapse syntax."
)
raise WaybackError(exc_message)
else:
return True
@@ -106,7 +111,10 @@ def check_match_type(match_type: Optional[str], url: str) -> bool:
"Can not use wildcard in the URL along with the match_type arguments."
)
elif match_type not in legal_match_type:
exc_message = f"{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'"
exc_message = (
f"{match_type} is not an allowed match type.\n"
"Use one from 'exact', 'prefix', 'host' or 'domain'"
)
raise WaybackError(exc_message)
else:
return True

View File

@@ -241,7 +241,8 @@ def main(
and not cdx
):
click.echo(
"Only URL passed, but did not specify what to do with the URL. Use --help flag for help using waybackpy."
"Only URL passed, but did not specify what to do with the URL. "
"Use --help flag for help using waybackpy."
)
return

View File

@@ -72,7 +72,6 @@ class WaybackMachineSaveAPI(object):
self.response = session.get(self.request_url, headers=self.request_headers)
# requests.response.headers is requests.structures.CaseInsensitiveDict
self.headers = self.response.headers
self.headers_str = str(self.headers)
self.status_code = self.response.status_code
self.response_url = self.response.url
session.close()
@@ -85,17 +84,17 @@ class WaybackMachineSaveAPI(object):
"""
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
match = re.search(regex1, self.headers_str)
match = re.search(regex1, str(self.headers))
if match:
return "https://web.archive.org" + match.group(1)
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
match = re.search(regex2, self.headers_str)
match = re.search(regex2, str(self.headers))
if match is not None and len(match.groups()) == 1:
return "https://" + match.group(1)
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
match = re.search(regex3, self.headers_str)
match = re.search(regex3, str(self.headers))
if match is not None and len(match.groups()) == 1:
return "https" + match.group(1)
@@ -132,10 +131,11 @@ class WaybackMachineSaveAPI(object):
Also check if the time on archive is URL and compare it to instance birth
time.
If time on the archive is older than the instance creation time set the cached_save
to True else set it to False. The flag can be used to check if the Wayback Machine
didn't serve a Cached URL. It is quite common for the Wayback Machine to serve
cached archive if last archive was captured before last 45 minutes.
If time on the archive is older than the instance creation time set the
cached_save to True else set it to False. The flag can be used to check
if the Wayback Machine didn't serve a Cached URL. It is quite common for
the Wayback Machine to serve cached archive if last archive was captured
before last 45 minutes.
"""
regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
m = re.search(regex, str(self._archive_url))
@@ -167,7 +167,7 @@ class WaybackMachineSaveAPI(object):
tries = 0
while True:
if not self.saved_archive:
if self.saved_archive is None:
if tries >= 1:
self.sleep(tries)
@@ -182,7 +182,8 @@ class WaybackMachineSaveAPI(object):
tries += 1
if tries >= self.max_tries:
raise MaximumSaveRetriesExceeded(
f"Tried {str(tries)} times but failed to save and retrieve the archive for {self.url}.\n"
f"Tried {tries} times but failed to save "
f"and retrieve the archive for {self.url}.\n"
f"Response URL:\n{self.response_url}\n"
f"Response Header:\n{self.headers_str}"
f"Response Header:\n{self.headers}"
)

View File

@@ -15,8 +15,8 @@ The reason it is still in the code is backwards compatibility with 2.x.x version
If were are using the Url before the update to version 3.x.x, your code should still be
working fine and there is no hurry to update the interface but is recommended that you
do not use the Url class for new code as it would be removed after 2025 also the first
3.x.x versions was released in January 2022 and three years are more than enough to update
the older interface code.
3.x.x versions was released in January 2022 and three years are more than enough to
update the older interface code.
"""