fix: shorten long lines
This commit is contained in:
		| @@ -64,8 +64,8 @@ profile = black | ||||
|  | ||||
| [flake8] | ||||
| indent-size = 4 | ||||
| # max-line-length = 88 | ||||
| extend-ignore = E203,W503,E501,W605 | ||||
| max-line-length = 88 | ||||
| extend-ignore = W605 | ||||
|  | ||||
| [mypy] | ||||
| python_version = 3.9 | ||||
|   | ||||
| @@ -12,7 +12,10 @@ from waybackpy.exceptions import ( | ||||
|  | ||||
| now = datetime.utcnow() | ||||
| url = "https://example.com/" | ||||
| user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" | ||||
| user_agent = ( | ||||
|     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | ||||
|     "(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" | ||||
| ) | ||||
|  | ||||
|  | ||||
| def rndstr(n: int) -> str: | ||||
| @@ -26,7 +29,10 @@ def test_oldest() -> None: | ||||
|     Test the oldest archive of Google.com and also checks the attributes. | ||||
|     """ | ||||
|     url = "https://example.com/" | ||||
|     user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | ||||
|         "(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" | ||||
|     ) | ||||
|     availability_api = WaybackMachineAvailabilityAPI(url, user_agent) | ||||
|     oldest = availability_api.oldest() | ||||
|     oldest_archive_url = oldest.archive_url | ||||
| @@ -59,7 +65,8 @@ def test_newest() -> None: | ||||
|  | ||||
| def test_invalid_json() -> None: | ||||
|     """ | ||||
|     When the API is malfunctioning or we don't pass a URL it may return invalid JSON data. | ||||
|     When the API is malfunctioning or we don't pass a URL, | ||||
|     it may return invalid JSON data. | ||||
|     """ | ||||
|     with pytest.raises(InvalidJSONInAvailabilityAPIResponse): | ||||
|         availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent) | ||||
|   | ||||
| @@ -2,7 +2,10 @@ from waybackpy.cdx_api import WaybackMachineCDXServerAPI | ||||
|  | ||||
|  | ||||
| def test_a() -> None: | ||||
|     user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " | ||||
|         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     ) | ||||
|     url = "https://twitter.com/jack" | ||||
|  | ||||
|     wayback = WaybackMachineCDXServerAPI( | ||||
| @@ -22,7 +25,10 @@ def test_a() -> None: | ||||
|  | ||||
|  | ||||
| def test_b() -> None: | ||||
|     user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " | ||||
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     ) | ||||
|     url = "https://www.google.com" | ||||
|  | ||||
|     wayback = WaybackMachineCDXServerAPI( | ||||
|   | ||||
| @@ -4,7 +4,10 @@ from waybackpy.cdx_snapshot import CDXSnapshot | ||||
|  | ||||
|  | ||||
| def test_CDXSnapshot() -> None: | ||||
|     sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" | ||||
|     sample_input = ( | ||||
|         "org,archive)/ 20080126045828 http://github.com " | ||||
|         "text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" | ||||
|     ) | ||||
|     prop_values = sample_input.split(" ") | ||||
|     properties = {} | ||||
|     ( | ||||
|   | ||||
| @@ -15,7 +15,10 @@ from waybackpy.exceptions import WaybackError | ||||
|  | ||||
| def test_get_total_pages() -> None: | ||||
|     url = "twitter.com" | ||||
|     user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.2 Safari/605.1.15" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 " | ||||
|         "(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15" | ||||
|     ) | ||||
|     assert get_total_pages(url=url, user_agent=user_agent) >= 56 | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -17,7 +17,10 @@ def rndstr(n: int) -> str: | ||||
|  | ||||
| def test_save() -> None: | ||||
|     url = "https://github.com/akamhy/waybackpy" | ||||
|     user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " | ||||
|         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     ) | ||||
|     save_api = WaybackMachineSaveAPI(url, user_agent) | ||||
|     save_api.save() | ||||
|     archive_url = save_api.archive_url | ||||
| @@ -34,7 +37,10 @@ def test_save() -> None: | ||||
| def test_max_redirect_exceeded() -> None: | ||||
|     with pytest.raises(MaximumSaveRetriesExceeded): | ||||
|         url = f"https://{rndstr}.gov" | ||||
|         user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|         user_agent = ( | ||||
|             "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " | ||||
|             "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|         ) | ||||
|         save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3) | ||||
|         save_api.save() | ||||
|  | ||||
| @@ -47,7 +53,10 @@ def test_sleep() -> None: | ||||
|     is as intended. | ||||
|     """ | ||||
|     url = "https://example.com" | ||||
|     user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " | ||||
|         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     ) | ||||
|     save_api = WaybackMachineSaveAPI(url, user_agent) | ||||
|     s_time = int(time.time()) | ||||
|     save_api.sleep(6)  # multiple of 3 sleep for 10 seconds | ||||
| @@ -62,14 +71,17 @@ def test_sleep() -> None: | ||||
|  | ||||
| def test_timestamp() -> None: | ||||
|     url = "https://example.com" | ||||
|     user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " | ||||
|         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     ) | ||||
|     save_api = WaybackMachineSaveAPI(url, user_agent) | ||||
|     now = datetime.utcnow().strftime("%Y%m%d%H%M%S") | ||||
|     save_api._archive_url = f"https://web.archive.org/web/{now}{url}/" | ||||
|     save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/" | ||||
|     save_api.timestamp() | ||||
|     assert save_api.cached_save is False | ||||
|     now = "20100124063622" | ||||
|     save_api._archive_url = f"https://web.archive.org/web/{now}{url}/" | ||||
|     save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/" | ||||
|     save_api.timestamp() | ||||
|     assert save_api.cached_save is True | ||||
|  | ||||
| @@ -79,7 +91,10 @@ def test_archive_url_parser() -> None: | ||||
|     Testing three regex for matches and also tests the response URL. | ||||
|     """ | ||||
|     url = "https://example.com" | ||||
|     user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " | ||||
|         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     ) | ||||
|     save_api = WaybackMachineSaveAPI(url, user_agent) | ||||
|  | ||||
|     save_api.headers_str = """ | ||||
| @@ -88,39 +103,101 @@ def test_archive_url_parser() -> None: | ||||
|     END | ||||
|     """ | ||||
|  | ||||
|     assert ( | ||||
|         save_api.archive_url_parser() | ||||
|         == "https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al" | ||||
|     expected_url = ( | ||||
|         "https://web.archive.org/web/20201126185327/" | ||||
|         "https://www.scribbr.com/citing-sources/et-al" | ||||
|     ) | ||||
|     assert save_api.archive_url_parser() == expected_url | ||||
|  | ||||
|     save_api.headers_str = """ | ||||
|     {'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'} | ||||
|     """ | ||||
|     headers = { | ||||
|         "Server": "nginx/1.15.8", | ||||
|         "Date": "Sat, 02 Jan 2021 09:40:25 GMT", | ||||
|         "Content-Type": "text/html; charset=UTF-8", | ||||
|         "Transfer-Encoding": "chunked", | ||||
|         "Connection": "keep-alive", | ||||
|         "X-Archive-Orig-Server": "nginx", | ||||
|         "X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT", | ||||
|         "X-Archive-Orig-Transfer-Encoding": "chunked", | ||||
|         "X-Archive-Orig-Connection": "keep-alive", | ||||
|         "X-Archive-Orig-Vary": "Accept-Encoding", | ||||
|         "X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT", | ||||
|         "X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;", | ||||
|         "X-Archive-Guessed-Content-Type": "text/html", | ||||
|         "X-Archive-Guessed-Charset": "utf-8", | ||||
|         "Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT", | ||||
|         "Link": ( | ||||
|             '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", ' | ||||
|             "<https://web.archive.org/web/timemap/link/https://www.scribbr.com/" | ||||
|             'citing-sources/et-al/>; rel="timemap"; type="application/link-format", ' | ||||
|             "<https://web.archive.org/web/https://www.scribbr.com/citing-sources/" | ||||
|             'et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/' | ||||
|             'https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; ' | ||||
|             'datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/' | ||||
|             "20201126185327/https://www.scribbr.com/citing-sources/et-al/>; " | ||||
|             'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", ' | ||||
|             "<https://web.archive.org/web/20210102094009/https://www.scribbr.com/" | ||||
|             'citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 ' | ||||
|             '09:40:09 GMT", <https://web.archive.org/web/20210102094009/' | ||||
|             "https://www.scribbr.com/citing-sources/et-al/>; " | ||||
|             'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"' | ||||
|         ), | ||||
|         "Content-Security-Policy": ( | ||||
|             "default-src 'self' 'unsafe-eval' 'unsafe-inline' " | ||||
|             "data: blob: archive.org web.archive.org analytics.archive.org " | ||||
|             "pragma.archivelab.org", | ||||
|         ), | ||||
|         "X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz", | ||||
|         "Server-Timing": ( | ||||
|             "captures_list;dur=112.646325, exclusion.robots;dur=0.172010, " | ||||
|             "exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, " | ||||
|             "esindex;dur=0.014647, LoadShardBlock;dur=82.205012, " | ||||
|             "PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, " | ||||
|             "load_resource;dur=26.520179" | ||||
|         ), | ||||
|         "X-App-Server": "wwwb-app200", | ||||
|         "X-ts": "200", | ||||
|         "X-location": "All", | ||||
|         "X-Cache-Key": ( | ||||
|             "httpsweb.archive.org/web/20210102094009/" | ||||
|             "https://www.scribbr.com/citing-sources/et-al/IN", | ||||
|         ), | ||||
|         "X-RL": "0", | ||||
|         "X-Page-Cache": "MISS", | ||||
|         "X-Archive-Screenname": "0", | ||||
|         "Content-Encoding": "gzip", | ||||
|     } | ||||
|     save_api.headers_str = str(headers) | ||||
|  | ||||
|     assert ( | ||||
|         save_api.archive_url_parser() | ||||
|         == "https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/" | ||||
|     expected_url2 = ( | ||||
|         "https://web.archive.org/web/20210102094009/" | ||||
|         "https://www.scribbr.com/citing-sources/et-al/" | ||||
|     ) | ||||
|     assert save_api.archive_url_parser() == expected_url2 | ||||
|  | ||||
|     save_api.headers_str = """ | ||||
|     START | ||||
|     X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US | ||||
|     END | ||||
|     """ | ||||
|  | ||||
|     assert ( | ||||
|         save_api.archive_url_parser() | ||||
|         == "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/" | ||||
|     expected_url_3 = ( | ||||
|         "https://web.archive.org/web/20171128185327/" | ||||
|         "https://www.scribbr.com/citing-sources/et-al/US" | ||||
|     ) | ||||
|     save_api.headers_str = f"START\nX-Cache-Key: {expected_url_3}\nEND\n" | ||||
|  | ||||
|     expected_url4 = ( | ||||
|         "https://web.archive.org/web/20171128185327/" | ||||
|         "https://www.scribbr.com/citing-sources/et-al/" | ||||
|     ) | ||||
|     assert save_api.archive_url_parser() == expected_url4 | ||||
|  | ||||
|     save_api.headers_str = ( | ||||
|         "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING" | ||||
|     ) | ||||
|     save_api.response_url = "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al" | ||||
|     assert ( | ||||
|         save_api.archive_url_parser() | ||||
|         == "https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al" | ||||
|     save_api.response_url = ( | ||||
|         "https://web.archive.org/web/20171128185327/" | ||||
|         "https://www.scribbr.com/citing-sources/et-al" | ||||
|     ) | ||||
|     expected_url5 = ( | ||||
|         "https://web.archive.org/web/20171128185327/" | ||||
|         "https://www.scribbr.com/citing-sources/et-al" | ||||
|     ) | ||||
|     assert save_api.archive_url_parser() == expected_url5 | ||||
|  | ||||
|  | ||||
| def test_archive_url() -> None: | ||||
| @@ -130,7 +207,10 @@ def test_archive_url() -> None: | ||||
|     by the archive_url method which is an attribute due to @property. | ||||
|     """ | ||||
|     url = "https://example.com" | ||||
|     user_agent = "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     user_agent = ( | ||||
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " | ||||
|         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" | ||||
|     ) | ||||
|     save_api = WaybackMachineSaveAPI(url, user_agent) | ||||
|     save_api.saved_archive = ( | ||||
|         "https://web.archive.org/web/20220124063056/https://example.com/" | ||||
|   | ||||
| @@ -142,8 +142,8 @@ class WaybackMachineAvailabilityAPI(object): | ||||
|             if not data or not data["archived_snapshots"]: | ||||
|                 raise ArchiveNotInAvailabilityAPIResponse( | ||||
|                     "Archive not found in the availability " | ||||
|                     "API response, the URL you requested may not have any " | ||||
|                     "archives yet. You may retry after some time or archive the webpage now.\n" | ||||
|                     "API response, the URL you requested may not have any archives " | ||||
|                     "yet. You may retry after some time or archive the webpage now.\n" | ||||
|                     f"Response data:\n{self.response.text}" | ||||
|                 ) | ||||
|         else: | ||||
| @@ -196,7 +196,8 @@ class WaybackMachineAvailabilityAPI(object): | ||||
|         unix_timestamp_to_wayback_timestamp or wayback_timestamp method with | ||||
|         appropriate arguments for their respective parameters. | ||||
|         Adds the timestamp to the payload dictionary. | ||||
|         And finally invoking the json method to make the API call then returns the instance. | ||||
|         And finally invoking the json method to make the API call then returns | ||||
|         the instance. | ||||
|         """ | ||||
|         if unix_timestamp: | ||||
|             timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp) | ||||
|   | ||||
| @@ -177,8 +177,8 @@ class WaybackMachineCDXServerAPI(object): | ||||
|  | ||||
|                 if prop_values_len != properties_len: | ||||
|                     raise WaybackError( | ||||
|                         f"Snapshot returned by Cdx API has {prop_values_len} properties " | ||||
|                         f"instead of expected {properties_len} properties.\n" | ||||
|                         f"Snapshot returned by Cdx API has {prop_values_len} " | ||||
|                         f"properties instead of expected {properties_len} properties.\n" | ||||
|                         f"Problematic Snapshot: {snapshot}" | ||||
|                     ) | ||||
|  | ||||
|   | ||||
| @@ -69,7 +69,8 @@ def check_filters(filters: List[str]) -> None: | ||||
|     # [!]field:regex | ||||
|     for _filter in filters: | ||||
|         match = re.search( | ||||
|             r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)", | ||||
|             r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):" | ||||
|             r"(.*)", | ||||
|             _filter, | ||||
|         ) | ||||
|  | ||||
| @@ -87,11 +88,15 @@ def check_collapses(collapses: List[str]) -> bool: | ||||
|  | ||||
|     for collapse in collapses: | ||||
|         match = re.search( | ||||
|             r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?", | ||||
|             r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)" | ||||
|             r"(:?[0-9]{1,99})?", | ||||
|             collapse, | ||||
|         ) | ||||
|         if match is None or len(match.groups()) != 2: | ||||
|             exc_message = f"collapse argument '{collapse}' is not following the cdx collapse syntax." | ||||
|             exc_message = ( | ||||
|                 f"collapse argument '{collapse}' " | ||||
|                 "is not following the cdx collapse syntax." | ||||
|             ) | ||||
|             raise WaybackError(exc_message) | ||||
|     else: | ||||
|         return True | ||||
| @@ -106,7 +111,10 @@ def check_match_type(match_type: Optional[str], url: str) -> bool: | ||||
|             "Can not use wildcard in the URL along with the match_type arguments." | ||||
|         ) | ||||
|     elif match_type not in legal_match_type: | ||||
|         exc_message = f"{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'" | ||||
|         exc_message = ( | ||||
|             f"{match_type} is not an allowed match type.\n" | ||||
|             "Use one from 'exact', 'prefix', 'host' or 'domain'" | ||||
|         ) | ||||
|         raise WaybackError(exc_message) | ||||
|     else: | ||||
|         return True | ||||
|   | ||||
| @@ -241,7 +241,8 @@ def main( | ||||
|         and not cdx | ||||
|     ): | ||||
|         click.echo( | ||||
|             "Only URL passed, but did not specify what to do with the URL. Use --help flag for help using waybackpy." | ||||
|             "Only URL passed, but did not specify what to do with the URL. " | ||||
|             "Use --help flag for help using waybackpy." | ||||
|         ) | ||||
|         return | ||||
|  | ||||
|   | ||||
| @@ -72,7 +72,6 @@ class WaybackMachineSaveAPI(object): | ||||
|         self.response = session.get(self.request_url, headers=self.request_headers) | ||||
|         # requests.response.headers is requests.structures.CaseInsensitiveDict | ||||
|         self.headers = self.response.headers | ||||
|         self.headers_str = str(self.headers) | ||||
|         self.status_code = self.response.status_code | ||||
|         self.response_url = self.response.url | ||||
|         session.close() | ||||
| @@ -85,17 +84,17 @@ class WaybackMachineSaveAPI(object): | ||||
|         """ | ||||
|  | ||||
|         regex1 = r"Content-Location: (/web/[0-9]{14}/.*)" | ||||
|         match = re.search(regex1, self.headers_str) | ||||
|         match = re.search(regex1, str(self.headers)) | ||||
|         if match: | ||||
|             return "https://web.archive.org" + match.group(1) | ||||
|  | ||||
|         regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>" | ||||
|         match = re.search(regex2, self.headers_str) | ||||
|         match = re.search(regex2, str(self.headers)) | ||||
|         if match is not None and len(match.groups()) == 1: | ||||
|             return "https://" + match.group(1) | ||||
|  | ||||
|         regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}" | ||||
|         match = re.search(regex3, self.headers_str) | ||||
|         match = re.search(regex3, str(self.headers)) | ||||
|         if match is not None and len(match.groups()) == 1: | ||||
|             return "https" + match.group(1) | ||||
|  | ||||
| @@ -132,10 +131,11 @@ class WaybackMachineSaveAPI(object): | ||||
|         Also check if the time on archive is URL and compare it to instance birth | ||||
|         time. | ||||
|  | ||||
|         If time on the archive is older than the instance creation time set the cached_save | ||||
|         to True else set it to False. The flag can be used to check if the Wayback Machine | ||||
|         didn't serve a Cached URL. It is quite common for the Wayback Machine to serve | ||||
|         cached archive if last archive was captured before last 45 minutes. | ||||
|         If time on the archive is older than the instance creation time set the | ||||
|         cached_save to True else set it to False. The flag can be used to check | ||||
|         if the Wayback Machine didn't serve a Cached URL. It is quite common for | ||||
|         the Wayback Machine to serve cached archive if last archive was captured | ||||
|         before last 45 minutes. | ||||
|         """ | ||||
|         regex = r"https?://web\.archive.org/web/([0-9]{14})/http" | ||||
|         m = re.search(regex, str(self._archive_url)) | ||||
| @@ -167,7 +167,7 @@ class WaybackMachineSaveAPI(object): | ||||
|         tries = 0 | ||||
|  | ||||
|         while True: | ||||
|             if not self.saved_archive: | ||||
|             if self.saved_archive is None: | ||||
|                 if tries >= 1: | ||||
|                     self.sleep(tries) | ||||
|  | ||||
| @@ -182,7 +182,8 @@ class WaybackMachineSaveAPI(object): | ||||
|             tries += 1 | ||||
|             if tries >= self.max_tries: | ||||
|                 raise MaximumSaveRetriesExceeded( | ||||
|                     f"Tried {str(tries)} times but failed to save and retrieve the archive for {self.url}.\n" | ||||
|                     f"Tried {tries} times but failed to save " | ||||
|                     f"and retrieve the archive for {self.url}.\n" | ||||
|                     f"Response URL:\n{self.response_url}\n" | ||||
|                     f"Response Header:\n{self.headers_str}" | ||||
|                     f"Response Header:\n{self.headers}" | ||||
|                 ) | ||||
|   | ||||
| @@ -15,8 +15,8 @@ The reason it is still in the code is backwards compatibility with 2.x.x version | ||||
| If were are using the Url before the update to version 3.x.x, your code should still be | ||||
| working fine and there is no hurry to update the interface but is recommended that you | ||||
| do not use the Url class for new code as it would be removed after 2025 also the first | ||||
| 3.x.x versions was released in January 2022 and three years are more than enough to update | ||||
| the older interface code. | ||||
| 3.x.x versions was released in January 2022 and three years are more than enough to | ||||
| update the older interface code. | ||||
| """ | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user