Improve pylint score (#142)
* fix: errors to improve pylint scores * fix: test * fix * add: flake ignore rule to pip8speaks conf * fix * add: test patterns to deepsource conf
This commit is contained in:
parent
d3a8f343f8
commit
0b631592ea
@ -3,6 +3,9 @@ version = 1
|
|||||||
[[analyzers]]
|
[[analyzers]]
|
||||||
name = "python"
|
name = "python"
|
||||||
enabled = true
|
enabled = true
|
||||||
|
test_patterns = [
|
||||||
|
"tests/**",
|
||||||
|
"test_*.py"
|
||||||
|
]
|
||||||
[analyzers.meta]
|
[analyzers.meta]
|
||||||
runtime_version = "3.x.x"
|
runtime_version = "3.x.x"
|
@ -4,3 +4,4 @@ scanner:
|
|||||||
|
|
||||||
flake8:
|
flake8:
|
||||||
max-line-length: 88
|
max-line-length: 88
|
||||||
|
extend-ignore: W503,W605
|
||||||
|
@ -65,7 +65,7 @@ profile = black
|
|||||||
[flake8]
|
[flake8]
|
||||||
indent-size = 4
|
indent-size = 4
|
||||||
max-line-length = 88
|
max-line-length = 88
|
||||||
extend-ignore = W605
|
extend-ignore = W503,W605
|
||||||
|
|
||||||
[mypy]
|
[mypy]
|
||||||
python_version = 3.9
|
python_version = 3.9
|
||||||
@ -84,7 +84,3 @@ addopts =
|
|||||||
--cov-report=html
|
--cov-report=html
|
||||||
testpaths =
|
testpaths =
|
||||||
tests
|
tests
|
||||||
|
|
||||||
[pycodestyle]
|
|
||||||
# for `license` and `filter in `waybackpy.cli.main`
|
|
||||||
ignore = W0622
|
|
||||||
|
@ -40,8 +40,8 @@ def test_oldest() -> None:
|
|||||||
oldest_timestamp = oldest.timestamp()
|
oldest_timestamp = oldest.timestamp()
|
||||||
assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years
|
assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years
|
||||||
assert (
|
assert (
|
||||||
availability_api.JSON is not None
|
availability_api.json is not None
|
||||||
and availability_api.JSON["archived_snapshots"]["closest"]["available"] is True
|
and availability_api.json["archived_snapshots"]["closest"]["available"] is True
|
||||||
)
|
)
|
||||||
assert repr(oldest).find("example.com") != -1
|
assert repr(oldest).find("example.com") != -1
|
||||||
assert "2002" in str(oldest)
|
assert "2002" in str(oldest)
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
"""Module initializer and provider of static infomation."""
|
||||||
|
|
||||||
__title__ = "waybackpy"
|
__title__ = "waybackpy"
|
||||||
__description__ = (
|
__description__ = (
|
||||||
"Python package that interfaces with the Internet Archive's Wayback Machine APIs. "
|
"Python package that interfaces with the Internet Archive's Wayback Machine APIs. "
|
||||||
|
@ -37,7 +37,7 @@ from .utils import DEFAULT_USER_AGENT
|
|||||||
ResponseJSON = Dict[str, Any]
|
ResponseJSON = Dict[str, Any]
|
||||||
|
|
||||||
|
|
||||||
class WaybackMachineAvailabilityAPI(object):
|
class WaybackMachineAvailabilityAPI:
|
||||||
"""
|
"""
|
||||||
Class that interfaces the availability API of the Wayback Machine.
|
Class that interfaces the availability API of the Wayback Machine.
|
||||||
"""
|
"""
|
||||||
@ -55,7 +55,8 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
self.tries: int = 0
|
self.tries: int = 0
|
||||||
self.last_api_call_unix_time: int = int(time.time())
|
self.last_api_call_unix_time: int = int(time.time())
|
||||||
self.api_call_time_gap: int = 5
|
self.api_call_time_gap: int = 5
|
||||||
self.JSON: Optional[ResponseJSON] = None
|
self.json: Optional[ResponseJSON] = None
|
||||||
|
self.response: Optional[Response] = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
|
def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
|
||||||
@ -83,12 +84,12 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
# String should not return anything other than a string object
|
# String should not return anything other than a string object
|
||||||
# So, if a string repr is asked for before making any API requests
|
# So, if a string repr is asked for before making any API requests
|
||||||
# just return ""
|
# just return ""
|
||||||
if not self.JSON:
|
if not self.json:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
return self.archive_url
|
return self.archive_url
|
||||||
|
|
||||||
def json(self) -> Optional[ResponseJSON]:
|
def setup_json(self) -> Optional[ResponseJSON]:
|
||||||
"""
|
"""
|
||||||
Makes the API call to the availability API and set the JSON response
|
Makes the API call to the availability API and set the JSON response
|
||||||
to the JSON attribute of the instance and also returns the JSON
|
to the JSON attribute of the instance and also returns the JSON
|
||||||
@ -109,19 +110,19 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
if sleep_time > 0:
|
if sleep_time > 0:
|
||||||
time.sleep(sleep_time)
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
self.response: Response = requests.get(
|
self.response = requests.get(
|
||||||
self.endpoint, params=self.payload, headers=self.headers
|
self.endpoint, params=self.payload, headers=self.headers
|
||||||
)
|
)
|
||||||
self.last_api_call_unix_time = int(time.time())
|
self.last_api_call_unix_time = int(time.time())
|
||||||
self.tries += 1
|
self.tries += 1
|
||||||
try:
|
try:
|
||||||
self.JSON = self.response.json()
|
self.json = None if self.response is None else self.response.json()
|
||||||
except json.decoder.JSONDecodeError as json_decode_error:
|
except json.decoder.JSONDecodeError as json_decode_error:
|
||||||
raise InvalidJSONInAvailabilityAPIResponse(
|
raise InvalidJSONInAvailabilityAPIResponse(
|
||||||
f"Response data:\n{self.response.text}"
|
f"Response data:\n{self.response.text}"
|
||||||
) from json_decode_error
|
) from json_decode_error
|
||||||
|
|
||||||
return self.JSON
|
return self.json
|
||||||
|
|
||||||
def timestamp(self) -> datetime:
|
def timestamp(self) -> datetime:
|
||||||
"""
|
"""
|
||||||
@ -136,19 +137,19 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
guaranteed that you can get the datetime object from the timestamp.
|
guaranteed that you can get the datetime object from the timestamp.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.JSON is None or "archived_snapshots" not in self.JSON:
|
if self.json is None or "archived_snapshots" not in self.json:
|
||||||
return datetime.max
|
return datetime.max
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.JSON is not None
|
self.json is not None
|
||||||
and "archived_snapshots" in self.JSON
|
and "archived_snapshots" in self.json
|
||||||
and self.JSON["archived_snapshots"] is not None
|
and self.json["archived_snapshots"] is not None
|
||||||
and "closest" in self.JSON["archived_snapshots"]
|
and "closest" in self.json["archived_snapshots"]
|
||||||
and self.JSON["archived_snapshots"]["closest"] is not None
|
and self.json["archived_snapshots"]["closest"] is not None
|
||||||
and "timestamp" in self.JSON["archived_snapshots"]["closest"]
|
and "timestamp" in self.json["archived_snapshots"]["closest"]
|
||||||
):
|
):
|
||||||
return datetime.strptime(
|
return datetime.strptime(
|
||||||
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||||
)
|
)
|
||||||
|
|
||||||
raise ValueError("Could not get timestamp from result")
|
raise ValueError("Could not get timestamp from result")
|
||||||
@ -162,7 +163,7 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
archive_url = ""
|
archive_url = ""
|
||||||
data = self.JSON
|
data = self.json
|
||||||
|
|
||||||
# If the user didn't invoke oldest, newest or near but tries to access the
|
# If the user didn't invoke oldest, newest or near but tries to access the
|
||||||
# archive_url attribute then assume they are fine with any archive
|
# archive_url attribute then assume they are fine with any archive
|
||||||
@ -176,8 +177,8 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
while (self.tries < self.max_tries) and (
|
while (self.tries < self.max_tries) and (
|
||||||
not data or not data["archived_snapshots"]
|
not data or not data["archived_snapshots"]
|
||||||
):
|
):
|
||||||
self.json() # It makes a new API call
|
self.setup_json() # It makes a new API call
|
||||||
data = self.JSON # json() updated the value of JSON attribute
|
data = self.json # json() updated the value of JSON attribute
|
||||||
|
|
||||||
# If we exhausted the max_tries, then we give up and
|
# If we exhausted the max_tries, then we give up and
|
||||||
# raise exception.
|
# raise exception.
|
||||||
@ -187,7 +188,10 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
"Archive not found in the availability "
|
"Archive not found in the availability "
|
||||||
"API response, the URL you requested may not have any archives "
|
"API response, the URL you requested may not have any archives "
|
||||||
"yet. You may retry after some time or archive the webpage now.\n"
|
"yet. You may retry after some time or archive the webpage now.\n"
|
||||||
f"Response data:\n{self.response.text}"
|
"Response data:\n"
|
||||||
|
""
|
||||||
|
if self.response is None
|
||||||
|
else self.response.text
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||||
@ -262,5 +266,5 @@ class WaybackMachineAvailabilityAPI(object):
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.payload["timestamp"] = timestamp
|
self.payload["timestamp"] = timestamp
|
||||||
self.json()
|
self.setup_json()
|
||||||
return self
|
return self
|
||||||
|
@ -24,7 +24,7 @@ from .exceptions import WaybackError
|
|||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
class WaybackMachineCDXServerAPI(object):
|
class WaybackMachineCDXServerAPI:
|
||||||
"""
|
"""
|
||||||
Class that interfaces the CDX server API of the Wayback Machine.
|
Class that interfaces the CDX server API of the Wayback Machine.
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ from datetime import datetime
|
|||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
|
|
||||||
class CDXSnapshot(object):
|
class CDXSnapshot:
|
||||||
"""
|
"""
|
||||||
Class for the CDX snapshot lines('record') returned by the CDX API,
|
Class for the CDX snapshot lines('record') returned by the CDX API,
|
||||||
Each valid line of the CDX API is casted to an CDXSnapshot object
|
Each valid line of the CDX API is casted to an CDXSnapshot object
|
||||||
|
@ -2,11 +2,11 @@
|
|||||||
Module that makes waybackpy a CLI tool.
|
Module that makes waybackpy a CLI tool.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json as JSON
|
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
|
from json import dumps
|
||||||
from typing import Generator, List, Optional
|
from typing import Generator, List, Optional
|
||||||
|
|
||||||
import click
|
import click
|
||||||
@ -40,7 +40,7 @@ def echo_availability_api(
|
|||||||
click.echo(archive_url)
|
click.echo(archive_url)
|
||||||
if json:
|
if json:
|
||||||
click.echo("JSON response:")
|
click.echo("JSON response:")
|
||||||
click.echo(JSON.dumps(availability_api_instance.JSON))
|
click.echo(dumps(availability_api_instance.json))
|
||||||
|
|
||||||
|
|
||||||
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
||||||
@ -63,7 +63,7 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
|
|||||||
domain = "domain-unknown" if match is None else match.group(1)
|
domain = "domain-unknown" if match is None else match.group(1)
|
||||||
file_name = f"{domain}-urls-{uid}.txt"
|
file_name = f"{domain}-urls-{uid}.txt"
|
||||||
file_path = os.path.join(os.getcwd(), file_name)
|
file_path = os.path.join(os.getcwd(), file_name)
|
||||||
with open(file_path, "a") as file:
|
with open(file_path, "a", encoding="UTF-8") as file:
|
||||||
file.write(f"{url}\n")
|
file.write(f"{url}\n")
|
||||||
|
|
||||||
click.echo(url)
|
click.echo(url)
|
||||||
@ -345,8 +345,8 @@ def main( # pylint: disable=no-value-for-parameter
|
|||||||
if file:
|
if file:
|
||||||
return save_urls_on_file(url_gen)
|
return save_urls_on_file(url_gen)
|
||||||
|
|
||||||
for url in url_gen:
|
for url_ in url_gen:
|
||||||
click.echo(url)
|
click.echo(url_)
|
||||||
|
|
||||||
elif cdx:
|
elif cdx:
|
||||||
filters = list(cdx_filter)
|
filters = list(cdx_filter)
|
||||||
|
@ -12,6 +12,7 @@ from typing import Dict, Optional
|
|||||||
|
|
||||||
import requests
|
import requests
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
|
from requests.models import Response
|
||||||
from requests.structures import CaseInsensitiveDict
|
from requests.structures import CaseInsensitiveDict
|
||||||
from urllib3.util.retry import Retry
|
from urllib3.util.retry import Retry
|
||||||
|
|
||||||
@ -19,7 +20,7 @@ from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, Waybac
|
|||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
class WaybackMachineSaveAPI(object):
|
class WaybackMachineSaveAPI:
|
||||||
"""
|
"""
|
||||||
WaybackMachineSaveAPI class provides an interface for saving URLs on the
|
WaybackMachineSaveAPI class provides an interface for saving URLs on the
|
||||||
Wayback Machine.
|
Wayback Machine.
|
||||||
@ -43,6 +44,12 @@ class WaybackMachineSaveAPI(object):
|
|||||||
self.status_forcelist = [500, 502, 503, 504]
|
self.status_forcelist = [500, 502, 503, 504]
|
||||||
self._archive_url: Optional[str] = None
|
self._archive_url: Optional[str] = None
|
||||||
self.instance_birth_time = datetime.utcnow()
|
self.instance_birth_time = datetime.utcnow()
|
||||||
|
self.response: Optional[Response] = None
|
||||||
|
self.headers: Optional[CaseInsensitiveDict[str]] = None
|
||||||
|
self.status_code: Optional[int] = None
|
||||||
|
self.response_url: Optional[str] = None
|
||||||
|
self.cached_save: Optional[bool] = None
|
||||||
|
self.saved_archive: Optional[str] = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def archive_url(self) -> str:
|
def archive_url(self) -> str:
|
||||||
@ -83,7 +90,7 @@ class WaybackMachineSaveAPI(object):
|
|||||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||||
self.response = session.get(self.request_url, headers=self.request_headers)
|
self.response = session.get(self.request_url, headers=self.request_headers)
|
||||||
# requests.response.headers is requests.structures.CaseInsensitiveDict
|
# requests.response.headers is requests.structures.CaseInsensitiveDict
|
||||||
self.headers: CaseInsensitiveDict[str] = self.response.headers
|
self.headers = self.response.headers
|
||||||
self.status_code = self.response.status_code
|
self.status_code = self.response.status_code
|
||||||
self.response_url = self.response.url
|
self.response_url = self.response.url
|
||||||
session.close()
|
session.close()
|
||||||
@ -129,7 +136,9 @@ class WaybackMachineSaveAPI(object):
|
|||||||
if match is not None and len(match.groups()) == 1:
|
if match is not None and len(match.groups()) == 1:
|
||||||
return "https" + match.group(1)
|
return "https" + match.group(1)
|
||||||
|
|
||||||
self.response_url = self.response_url.strip()
|
self.response_url = (
|
||||||
|
"" if self.response_url is None else self.response_url.strip()
|
||||||
|
)
|
||||||
regex4 = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$"
|
regex4 = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$"
|
||||||
match = re.search(regex4, self.response_url)
|
match = re.search(regex4, self.response_url)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
|
@ -7,13 +7,15 @@ the Url class.
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Generator, Optional
|
from typing import Generator, Optional
|
||||||
|
|
||||||
from .availability_api import WaybackMachineAvailabilityAPI
|
from requests.structures import CaseInsensitiveDict
|
||||||
|
|
||||||
|
from .availability_api import ResponseJSON, WaybackMachineAvailabilityAPI
|
||||||
from .cdx_api import WaybackMachineCDXServerAPI
|
from .cdx_api import WaybackMachineCDXServerAPI
|
||||||
from .save_api import WaybackMachineSaveAPI
|
from .save_api import WaybackMachineSaveAPI
|
||||||
from .utils import DEFAULT_USER_AGENT
|
from .utils import DEFAULT_USER_AGENT
|
||||||
|
|
||||||
|
|
||||||
class Url(object):
|
class Url:
|
||||||
"""
|
"""
|
||||||
The Url class is not recommended to be used anymore, instead use:
|
The Url class is not recommended to be used anymore, instead use:
|
||||||
|
|
||||||
@ -39,6 +41,9 @@ class Url(object):
|
|||||||
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
||||||
self.url, user_agent=self.user_agent
|
self.url, user_agent=self.user_agent
|
||||||
)
|
)
|
||||||
|
self.wayback_machine_save_api: Optional[WaybackMachineSaveAPI] = None
|
||||||
|
self.headers: Optional[CaseInsensitiveDict[str]] = None
|
||||||
|
self.json: Optional[ResponseJSON] = None
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
if not self.archive_url:
|
if not self.archive_url:
|
||||||
@ -107,7 +112,7 @@ class Url(object):
|
|||||||
def set_availability_api_attrs(self) -> None:
|
def set_availability_api_attrs(self) -> None:
|
||||||
"""Set the attributes for total backwards compatibility."""
|
"""Set the attributes for total backwards compatibility."""
|
||||||
self.archive_url = self.wayback_machine_availability_api.archive_url
|
self.archive_url = self.wayback_machine_availability_api.archive_url
|
||||||
self.JSON = self.wayback_machine_availability_api.JSON
|
self.json = self.wayback_machine_availability_api.json
|
||||||
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
||||||
|
|
||||||
def total_archives(
|
def total_archives(
|
||||||
|
Loading…
Reference in New Issue
Block a user