diff --git a/.deepsource.toml b/.deepsource.toml index aebeb0f..d0c19ff 100644 --- a/.deepsource.toml +++ b/.deepsource.toml @@ -3,6 +3,9 @@ version = 1 [[analyzers]] name = "python" enabled = true - +test_patterns = [ + "tests/**", + "test_*.py" +] [analyzers.meta] - runtime_version = "3.x.x" \ No newline at end of file + runtime_version = "3.x.x" diff --git a/.pep8speaks.yml b/.pep8speaks.yml index 298e3fb..249d4eb 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -4,3 +4,4 @@ scanner: flake8: max-line-length: 88 + extend-ignore: W503,W605 diff --git a/setup.cfg b/setup.cfg index a084b14..7fe3424 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,7 +65,7 @@ profile = black [flake8] indent-size = 4 max-line-length = 88 -extend-ignore = W605 +extend-ignore = W503,W605 [mypy] python_version = 3.9 @@ -84,7 +84,3 @@ addopts = --cov-report=html testpaths = tests - -[pycodestyle] -# for `license` and `filter in `waybackpy.cli.main` -ignore = W0622 diff --git a/tests/test_availability_api.py b/tests/test_availability_api.py index 42803df..e74854b 100644 --- a/tests/test_availability_api.py +++ b/tests/test_availability_api.py @@ -40,8 +40,8 @@ def test_oldest() -> None: oldest_timestamp = oldest.timestamp() assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years assert ( - availability_api.JSON is not None - and availability_api.JSON["archived_snapshots"]["closest"]["available"] is True + availability_api.json is not None + and availability_api.json["archived_snapshots"]["closest"]["available"] is True ) assert repr(oldest).find("example.com") != -1 assert "2002" in str(oldest) diff --git a/waybackpy/__init__.py b/waybackpy/__init__.py index e73471b..4052148 100644 --- a/waybackpy/__init__.py +++ b/waybackpy/__init__.py @@ -1,3 +1,5 @@ +"""Module initializer and provider of static infomation.""" + __title__ = "waybackpy" __description__ = ( "Python package that interfaces with the Internet Archive's Wayback Machine APIs. " diff --git a/waybackpy/availability_api.py b/waybackpy/availability_api.py index 6ef5b53..3864da4 100644 --- a/waybackpy/availability_api.py +++ b/waybackpy/availability_api.py @@ -37,7 +37,7 @@ from .utils import DEFAULT_USER_AGENT ResponseJSON = Dict[str, Any] -class WaybackMachineAvailabilityAPI(object): +class WaybackMachineAvailabilityAPI: """ Class that interfaces the availability API of the Wayback Machine. """ @@ -55,7 +55,8 @@ class WaybackMachineAvailabilityAPI(object): self.tries: int = 0 self.last_api_call_unix_time: int = int(time.time()) self.api_call_time_gap: int = 5 - self.JSON: Optional[ResponseJSON] = None + self.json: Optional[ResponseJSON] = None + self.response: Optional[Response] = None @staticmethod def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: @@ -83,12 +84,12 @@ class WaybackMachineAvailabilityAPI(object): # String should not return anything other than a string object # So, if a string repr is asked for before making any API requests # just return "" - if not self.JSON: + if not self.json: return "" return self.archive_url - def json(self) -> Optional[ResponseJSON]: + def setup_json(self) -> Optional[ResponseJSON]: """ Makes the API call to the availability API and set the JSON response to the JSON attribute of the instance and also returns the JSON @@ -109,19 +110,19 @@ class WaybackMachineAvailabilityAPI(object): if sleep_time > 0: time.sleep(sleep_time) - self.response: Response = requests.get( + self.response = requests.get( self.endpoint, params=self.payload, headers=self.headers ) self.last_api_call_unix_time = int(time.time()) self.tries += 1 try: - self.JSON = self.response.json() + self.json = None if self.response is None else self.response.json() except json.decoder.JSONDecodeError as json_decode_error: raise InvalidJSONInAvailabilityAPIResponse( f"Response data:\n{self.response.text}" ) from json_decode_error - return self.JSON + return self.json def timestamp(self) -> datetime: """ @@ -136,19 +137,19 @@ class WaybackMachineAvailabilityAPI(object): guaranteed that you can get the datetime object from the timestamp. """ - if self.JSON is None or "archived_snapshots" not in self.JSON: + if self.json is None or "archived_snapshots" not in self.json: return datetime.max if ( - self.JSON is not None - and "archived_snapshots" in self.JSON - and self.JSON["archived_snapshots"] is not None - and "closest" in self.JSON["archived_snapshots"] - and self.JSON["archived_snapshots"]["closest"] is not None - and "timestamp" in self.JSON["archived_snapshots"]["closest"] + self.json is not None + and "archived_snapshots" in self.json + and self.json["archived_snapshots"] is not None + and "closest" in self.json["archived_snapshots"] + and self.json["archived_snapshots"]["closest"] is not None + and "timestamp" in self.json["archived_snapshots"]["closest"] ): return datetime.strptime( - self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" + self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" ) raise ValueError("Could not get timestamp from result") @@ -162,7 +163,7 @@ class WaybackMachineAvailabilityAPI(object): """ archive_url = "" - data = self.JSON + data = self.json # If the user didn't invoke oldest, newest or near but tries to access the # archive_url attribute then assume they are fine with any archive @@ -176,8 +177,8 @@ class WaybackMachineAvailabilityAPI(object): while (self.tries < self.max_tries) and ( not data or not data["archived_snapshots"] ): - self.json() # It makes a new API call - data = self.JSON # json() updated the value of JSON attribute + self.setup_json() # It makes a new API call + data = self.json # json() updated the value of JSON attribute # If we exhausted the max_tries, then we give up and # raise exception. @@ -187,7 +188,10 @@ class WaybackMachineAvailabilityAPI(object): "Archive not found in the availability " "API response, the URL you requested may not have any archives " "yet. You may retry after some time or archive the webpage now.\n" - f"Response data:\n{self.response.text}" + "Response data:\n" + "" + if self.response is None + else self.response.text ) else: archive_url = data["archived_snapshots"]["closest"]["url"] @@ -262,5 +266,5 @@ class WaybackMachineAvailabilityAPI(object): ) self.payload["timestamp"] = timestamp - self.json() + self.setup_json() return self diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index 97cc908..bfb474e 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -24,7 +24,7 @@ from .exceptions import WaybackError from .utils import DEFAULT_USER_AGENT -class WaybackMachineCDXServerAPI(object): +class WaybackMachineCDXServerAPI: """ Class that interfaces the CDX server API of the Wayback Machine. diff --git a/waybackpy/cdx_snapshot.py b/waybackpy/cdx_snapshot.py index 9cf9610..d7a4a16 100644 --- a/waybackpy/cdx_snapshot.py +++ b/waybackpy/cdx_snapshot.py @@ -11,7 +11,7 @@ from datetime import datetime from typing import Dict -class CDXSnapshot(object): +class CDXSnapshot: """ Class for the CDX snapshot lines('record') returned by the CDX API, Each valid line of the CDX API is casted to an CDXSnapshot object diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 04daa82..bf01f25 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -2,11 +2,11 @@ Module that makes waybackpy a CLI tool. """ -import json as JSON import os import random import re import string +from json import dumps from typing import Generator, List, Optional import click @@ -40,7 +40,7 @@ def echo_availability_api( click.echo(archive_url) if json: click.echo("JSON response:") - click.echo(JSON.dumps(availability_api_instance.JSON)) + click.echo(dumps(availability_api_instance.json)) def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: @@ -63,7 +63,7 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: domain = "domain-unknown" if match is None else match.group(1) file_name = f"{domain}-urls-{uid}.txt" file_path = os.path.join(os.getcwd(), file_name) - with open(file_path, "a") as file: + with open(file_path, "a", encoding="UTF-8") as file: file.write(f"{url}\n") click.echo(url) @@ -345,8 +345,8 @@ def main( # pylint: disable=no-value-for-parameter if file: return save_urls_on_file(url_gen) - for url in url_gen: - click.echo(url) + for url_ in url_gen: + click.echo(url_) elif cdx: filters = list(cdx_filter) diff --git a/waybackpy/save_api.py b/waybackpy/save_api.py index e0d5cef..d80f477 100644 --- a/waybackpy/save_api.py +++ b/waybackpy/save_api.py @@ -12,6 +12,7 @@ from typing import Dict, Optional import requests from requests.adapters import HTTPAdapter +from requests.models import Response from requests.structures import CaseInsensitiveDict from urllib3.util.retry import Retry @@ -19,7 +20,7 @@ from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, Waybac from .utils import DEFAULT_USER_AGENT -class WaybackMachineSaveAPI(object): +class WaybackMachineSaveAPI: """ WaybackMachineSaveAPI class provides an interface for saving URLs on the Wayback Machine. @@ -43,6 +44,12 @@ class WaybackMachineSaveAPI(object): self.status_forcelist = [500, 502, 503, 504] self._archive_url: Optional[str] = None self.instance_birth_time = datetime.utcnow() + self.response: Optional[Response] = None + self.headers: Optional[CaseInsensitiveDict[str]] = None + self.status_code: Optional[int] = None + self.response_url: Optional[str] = None + self.cached_save: Optional[bool] = None + self.saved_archive: Optional[str] = None @property def archive_url(self) -> str: @@ -83,7 +90,7 @@ class WaybackMachineSaveAPI(object): session.mount("https://", HTTPAdapter(max_retries=retries)) self.response = session.get(self.request_url, headers=self.request_headers) # requests.response.headers is requests.structures.CaseInsensitiveDict - self.headers: CaseInsensitiveDict[str] = self.response.headers + self.headers = self.response.headers self.status_code = self.response.status_code self.response_url = self.response.url session.close() @@ -129,7 +136,9 @@ class WaybackMachineSaveAPI(object): if match is not None and len(match.groups()) == 1: return "https" + match.group(1) - self.response_url = self.response_url.strip() + self.response_url = ( + "" if self.response_url is None else self.response_url.strip() + ) regex4 = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$" match = re.search(regex4, self.response_url) if match is not None: diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 6c7b0ce..21b6782 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -7,13 +7,15 @@ the Url class. from datetime import datetime, timedelta from typing import Generator, Optional -from .availability_api import WaybackMachineAvailabilityAPI +from requests.structures import CaseInsensitiveDict + +from .availability_api import ResponseJSON, WaybackMachineAvailabilityAPI from .cdx_api import WaybackMachineCDXServerAPI from .save_api import WaybackMachineSaveAPI from .utils import DEFAULT_USER_AGENT -class Url(object): +class Url: """ The Url class is not recommended to be used anymore, instead use: @@ -39,6 +41,9 @@ class Url(object): self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI( self.url, user_agent=self.user_agent ) + self.wayback_machine_save_api: Optional[WaybackMachineSaveAPI] = None + self.headers: Optional[CaseInsensitiveDict[str]] = None + self.json: Optional[ResponseJSON] = None def __str__(self) -> str: if not self.archive_url: @@ -107,7 +112,7 @@ class Url(object): def set_availability_api_attrs(self) -> None: """Set the attributes for total backwards compatibility.""" self.archive_url = self.wayback_machine_availability_api.archive_url - self.JSON = self.wayback_machine_availability_api.JSON + self.json = self.wayback_machine_availability_api.json self.timestamp = self.wayback_machine_availability_api.timestamp() def total_archives(