added docstrings, added some static type hints and also lint. (#141)
* added docstrings, added some static type hints and also lint. * added doc strings and changed some internal variable names for more clarity. * make flake8 happy * add descriptive docstrings and type hints in waybackpy/cdx_snapshot.py * remove useless code and add docstrings and also lint using pylint. * remove unwarented test * added docstrings, lint using pylint and add a raise on 509 SC * added docstrings and lint with pylint * lint * add doc strings and lint * add docstrings and lint
This commit is contained in:
		@@ -1,9 +1,32 @@
 | 
			
		||||
"""
 | 
			
		||||
This module interfaces the Wayback Machine's availability API.
 | 
			
		||||
 | 
			
		||||
The interface could be useful for looking up archives and finding archives
 | 
			
		||||
that are close to a specific date and time.
 | 
			
		||||
 | 
			
		||||
It has a class called WaybackMachineAvailabilityAPI, and the class has
 | 
			
		||||
methods such as:
 | 
			
		||||
 | 
			
		||||
near() for looking up archives close to a specific date and time.
 | 
			
		||||
 | 
			
		||||
oldest() for retrieving the first archive URL of the webpage.
 | 
			
		||||
 | 
			
		||||
newest() for retrieving the latest archive of an URL.
 | 
			
		||||
 | 
			
		||||
The Wayback Machine Availability response should be a valid JSON and
 | 
			
		||||
if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
 | 
			
		||||
 | 
			
		||||
If the Availability API returned valid JSON but archive URL could not be found
 | 
			
		||||
it it then ArchiveNotInAvailabilityAPIResponse is raised.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
import time
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from typing import Any, Dict, Optional
 | 
			
		||||
 | 
			
		||||
import requests
 | 
			
		||||
from requests.models import Response
 | 
			
		||||
 | 
			
		||||
from .exceptions import (
 | 
			
		||||
    ArchiveNotInAvailabilityAPIResponse,
 | 
			
		||||
@@ -22,38 +45,43 @@ class WaybackMachineAvailabilityAPI(object):
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
 | 
			
		||||
    ) -> None:
 | 
			
		||||
 | 
			
		||||
        self.url = str(url).strip().replace(" ", "%20")
 | 
			
		||||
        self.user_agent = user_agent
 | 
			
		||||
        self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
 | 
			
		||||
        self.payload = {"url": self.url}
 | 
			
		||||
        self.endpoint = "https://archive.org/wayback/available"
 | 
			
		||||
        self.max_tries = max_tries
 | 
			
		||||
        self.tries = 0
 | 
			
		||||
        self.last_api_call_unix_time = int(time.time())
 | 
			
		||||
        self.api_call_time_gap = 5
 | 
			
		||||
        self.payload: Dict[str, str] = {"url": self.url}
 | 
			
		||||
        self.endpoint: str = "https://archive.org/wayback/available"
 | 
			
		||||
        self.max_tries: int = max_tries
 | 
			
		||||
        self.tries: int = 0
 | 
			
		||||
        self.last_api_call_unix_time: int = int(time.time())
 | 
			
		||||
        self.api_call_time_gap: int = 5
 | 
			
		||||
        self.JSON: Optional[ResponseJSON] = None
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
 | 
			
		||||
        """
 | 
			
		||||
        Converts Unix time to wayback Machine timestamp.
 | 
			
		||||
        Converts Unix time to wayback Machine timestamp and the Wayback Machine
 | 
			
		||||
        timestamp format is yyyyMMddhhmmss.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
 | 
			
		||||
 | 
			
		||||
    def __repr__(self) -> str:
 | 
			
		||||
        """
 | 
			
		||||
        Same as string representation, just return the archive URL as a string.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        return str(self)
 | 
			
		||||
 | 
			
		||||
    def __str__(self) -> str:
 | 
			
		||||
        """
 | 
			
		||||
        String representation of the class. If atleast one API call was successfully
 | 
			
		||||
        made then return the archive URL as a string. Else returns None.
 | 
			
		||||
        String representation of the class. If atleast one API
 | 
			
		||||
        call was successfully made then return the archive URL
 | 
			
		||||
        as a string. Else returns "".
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        # String must not return anything other than a string object
 | 
			
		||||
        # So, if some asks for string repr before making the API requests
 | 
			
		||||
        # String should not return anything other than a string object
 | 
			
		||||
        # So, if a string repr is asked for before making any API requests
 | 
			
		||||
        # just return ""
 | 
			
		||||
        if not self.JSON:
 | 
			
		||||
            return ""
 | 
			
		||||
@@ -62,26 +90,36 @@ class WaybackMachineAvailabilityAPI(object):
 | 
			
		||||
 | 
			
		||||
    def json(self) -> Optional[ResponseJSON]:
 | 
			
		||||
        """
 | 
			
		||||
        Makes the API call to the availability API can set the JSON response
 | 
			
		||||
        to the JSON attribute of the instance and also returns the JSON attribute.
 | 
			
		||||
        Makes the API call to the availability API and set the JSON response
 | 
			
		||||
        to the JSON attribute of the instance and also returns the JSON
 | 
			
		||||
        attribute.
 | 
			
		||||
 | 
			
		||||
        time_diff and sleep_time makes sure that you are not making too many
 | 
			
		||||
        requests in a short interval of item, making too many requests is bad
 | 
			
		||||
        as Wayback Machine may reject them above a certain threshold.
 | 
			
		||||
 | 
			
		||||
        The end-user can change the api_call_time_gap attribute of the instance
 | 
			
		||||
        to increase or decrease the default time gap between two successive API
 | 
			
		||||
        calls, but it is not recommended to increase it.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        time_diff = int(time.time()) - self.last_api_call_unix_time
 | 
			
		||||
        sleep_time = self.api_call_time_gap - time_diff
 | 
			
		||||
 | 
			
		||||
        if sleep_time > 0:
 | 
			
		||||
            time.sleep(sleep_time)
 | 
			
		||||
 | 
			
		||||
        self.response = requests.get(
 | 
			
		||||
        self.response: Response = requests.get(
 | 
			
		||||
            self.endpoint, params=self.payload, headers=self.headers
 | 
			
		||||
        )
 | 
			
		||||
        self.last_api_call_unix_time = int(time.time())
 | 
			
		||||
        self.tries += 1
 | 
			
		||||
        try:
 | 
			
		||||
            self.JSON = self.response.json()
 | 
			
		||||
        except json.decoder.JSONDecodeError:
 | 
			
		||||
        except json.decoder.JSONDecodeError as json_decode_error:
 | 
			
		||||
            raise InvalidJSONInAvailabilityAPIResponse(
 | 
			
		||||
                f"Response data:\n{self.response.text}"
 | 
			
		||||
            )
 | 
			
		||||
            ) from json_decode_error
 | 
			
		||||
 | 
			
		||||
        return self.JSON
 | 
			
		||||
 | 
			
		||||
@@ -91,15 +129,17 @@ class WaybackMachineAvailabilityAPI(object):
 | 
			
		||||
        If JSON attribute of the instance is None it implies that the either
 | 
			
		||||
        the the last API call failed or one was never made.
 | 
			
		||||
 | 
			
		||||
        If not JSON or if JSON but no timestamp in the JSON response then returns
 | 
			
		||||
        the maximum value for datetime object that is possible.
 | 
			
		||||
        If not JSON or if JSON but no timestamp in the JSON response then
 | 
			
		||||
        returns the maximum value for datetime object that is possible.
 | 
			
		||||
 | 
			
		||||
        If you get an URL as a response form the availability API it is guaranteed
 | 
			
		||||
        that you can get the datetime object from the timestamp.
 | 
			
		||||
        If you get an URL as a response form the availability API it is
 | 
			
		||||
        guaranteed that you can get the datetime object from the timestamp.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if self.JSON is None or "archived_snapshots" not in self.JSON:
 | 
			
		||||
            return datetime.max
 | 
			
		||||
        elif (
 | 
			
		||||
 | 
			
		||||
        if (
 | 
			
		||||
            self.JSON is not None
 | 
			
		||||
            and "archived_snapshots" in self.JSON
 | 
			
		||||
            and self.JSON["archived_snapshots"] is not None
 | 
			
		||||
@@ -110,21 +150,23 @@ class WaybackMachineAvailabilityAPI(object):
 | 
			
		||||
            return datetime.strptime(
 | 
			
		||||
                self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            raise ValueError("Could not get timestamp from result")
 | 
			
		||||
 | 
			
		||||
        raise ValueError("Could not get timestamp from result")
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def archive_url(self) -> str:
 | 
			
		||||
        """
 | 
			
		||||
        Reads the the JSON response data and tries to get the timestamp and returns
 | 
			
		||||
        the timestamp if found else returns None.
 | 
			
		||||
        Reads the the JSON response data and returns
 | 
			
		||||
        the timestamp if found and if not found raises
 | 
			
		||||
        ArchiveNotInAvailabilityAPIResponse.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        archive_url = ""
 | 
			
		||||
        data = self.JSON
 | 
			
		||||
 | 
			
		||||
        # If the user didn't used oldest, newest or near but tries to access the
 | 
			
		||||
        # archive_url attribute then, we assume they are fine with any archive
 | 
			
		||||
        # and invoke the oldest archive function.
 | 
			
		||||
        # If the user didn't invoke oldest, newest or near but tries to access the
 | 
			
		||||
        # archive_url attribute then assume they are fine with any archive
 | 
			
		||||
        # and invoke the oldest method.
 | 
			
		||||
        if not data:
 | 
			
		||||
            self.oldest()
 | 
			
		||||
 | 
			
		||||
@@ -137,7 +179,7 @@ class WaybackMachineAvailabilityAPI(object):
 | 
			
		||||
                self.json()  # It makes a new API call
 | 
			
		||||
                data = self.JSON  # json() updated the value of JSON attribute
 | 
			
		||||
 | 
			
		||||
            # Even if after we exhausted teh max_tries, then we give up and
 | 
			
		||||
            # If we exhausted the max_tries, then we give up and
 | 
			
		||||
            # raise exception.
 | 
			
		||||
 | 
			
		||||
            if not data or not data["archived_snapshots"]:
 | 
			
		||||
@@ -160,6 +202,7 @@ class WaybackMachineAvailabilityAPI(object):
 | 
			
		||||
        Prepends zero before the year, month, day, hour and minute so that they
 | 
			
		||||
        are conformable with the YYYYMMDDhhmmss wayback machine timestamp format.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        return "".join(
 | 
			
		||||
            str(kwargs[key]).zfill(2)
 | 
			
		||||
            for key in ["year", "month", "day", "hour", "minute"]
 | 
			
		||||
@@ -167,18 +210,21 @@ class WaybackMachineAvailabilityAPI(object):
 | 
			
		||||
 | 
			
		||||
    def oldest(self) -> "WaybackMachineAvailabilityAPI":
 | 
			
		||||
        """
 | 
			
		||||
        Passing the year 1994 should return the oldest archive because
 | 
			
		||||
        wayback machine was started in May, 1996 and there should be no archive
 | 
			
		||||
        before the year 1994.
 | 
			
		||||
        Passes the date 1994-01-01 to near which should return the oldest archive
 | 
			
		||||
        because Wayback Machine was started in May, 1996 and it is assumed that
 | 
			
		||||
        there would be no archive older than January 1, 1994.
 | 
			
		||||
        """
 | 
			
		||||
        return self.near(year=1994)
 | 
			
		||||
 | 
			
		||||
        return self.near(year=1994, month=1, day=1)
 | 
			
		||||
 | 
			
		||||
    def newest(self) -> "WaybackMachineAvailabilityAPI":
 | 
			
		||||
        """
 | 
			
		||||
        Passing the current UNIX time should be sufficient to get the newest
 | 
			
		||||
        archive considering the API request-response time delay and also the
 | 
			
		||||
        database lags on Wayback machine.
 | 
			
		||||
        Passes the current UNIX time to near() for retrieving the newest archive
 | 
			
		||||
        from the availability API.
 | 
			
		||||
 | 
			
		||||
        We assume that wayback machine can not archive the future of a webpage.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        return self.near(unix_timestamp=int(time.time()))
 | 
			
		||||
 | 
			
		||||
    def near(
 | 
			
		||||
@@ -191,16 +237,18 @@ class WaybackMachineAvailabilityAPI(object):
 | 
			
		||||
        unix_timestamp: Optional[int] = None,
 | 
			
		||||
    ) -> "WaybackMachineAvailabilityAPI":
 | 
			
		||||
        """
 | 
			
		||||
        The main method for this Class, oldest and newest methods are dependent on this
 | 
			
		||||
        method.
 | 
			
		||||
        The main method for the Class, oldest() and newest() are dependent on it.
 | 
			
		||||
 | 
			
		||||
        It generates the timestamp based on the input either by calling the
 | 
			
		||||
        unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
 | 
			
		||||
        appropriate arguments for their respective parameters.
 | 
			
		||||
 | 
			
		||||
        Adds the timestamp to the payload dictionary.
 | 
			
		||||
 | 
			
		||||
        And finally invoking the json method to make the API call then returns
 | 
			
		||||
        the instance.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        if unix_timestamp:
 | 
			
		||||
            timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
 | 
			
		||||
        else:
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,14 @@
 | 
			
		||||
"""
 | 
			
		||||
This module interfaces the Wayback Machine's CDX server API.
 | 
			
		||||
 | 
			
		||||
The module has WaybackMachineCDXServerAPI which should be used by the users of
 | 
			
		||||
this module to consume the CDX server API.
 | 
			
		||||
 | 
			
		||||
WaybackMachineCDXServerAPI has a snapshot method that yields the snapshots, and
 | 
			
		||||
the snapshots are yielded as instances of the CDXSnapshot class.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from typing import Dict, Generator, List, Optional, cast
 | 
			
		||||
 | 
			
		||||
from .cdx_snapshot import CDXSnapshot
 | 
			
		||||
@@ -16,6 +27,11 @@ from .utils import DEFAULT_USER_AGENT
 | 
			
		||||
class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
    """
 | 
			
		||||
    Class that interfaces the CDX server API of the Wayback Machine.
 | 
			
		||||
 | 
			
		||||
    snapshot() returns a generator that can be iterated upon by the end-user,
 | 
			
		||||
    the generator returns the snapshots/entries as instance of CDXSnapshot to
 | 
			
		||||
    make the usage easy, just use '.' to get any attribute as the attributes are
 | 
			
		||||
    accessible via a dot ".".
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    # start_timestamp: from, can not use from as it's a keyword
 | 
			
		||||
@@ -53,9 +69,35 @@ class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
    def cdx_api_manager(
 | 
			
		||||
        self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
 | 
			
		||||
    ) -> Generator[str, None, None]:
 | 
			
		||||
        """
 | 
			
		||||
        Manages the API calls for the instance, it automatically selects the best
 | 
			
		||||
        parameters by looking as the query of the end-user. For bigger queries
 | 
			
		||||
        automatically use the CDX pagination API and for smaller queries use the
 | 
			
		||||
        normal API.
 | 
			
		||||
 | 
			
		||||
        CDX Server API is a complex API and to make it easy for the end user to
 | 
			
		||||
        consume it the CDX manager(this method) handles the selection of the
 | 
			
		||||
        API output, whether to use the pagination API or not.
 | 
			
		||||
 | 
			
		||||
        For doing large/bulk queries, the use of the Pagination API is
 | 
			
		||||
        recommended by the Wayback Machine authors. And it determines if the
 | 
			
		||||
        query would be large or not by using the showNumPages=true parameter,
 | 
			
		||||
        this tells the number of pages of CDX DATA that the pagination API
 | 
			
		||||
        will return.
 | 
			
		||||
 | 
			
		||||
        If the number of page is less than 2 we use the normal non-pagination
 | 
			
		||||
        API as the pagination API is known to lag and for big queries it should
 | 
			
		||||
        not matter but for queries where the number of pages are less this
 | 
			
		||||
        method chooses accuracy over the pagination API.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        # number of pages that will returned by the pagination API.
 | 
			
		||||
        # get_total_pages adds the showNumPages=true param to pagination API
 | 
			
		||||
        # requests.
 | 
			
		||||
        # This is a special query that will return a single number indicating
 | 
			
		||||
        # the number of pages.
 | 
			
		||||
        total_pages = get_total_pages(self.url, self.user_agent)
 | 
			
		||||
        # If we only have two or less pages of archives then we care for more accuracy
 | 
			
		||||
        # pagination API is lagged sometimes
 | 
			
		||||
 | 
			
		||||
        if use_page is True and total_pages >= 2:
 | 
			
		||||
            blank_pages = 0
 | 
			
		||||
            for i in range(total_pages):
 | 
			
		||||
@@ -78,11 +120,11 @@ class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
        else:
 | 
			
		||||
            payload["showResumeKey"] = "true"
 | 
			
		||||
            payload["limit"] = str(self.limit)
 | 
			
		||||
            resumeKey = None
 | 
			
		||||
            resume_key = None
 | 
			
		||||
            more = True
 | 
			
		||||
            while more:
 | 
			
		||||
                if resumeKey:
 | 
			
		||||
                    payload["resumeKey"] = resumeKey
 | 
			
		||||
                if resume_key:
 | 
			
		||||
                    payload["resumeKey"] = resume_key
 | 
			
		||||
 | 
			
		||||
                url = full_url(self.endpoint, params=payload)
 | 
			
		||||
                res = get_response(url, headers=headers)
 | 
			
		||||
@@ -102,13 +144,16 @@ class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
 | 
			
		||||
                    if len(second_last_line) == 0:
 | 
			
		||||
 | 
			
		||||
                        resumeKey = lines[-1].strip()
 | 
			
		||||
                        text = text.replace(resumeKey, "", 1).strip()
 | 
			
		||||
                        resume_key = lines[-1].strip()
 | 
			
		||||
                        text = text.replace(resume_key, "", 1).strip()
 | 
			
		||||
                        more = True
 | 
			
		||||
 | 
			
		||||
                yield text
 | 
			
		||||
 | 
			
		||||
    def add_payload(self, payload: Dict[str, str]) -> None:
 | 
			
		||||
        """
 | 
			
		||||
        Adds the payload to the payload dictionary.
 | 
			
		||||
        """
 | 
			
		||||
        if self.start_timestamp:
 | 
			
		||||
            payload["from"] = self.start_timestamp
 | 
			
		||||
 | 
			
		||||
@@ -122,17 +167,35 @@ class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
            payload["matchType"] = self.match_type
 | 
			
		||||
 | 
			
		||||
        if self.filters and len(self.filters) > 0:
 | 
			
		||||
            for i, f in enumerate(self.filters):
 | 
			
		||||
                payload["filter" + str(i)] = f
 | 
			
		||||
            for i, _filter in enumerate(self.filters):
 | 
			
		||||
                payload["filter" + str(i)] = _filter
 | 
			
		||||
 | 
			
		||||
        if self.collapses and len(self.collapses) > 0:
 | 
			
		||||
            for i, f in enumerate(self.collapses):
 | 
			
		||||
                payload["collapse" + str(i)] = f
 | 
			
		||||
            for i, collapse in enumerate(self.collapses):
 | 
			
		||||
                payload["collapse" + str(i)] = collapse
 | 
			
		||||
 | 
			
		||||
        # Don't need to return anything as it's dictionary.
 | 
			
		||||
        payload["url"] = self.url
 | 
			
		||||
 | 
			
		||||
    def snapshots(self) -> Generator[CDXSnapshot, None, None]:
 | 
			
		||||
        """
 | 
			
		||||
        This function yields the CDX data lines as snapshots.
 | 
			
		||||
 | 
			
		||||
        As it is a generator it exhaustible, the reason that this is
 | 
			
		||||
        a generator and not a list are:
 | 
			
		||||
 | 
			
		||||
        a) CDX server API can return millions of entries for a query and list
 | 
			
		||||
        is not suitable for such cases.
 | 
			
		||||
 | 
			
		||||
        b) Preventing memory usage issues, as told before this method may yield
 | 
			
		||||
        millions of records for some queries and your system may not have enough
 | 
			
		||||
        memory for such a big list. Also Remember this if outputing to Jupyter
 | 
			
		||||
        Notebooks.
 | 
			
		||||
 | 
			
		||||
        The objects yielded by this method are instance of CDXSnapshot class,
 | 
			
		||||
        you can access the attributes of the entries as the attribute of the instance
 | 
			
		||||
        itself.
 | 
			
		||||
        """
 | 
			
		||||
        payload: Dict[str, str] = {}
 | 
			
		||||
        headers = {"User-Agent": self.user_agent}
 | 
			
		||||
 | 
			
		||||
@@ -144,18 +207,25 @@ class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
        if self.collapses != []:
 | 
			
		||||
            self.use_page = False
 | 
			
		||||
 | 
			
		||||
        texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
 | 
			
		||||
        entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)
 | 
			
		||||
 | 
			
		||||
        for text in texts:
 | 
			
		||||
        for entry in entries:
 | 
			
		||||
 | 
			
		||||
            if text.isspace() or len(text) <= 1 or not text:
 | 
			
		||||
            if entry.isspace() or len(entry) <= 1 or not entry:
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            snapshot_list = text.split("\n")
 | 
			
		||||
            # each line is a snapshot aka entry of the CDX server API.
 | 
			
		||||
            # We are able to split the page by lines because it only
 | 
			
		||||
            # splits the lines on a sinlge page and not all the entries
 | 
			
		||||
            # at once, thus there should be no issues of too much memory usage.
 | 
			
		||||
            snapshot_list = entry.split("\n")
 | 
			
		||||
 | 
			
		||||
            for snapshot in snapshot_list:
 | 
			
		||||
 | 
			
		||||
                if len(snapshot) < 46:  # 14 + 32 (timestamp+digest)
 | 
			
		||||
                # 14 + 32 == 46 ( timestamp + digest ), ignore the invalid entries.
 | 
			
		||||
                # they are invalid if their length is smaller than sum of length
 | 
			
		||||
                # of a standard wayback_timestamp and standard digest of an entry.
 | 
			
		||||
                if len(snapshot) < 46:
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                properties: Dict[str, Optional[str]] = {
 | 
			
		||||
@@ -168,16 +238,16 @@ class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
                    "length": None,
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                prop_values = snapshot.split(" ")
 | 
			
		||||
                property_value = snapshot.split(" ")
 | 
			
		||||
 | 
			
		||||
                prop_values_len = len(prop_values)
 | 
			
		||||
                properties_len = len(properties)
 | 
			
		||||
                total_property_values = len(property_value)
 | 
			
		||||
                warranted_total_property_values = len(properties)
 | 
			
		||||
 | 
			
		||||
                if prop_values_len != properties_len:
 | 
			
		||||
                if total_property_values != warranted_total_property_values:
 | 
			
		||||
                    raise WaybackError(
 | 
			
		||||
                        f"Snapshot returned by Cdx API has {prop_values_len} "
 | 
			
		||||
                        f"properties instead of expected {properties_len} properties.\n"
 | 
			
		||||
                        f"Problematic Snapshot: {snapshot}"
 | 
			
		||||
                        f"Snapshot returned by CDX API has {total_property_values} prop"
 | 
			
		||||
                        f"erties instead of expected {warranted_total_property_values} "
 | 
			
		||||
                        f"properties.\nProblematic Snapshot: {snapshot}"
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
                (
 | 
			
		||||
@@ -188,6 +258,6 @@ class WaybackMachineCDXServerAPI(object):
 | 
			
		||||
                    properties["statuscode"],
 | 
			
		||||
                    properties["digest"],
 | 
			
		||||
                    properties["length"],
 | 
			
		||||
                ) = prop_values
 | 
			
		||||
                ) = property_value
 | 
			
		||||
 | 
			
		||||
                yield CDXSnapshot(cast(Dict[str, str], properties))
 | 
			
		||||
 
 | 
			
		||||
@@ -1,30 +1,83 @@
 | 
			
		||||
"""
 | 
			
		||||
Module that contains the CDXSnapshot class, CDX records are casted
 | 
			
		||||
to CDXSnapshot objects for easier access.
 | 
			
		||||
 | 
			
		||||
The CDX index format is plain text data. Each line ('record') indicates a
 | 
			
		||||
crawled document. And these lines are casted to CDXSnapshot.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from typing import Dict
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CDXSnapshot(object):
 | 
			
		||||
    """
 | 
			
		||||
    Class for the CDX snapshot lines returned by the CDX API,
 | 
			
		||||
    Class for the CDX snapshot lines('record') returned by the CDX API,
 | 
			
		||||
    Each valid line of the CDX API is casted to an CDXSnapshot object
 | 
			
		||||
    by the CDX API interface.
 | 
			
		||||
    by the CDX API interface, just use "." to access any attribute of the
 | 
			
		||||
    CDX server API snapshot.
 | 
			
		||||
 | 
			
		||||
    This provides the end-user the ease of using the data as attributes
 | 
			
		||||
    of the CDXSnapshot.
 | 
			
		||||
 | 
			
		||||
    The string representation of the class is identical to the line returned
 | 
			
		||||
    by the CDX server API.
 | 
			
		||||
 | 
			
		||||
    Besides all the attributes of the CDX server API this class also provides
 | 
			
		||||
    archive_url attribute, yes it is the archive url of the snapshot.
 | 
			
		||||
 | 
			
		||||
    Attributes of the this class and what they represents and are useful for:
 | 
			
		||||
 | 
			
		||||
    urlkey: The document captured, expressed as a SURT
 | 
			
		||||
            SURT stands for Sort-friendly URI Reordering Transform, and is a
 | 
			
		||||
            transformation applied to URIs which makes their left-to-right
 | 
			
		||||
            representation better match the natural hierarchy of domain names.
 | 
			
		||||
            A URI <scheme://domain.tld/path?query> has SURT
 | 
			
		||||
            form <scheme://(tld,domain,)/path?query>.
 | 
			
		||||
 | 
			
		||||
    timestamp: The timestamp of the archive, format is yyyyMMddhhmmss and type
 | 
			
		||||
               is string.
 | 
			
		||||
 | 
			
		||||
    datetime_timestamp: The timestamp as a datetime object.
 | 
			
		||||
 | 
			
		||||
    original: The original URL of the archive. If archive_url is
 | 
			
		||||
    https://web.archive.org/web/20220113130051/https://google.com then the
 | 
			
		||||
    original URL is https://google.com
 | 
			
		||||
 | 
			
		||||
    mimetype: The document’s file type. e.g. text/html
 | 
			
		||||
 | 
			
		||||
    statuscode: HTTP response code for the document at the time of its crawling
 | 
			
		||||
 | 
			
		||||
    digest: Base32-encoded SHA-1 checksum of the document for discriminating
 | 
			
		||||
            with others
 | 
			
		||||
 | 
			
		||||
    length: Document’s volume of bytes in the WARC file
 | 
			
		||||
 | 
			
		||||
    archive_url: The archive url of the snapshot, this is not returned by the
 | 
			
		||||
                 CDX server API but created by this class on init.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, properties: Dict[str, str]) -> None:
 | 
			
		||||
        self.urlkey = properties["urlkey"]
 | 
			
		||||
        self.timestamp = properties["timestamp"]
 | 
			
		||||
        self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
 | 
			
		||||
        self.original = properties["original"]
 | 
			
		||||
        self.mimetype = properties["mimetype"]
 | 
			
		||||
        self.statuscode = properties["statuscode"]
 | 
			
		||||
        self.digest = properties["digest"]
 | 
			
		||||
        self.length = properties["length"]
 | 
			
		||||
        self.archive_url = (
 | 
			
		||||
        self.urlkey: str = properties["urlkey"]
 | 
			
		||||
        self.timestamp: str = properties["timestamp"]
 | 
			
		||||
        self.datetime_timestamp: datetime = datetime.strptime(
 | 
			
		||||
            self.timestamp, "%Y%m%d%H%M%S"
 | 
			
		||||
        )
 | 
			
		||||
        self.original: str = properties["original"]
 | 
			
		||||
        self.mimetype: str = properties["mimetype"]
 | 
			
		||||
        self.statuscode: str = properties["statuscode"]
 | 
			
		||||
        self.digest: str = properties["digest"]
 | 
			
		||||
        self.length: str = properties["length"]
 | 
			
		||||
        self.archive_url: str = (
 | 
			
		||||
            f"https://web.archive.org/web/{self.timestamp}/{self.original}"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def __str__(self) -> str:
 | 
			
		||||
        """
 | 
			
		||||
        The string representation is same as the line returned by the
 | 
			
		||||
        CDX server API for the snapshot.
 | 
			
		||||
        """
 | 
			
		||||
        return (
 | 
			
		||||
            f"{self.urlkey} {self.timestamp} {self.original} "
 | 
			
		||||
            f"{self.mimetype} {self.statuscode} {self.digest} {self.length}"
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,10 @@
 | 
			
		||||
"""
 | 
			
		||||
Utility functions required for accessing the CDX server API.
 | 
			
		||||
 | 
			
		||||
These are here in this module so that we don’t make any module too
 | 
			
		||||
big.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import re
 | 
			
		||||
from typing import Any, Dict, List, Optional, Union
 | 
			
		||||
from urllib.parse import quote
 | 
			
		||||
@@ -11,28 +18,44 @@ from .utils import DEFAULT_USER_AGENT
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
 | 
			
		||||
    """
 | 
			
		||||
    When using the pagination use adding showNumPages=true to the request
 | 
			
		||||
    URL makes the CDX server return an integer which is the number of pages
 | 
			
		||||
    of CDX pages available for us to query using the pagination API.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    endpoint = "https://web.archive.org/cdx/search/cdx?"
 | 
			
		||||
    payload = {"showNumPages": "true", "url": str(url)}
 | 
			
		||||
    headers = {"User-Agent": user_agent}
 | 
			
		||||
    request_url = full_url(endpoint, params=payload)
 | 
			
		||||
    response = get_response(request_url, headers=headers)
 | 
			
		||||
 | 
			
		||||
    if isinstance(response, requests.Response):
 | 
			
		||||
        return int(response.text.strip())
 | 
			
		||||
    else:
 | 
			
		||||
        raise response
 | 
			
		||||
    raise response
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def full_url(endpoint: str, params: Dict[str, Any]) -> str:
 | 
			
		||||
    """
 | 
			
		||||
    As the function's name already implies that it returns
 | 
			
		||||
    full URL, but why we need a function for generating full URL?
 | 
			
		||||
    The CDX server can support multiple arguments for parameters
 | 
			
		||||
    such as filter and collapse and this function adds them without
 | 
			
		||||
    overwriting earlier added arguments.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if not params:
 | 
			
		||||
        return endpoint
 | 
			
		||||
    full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
 | 
			
		||||
    _full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
 | 
			
		||||
 | 
			
		||||
    for key, val in params.items():
 | 
			
		||||
        key = "filter" if key.startswith("filter") else key
 | 
			
		||||
        key = "collapse" if key.startswith("collapse") else key
 | 
			
		||||
        amp = "" if full_url.endswith("?") else "&"
 | 
			
		||||
        amp = "" if _full_url.endswith("?") else "&"
 | 
			
		||||
        val = quote(str(val), safe="")
 | 
			
		||||
        full_url += f"{amp}{key}={val}"
 | 
			
		||||
    return full_url
 | 
			
		||||
        _full_url += f"{amp}{key}={val}"
 | 
			
		||||
 | 
			
		||||
    return _full_url
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_response(
 | 
			
		||||
@@ -40,29 +63,31 @@ def get_response(
 | 
			
		||||
    headers: Optional[Dict[str, str]] = None,
 | 
			
		||||
    retries: int = 5,
 | 
			
		||||
    backoff_factor: float = 0.5,
 | 
			
		||||
    # no_raise_on_redirects=False,
 | 
			
		||||
) -> Union[requests.Response, Exception]:
 | 
			
		||||
    """
 | 
			
		||||
    Make get request to the CDX server and return the response.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    session = requests.Session()
 | 
			
		||||
 | 
			
		||||
    retries_ = Retry(
 | 
			
		||||
        total=retries,
 | 
			
		||||
        backoff_factor=backoff_factor,
 | 
			
		||||
        status_forcelist=[500, 502, 503, 504],
 | 
			
		||||
    )
 | 
			
		||||
    session.mount("https://", HTTPAdapter(max_retries=retries_))
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        response = session.get(url, headers=headers)
 | 
			
		||||
        session.close()
 | 
			
		||||
        return response
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        reason = str(e)
 | 
			
		||||
        exc_message = f"Error while retrieving {url}.\n{reason}"
 | 
			
		||||
        exc = WaybackError(exc_message)
 | 
			
		||||
        exc.__cause__ = e
 | 
			
		||||
        raise exc
 | 
			
		||||
    session.mount("https://", HTTPAdapter(max_retries=retries_))
 | 
			
		||||
    response = session.get(url, headers=headers)
 | 
			
		||||
    session.close()
 | 
			
		||||
    return response
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_filters(filters: List[str]) -> None:
 | 
			
		||||
    """
 | 
			
		||||
    Check that the filter arguments passed by the end-user are valid.
 | 
			
		||||
    If not valid then raise WaybackError.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if not isinstance(filters, list):
 | 
			
		||||
        raise WaybackError("filters must be a list.")
 | 
			
		||||
 | 
			
		||||
@@ -81,9 +106,15 @@ def check_filters(filters: List[str]) -> None:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_collapses(collapses: List[str]) -> bool:
 | 
			
		||||
    """
 | 
			
		||||
    Check that the collapse arguments passed by the end-user are valid.
 | 
			
		||||
    If not valid then raise WaybackError.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    if not isinstance(collapses, list):
 | 
			
		||||
        raise WaybackError("collapses must be a list.")
 | 
			
		||||
    elif len(collapses) == 0:
 | 
			
		||||
 | 
			
		||||
    if len(collapses) == 0:
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    for collapse in collapses:
 | 
			
		||||
@@ -103,18 +134,26 @@ def check_collapses(collapses: List[str]) -> bool:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_match_type(match_type: Optional[str], url: str) -> bool:
 | 
			
		||||
    """
 | 
			
		||||
    Check that the match_type argument passed by the end-user is valid.
 | 
			
		||||
    If not valid then raise WaybackError.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    legal_match_type = ["exact", "prefix", "host", "domain"]
 | 
			
		||||
 | 
			
		||||
    if not match_type:
 | 
			
		||||
        return True
 | 
			
		||||
    elif "*" in url:
 | 
			
		||||
 | 
			
		||||
    if "*" in url:
 | 
			
		||||
        raise WaybackError(
 | 
			
		||||
            "Can not use wildcard in the URL along with the match_type arguments."
 | 
			
		||||
        )
 | 
			
		||||
    elif match_type not in legal_match_type:
 | 
			
		||||
 | 
			
		||||
    if match_type not in legal_match_type:
 | 
			
		||||
        exc_message = (
 | 
			
		||||
            f"{match_type} is not an allowed match type.\n"
 | 
			
		||||
            "Use one from 'exact', 'prefix', 'host' or 'domain'"
 | 
			
		||||
        )
 | 
			
		||||
        raise WaybackError(exc_message)
 | 
			
		||||
    else:
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    return True
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,7 @@
 | 
			
		||||
"""
 | 
			
		||||
Module that makes waybackpy a CLI tool.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import json as JSON
 | 
			
		||||
import os
 | 
			
		||||
import random
 | 
			
		||||
@@ -19,7 +23,10 @@ from .wrapper import Url
 | 
			
		||||
def echo_availability_api(
 | 
			
		||||
    availability_api_instance: WaybackMachineAvailabilityAPI, json: bool
 | 
			
		||||
) -> None:
 | 
			
		||||
    click.echo("Archive URL:")
 | 
			
		||||
    """
 | 
			
		||||
    Output availability API depending functions.
 | 
			
		||||
    Near, oldest and newest output by this method.
 | 
			
		||||
    """
 | 
			
		||||
    if not availability_api_instance.archive_url:
 | 
			
		||||
        archive_url = (
 | 
			
		||||
            "NO ARCHIVE FOUND - The requested URL is probably "
 | 
			
		||||
@@ -29,6 +36,7 @@ def echo_availability_api(
 | 
			
		||||
        )
 | 
			
		||||
    else:
 | 
			
		||||
        archive_url = availability_api_instance.archive_url
 | 
			
		||||
    click.echo("Archive URL:")
 | 
			
		||||
    click.echo(archive_url)
 | 
			
		||||
    if json:
 | 
			
		||||
        click.echo("JSON response:")
 | 
			
		||||
@@ -36,6 +44,10 @@ def echo_availability_api(
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
 | 
			
		||||
    """
 | 
			
		||||
    Save output of CDX API on file.
 | 
			
		||||
    Mainly here because of backwards compatibility.
 | 
			
		||||
    """
 | 
			
		||||
    domain = None
 | 
			
		||||
    sys_random = random.SystemRandom()
 | 
			
		||||
    uid = "".join(
 | 
			
		||||
@@ -51,8 +63,8 @@ def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
 | 
			
		||||
            domain = "domain-unknown" if match is None else match.group(1)
 | 
			
		||||
            file_name = f"{domain}-urls-{uid}.txt"
 | 
			
		||||
            file_path = os.path.join(os.getcwd(), file_name)
 | 
			
		||||
            with open(file_path, "a") as f:
 | 
			
		||||
                f.write(f"{url}\n")
 | 
			
		||||
            with open(file_path, "a") as file:
 | 
			
		||||
                file.write(f"{url}\n")
 | 
			
		||||
 | 
			
		||||
        click.echo(url)
 | 
			
		||||
 | 
			
		||||
@@ -269,6 +281,7 @@ def main(  # pylint: disable=no-value-for-parameter
 | 
			
		||||
    """
 | 
			
		||||
    if version:
 | 
			
		||||
        click.echo(f"waybackpy version {__version__}")
 | 
			
		||||
 | 
			
		||||
    elif show_license:
 | 
			
		||||
        click.echo(
 | 
			
		||||
            requests.get(
 | 
			
		||||
@@ -277,6 +290,7 @@ def main(  # pylint: disable=no-value-for-parameter
 | 
			
		||||
        )
 | 
			
		||||
    elif url is None:
 | 
			
		||||
        click.echo("No URL detected. Please provide an URL.", err=True)
 | 
			
		||||
 | 
			
		||||
    elif (
 | 
			
		||||
        not version
 | 
			
		||||
        and not oldest
 | 
			
		||||
@@ -291,14 +305,17 @@ def main(  # pylint: disable=no-value-for-parameter
 | 
			
		||||
            "Use --help flag for help using waybackpy.",
 | 
			
		||||
            err=True,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    elif oldest:
 | 
			
		||||
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
 | 
			
		||||
        availability_api.oldest()
 | 
			
		||||
        echo_availability_api(availability_api, json)
 | 
			
		||||
 | 
			
		||||
    elif newest:
 | 
			
		||||
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
 | 
			
		||||
        availability_api.newest()
 | 
			
		||||
        echo_availability_api(availability_api, json)
 | 
			
		||||
 | 
			
		||||
    elif near:
 | 
			
		||||
        availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
 | 
			
		||||
        near_args = {}
 | 
			
		||||
@@ -309,6 +326,7 @@ def main(  # pylint: disable=no-value-for-parameter
 | 
			
		||||
                near_args[key] = arg
 | 
			
		||||
        availability_api.near(**near_args)
 | 
			
		||||
        echo_availability_api(availability_api, json)
 | 
			
		||||
 | 
			
		||||
    elif save:
 | 
			
		||||
        save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
 | 
			
		||||
        save_api.save()
 | 
			
		||||
@@ -319,15 +337,17 @@ def main(  # pylint: disable=no-value-for-parameter
 | 
			
		||||
        if headers:
 | 
			
		||||
            click.echo("Save API headers:")
 | 
			
		||||
            click.echo(save_api.headers)
 | 
			
		||||
 | 
			
		||||
    elif known_urls:
 | 
			
		||||
        wayback = Url(url, user_agent)
 | 
			
		||||
        url_gen = wayback.known_urls(subdomain=subdomain)
 | 
			
		||||
 | 
			
		||||
        if file:
 | 
			
		||||
            return save_urls_on_file(url_gen)
 | 
			
		||||
        else:
 | 
			
		||||
            for url in url_gen:
 | 
			
		||||
                click.echo(url)
 | 
			
		||||
 | 
			
		||||
        for url in url_gen:
 | 
			
		||||
            click.echo(url)
 | 
			
		||||
 | 
			
		||||
    elif cdx:
 | 
			
		||||
        filters = list(cdx_filter)
 | 
			
		||||
        collapses = list(collapse)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,10 @@
 | 
			
		||||
"""
 | 
			
		||||
This module interfaces the Wayback Machine's SavePageNow (SPN) API.
 | 
			
		||||
 | 
			
		||||
The module has WaybackMachineSaveAPI class which should be used by the users of
 | 
			
		||||
this module to use the SavePageNow API.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import re
 | 
			
		||||
import time
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
@@ -8,7 +15,7 @@ from requests.adapters import HTTPAdapter
 | 
			
		||||
from requests.structures import CaseInsensitiveDict
 | 
			
		||||
from urllib3.util.retry import Retry
 | 
			
		||||
 | 
			
		||||
from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError
 | 
			
		||||
from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, WaybackError
 | 
			
		||||
from .utils import DEFAULT_USER_AGENT
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -47,8 +54,8 @@ class WaybackMachineSaveAPI(object):
 | 
			
		||||
 | 
			
		||||
        if self._archive_url:
 | 
			
		||||
            return self._archive_url
 | 
			
		||||
        else:
 | 
			
		||||
            return self.save()
 | 
			
		||||
 | 
			
		||||
        return self.save()
 | 
			
		||||
 | 
			
		||||
    def get_save_request_headers(self) -> None:
 | 
			
		||||
        """
 | 
			
		||||
@@ -66,6 +73,7 @@ class WaybackMachineSaveAPI(object):
 | 
			
		||||
        to be very unreliable thus if it fails first check opening
 | 
			
		||||
        the response URL yourself in the browser.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        session = requests.Session()
 | 
			
		||||
        retries = Retry(
 | 
			
		||||
            total=self.total_save_retries,
 | 
			
		||||
@@ -79,11 +87,24 @@ class WaybackMachineSaveAPI(object):
 | 
			
		||||
        self.status_code = self.response.status_code
 | 
			
		||||
        self.response_url = self.response.url
 | 
			
		||||
        session.close()
 | 
			
		||||
 | 
			
		||||
        if self.status_code == 429:
 | 
			
		||||
            # why wait 5 minutes and 429?
 | 
			
		||||
            # see https://github.com/akamhy/waybackpy/issues/97
 | 
			
		||||
            raise TooManyRequestsError(
 | 
			
		||||
                "Seem to be refused to request by the server. "
 | 
			
		||||
                "Save Page Now receives up to 15 URLs per minutes. "
 | 
			
		||||
                "Wait a moment and run again."
 | 
			
		||||
                f"Can not save '{self.url}'. "
 | 
			
		||||
                f"Save request refused by the server. "
 | 
			
		||||
                f"Save Page Now limits saving 15 URLs per minutes. "
 | 
			
		||||
                f"Try waiting for 5 minutes and then try again."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # why 509?
 | 
			
		||||
        # see https://github.com/akamhy/waybackpy/pull/99
 | 
			
		||||
        # also https://t.co/xww4YJ0Iwc
 | 
			
		||||
        if self.status_code == 509:
 | 
			
		||||
            raise WaybackError(
 | 
			
		||||
                f"Can not save '{self.url}'. You have probably reached the "
 | 
			
		||||
                f"limit of active sessions."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def archive_url_parser(self) -> Optional[str]:
 | 
			
		||||
@@ -146,13 +167,17 @@ class WaybackMachineSaveAPI(object):
 | 
			
		||||
        the Wayback Machine to serve cached archive if last archive was captured
 | 
			
		||||
        before last 45 minutes.
 | 
			
		||||
        """
 | 
			
		||||
        regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
 | 
			
		||||
        m = re.search(regex, str(self._archive_url))
 | 
			
		||||
        if m is None or len(m.groups()) != 1:
 | 
			
		||||
            raise ValueError("Could not get timestamp")
 | 
			
		||||
        string_timestamp = m.group(1)
 | 
			
		||||
        timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
 | 
			
		||||
 | 
			
		||||
        regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
 | 
			
		||||
        match = re.search(regex, str(self._archive_url))
 | 
			
		||||
 | 
			
		||||
        if match is None or len(match.groups()) != 1:
 | 
			
		||||
            raise ValueError(
 | 
			
		||||
                f"Can not parse timestamp from archive URL, '{self._archive_url}'."
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        string_timestamp = match.group(1)
 | 
			
		||||
        timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
 | 
			
		||||
        timestamp_unixtime = time.mktime(timestamp.timetuple())
 | 
			
		||||
        instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple())
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,7 @@
 | 
			
		||||
"""
 | 
			
		||||
Utility functions and shared variables like DEFAULT_USER_AGENT are here.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
from . import __version__
 | 
			
		||||
@@ -8,6 +12,7 @@ DEFAULT_USER_AGENT: str = (
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT) -> str:
 | 
			
		||||
    """Latest waybackpy version on PyPi."""
 | 
			
		||||
    request_url = "https://pypi.org/pypi/" + package_name + "/json"
 | 
			
		||||
    headers = {"User-Agent": user_agent}
 | 
			
		||||
    response = requests.get(request_url, headers=headers)
 | 
			
		||||
@@ -20,13 +25,14 @@ def latest_version_pypi(package_name: str, user_agent: str = DEFAULT_USER_AGENT)
 | 
			
		||||
        and data["info"]["version"] is not None
 | 
			
		||||
    ):
 | 
			
		||||
        return str(data["info"]["version"])
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError("Could not get latest pypi version")
 | 
			
		||||
 | 
			
		||||
    raise ValueError("Could not get latest pypi version")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def latest_version_github(
 | 
			
		||||
    package_name: str, user_agent: str = DEFAULT_USER_AGENT
 | 
			
		||||
) -> str:
 | 
			
		||||
    """Latest waybackpy version on GitHub."""
 | 
			
		||||
    request_url = (
 | 
			
		||||
        "https://api.github.com/repos/akamhy/" + package_name + "/releases?per_page=1"
 | 
			
		||||
    )
 | 
			
		||||
@@ -40,5 +46,5 @@ def latest_version_github(
 | 
			
		||||
        and "tag_name" in data[0]
 | 
			
		||||
    ):
 | 
			
		||||
        return str(data[0]["tag_name"])
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError("Could not get latest github version")
 | 
			
		||||
 | 
			
		||||
    raise ValueError("Could not get latest github version")
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,9 @@
 | 
			
		||||
"""
 | 
			
		||||
This module exists because backwards compatibility matters.
 | 
			
		||||
Don't touch this or add any new functionality here and don't use
 | 
			
		||||
the Url class.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
from datetime import datetime, timedelta
 | 
			
		||||
from typing import Generator, Optional
 | 
			
		||||
 | 
			
		||||
@@ -49,12 +55,14 @@ class Url(object):
 | 
			
		||||
 | 
			
		||||
        if not isinstance(self.timestamp, datetime):
 | 
			
		||||
            raise TypeError("timestamp must be a datetime")
 | 
			
		||||
        elif self.timestamp == datetime.max:
 | 
			
		||||
 | 
			
		||||
        if self.timestamp == datetime.max:
 | 
			
		||||
            return td_max.days
 | 
			
		||||
        else:
 | 
			
		||||
            return (datetime.utcnow() - self.timestamp).days
 | 
			
		||||
 | 
			
		||||
        return (datetime.utcnow() - self.timestamp).days
 | 
			
		||||
 | 
			
		||||
    def save(self) -> "Url":
 | 
			
		||||
        """Save the URL on wayback machine."""
 | 
			
		||||
        self.wayback_machine_save_api = WaybackMachineSaveAPI(
 | 
			
		||||
            self.url, user_agent=self.user_agent
 | 
			
		||||
        )
 | 
			
		||||
@@ -72,7 +80,7 @@ class Url(object):
 | 
			
		||||
        minute: Optional[int] = None,
 | 
			
		||||
        unix_timestamp: Optional[int] = None,
 | 
			
		||||
    ) -> "Url":
 | 
			
		||||
 | 
			
		||||
        """Returns the archive of the URL close to a date and time."""
 | 
			
		||||
        self.wayback_machine_availability_api.near(
 | 
			
		||||
            year=year,
 | 
			
		||||
            month=month,
 | 
			
		||||
@@ -85,16 +93,19 @@ class Url(object):
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def oldest(self) -> "Url":
 | 
			
		||||
        """Returns the oldest archive of the URL."""
 | 
			
		||||
        self.wayback_machine_availability_api.oldest()
 | 
			
		||||
        self.set_availability_api_attrs()
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def newest(self) -> "Url":
 | 
			
		||||
        """Returns the newest archive of the URL."""
 | 
			
		||||
        self.wayback_machine_availability_api.newest()
 | 
			
		||||
        self.set_availability_api_attrs()
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def set_availability_api_attrs(self) -> None:
 | 
			
		||||
        """Set the attributes for total backwards compatibility."""
 | 
			
		||||
        self.archive_url = self.wayback_machine_availability_api.archive_url
 | 
			
		||||
        self.JSON = self.wayback_machine_availability_api.JSON
 | 
			
		||||
        self.timestamp = self.wayback_machine_availability_api.timestamp()
 | 
			
		||||
@@ -102,6 +113,10 @@ class Url(object):
 | 
			
		||||
    def total_archives(
 | 
			
		||||
        self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
 | 
			
		||||
    ) -> int:
 | 
			
		||||
        """
 | 
			
		||||
        Returns an integer which indicates total number of archives for an URL.
 | 
			
		||||
        Useless in my opinion, only here because of backwards compatibility.
 | 
			
		||||
        """
 | 
			
		||||
        cdx = WaybackMachineCDXServerAPI(
 | 
			
		||||
            self.url,
 | 
			
		||||
            user_agent=self.user_agent,
 | 
			
		||||
@@ -122,6 +137,7 @@ class Url(object):
 | 
			
		||||
        end_timestamp: Optional[str] = None,
 | 
			
		||||
        match_type: str = "prefix",
 | 
			
		||||
    ) -> Generator[str, None, None]:
 | 
			
		||||
        """Yields known URLs for any URL."""
 | 
			
		||||
        if subdomain:
 | 
			
		||||
            match_type = "domain"
 | 
			
		||||
        if host:
 | 
			
		||||
@@ -137,4 +153,4 @@ class Url(object):
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        for snapshot in cdx.snapshots():
 | 
			
		||||
            yield (snapshot.original)
 | 
			
		||||
            yield snapshot.original
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user