added doc strings and changed some internal variable names for more clarity.

2022-02-07 13:02:29 +05:30
parent a2b14fb9a1
commit edec28d67f
1 changed files with 90 additions and 23 deletions
--- a/waybackpy/cdx_api.py
+++ b/waybackpy/cdx_api.py
@@ -1,3 +1,14 @@
+"""
+This module interfaces the Wayback Machine's CDX server API.
+
+The module has WaybackMachineCDXServerAPI which should be used by the users of
+this module to consume the CDX server API.
+
+WaybackMachineCDXServerAPI has a snapshot method that yields the snapshots, and
+the snapshots are yielded as instances of the CDXSnapshot class.
+"""
+
+
 from typing import Dict, Generator, List, Optional, cast

 from .cdx_snapshot import CDXSnapshot
@@ -16,6 +27,11 @@ from .utils import DEFAULT_USER_AGENT
 class WaybackMachineCDXServerAPI(object):
    """
    Class that interfaces the CDX server API of the Wayback Machine.
+
+    snapshot() returns a generator that can be iterated upon by the end-user,
+    the generator returns the snapshots/entries as instance of CDXSnapshot to
+    make the usage easy, just use '.' to get any attribute as the attributes are
+    accessible via a dot ".".
    """

    # start_timestamp: from, can not use from as it's a keyword
@@ -53,9 +69,32 @@ class WaybackMachineCDXServerAPI(object):
    def cdx_api_manager(
        self, payload: Dict[str, str], headers: Dict[str, str], use_page: bool = False
    ) -> Generator[str, None, None]:
+        """
+        Manages the API calls for the instance, it automatically selects the best
+        parameters by looking as the query of the end-user. For bigger queries automatically
+        use the CDX pagination API and for smaller queries use the normal API.
+
+        CDX Server API is a complex API and to make it easy for the end user to consume it
+        the CDX manager(this method) handles the selection of the API output, whether to
+        use the pagination API or not.
+
+        For doing large/bulk queries, the use of the Pagination API is recommended by the
+        Wayback Machine authors. And it determines if the query would be large or not by
+        using the showNumPages=true parameter, this tells the number of pages of CDX DATA
+        that the pagination API will return.
+        If the number of page is less than 2 we use the normal non-pagination API as the
+        pagination API is known to lag and for big queries it should not matter but for
+        queries where the number of pages are less this method chooses accuracy over the
+        pagination API.
+        """
+
+        # number of pages that will returned by the pagination API.
+        # get_total_pages adds the showNumPages=true param to pagination API
+        # requests.
+        # This is a special query that will return a single number indicating
+        # the number of pages.
        total_pages = get_total_pages(self.url, self.user_agent)
-        # If we only have two or less pages of archives then we care for more accuracy
-        # pagination API is lagged sometimes
+
        if use_page is True and total_pages >= 2:
            blank_pages = 0
            for i in range(total_pages):
@@ -78,11 +117,11 @@ class WaybackMachineCDXServerAPI(object):
        else:
            payload["showResumeKey"] = "true"
            payload["limit"] = str(self.limit)
-            resumeKey = None
+            resume_key = None
            more = True
            while more:
-                if resumeKey:
-                    payload["resumeKey"] = resumeKey
+                if resume_key:
+                    payload["resumeKey"] = resume_key

                url = full_url(self.endpoint, params=payload)
                res = get_response(url, headers=headers)
@@ -102,13 +141,16 @@ class WaybackMachineCDXServerAPI(object):

                    if len(second_last_line) == 0:

-                        resumeKey = lines[-1].strip()
-                        text = text.replace(resumeKey, "", 1).strip()
+                        resume_key = lines[-1].strip()
+                        text = text.replace(resume_key, "", 1).strip()
                        more = True

                yield text

    def add_payload(self, payload: Dict[str, str]) -> None:
+        """
+        Adds the payload to the payload dictionary.
+        """
        if self.start_timestamp:
            payload["from"] = self.start_timestamp

@@ -122,17 +164,35 @@ class WaybackMachineCDXServerAPI(object):
            payload["matchType"] = self.match_type

        if self.filters and len(self.filters) > 0:
-            for i, f in enumerate(self.filters):
-                payload["filter" + str(i)] = f
+            for i, _filter in enumerate(self.filters):
+                payload["filter" + str(i)] = _filter

        if self.collapses and len(self.collapses) > 0:
-            for i, f in enumerate(self.collapses):
-                payload["collapse" + str(i)] = f
+            for i, collapse in enumerate(self.collapses):
+                payload["collapse" + str(i)] = collapse

        # Don't need to return anything as it's dictionary.
        payload["url"] = self.url

    def snapshots(self) -> Generator[CDXSnapshot, None, None]:
+        """
+        This function yields the CDX data lines as snapshots.
+
+        As it is a generator it exhaustible, the reason that this is
+        a generator and not a list are:
+
+        a) CDX server API can return millions of entries for a query and list
+        is not suitable for such cases.
+
+        b) Preventing memory usage issues, as told before this method may yield
+        millions of records for some queries and your system may not have enough
+        memory for such a big list. Also Remember this if outputing to Jupyter
+        Notebooks.
+
+        The objects yielded by this method are instance of CDXSnapshot class,
+        you can access the attributes of the entries as the attribute of the instance
+        itself.
+        """
        payload: Dict[str, str] = {}
        headers = {"User-Agent": self.user_agent}

@@ -144,18 +204,25 @@ class WaybackMachineCDXServerAPI(object):
        if self.collapses != []:
            self.use_page = False

-        texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
+        entries = self.cdx_api_manager(payload, headers, use_page=self.use_page)

-        for text in texts:
+        for entry in entries:

-            if text.isspace() or len(text) <= 1 or not text:
+            if entry.isspace() or len(entry) <= 1 or not entry:
                continue

-            snapshot_list = text.split("\n")
+            # each line is a snapshot aka entry of the CDX server API.
+            # We are able to split the page by lines because it only
+            # splits the lines on a sinlge page and not all the entries
+            # at once, thus there should be no issues of too much memory usage.
+            snapshot_list = entry.split("\n")

            for snapshot in snapshot_list:

-                if len(snapshot) < 46:  # 14 + 32 (timestamp+digest)
+                # 14 + 32 == 46 ( timestamp + digest ), ignore the invalid entries.
+                # they are invalid if their length is smaller than sum of length
+                # of a standard wayback_timestamp and standard digest of an entry.
+                if len(snapshot) < 46:
                    continue

                properties: Dict[str, Optional[str]] = {
@@ -168,15 +235,15 @@ class WaybackMachineCDXServerAPI(object):
                    "length": None,
                }

-                prop_values = snapshot.split(" ")
+                property_value = snapshot.split(" ")

-                prop_values_len = len(prop_values)
-                properties_len = len(properties)
+                total_property_values = len(property_value)
+                warranted_total_property_values = len(properties)

-                if prop_values_len != properties_len:
+                if total_property_values != warranted_total_property_values:
                    raise WaybackError(
-                        f"Snapshot returned by Cdx API has {prop_values_len} "
-                        f"properties instead of expected {properties_len} properties.\n"
+                        f"Snapshot returned by Cdx API has {total_property_values} "
+                        f"properties instead of expected {warranted_total_property_values} properties.\n"
                        f"Problematic Snapshot: {snapshot}"
                    )

@@ -188,6 +255,6 @@ class WaybackMachineCDXServerAPI(object):
                    properties["statuscode"],
                    properties["digest"],
                    properties["length"],
-                ) = prop_values
+                ) = property_value

                yield CDXSnapshot(cast(Dict[str, str], properties))