add descriptive docstrings and type hints in waybackpy/cdx_snapshot.py

2022-02-07 16:06:51 +05:30
parent 13373ae7f3
commit f70ff906a7
1 changed files with 64 additions and 11 deletions
--- a/waybackpy/cdx_snapshot.py
+++ b/waybackpy/cdx_snapshot.py
@@ -1,30 +1,83 @@
+"""
+Module that contains the CDXSnapshot class, CDX records are casted
+to CDXSnapshot objects for easier access.
+
+The CDX index format is plain text data. Each line ('record') indicates a
+crawled document. And these lines are casted to CDXSnapshot.
+"""
+
+
 from datetime import datetime
 from typing import Dict


 class CDXSnapshot(object):
    """
-    Class for the CDX snapshot lines returned by the CDX API,
+    Class for the CDX snapshot lines('record') returned by the CDX API,
    Each valid line of the CDX API is casted to an CDXSnapshot object
-    by the CDX API interface.
+    by the CDX API interface, just use "." to access any attribute of the
+    CDX server API snapshot.
+
    This provides the end-user the ease of using the data as attributes
    of the CDXSnapshot.
+
+    The string representation of the class is identical to the line returned
+    by the CDX server API.
+
+    Besides all the attributes of the CDX server API this class also provides
+    archive_url attribute, yes it is the archive url of the snapshot.
+
+    Attributes of the this class and what they represents and are useful for:
+
+    urlkey: The document captured, expressed as a SURT
+            SURT stands for Sort-friendly URI Reordering Transform, and is a
+            transformation applied to URIs which makes their left-to-right
+            representation better match the natural hierarchy of domain names.
+            A URI <scheme://domain.tld/path?query> has SURT
+            form <scheme://(tld,domain,)/path?query>.
+
+    timestamp: The timestamp of the archive, format is yyyyMMddhhmmss and type
+               is string.
+
+    datetime_timestamp: The timestamp as a datetime object.
+
+    original: The original URL of the archive. If archive_url is
+    https://web.archive.org/web/20220113130051/https://google.com then the
+    original URL is https://google.com
+
+    mimetype: The document’s file type. e.g. text/html
+
+    statuscode: HTTP response code for the document at the time of its crawling
+
+    digest: Base32-encoded SHA-1 checksum of the document for discriminating
+            with others
+
+    length: Document’s volume of bytes in the WARC file
+
+    archive_url: The archive url of the snapshot, this is not returned by the
+                 CDX server API but created by this class on init.
    """

    def __init__(self, properties: Dict[str, str]) -> None:
-        self.urlkey = properties["urlkey"]
-        self.timestamp = properties["timestamp"]
-        self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
-        self.original = properties["original"]
-        self.mimetype = properties["mimetype"]
-        self.statuscode = properties["statuscode"]
-        self.digest = properties["digest"]
-        self.length = properties["length"]
-        self.archive_url = (
+        self.urlkey: str = properties["urlkey"]
+        self.timestamp: str = properties["timestamp"]
+        self.datetime_timestamp: datetime = datetime.strptime(
+            self.timestamp, "%Y%m%d%H%M%S"
+        )
+        self.original: str = properties["original"]
+        self.mimetype: str = properties["mimetype"]
+        self.statuscode: str = properties["statuscode"]
+        self.digest: str = properties["digest"]
+        self.length: str = properties["length"]
+        self.archive_url: str = (
            f"https://web.archive.org/web/{self.timestamp}/{self.original}"
        )

    def __str__(self) -> str:
+        """
+        The string representation is same as the line returned by the
+        CDX server API for the snapshot.
+        """
        return (
            f"{self.urlkey} {self.timestamp} {self.original} "
            f"{self.mimetype} {self.statuscode} {self.digest} {self.length}"