From f70ff906a7b5b0def97fa0ea252559a3ccb4acbd Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Mon, 7 Feb 2022 16:06:51 +0530 Subject: [PATCH] add descriptive docstrings and type hints in waybackpy/cdx_snapshot.py --- waybackpy/cdx_snapshot.py | 75 +++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 11 deletions(-) diff --git a/waybackpy/cdx_snapshot.py b/waybackpy/cdx_snapshot.py index ab96602..9cf9610 100644 --- a/waybackpy/cdx_snapshot.py +++ b/waybackpy/cdx_snapshot.py @@ -1,30 +1,83 @@ +""" +Module that contains the CDXSnapshot class, CDX records are casted +to CDXSnapshot objects for easier access. + +The CDX index format is plain text data. Each line ('record') indicates a +crawled document. And these lines are casted to CDXSnapshot. +""" + + from datetime import datetime from typing import Dict class CDXSnapshot(object): """ - Class for the CDX snapshot lines returned by the CDX API, + Class for the CDX snapshot lines('record') returned by the CDX API, Each valid line of the CDX API is casted to an CDXSnapshot object - by the CDX API interface. + by the CDX API interface, just use "." to access any attribute of the + CDX server API snapshot. + This provides the end-user the ease of using the data as attributes of the CDXSnapshot. + + The string representation of the class is identical to the line returned + by the CDX server API. + + Besides all the attributes of the CDX server API this class also provides + archive_url attribute, yes it is the archive url of the snapshot. + + Attributes of the this class and what they represents and are useful for: + + urlkey: The document captured, expressed as a SURT + SURT stands for Sort-friendly URI Reordering Transform, and is a + transformation applied to URIs which makes their left-to-right + representation better match the natural hierarchy of domain names. + A URI has SURT + form . + + timestamp: The timestamp of the archive, format is yyyyMMddhhmmss and type + is string. + + datetime_timestamp: The timestamp as a datetime object. + + original: The original URL of the archive. If archive_url is + https://web.archive.org/web/20220113130051/https://google.com then the + original URL is https://google.com + + mimetype: The document’s file type. e.g. text/html + + statuscode: HTTP response code for the document at the time of its crawling + + digest: Base32-encoded SHA-1 checksum of the document for discriminating + with others + + length: Document’s volume of bytes in the WARC file + + archive_url: The archive url of the snapshot, this is not returned by the + CDX server API but created by this class on init. """ def __init__(self, properties: Dict[str, str]) -> None: - self.urlkey = properties["urlkey"] - self.timestamp = properties["timestamp"] - self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S") - self.original = properties["original"] - self.mimetype = properties["mimetype"] - self.statuscode = properties["statuscode"] - self.digest = properties["digest"] - self.length = properties["length"] - self.archive_url = ( + self.urlkey: str = properties["urlkey"] + self.timestamp: str = properties["timestamp"] + self.datetime_timestamp: datetime = datetime.strptime( + self.timestamp, "%Y%m%d%H%M%S" + ) + self.original: str = properties["original"] + self.mimetype: str = properties["mimetype"] + self.statuscode: str = properties["statuscode"] + self.digest: str = properties["digest"] + self.length: str = properties["length"] + self.archive_url: str = ( f"https://web.archive.org/web/{self.timestamp}/{self.original}" ) def __str__(self) -> str: + """ + The string representation is same as the line returned by the + CDX server API for the snapshot. + """ return ( f"{self.urlkey} {self.timestamp} {self.original} " f"{self.mimetype} {self.statuscode} {self.digest} {self.length}"