full cdx api support

2021-01-10 02:23:53 +05:30
parent a2550f17d7
commit a03813315f
10 changed files with 915 additions and 403 deletions
@@ -0,0 +1,93 @@
 import pytest
 from waybackpy.cdx import Cdx
 from waybackpy.exceptions import WaybackError
 def test_all_cdx():
    url = "akamhy.github.io"
    user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, \
    like Gecko) Chrome/45.0.2454.85 Safari/537.36"
    cdx = Cdx(
        url=url,
        user_agent=user_agent,
        start_timestamp=2017,
        end_timestamp=2020,
        filters=[
            "statuscode:200",
            "mimetype:text/html",
            "timestamp:20201002182319",
            "original:https://akamhy.github.io/",
        ],
        gzip=False,
        collapses=["timestamp:10", "digest"],
        limit=50,
        match_type="prefix",
    )
    snapshots = cdx.snapshots()
    for snapshot in snapshots:
        ans = snapshot.archive_url
    assert "https://web.archive.org/web/20201002182319/https://akamhy.github.io/" == ans
    url = "akahfjgjkmhy.gihthub.ip"
    cdx = Cdx(
        url=url,
        user_agent=user_agent,
        start_timestamp=None,
        end_timestamp=None,
        filters=[],
        match_type=None,
        gzip=True,
        collapses=[],
        limit=10,
    )
    snapshots = cdx.snapshots()
    print(snapshots)
    i = 0
    for _ in snapshots:
        i += 1
    assert i == 0
    url = "https://github.com/akamhy/waybackpy/*"
    cdx = Cdx(url=url, user_agent=user_agent, limit=50)
    snapshots = cdx.snapshots()
    for snapshot in snapshots:
        print(snapshot.archive_url)
    url = "https://github.com/akamhy/waybackpy"
    with pytest.raises(WaybackError):
        cdx = Cdx(url=url, user_agent=user_agent, limit=50, filters=["ghddhfhj"])
        snapshots = cdx.snapshots()
    with pytest.raises(WaybackError):
        cdx = Cdx(url=url, user_agent=user_agent, collapses=["timestamp", "ghdd:hfhj"])
        snapshots = cdx.snapshots()
    url = "https://github.com"
    cdx = Cdx(url=url, user_agent=user_agent, limit=50)
    snapshots = cdx.snapshots()
    c = 0
    for snapshot in snapshots:
        c += 1
        if c > 100:
            break
    url = "https://github.com/*"
    cdx = Cdx(url=url, user_agent=user_agent, collapses=["timestamp"])
    snapshots = cdx.snapshots()
    c = 0
    for snapshot in snapshots:
        c += 1
        if c > 30_529:  # deafult limit is 10k
            break
    url = "https://github.com/*"
    cdx = Cdx(url=url, user_agent=user_agent)
    c = 0
    snapshots = cdx.snapshots()
    for snapshot in snapshots:
        c += 1
        if c > 100_529:
            break
@@ -0,0 +1,32 @@
 import pytest
 from waybackpy.snapshot import CdxSnapshot, datetime
 def test_CdxSnapshot():
    sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
    (
        urlkey,
        timestamp,
        original,
        mimetype,
        statuscode,
        digest,
        length,
    ) = sample_input.split(" ")
    snapshot = CdxSnapshot(
        urlkey, timestamp, original, mimetype, statuscode, digest, length
    )
    assert urlkey == snapshot.urlkey
    assert timestamp == snapshot.timestamp
    assert original == snapshot.original
    assert mimetype == snapshot.mimetype
    assert statuscode == snapshot.statuscode
    assert digest == snapshot.digest
    assert length == snapshot.length
    assert datetime.strptime(timestamp, "%Y%m%d%H%M%S") == snapshot.datetime_timestamp
    archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
    assert archive_url == snapshot.archive_url
    assert archive_url == str(snapshot)
@@ -0,0 +1,190 @@
 import pytest
 import json
 from waybackpy.utils import (
    _cleaned_url,
    _url_check,
    _full_url,
    URLError,
    WaybackError,
    _get_total_pages,
    _archive_url_parser,
    _wayback_timestamp,
    _get_response,
    _check_match_type,
    _check_collapses,
    _check_filters,
    _ts,
 )
 def test_ts():
    timestamp = True
    data = {}
    assert _ts(timestamp, data)
    data = """
    {"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"}
    """
    data = json.loads(data)
    assert data["archived_snapshots"]["closest"]["timestamp"] == "20210109155628"
 def test_check_filters():
    filters = []
    _check_filters(filters)
    filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"]
    _check_filters(filters)
    filters = ["statuscode:2768900", "timestamp:123456789", "original:://url.com"]
    with pytest.raises(WaybackError):
        _check_filters(filters)
    with pytest.raises(WaybackError):
        _check_filters("not-list")
 def test_check_collapses():
    collapses = []
    _check_collapses(collapses)
    collapses = ["timestamp:10"]
    _check_collapses(collapses)
    collapses = ["urlkey"]
    _check_collapses(collapses)
    collapses = "urlkey"  # NOT LIST
    with pytest.raises(WaybackError):
        _check_collapses(collapses)
    collapses = ["also illegal collapse"]
    with pytest.raises(WaybackError):
        _check_collapses(collapses)
 def test_check_match_type():
    assert None == _check_match_type(None, "url")
    match_type = "exact"
    url = "test_url"
    assert None == _check_match_type(match_type, url)
    url = "has * in it"
    with pytest.raises(WaybackError):
        _check_match_type("domain", url)
    with pytest.raises(WaybackError):
        _check_match_type("not a valid type", "url")
 def test_cleaned_url():
    test_url = " https://en.wikipedia.org/wiki/Network security "
    answer = "https://en.wikipedia.org/wiki/Network%20security"
    assert answer == _cleaned_url(test_url)
 def test_url_check():
    good_url = "https://akamhy.github.io"
    assert None == _url_check(good_url)
    bad_url = "https://github-com"
    with pytest.raises(URLError):
        _url_check(bad_url)
 def test_full_url():
    params = {}
    endpoint = "https://web.archive.org/cdx/search/cdx"
    assert endpoint == _full_url(endpoint, params)
    params = {"a": "1"}
    assert "https://web.archive.org/cdx/search/cdx?a=1" == _full_url(endpoint, params)
    assert "https://web.archive.org/cdx/search/cdx?a=1" == _full_url(
        endpoint + "?", params
    )
    params["b"] = 2
    assert "https://web.archive.org/cdx/search/cdx?a=1&b=2" == _full_url(
        endpoint + "?", params
    )
    params["c"] = "foo bar"
    assert "https://web.archive.org/cdx/search/cdx?a=1&b=2&c=foo%20bar" == _full_url(
        endpoint + "?", params
    )
 def test_get_total_pages():
    user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
    url = "github.com*"
    assert 212890 <= _get_total_pages(url, user_agent)
    url = "https://zenodo.org/record/4416138"
    assert 2 >= _get_total_pages(url, user_agent)
 def test_archive_url_parser():
    perfect_header = """
    {'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'}
    """
    archive = _archive_url_parser(
        perfect_header, "https://www.scribbr.com/citing-sources/et-al/"
    )
    assert "web.archive.org/web/20210102094009" in archive
    header = """
    vhgvkjv
    Content-Location: /web/20201126185327/https://www.scribbr.com/citing-sources/et-al
    ghvjkbjmmcmhj
    """
    archive = _archive_url_parser(
        header, "https://www.scribbr.com/citing-sources/et-al/"
    )
    assert "20201126185327" in archive
    header = """
    hfjkfjfcjhmghmvjm
    X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US
    yfu,u,gikgkikik
    """
    archive = _archive_url_parser(
        header, "https://www.scribbr.com/citing-sources/et-al/"
    )
    assert "20171128185327" in archive
    # The below header should result in Exception
    no_archive_header = """
    {'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:42:45 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', 'X-App-Server': 'wwwb-app52', 'X-ts': '523', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0'}
    """
    with pytest.raises(WaybackError):
        _archive_url_parser(
            no_archive_header, "https://www.scribbr.com/citing-sources/et-al/"
        )
 def test_wayback_timestamp():
    ts = _wayback_timestamp(year=2020, month=1, day=2, hour=3, minute=4)
    assert "202001020304" in str(ts)
 def test_get_response():
    endpoint = "https://www.google.com"
    user_agent = (
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
    )
    headers = {"User-Agent": "%s" % user_agent}
    response = _get_response(endpoint, params=None, headers=headers)
    assert response.status_code == 200
    endpoint = "http/wwhfhfvhvjhmom"
    with pytest.raises(WaybackError):
        _get_response(endpoint, params=None, headers=headers)
    endpoint = "https://akamhy.github.io"
    url, response = _get_response(
        endpoint, params=None, headers=headers, return_full_url=True
    )
    assert endpoint == url
@@ -4,85 +4,17 @@ import random
 import requests
 from datetime import datetime
-sys.path.append("..")
+from waybackpy.wrapper import Url, Cdx
 import waybackpy.wrapper as waybackpy  # noqa: E402
 user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
 def test_cleaned_url():
    """No API use"""
    test_url = " https://en.wikipedia.org/wiki/Network security "
    answer = "https://en.wikipedia.org/wiki/Network_security"
    target = waybackpy.Url(test_url, user_agent)
    test_result = target._cleaned_url()
    assert answer == test_result
 def test_ts():
    a = waybackpy.Url("https://google.com", user_agent)
    ts = a._timestamp
    assert str(datetime.utcnow().year) in str(ts)
 def test_dunders():
    """No API use"""
    url = "https://en.wikipedia.org/wiki/Network_security"
    user_agent = "UA"
    target = waybackpy.Url(url, user_agent)
    assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target)
    assert "en.wikipedia.org" in str(target)
 def test_url_check():
    """No API Use"""
    broken_url = "http://wwwgooglecom/"
    with pytest.raises(Exception):
-        waybackpy.Url(broken_url, user_agent)
+        Url(broken_url, user_agent)
 def test_archive_url_parser():
    """No API Use"""
    perfect_header = """
    {'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'}
    """
    archive = waybackpy._archive_url_parser(
        perfect_header, "https://www.scribbr.com/citing-sources/et-al/"
    )
    assert "web.archive.org/web/20210102094009" in archive
    header = """
    vhgvkjv
    Content-Location: /web/20201126185327/https://www.scribbr.com/citing-sources/et-al
    ghvjkbjmmcmhj
    """
    archive = waybackpy._archive_url_parser(
        header, "https://www.scribbr.com/citing-sources/et-al/"
    )
    assert "20201126185327" in archive
    header = """
    hfjkfjfcjhmghmvjm
    X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US
    yfu,u,gikgkikik
    """
    archive = waybackpy._archive_url_parser(
        header, "https://www.scribbr.com/citing-sources/et-al/"
    )
    assert "20171128185327" in archive
    # The below header should result in Exception
    no_archive_header = """
    {'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:42:45 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', 'X-App-Server': 'wwwb-app52', 'X-ts': '523', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0'}
    """
    with pytest.raises(Exception):
        waybackpy._archive_url_parser(
            no_archive_header, "https://www.scribbr.com/citing-sources/et-al/"
        )
 def test_save():
@@ -90,15 +22,14 @@ def test_save():
    url_list = [
        "en.wikipedia.org",
-        "www.wikidata.org",
+        "akamhy.github.io",
        "commons.wikimedia.org",
        "www.wiktionary.org",
        "www.w3schools.com",
-        "www.ibm.com",
+        "youtube.com",
    ]
    x = random.randint(0, len(url_list) - 1)
    url1 = url_list[x]
-    target = waybackpy.Url(
+    target = Url(
        url1,
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
@@ -109,11 +40,11 @@ def test_save():
    # Test for urls that are incorrect.
    with pytest.raises(Exception):
        url2 = "ha ha ha ha"
-        waybackpy.Url(url2, user_agent)
+        Url(url2, user_agent)
    url3 = "http://www.archive.is/faq.html"
    with pytest.raises(Exception):
-        target = waybackpy.Url(
+        target = Url(
            url3,
            "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) "
            "AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 "
@@ -124,7 +55,7 @@ def test_save():
 def test_near():
    url = "google.com"
-    target = waybackpy.Url(
+    target = Url(
        url,
        "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 "
        "(KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
@@ -139,7 +70,7 @@ def test_near():
        or ("2015-03" in archive_near_month_year)
    )
-    target = waybackpy.Url(
+    target = Url(
        "www.python.org",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
@@ -157,13 +88,13 @@ def test_near():
        NeverArchivedUrl = (
            "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
        )
-        target = waybackpy.Url(NeverArchivedUrl, user_agent)
+        target = Url(NeverArchivedUrl, user_agent)
        target.near(year=2010)
 def test_oldest():
    url = "github.com/akamhy/waybackpy"
-    target = waybackpy.Url(url, user_agent)
+    target = Url(url, user_agent)
    o = target.oldest()
    assert "20200504141153" in str(o)
    assert "2020-05-04" in str(o._timestamp)
@@ -171,50 +102,35 @@ def test_oldest():
 def test_json():
    url = "github.com/akamhy/waybackpy"
-    target = waybackpy.Url(url, user_agent)
+    target = Url(url, user_agent)
    assert "archived_snapshots" in str(target.JSON)
 def test_archive_url():
    url = "github.com/akamhy/waybackpy"
-    target = waybackpy.Url(url, user_agent)
+    target = Url(url, user_agent)
    assert "github.com/akamhy" in str(target.archive_url)
 def test_newest():
    url = "github.com/akamhy/waybackpy"
-    target = waybackpy.Url(url, user_agent)
+    target = Url(url, user_agent)
    assert url in str(target.newest())
 def test_get():
-    target = waybackpy.Url("google.com", user_agent)
+    target = Url("google.com", user_agent)
    assert "Welcome to Google" in target.get(target.oldest())
 def test_wayback_timestamp():
    ts = waybackpy._wayback_timestamp(year=2020, month=1, day=2, hour=3, minute=4)
    assert "202001020304" in str(ts)
 def test_get_response():
    endpoint = "https://www.google.com"
    user_agent = (
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
    )
    headers = {"User-Agent": "%s" % user_agent}
    response = waybackpy._get_response(endpoint, params=None, headers=headers)
    assert response.status_code == 200
 def test_total_archives():
    user_agent = (
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
    )
-    target = waybackpy.Url(" https://outlook.com ", user_agent)
+    target = Url(" https://outlook.com ", user_agent)
    assert target.total_archives() > 80000
-    target = waybackpy.Url(
+    target = Url(
        " https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent
    )
    assert target.total_archives() == 0
@@ -222,8 +138,8 @@ def test_total_archives():
 def test_known_urls():
-    target = waybackpy.Url("akamhy.github.io", user_agent)
+    target = Url("akamhy.github.io", user_agent)
    assert len(target.known_urls(alive=True, subdomain=False)) > 2
-    target = waybackpy.Url("akamhy.github.io", user_agent)
+    target = Url("akamhy.github.io", user_agent)
    assert len(target.known_urls()) > 3
@@ -0,0 +1,234 @@
 from .snapshot import CdxSnapshot
 from .exceptions import WaybackError
 from .utils import (
    _full_url,
    _get_total_pages,
    _get_response,
    default_user_agent,
    _check_filters,
    _check_collapses,
    _check_match_type,
 )
 # TODO : Threading support for pagination API. It's designed for Threading.
 class Cdx:
    def __init__(
        self,
        url,
        user_agent=default_user_agent,
        start_timestamp=None,
        end_timestamp=None,
        filters=[],
        match_type=None,
        gzip=True,
        collapses=[],
        limit=10000,
    ):
        self.url = str(url).strip()
        self.user_agent = str(user_agent)
        self.start_timestamp = str(start_timestamp) if start_timestamp else None
        self.end_timestamp = str(end_timestamp) if end_timestamp else None
        self.filters = filters
        _check_filters(self.filters)
        self.match_type = str(match_type).strip() if match_type else None
        _check_match_type(self.match_type, self.url)
        self.gzip = gzip
        self.collapses = collapses
        _check_collapses(self.collapses)
        self.limit = limit
        self.last_api_request_url = None
        self.use_page = False
    def cdx_api_manager(self, payload, headers, use_page=False):
        """
        We have two options to get the snapshots, we use this
        method to make a selection between pagination API and
        the normal one with Resumption Key, sequential querying
        of CDX data. For very large querying (for example domain query),
        it may be useful to perform queries in parallel and also estimate
        the total size of the query.
        read more about the pagination API at:
        https://web.archive.org/web/20201228063237/https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api
        if use_page is false if will use the normal sequential query API,
        else use the pagination API.
        two mutually exclusive cases possible:
        1) pagination API is selected
            a) get the total number of pages to read, using _get_total_pages()
            b) then we use a for loop to get all the pages and yield the response text
        2) normal sequential query API is selected.
            a) get use showResumeKey=true to ask the API to add a query resumption key
               at the bottom of response
            b) check if the page has more than 3 lines, if not return the text
            c) if it has atleast three lines, we check the second last line for zero length.
            d) if the second last line has length zero than we assume that the last line contains
               the resumption key, we set the resumeKey and remove the resumeKey from text
            e) if the second line has non zero length we return the text as there will no resumption key
            f) if we find the resumption key we set the "more" variable status to True which is always set
               to False on each iteration. If more is not True the iteration stops and function returns.
        """
        endpoint = "https://web.archive.org/cdx/search/cdx"
        if use_page == True:
            total_pages = _get_total_pages(self.url, self.user_agent)
            for i in range(total_pages):
                payload["page"] = str(i)
                url, res = _get_response(
                    endpoint, params=payload, headers=headers, return_full_url=True
                )
                self.last_api_request_url = url
                yield res.text
        else:
            payload["showResumeKey"] = "true"
            payload["limit"] = str(self.limit)
            resumeKey = None
            more = True
            while more:
                if resumeKey:
                    payload["resumeKey"] = resumeKey
                url, res = _get_response(
                    endpoint, params=payload, headers=headers, return_full_url=True
                )
                self.last_api_request_url = url
                text = res.text.strip()
                lines = text.splitlines()
                more = False
                if len(lines) >= 3:
                    last_line = lines[-1]
                    second_last_line = lines[-2]
                    if len(second_last_line) == 0:
                        resumeKey = lines[-1].strip()
                        text = text.replace(resumeKey, "", 1).strip()
                        more = True
                yield text
    def snapshots(self):
        """
        This function yeilds snapshots encapsulated
        in CdxSnapshot for more usability.
        All the get request values are set if the conditions match
        And we use logic that if someone's only inputs don't have any
        of [start_timestamp, end_timestamp] and don't use any collapses
        then we use the pagination API as it returns archives starting
        from the first archive and the recent most archive will be on
        the last page.
        """
        payload = {}
        headers = {"User-Agent": self.user_agent}
        if self.start_timestamp:
            payload["from"] = self.start_timestamp
        if self.end_timestamp:
            payload["to"] = self.end_timestamp
        if self.gzip != True:
            payload["gzip"] = "false"
        if self.match_type:
            payload["matchType"] = self.match_type
        if self.filters and len(self.filters) > 0:
            for i, f in enumerate(self.filters):
                payload["filter" + str(i)] = f
        if self.collapses and len(self.collapses) > 0:
            for i, f in enumerate(self.collapses):
                payload["collapse" + str(i)] = f
        payload["url"] = self.url
        if not self.start_timestamp or self.end_timestamp:
            self.use_page = True
        if self.collapses != []:
            self.use_page = False
        texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
        for text in texts:
            if text.isspace() or len(text) <= 1 or not text:
                continue
            snapshot_list = text.split("\n")
            for snapshot in snapshot_list:
                if len(snapshot) < 46:  # 14 + 32 (timestamp+digest)
                    continue
                properties = {
                    "urlkey": None,
                    "timestamp": None,
                    "original": None,
                    "mimetype": None,
                    "statuscode": None,
                    "digest": None,
                    "length": None,
                }
                prop_values = snapshot.split(" ")
                # Making sure that we get the same number of
                # property values as the number of properties
                prop_values_len = len(prop_values)
                properties_len = len(properties)
                if prop_values_len != properties_len:
                    raise WaybackError(
                        "Snapshot returned by Cdx API has %s properties instead of expected %s properties.\nInvolved Snapshot : %s"
                        % (prop_values_len, properties_len, snapshot)
                    )
                (
                    properties["urlkey"],
                    properties["timestamp"],
                    properties["original"],
                    properties["mimetype"],
                    properties["statuscode"],
                    properties["digest"],
                    properties["length"],
                ) = prop_values
                yield CdxSnapshot(
                    properties["urlkey"],
                    properties["timestamp"],
                    properties["original"],
                    properties["mimetype"],
                    properties["statuscode"],
                    properties["digest"],
                    properties["length"],
                )
@@ -4,9 +4,9 @@ import sys
 import random
 import string
 import argparse
-from waybackpy.wrapper import Url
+from .wrapper import Url
-from waybackpy.exceptions import WaybackError
+from .exceptions import WaybackError
-from waybackpy.__version__ import __version__
+from .__version__ import __version__
 def _save(obj):
@@ -19,11 +19,11 @@ def _save(obj):
            header = m.group(1)
        if "No archive URL found in the API response" in e:
            return (
-                "\n[waybackpy] Can not save/archive your link.\n[waybackpy] This\
+                "\n[waybackpy] Can not save/archive your link.\n[waybackpy] This "
-                 could happen because either your waybackpy (%s) is likely out of\
+                "could happen because either your waybackpy (%s) is likely out of "
-                 date or Wayback Machine is malfunctioning.\n[waybackpy] Visit\
+                "date or Wayback Machine is malfunctioning.\n[waybackpy] Visit "
-                 https://github.com/akamhy/waybackpy for the latest version of \
+                "https://github.com/akamhy/waybackpy for the latest version of "
-                waybackpy.\n[waybackpy] API response Header :\n%s"
+                "waybackpy.\n[waybackpy] API response Header :\n%s"
                % (__version__, header)
            )
        return WaybackError(err)
@@ -108,17 +108,16 @@ def _known_urls(obj, args):
    """
    Known urls for a domain.
    """
-    # sd = subdomain
+
-    sd = False
+    subdomain = False
    if args.subdomain:
-        sd = True
+        subdomain = True
-    # al = alive
+    alive = False
    al = False
    if args.alive:
-        al = True
+        alive = True
-    url_list = obj.known_urls(alive=al, subdomain=sd)
+    url_list = obj.known_urls(alive=alive, subdomain=subdomain)
    total_urls = len(url_list)
    if total_urls > 0:
@@ -4,6 +4,7 @@ waybackpy.exceptions
 This module contains the set of Waybackpy's exceptions.
 """
 class WaybackError(Exception):
    """
    Raised when Wayback Machine API Service is unreachable/down.
@@ -0,0 +1,26 @@
 from datetime import datetime
 class CdxSnapshot:
    """
    This class helps to handle the Cdx Snapshots easily.
    What the raw data looks like:
    org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
    """
    def __init__(
        self, urlkey, timestamp, original, mimetype, statuscode, digest, length
    ):
        self.urlkey = urlkey
        self.timestamp = timestamp
        self.datetime_timestamp = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
        self.original = original
        self.mimetype = mimetype
        self.statuscode = statuscode
        self.digest = digest
        self.length = length
        self.archive_url = "https://web.archive.org/web/" + timestamp + "/" + original
    def __str__(self):
        return self.archive_url
@@ -0,0 +1,280 @@
 import re
 import requests
 from .exceptions import WaybackError, URLError
 from datetime import datetime
 from urllib3.util.retry import Retry
 from requests.adapters import HTTPAdapter
 from .__version__ import __version__
 quote = requests.utils.quote
 default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
 def _ts(timestamp, data):
    """
    Get timestamp of last fetched archive.
    If used before fetching any archive, will
    use whatever self.JSON returns.
    self.timestamp is None implies that
    self.JSON will return any archive's JSON
    that wayback machine provides it.
    """
    if timestamp:
        return timestamp
    if not data["archived_snapshots"]:
        return datetime.max
    return datetime.strptime(
        data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
    )
 def _check_match_type(match_type, url):
    if not match_type:
        return
    if "*" in url:
        raise WaybackError("Can not use wildcard with match_type argument")
    legal_match_type = ["exact", "prefix", "host", "domain"]
    if match_type not in legal_match_type:
        raise WaybackError(
            "%s is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'"
            % match_type
        )
 def _check_collapses(collapses):
    if not isinstance(collapses, list):
        raise WaybackError("collapses must be a list.")
    if len(collapses) == 0:
        return
    for c in collapses:
        try:
            match = re.search(
                r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
                c,
            )
            field = match.group(1)
            N = None
            if 2 == len(match.groups()):
                N = match.group(2)
            if N:
                assert field + N == c
            else:
                assert field == c
        except Exception:
            e = "collapse argument '%s' is not following the cdx collapse syntax." % c
            raise WaybackError(e)
 def _check_filters(filters):
    if not isinstance(filters, list):
        raise WaybackError("filters must be a list.")
    # [!]field:regex
    for f in filters:
        try:
            match = re.search(
                r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
                f,
            )
            key = match.group(1)
            val = match.group(2)
            if "statuscode" in key:
                assert len(val) == 3
                assert isinstance(int(val), int) == True
            if "timestamp" in key:
                int_ts = int(val)
                assert len(val) == 14  # must be 14 and not less to filter
                assert int_ts > 19_950_000_000_000  # year 1995, 14 digit ts
                assert isinstance(int_ts, int) == True
            if "original" in key:
                assert "http" in val
        except Exception as e:
            e = "Filter '%s' not following the cdx filter syntax." % f
            raise WaybackError(e)
 def _cleaned_url(url):
    print(1)
    """
    Remove EOL
    replace " " with "_"
    """
    return str(url).strip().replace(" ", "%20")
 def _url_check(url):
    """
    Check for common URL problems.
    What we are checking:
    1) '.' in self.url, no url that ain't '.' in it.
    If you known any others, please create a PR on the github repo.
    """
    if "." not in url:
        raise URLError("'%s' is not a vaild URL." % url)
 def _full_url(endpoint, params):
    full_url = endpoint
    if params:
        full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
        for key, val in params.items():
            key = "filter" if key.startswith("filter") else key
            key = "collapse" if key.startswith("collapse") else key
            amp = "" if full_url.endswith("?") else "&"
            full_url = full_url + amp + "%s=%s" % (key, quote(str(val)))
    return full_url
 def _get_total_pages(url, user_agent):
    """
    If showNumPages is passed in cdx API, it returns
    'number of archive pages'and each page has many archives.
    This func returns number of pages of archives (type int).
    """
    total_pages_url = (
        "https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url
    )
    headers = {"User-Agent": user_agent}
    return int((_get_response(total_pages_url, headers=headers).text).strip())
 def _archive_url_parser(header, url):
    """
    The wayback machine's save API doesn't
    return JSON response, we are required
    to read the header of the API response
    and look for the archive URL.
    This method has some regexen (or regexes)
    that search for archive url in header.
    This method is used when you try to
    save a webpage on wayback machine.
    Two cases are possible:
    1) Either we find the archive url in
       the header.
    2) Or we didn't find the archive url in
       API header.
    If we found the archive URL we return it.
    And if we couldn't find it, we raise
    WaybackError with an error message.
    """
    # Regex1
    m = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
    if m:
        return "web.archive.org" + m.group(1)
    # Regex2
    m = re.search(
        r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
    )
    if m:
        return m.group(1)
    # Regex3
    m = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
    if m:
        return m.group(1)
    raise WaybackError(
        "No archive URL found in the API response. "
        "If '%s' can be accessed via your web browser then either "
        "this version of waybackpy (%s) is out of date or WayBack Machine is malfunctioning. Visit "
        "'https://github.com/akamhy/waybackpy' for the latest version "
        "of waybackpy.\nHeader:\n%s" % (url, __version__, str(header))
    )
 def _wayback_timestamp(**kwargs):
    """
    Wayback Machine archive URLs
    have a timestamp in them.
    The standard archive URL format is
    https://web.archive.org/web/20191214041711/https://www.youtube.com
    If we break it down in three parts:
    1 ) The start (https://web.archive.org/web/)
    2 ) timestamp (20191214041711)
    3 ) https://www.youtube.com, the original URL
    The near method takes year, month, day, hour and minute
    as Arguments, their type is int.
    This method takes those integers and converts it to
    wayback machine timestamp and returns it.
    Return format is string.
    """
    return "".join(
        str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
    )
 def _get_response(
    endpoint, params=None, headers=None, retries=5, return_full_url=False
 ):
    """
    This function is used make get request.
    We use the requests package to make the
    requests.
    We try five times and if it fails it raises
    WaybackError exception.
    You can handles WaybackError by importing:
    from waybackpy.exceptions import WaybackError
    try:
        ...
    except WaybackError as e:
        # handle it
    """
    # From https://stackoverflow.com/a/35504626
    # By https://stackoverflow.com/users/401467/datashaman
    s = requests.Session()
    retries = Retry(
        total=retries, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    url = _full_url(endpoint, params)
    print(url)
    try:
        if not return_full_url:
            return s.get(url, headers=headers)
        return (url, s.get(url, headers=headers))
    except Exception as e:
        exc = WaybackError("Error while retrieving %s" % url)
        exc.__cause__ = e
        raise exc
@@ -1,151 +1,24 @@
 import re
 import requests
 import concurrent.futures
 from urllib3.util.retry import Retry
 from datetime import datetime, timedelta
-from requests.adapters import HTTPAdapter
+from .exceptions import WaybackError
-from waybackpy.__version__ import __version__
+from .cdx import Cdx
-from waybackpy.exceptions import WaybackError, URLError
+from .utils import (
-
+    _archive_url_parser,
-
+    _wayback_timestamp,
-default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
+    _get_response,
-
+    default_user_agent,
-
+    _url_check,
-def _get_total_pages(url, user_agent):
+    _cleaned_url,
-    """
+    _ts,
-    If showNumPages is passed in cdx API, it returns
+)
    'number of archive pages'and each page has many archives.
    This func returns number of pages of archives (type int).
    """
    total_pages_url = (
        "https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true" % url
    )
    headers = {"User-Agent": user_agent}
    return int((_get_response(total_pages_url, headers=headers).text).strip())
 def _archive_url_parser(header, url):
    """
    The wayback machine's save API doesn't
    return JSON response, we are required
    to read the header of the API response
    and look for the archive URL.
    This method has some regexen (or regexes)
    that search for archive url in header.
    This method is used when you try to
    save a webpage on wayback machine.
    Two cases are possible:
    1) Either we find the archive url in
       the header.
    2) Or we didn't find the archive url in
       API header.
    If we found the archive URL we return it.
    And if we couldn't find it, we raise
    WaybackError with an error message.
    """
    # Regex1
    m = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
    if m:
        return "web.archive.org" + m.group(1)
    # Regex2
    m = re.search(
        r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
    )
    if m:
        return m.group(1)
    # Regex3
    m = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
    if m:
        return m.group(1)
    raise WaybackError(
        "No archive URL found in the API response. "
        "If '%s' can be accessed via your web browser then either "
        "this version of waybackpy (%s) is out of date or WayBack Machine is malfunctioning. Visit "
        "'https://github.com/akamhy/waybackpy' for the latest version "
        "of waybackpy.\nHeader:\n%s" % (url, __version__, str(header))
    )
 def _wayback_timestamp(**kwargs):
    """
    Wayback Machine archive URLs
    have a timestamp in them.
    The standard archive URL format is
    https://web.archive.org/web/20191214041711/https://www.youtube.com
    If we break it down in three parts:
    1 ) The start (https://web.archive.org/web/)
    2 ) timestamp (20191214041711)
    3 ) https://www.youtube.com, the original URL
    The near method takes year, month, day, hour and minute
    as Arguments, their type is int.
    This method takes those integers and converts it to
    wayback machine timestamp and returns it.
    Return format is string.
    """
    return "".join(
        str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
    )
 def _get_response(endpoint, params=None, headers=None, retries=5):
    """
    This function is used make get request.
    We use the requests package to make the
    requests.
    We try five times and if it fails it raises
    WaybackError exception.
    You can handles WaybackError by importing:
    from waybackpy.exceptions import WaybackError
    try:
        ...
    except WaybackError as e:
        # handle it
    """
    # From https://stackoverflow.com/a/35504626
    # By https://stackoverflow.com/users/401467/datashaman
    s = requests.Session()
    retries = Retry(total=retries, backoff_factor=0.5, status_forcelist=[ 500, 502, 503, 504 ])
    s.mount('https://', HTTPAdapter(max_retries=retries))
    try:
        return s.get(endpoint, params=params, headers=headers)
    except Exception as e:
        exc = WaybackError("Error while retrieving %s" % endpoint)
        exc.__cause__ = e
        raise exc
 class Url:
    """
    waybackpy Url class, Type : <class 'waybackpy.wrapper.Url'>
    """
    def __init__(self, url, user_agent=default_user_agent):
        self.url = url
        self.user_agent = str(user_agent)
-        self._url_check()
+        _url_check(self.url)
        self._archive_url = None
        self.timestamp = None
        self._JSON = None
@@ -197,18 +70,6 @@ class Url:
        return (datetime.utcnow() - self.timestamp).days
    def _url_check(self):
        """
        Check for common URL problems.
        What we are checking:
        1) '.' in self.url, no url that ain't '.' in it.
        If you known any others, please create a PR on the github repo.
        """
        if "." not in self.url:
            raise URLError("'%s' is not a vaild URL." % self.url)
    @property
    def JSON(self):
        """
@@ -225,7 +86,7 @@ class Url:
        endpoint = "https://archive.org/wayback/available"
        headers = {"User-Agent": self.user_agent}
-        payload = {"url": "%s" % self._cleaned_url()}
+        payload = {"url": "%s" % _cleaned_url(self.url)}
        response = _get_response(endpoint, params=payload, headers=headers)
        return response.json()
@@ -256,37 +117,8 @@ class Url:
    @property
    def _timestamp(self):
-        """
+        self.timestamp = _ts(self.timestamp, self.JSON)
-        Get timestamp of last fetched archive.
+        return self.timestamp
        If used before fetching any archive, will
        use whatever self.JSON returns.
        self.timestamp is None implies that
        self.JSON will return any archive's JSON
        that wayback machine provides it.
        """
        if self.timestamp:
            return self.timestamp
        data = self.JSON
        if not data["archived_snapshots"]:
            ts = datetime.max
        else:
            ts = datetime.strptime(
                data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
            )
        self.timestamp = ts
        return ts
    def _cleaned_url(self):
        """
        Remove EOL
        replace " " with "_"
        """
        return str(self.url).strip().replace(" ", "_")
    def save(self):
        """
@@ -302,7 +134,7 @@ class Url:
        _archive_url_parser() parses the archive from the header.
        """
-        request_url = "https://web.archive.org/save/" + self._cleaned_url()
+        request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
        headers = {"User-Agent": self.user_agent}
        response = _get_response(request_url, params=None, headers=headers)
        self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
@@ -317,7 +149,7 @@ class Url:
        """
        if not url:
-            url = self._cleaned_url()
+            url = _cleaned_url(self.url)
        if not user_agent:
            user_agent = self.user_agent
@@ -366,14 +198,15 @@ class Url:
        endpoint = "https://archive.org/wayback/available"
        headers = {"User-Agent": self.user_agent}
-        payload = {"url": "%s" % self._cleaned_url(), "timestamp": timestamp}
+        payload = {"url": "%s" % _cleaned_url(self.url), "timestamp": timestamp}
        response = _get_response(endpoint, params=payload, headers=headers)
        data = response.json()
        if not data["archived_snapshots"]:
            raise WaybackError(
                "Can not find archive for '%s' try later or use wayback.Url(url, user_agent).save() "
-                "to create a new archive." % self._cleaned_url()
+                "to create a new archive.\nAPI response:\n%s"
                % (_cleaned_url(self.url), response.text)
            )
        archive_url = data["archived_snapshots"]["closest"]["url"]
        archive_url = archive_url.replace(
@@ -423,17 +256,17 @@ class Url:
        """
        cdx = Cdx(
-            self._cleaned_url(),
+            _cleaned_url(self.url),
            user_agent=self.user_agent,
            start_timestamp=start_timestamp,
            end_timestamp=end_timestamp,
        )
        i = 0
        for _ in cdx.snapshots():
-            i += 1
+            i = i + 1
        return i
-    def live_urls_picker(self, url):
+    def live_urls_finder(self, url):
        """
        This method is used to check if supplied url
        is >= 400.
@@ -465,9 +298,9 @@ class Url:
        url_list = []
        if subdomain:
-            url = "*.%s/*" % self._cleaned_url()
+            url = "*.%s/*" % _cleaned_url(self.url)
        else:
-            url = "%s/*" % self._cleaned_url()
+            url = "%s/*" % _cleaned_url(self.url)
        cdx = Cdx(
            url,
@@ -486,99 +319,7 @@ class Url:
        # Remove all deadURLs from url_list if alive=True
        if alive:
            with concurrent.futures.ThreadPoolExecutor() as executor:
-                executor.map(self.live_urls_picker, url_list)
+                executor.map(self.live_urls_finder, url_list)
            url_list = self._alive_url_list
        return url_list
 class CdxSnapshot:
    """
    This class helps to handle the Cdx Snapshots easily.
    What the raw data looks like:
    org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
    """
    def __init__(
        self, urlkey, timestamp, original, mimetype, statuscode, digest, length
    ):
        self.urlkey = urlkey  # Useless
        self.timestamp = timestamp
        self.original = original
        self.mimetype = mimetype
        self.statuscode = statuscode
        self.digest = digest
        self.length = length
        self.archive_url = "https://web.archive.org/web/%s/%s" % (
            self.timestamp,
            self.original,
        )
    def __str__(self):
        return self.archive_url
 class Cdx:
    """
    waybackpy Cdx class, Type : <class 'waybackpy.wrapper.Cdx'>
    Cdx keys are :
    urlkey
    timestamp
    original
    mimetype
    statuscode
    digest
    length
    """
    def __init__(
        self,
        url,
        user_agent=default_user_agent,
        start_timestamp=None,
        end_timestamp=None,
    ):
        self.url = url
        self.user_agent = str(user_agent)
        self.start_timestamp = str(start_timestamp) if start_timestamp else None
        self.end_timestamp = str(end_timestamp) if end_timestamp else None
    def snapshots(self):
        """
        This function yeilds snapshots encapsulated
        in CdxSnapshot for more usability.
        """
        payload = {}
        endpoint = "https://web.archive.org/cdx/search/cdx"
        total_pages = _get_total_pages(self.url, self.user_agent)
        headers = {"User-Agent": self.user_agent}
        if self.start_timestamp:
            payload["from"] = self.start_timestamp
        if self.end_timestamp:
            payload["to"] = self.end_timestamp
        payload["url"] = self.url
        for i in range(total_pages):
            payload["page"] = str(i)
            res = _get_response(endpoint, params=payload, headers=headers)
            text = res.text
            if text.isspace() or len(text) <= 1 or not text:
                break
            snapshot_list = text.split("\n")
            for snapshot in snapshot_list:
                if len(snapshot) < 15:
                    continue
                (
                    urlkey,
                    timestamp,
                    original,
                    mimetype,
                    statuscode,
                    digest,
                    length,
                ) = snapshot.split(" ")
                yield CdxSnapshot(
                    urlkey, timestamp, original, mimetype, statuscode, digest, length
                )