215 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			215 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import random
 | |
| import string
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| from waybackpy.cdx_api import WaybackMachineCDXServerAPI
 | |
| from waybackpy.exceptions import NoCDXRecordFound
 | |
| 
 | |
| 
 | |
| def rndstr(n: int) -> str:
 | |
|     return "".join(
 | |
|         random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
 | |
|     )
 | |
| 
 | |
| 
 | |
| def test_a() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
 | |
|         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
|     url = "https://twitter.com/jack"
 | |
| 
 | |
|     wayback = WaybackMachineCDXServerAPI(
 | |
|         url=url,
 | |
|         user_agent=user_agent,
 | |
|         match_type="prefix",
 | |
|         collapses=["urlkey"],
 | |
|         start_timestamp="201001",
 | |
|         end_timestamp="201002",
 | |
|     )
 | |
|     #  timeframe bound prefix matching enabled along with active urlkey based collapsing
 | |
| 
 | |
|     snapshots = wayback.snapshots()  # <class 'generator'>
 | |
| 
 | |
|     for snapshot in snapshots:
 | |
|         assert snapshot.timestamp.startswith("2010")
 | |
| 
 | |
| 
 | |
| def test_b() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 | |
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
|     url = "https://www.google.com"
 | |
| 
 | |
|     wayback = WaybackMachineCDXServerAPI(
 | |
|         url=url,
 | |
|         user_agent=user_agent,
 | |
|         start_timestamp="202101",
 | |
|         end_timestamp="202112",
 | |
|         collapses=["urlkey"],
 | |
|     )
 | |
|     #  timeframe bound prefix matching enabled along with active urlkey based collapsing
 | |
| 
 | |
|     snapshots = wayback.snapshots()  # <class 'generator'>
 | |
| 
 | |
|     for snapshot in snapshots:
 | |
|         assert snapshot.timestamp.startswith("2021")
 | |
| 
 | |
| 
 | |
| def test_c() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 | |
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
|     url = "https://www.google.com"
 | |
| 
 | |
|     cdx = WaybackMachineCDXServerAPI(
 | |
|         url=url,
 | |
|         user_agent=user_agent,
 | |
|         closest="201010101010",
 | |
|         sort="closest",
 | |
|         limit="1",
 | |
|     )
 | |
|     snapshots = cdx.snapshots()
 | |
|     for snapshot in snapshots:
 | |
|         archive_url = snapshot.archive_url
 | |
|         timestamp = snapshot.timestamp
 | |
|         break
 | |
| 
 | |
|     assert str(archive_url).find("google.com")
 | |
|     assert "20101010" in timestamp
 | |
| 
 | |
| 
 | |
| def test_d() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 | |
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
| 
 | |
|     cdx = WaybackMachineCDXServerAPI(
 | |
|         url="akamhy.github.io",
 | |
|         user_agent=user_agent,
 | |
|         match_type="prefix",
 | |
|         use_pagination=True,
 | |
|         filters=["statuscode:200"],
 | |
|     )
 | |
|     snapshots = cdx.snapshots()
 | |
| 
 | |
|     count = 0
 | |
|     for snapshot in snapshots:
 | |
|         count += 1
 | |
|         assert str(snapshot.archive_url).find("akamhy.github.io")
 | |
|     assert count > 50
 | |
| 
 | |
| 
 | |
| def test_oldest() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 | |
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
| 
 | |
|     cdx = WaybackMachineCDXServerAPI(
 | |
|         url="google.com",
 | |
|         user_agent=user_agent,
 | |
|         filters=["statuscode:200"],
 | |
|     )
 | |
|     oldest = cdx.oldest()
 | |
|     assert "1998" in oldest.timestamp
 | |
|     assert "google" in oldest.urlkey
 | |
|     assert oldest.original.find("google.com") != -1
 | |
|     assert oldest.archive_url.find("google.com") != -1
 | |
| 
 | |
| 
 | |
| def test_newest() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 | |
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
| 
 | |
|     cdx = WaybackMachineCDXServerAPI(
 | |
|         url="google.com",
 | |
|         user_agent=user_agent,
 | |
|         filters=["statuscode:200"],
 | |
|     )
 | |
|     newest = cdx.newest()
 | |
|     assert "google" in newest.urlkey
 | |
|     assert newest.original.find("google.com") != -1
 | |
|     assert newest.archive_url.find("google.com") != -1
 | |
| 
 | |
| 
 | |
| def test_near() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 | |
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
| 
 | |
|     cdx = WaybackMachineCDXServerAPI(
 | |
|         url="google.com",
 | |
|         user_agent=user_agent,
 | |
|         filters=["statuscode:200"],
 | |
|     )
 | |
|     near = cdx.near(year=2010, month=10, day=10, hour=10, minute=10)
 | |
|     assert "2010101010" in near.timestamp
 | |
|     assert "google" in near.urlkey
 | |
|     assert near.original.find("google.com") != -1
 | |
|     assert near.archive_url.find("google.com") != -1
 | |
| 
 | |
|     near = cdx.near(wayback_machine_timestamp="201010101010")
 | |
|     assert "2010101010" in near.timestamp
 | |
|     assert "google" in near.urlkey
 | |
|     assert near.original.find("google.com") != -1
 | |
|     assert near.archive_url.find("google.com") != -1
 | |
| 
 | |
|     near = cdx.near(unix_timestamp=1286705410)
 | |
|     assert "2010101010" in near.timestamp
 | |
|     assert "google" in near.urlkey
 | |
|     assert near.original.find("google.com") != -1
 | |
|     assert near.archive_url.find("google.com") != -1
 | |
| 
 | |
|     with pytest.raises(NoCDXRecordFound):
 | |
|         dne_url = f"https://{rndstr(30)}.in"
 | |
|         cdx = WaybackMachineCDXServerAPI(
 | |
|             url=dne_url,
 | |
|             user_agent=user_agent,
 | |
|             filters=["statuscode:200"],
 | |
|         )
 | |
|         cdx.near(unix_timestamp=1286705410)
 | |
| 
 | |
| 
 | |
| def test_before() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 | |
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
| 
 | |
|     cdx = WaybackMachineCDXServerAPI(
 | |
|         url="http://www.google.com/",
 | |
|         user_agent=user_agent,
 | |
|         filters=["statuscode:200"],
 | |
|     )
 | |
|     before = cdx.before(wayback_machine_timestamp=20160731235949)
 | |
|     assert "20160731233347" in before.timestamp
 | |
|     assert "google" in before.urlkey
 | |
|     assert before.original.find("google.com") != -1
 | |
|     assert before.archive_url.find("google.com") != -1
 | |
| 
 | |
| 
 | |
| def test_after() -> None:
 | |
|     user_agent = (
 | |
|         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 | |
|         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 | |
|     )
 | |
| 
 | |
|     cdx = WaybackMachineCDXServerAPI(
 | |
|         url="http://www.google.com/",
 | |
|         user_agent=user_agent,
 | |
|         filters=["statuscode:200"],
 | |
|     )
 | |
|     after = cdx.after(wayback_machine_timestamp=20160731235949)
 | |
|     assert "20160801000917" in after.timestamp, after.timestamp
 | |
|     assert "google" in after.urlkey
 | |
|     assert after.original.find("google.com") != -1
 | |
|     assert after.archive_url.find("google.com") != -1
 |