Compare commits
223 Commits
Author | SHA1 | Date | |
---|---|---|---|
de5a3e1561 | |||
52e46fecc2 | |||
3b6415abc7 | |||
66e16d6d89 | |||
16b9bdd7f9 | |||
7adc01bff2 | |||
9bbd056268 | |||
2ab44391cf | |||
cc3628ae18 | |||
1d751b942b | |||
261a867a21 | |||
2e487e88d3 | |||
c8d0ad493a | |||
ce869177fd | |||
58616fb986 | |||
4e68cd5743 | |||
a7b805292d | |||
6dc6124dc4 | |||
5a7fc7d568 | |||
5a9c861cad | |||
dd1917c77e | |||
db8f902cff | |||
88cda94c0b | |||
09290f88d1 | |||
e5835091c9 | |||
7312ed1f4f | |||
6ae8f843d3 | |||
36b936820b | |||
a3bc6aad2b | |||
edc2f63d93 | |||
ffe0810b12 | |||
40233eb115 | |||
d549d31421 | |||
0725163af8 | |||
712471176b | |||
dcd7b03302 | |||
76205d9cf6 | |||
ec0a0d04cc | |||
7bb01df846 | |||
6142e0b353 | |||
a65990aee3 | |||
259a024eb1 | |||
91402792e6 | |||
eabf4dc046 | |||
5a7bd73565 | |||
4693dbf9c1 | |||
f4f2e51315 | |||
d6b7df6837 | |||
dafba5d0cb | |||
6c71dfbe41 | |||
a6470b1036 | |||
04cda4558e | |||
625ed63482 | |||
a03813315f | |||
a2550f17d7 | |||
15ef5816db | |||
93b52bd0fe | |||
28ff877081 | |||
3e3ecff9df | |||
ce64135ba8 | |||
2af6580ffb | |||
8a3c515176 | |||
d98c4f32ad | |||
e0a4b007d5 | |||
6fb6b2deee | |||
1882862992 | |||
0c6107e675 | |||
bd079978bf | |||
5dec4927cd | |||
62e5217b9e | |||
9823c809e9 | |||
db5737a857 | |||
ca0821a466 | |||
bb4dbc7d3c | |||
7c7fd75376 | |||
0b71433667 | |||
1b499a7594 | |||
da390ee8a3 | |||
d3e68d0e70 | |||
fde28d57aa | |||
6092e504c8 | |||
93ef60ecd2 | |||
461b3f74c9 | |||
3c53b411b0 | |||
8125526061 | |||
2dc81569a8 | |||
fd163f3d36 | |||
a0a918cf0d | |||
4943cf6873 | |||
bc3efc7d63 | |||
f89368f16d | |||
c919a6a605 | |||
0280fca189 | |||
60ee8b95a8 | |||
ca51c14332 | |||
525cf17c6f | |||
406e03c52f | |||
672b33e83a | |||
b19b840628 | |||
a6df4f899c | |||
7686e9c20d | |||
3c5932bc39 | |||
f9a986f489 | |||
0d7458ee90 | |||
ac8b9d6a50 | |||
58cd9c28e7 | |||
5088305a58 | |||
9f847a5e55 | |||
6c04c2f3d3 | |||
925be7b17e | |||
2b132456ac | |||
50e3154a4e | |||
7aef50428f | |||
d8ec0f5025 | |||
0a2f97c034 | |||
3e9cf23578 | |||
7f927ec7be | |||
9de6393cd5 | |||
91e7f65617 | |||
d465454019 | |||
1a81eb97fb | |||
6b3b2e2a7d | |||
82c65454e6 | |||
19710461b6 | |||
a3661d6b85 | |||
58375e4ef4 | |||
ea023e98da | |||
f1065ed1c8 | |||
315519b21f | |||
07c98661de | |||
2cd991a54e | |||
ede251afb3 | |||
a8ce970ca0 | |||
243af26bf6 | |||
0f1db94884 | |||
c304f58ea2 | |||
23f7222cb5 | |||
ce7294d990 | |||
c9fa114d2e | |||
8b6bacb28e | |||
32d8ad7780 | |||
cbf2f90faa | |||
4dde3e3134 | |||
1551e8f1c6 | |||
c84f09e2d2 | |||
57a32669b5 | |||
fe017cbcc8 | |||
5edb03d24b | |||
c5de2232ba | |||
ca9186c301 | |||
8a4b631c13 | |||
ec9ce92f48 | |||
e95d35c37f | |||
36d662b961 | |||
2835f8877e | |||
18cbd2fd30 | |||
a2812fb56f | |||
77effcf649 | |||
7272ef45a0 | |||
56116551ac | |||
4dcda94cb0 | |||
09f59b0182 | |||
ed24184b99 | |||
56bef064b1 | |||
44bb2cf5e4 | |||
e231228721 | |||
b8b2d6dfa9 | |||
3eca6294df | |||
eb037a0284 | |||
a01821f20b | |||
b21036f8df | |||
b43bacb7ac | |||
f7313b255a | |||
7457e1c793 | |||
f7493d823f | |||
7fa7b59ce3 | |||
78a608db50 | |||
93f7dfdaf9 | |||
83c6f256c9 | |||
dee9105794 | |||
3bfc3b46d0 | |||
553f150bee | |||
b3a7e714a5 | |||
cd9841713c | |||
1ea9548d46 | |||
be7642c837 | |||
a418a4e464 | |||
aec035ef1e | |||
6d37993ab9 | |||
72b80ca44e | |||
c10aa9279c | |||
68d809a7d6 | |||
4ad09a419b | |||
ddc6620f09 | |||
4066a65678 | |||
8e46a9ba7a | |||
a5a98b9b00 | |||
a721ab7d6c | |||
7db27ae5e1 | |||
8fd4462025 | |||
c458a15820 | |||
bae3412bee | |||
94cb08bb37 | |||
af888db13e | |||
d24f2408ee | |||
ddd2274015 | |||
99abdb7c67 | |||
f3bb9a8540 | |||
bb94e0d1c5 | |||
1a78d88be2 | |||
3ec61758b3 | |||
83c962166d | |||
e87dee3bdf | |||
b27bfff15a | |||
970fc1cd08 | |||
65391bf14b | |||
8ab116f276 | |||
6f82041ec9 | |||
11059c960e | |||
eee1b8eba1 | |||
f7de8f5575 | |||
3fa0c32064 | |||
aa1e3b8825 |
31
.github/workflows/python-publish.yml
vendored
Normal file
31
.github/workflows/python-publish.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
# This workflows will upload a Python Package using Twine when a release is created
|
||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
||||
|
||||
name: Upload Python Package
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: '3.x'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools wheel twine
|
||||
- name: Build and publish
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: |
|
||||
python setup.py sdist bdist_wheel
|
||||
twine upload dist/*
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,6 @@
|
||||
# Files generated while testing
|
||||
*-urls-*.txt
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
14
.travis.yml
14
.travis.yml
@ -1,14 +0,0 @@
|
||||
language: python
|
||||
python:
|
||||
- "2.7"
|
||||
- "3.6"
|
||||
- "3.8"
|
||||
os: linux
|
||||
dist: xenial
|
||||
cache: pip
|
||||
install:
|
||||
- pip install pytest
|
||||
before_script:
|
||||
cd tests
|
||||
script:
|
||||
- pytest test_1.py
|
@ -1,6 +1,10 @@
|
||||
{
|
||||
"scanSettings": {
|
||||
"baseBranches": []
|
||||
},
|
||||
"checkRunSettings": {
|
||||
"vulnerableCheckRunConclusionLevel": "failure"
|
||||
"vulnerableCheckRunConclusionLevel": "failure",
|
||||
"displayMode": "diff"
|
||||
},
|
||||
"issueSettings": {
|
||||
"minSeverityLevel": "LOW"
|
||||
|
10
CONTRIBUTORS.md
Normal file
10
CONTRIBUTORS.md
Normal file
@ -0,0 +1,10 @@
|
||||
## AUTHORS
|
||||
- akamhy (<https://github.com/akamhy>)
|
||||
- danvalen1 (<https://github.com/danvalen1>)
|
||||
- AntiCompositeNumber (<https://github.com/AntiCompositeNumber>)
|
||||
- jonasjancarik (<https://github.com/jonasjancarik>)
|
||||
|
||||
## ACKNOWLEDGEMENTS
|
||||
- mhmdiaa (<https://github.com/mhmdiaa>) for <https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050>. known_urls is based on this gist.
|
||||
- datashaman (<https://stackoverflow.com/users/401467/datashaman>) for <https://stackoverflow.com/a/35504626>. _get_response is based on this amazing answer.
|
||||
- dequeued0 (<https://github.com/dequeued0>) for reporting bugs and useful feature requests.
|
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 akamhy
|
||||
Copyright (c) 2020 waybackpy contributors ( https://github.com/akamhy/waybackpy/graphs/contributors )
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
215
README.md
215
README.md
@ -1,142 +1,129 @@
|
||||
# waybackpy
|
||||
<div align="center">
|
||||
|
||||
[](https://travis-ci.org/akamhy/waybackpy)
|
||||
[](https://pypistats.org/packages/waybackpy)
|
||||
[](https://github.com/akamhy/waybackpy/releases)
|
||||
[](https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade)
|
||||
[](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
[](https://codeclimate.com/github/akamhy/waybackpy/maintainability)
|
||||
[](https://www.codefactor.io/repository/github/akamhy/waybackpy)
|
||||
[](https://www.python.org/)
|
||||

|
||||

|
||||
[](https://github.com/akamhy/waybackpy/graphs/commit-activity)
|
||||
[](https://codecov.io/gh/akamhy/waybackpy)
|
||||

|
||||

|
||||
<img src="https://raw.githubusercontent.com/akamhy/waybackpy/master/assets/waybackpy_logo.svg"><br>
|
||||
|
||||
<h3>Python package & CLI tool that interfaces with the Wayback Machine API</h3>
|
||||
|
||||
</div>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://pypi.org/project/waybackpy/"><img alt="pypi" src="https://img.shields.io/pypi/v/waybackpy.svg"></a>
|
||||
<a href="https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md"><img alt="Contributions Welcome" src="https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square"></a>
|
||||
<a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a>
|
||||
<a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a>
|
||||
<a href="#"><img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square"></a>
|
||||
</p>
|
||||
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
## ⭐️ Introduction
|
||||
Waybackpy is a [Python package](https://www.udacity.com/blog/2021/01/what-is-a-python-package.html) and a CLI tool that interfaces with the Wayback Machine API.
|
||||
|
||||
Wayback Machine has 3 client side APIs.
|
||||
|
||||
- Save API
|
||||
- Availability API
|
||||
- CDX API
|
||||
|
||||
All three of these can be accessed by waybackpy.
|
||||
|
||||
|
||||

|
||||

|
||||
### 🏗 Installation
|
||||
|
||||
Waybackpy is a Python library that interfaces with the [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine) API. Archive pages and retrieve archived pages easily.
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
<!--ts-->
|
||||
|
||||
* [Installation](#installation)
|
||||
|
||||
* [Usage](#usage)
|
||||
* [Saving an url using save()](#capturing-aka-saving-an-url-using-save)
|
||||
* [Receiving the oldest archive for an URL Using oldest()](#receiving-the-oldest-archive-for-an-url-using-oldest)
|
||||
* [Receiving the recent most/newest archive for an URL using newest()](#receiving-the-newest-archive-for-an-url-using-newest)
|
||||
* [Receiving archive close to a specified year, month, day, hour, and minute using near()](#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
|
||||
* [Get the content of webpage using get()](#get-the-content-of-webpage-using-get)
|
||||
* [Count total archives for an URL using total_archives()](#count-total-archives-for-an-url-using-total_archives)
|
||||
|
||||
|
||||
* [Tests](#tests)
|
||||
|
||||
* [Dependency](#dependency)
|
||||
|
||||
* [License](#license)
|
||||
|
||||
<!--te-->
|
||||
|
||||
## Installation
|
||||
Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)):
|
||||
|
||||
```bash
|
||||
pip install waybackpy
|
||||
```
|
||||
|
||||
Install directly from GitHub:
|
||||
|
||||
## Usage
|
||||
|
||||
#### Capturing aka Saving an url Using save()
|
||||
```python
|
||||
import waybackpy
|
||||
# Capturing a new archive on Wayback machine.
|
||||
target_url = waybackpy.Url("https://github.com/akamhy/waybackpy", user_agnet="My-cool-user-agent")
|
||||
archived_url = target_url.save()
|
||||
print(archived_url)
|
||||
```bash
|
||||
pip install git+https://github.com/akamhy/waybackpy.git
|
||||
```
|
||||
This should print an URL similar to the following archived URL:
|
||||
|
||||
> <https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy>
|
||||
### Docker Image
|
||||
Docker Hub : <https://hub.docker.com/r/secsi/waybackpy>
|
||||
|
||||
Docker image is automatically updated on every release by [Regulary and Automatically Updated Docker Images](https://github.com/cybersecsi/RAUDI) (RAUDI).
|
||||
|
||||
RAUDI is a tool by SecSI (<https://secsi.io>), an Italian cybersecurity startup.
|
||||
|
||||
|
||||
#### Receiving the oldest archive for an URL Using oldest()
|
||||
|
||||
### Usage
|
||||
|
||||
#### As a Python package
|
||||
|
||||
##### Save API aka SavePageNow
|
||||
```python
|
||||
import waybackpy
|
||||
# retrieving the oldest archive on Wayback machine.
|
||||
target_url = waybackpy.Url("https://www.google.com/", "My-cool-user-agent")
|
||||
oldest_archive = target_url.oldest()
|
||||
print(oldest_archive)
|
||||
>>> from waybackpy import WaybackMachineSaveAPI
|
||||
>>> url = "https://github.com"
|
||||
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
>>>
|
||||
>>> save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||
>>> save_api.save()
|
||||
https://web.archive.org/web/20220118125249/https://github.com/
|
||||
>>> save_api.cached_save
|
||||
False
|
||||
>>> save_api.timestamp()
|
||||
datetime.datetime(2022, 1, 18, 12, 52, 49)
|
||||
```
|
||||
This should print the oldest available archive for <https://google.com>.
|
||||
|
||||
> <http://web.archive.org/web/19981111184551/http://google.com:80/>
|
||||
|
||||
|
||||
#### Receiving the newest archive for an URL using newest()
|
||||
##### Availability API
|
||||
```python
|
||||
import waybackpy
|
||||
# retrieving the newest/latest archive on Wayback machine.
|
||||
target_url = waybackpy.Url(url="https://www.google.com/", user_agnet="My-cool-user-agent")
|
||||
newest_archive = target_url.newest()
|
||||
print(newest_archive)
|
||||
>>> from waybackpy import WaybackMachineAvailabilityAPI
|
||||
>>>
|
||||
>>> url = "https://google.com"
|
||||
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
>>>
|
||||
>>> availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
||||
>>>
|
||||
>>> availability_api.oldest()
|
||||
https://web.archive.org/web/19981111184551/http://google.com:80/
|
||||
>>>
|
||||
>>> availability_api.newest()
|
||||
https://web.archive.org/web/20220118150444/https://www.google.com/
|
||||
>>>
|
||||
>>> availability_api.near(year=2010, month=10, day=10, hour=10)
|
||||
https://web.archive.org/web/20101010101708/http://www.google.com/
|
||||
```
|
||||
This print the newest available archive for <https://www.microsoft.com/en-us>, something just like this:
|
||||
|
||||
> <http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/>
|
||||
|
||||
|
||||
#### Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
##### CDX API aka CDXServerAPI
|
||||
```python
|
||||
import waybackpy
|
||||
# retriving the the closest archive from a specified year.
|
||||
# supported argumnets are year,month,day,hour and minute
|
||||
target_url = waybackpy.Url(https://www.facebook.com/", "Any-User-Agent")
|
||||
archive_near_year = target_url.near(year=2010)
|
||||
print(archive_near_year)
|
||||
>>> from waybackpy import WaybackMachineCDXServerAPI
|
||||
>>> url = "https://pypi.org"
|
||||
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
>>> cdx = WaybackMachineCDXServerAPI(url, user_agent, start_timestamp=2016, end_timestamp=2017)
|
||||
>>> for item in cdx.snapshots():
|
||||
... print(item.archive_url)
|
||||
...
|
||||
https://web.archive.org/web/20160110011047/http://pypi.org/
|
||||
https://web.archive.org/web/20160305104847/http://pypi.org/
|
||||
.
|
||||
. # URLS REDACTED FOR READABILITY
|
||||
.
|
||||
https://web.archive.org/web/20171127171549/https://pypi.org/
|
||||
https://web.archive.org/web/20171206002737/http://pypi.org:80/
|
||||
```
|
||||
returns : <http://web.archive.org/web/20100504071154/http://www.facebook.com/>
|
||||
|
||||
> Please note that if you only specify the year, the current month and day are default arguments for month and day respectively. Just putting the year parameter would not return the archive closer to January but the current month you are using the package. You need to specify the month "1" for January , 2 for february and so on.
|
||||
|
||||
> Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
> Documentation at <https://github.com/akamhy/waybackpy/wiki/Python-package-docs>.
|
||||
|
||||
|
||||
#### Get the content of webpage using get()
|
||||
```python
|
||||
import waybackpy
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# supported argumnets encoding and user_agent
|
||||
target = waybackpy.Url("google.com", "any-user_agent")
|
||||
oldest_url = target.oldest()
|
||||
webpage = target.get(oldest_url) # We are getting the source of oldest archive of google.com.
|
||||
print(webpage)
|
||||
#### As a CLI tool
|
||||
```bash
|
||||
$ waybackpy --save --url "https://en.wikipedia.org/wiki/Social_media" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media
|
||||
|
||||
$ waybackpy --oldest --url "https://en.wikipedia.org/wiki/Humanoid" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20040415020811/http://en.wikipedia.org:80/wiki/Humanoid
|
||||
|
||||
$ waybackpy --newest --url "https://en.wikipedia.org/wiki/Remote_sensing" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20201221130522/https://en.wikipedia.org/wiki/Remote_sensing
|
||||
```
|
||||
> This should print the source code for oldest archive of google.com. If no URL is passed in get() then it should retrive the source code of google.com and not any archive.
|
||||
> CLI documentation is at <https://github.com/akamhy/waybackpy/wiki/CLI-docs>.
|
||||
|
||||
#### Count total archives for an URL using total_archives()
|
||||
```python
|
||||
from waybackpy import Url
|
||||
# retriving the content of a webpage from any url including but not limited to the archived urls.
|
||||
count = Url("https://en.wikipedia.org/wiki/Python (programming language)", "User-Agent").total_archives()
|
||||
print(count)
|
||||
```
|
||||
> This should print an integer (int), which is the number of total archives on archive.org
|
||||
### 🛡 License
|
||||
[](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
|
||||
|
||||
## Tests
|
||||
* [Here](https://github.com/akamhy/waybackpy/tree/master/tests)
|
||||
|
||||
|
||||
## Dependency
|
||||
* None, just python standard libraries (re, json, urllib and datetime). Both python 2 and 3 are supported :)
|
||||
|
||||
|
||||
## License
|
||||
[MIT License](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
Released under the MIT License. See [license](https://github.com/akamhy/waybackpy/blob/master/LICENSE) for details.
|
||||
|
@ -1 +1 @@
|
||||
theme: jekyll-theme-cayman
|
||||
theme: jekyll-theme-cayman
|
||||
|
1
assets/waybackpy_logo.svg
Normal file
1
assets/waybackpy_logo.svg
Normal file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 176.612 41.908" height="158.392" width="667.51" xmlns:v="https://github.com/akamhy/waybackpy"><text transform="matrix(.862888 0 0 1.158899 -.748 -98.312)" y="110.937" x="0.931" xml:space="preserve" font-weight="bold" font-size="28.149" font-family="sans-serif" letter-spacing="0" word-spacing="0" writing-mode="lr-tb" fill="#003dff"><tspan y="110.937" x="0.931"><tspan y="110.937" x="0.931" letter-spacing="3.568" writing-mode="lr-tb">waybackpy</tspan></tspan></text><path d="M.749 0h153.787v4.864H.749zm22.076 37.418h153.787v4.49H22.825z" fill="navy"/><path d="M0 37.418h22.825v4.49H0zM154.536 0h21.702v4.864h-21.702z" fill="#f0f"/></svg>
|
After Width: | Height: | Size: 694 B |
225
index.rst
225
index.rst
@ -1,225 +0,0 @@
|
||||
waybackpy
|
||||
=========
|
||||
|
||||
|Build Status| |Downloads| |Release| |Codacy Badge| |License: MIT|
|
||||
|Maintainability| |CodeFactor| |made-with-python| |pypi| |PyPI - Python
|
||||
Version| |Maintenance| |codecov| |image1| |contributions welcome|
|
||||
|
||||
.. |Build Status| image:: https://img.shields.io/travis/akamhy/waybackpy.svg?label=Travis%20CI&logo=travis&style=flat-square
|
||||
:target: https://travis-ci.org/akamhy/waybackpy
|
||||
.. |Downloads| image:: https://img.shields.io/pypi/dm/waybackpy.svg
|
||||
:target: https://pypistats.org/packages/waybackpy
|
||||
.. |Release| image:: https://img.shields.io/github/v/release/akamhy/waybackpy.svg
|
||||
:target: https://github.com/akamhy/waybackpy/releases
|
||||
.. |Codacy Badge| image:: https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65
|
||||
:target: https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade
|
||||
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
|
||||
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
||||
.. |Maintainability| image:: https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability
|
||||
:target: https://codeclimate.com/github/akamhy/waybackpy/maintainability
|
||||
.. |CodeFactor| image:: https://www.codefactor.io/repository/github/akamhy/waybackpy/badge
|
||||
:target: https://www.codefactor.io/repository/github/akamhy/waybackpy
|
||||
.. |made-with-python| image:: https://img.shields.io/badge/Made%20with-Python-1f425f.svg
|
||||
:target: https://www.python.org/
|
||||
.. |pypi| image:: https://img.shields.io/pypi/v/waybackpy.svg
|
||||
.. |PyPI - Python Version| image:: https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square
|
||||
.. |Maintenance| image:: https://img.shields.io/badge/Maintained%3F-yes-green.svg
|
||||
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity
|
||||
.. |codecov| image:: https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg
|
||||
:target: https://codecov.io/gh/akamhy/waybackpy
|
||||
.. |image1| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
|
||||
.. |contributions welcome| image:: https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square
|
||||
|
||||
|
||||
|Internet Archive| |Wayback Machine|
|
||||
|
||||
Waybackpy is a Python library that interfaces with the `Internet
|
||||
Archive`_\ ’s `Wayback Machine`_ API. Archive pages and retrieve
|
||||
archived pages easily.
|
||||
|
||||
.. _Internet Archive: https://en.wikipedia.org/wiki/Internet_Archive
|
||||
.. _Wayback Machine: https://en.wikipedia.org/wiki/Wayback_Machine
|
||||
|
||||
.. |Internet Archive| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png
|
||||
.. |Wayback Machine| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<!--ts-->
|
||||
|
||||
- `Installation`_
|
||||
|
||||
- `Usage`_
|
||||
|
||||
- `Saving an url using save()`_
|
||||
- `Receiving the oldest archive for an URL Using oldest()`_
|
||||
- `Receiving the recent most/newest archive for an URL using
|
||||
newest()`_
|
||||
- `Receiving archive close to a specified year, month, day, hour,
|
||||
and minute using near()`_
|
||||
- `Get the content of webpage using get()`_
|
||||
- `Count total archives for an URL using total_archives()`_
|
||||
|
||||
- `Tests`_
|
||||
|
||||
- `Dependency`_
|
||||
|
||||
- `License`_
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<!--te-->
|
||||
|
||||
.. _Installation: #installation
|
||||
.. _Usage: #usage
|
||||
.. _Saving an url using save(): #capturing-aka-saving-an-url-using-save
|
||||
.. _Receiving the oldest archive for an URL Using oldest(): #receiving-the-oldest-archive-for-an-url-using-oldest
|
||||
.. _Receiving the recent most/newest archive for an URL using newest(): #receiving-the-newest-archive-for-an-url-using-newest
|
||||
.. _Receiving archive close to a specified year, month, day, hour, and minute using near(): #receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near
|
||||
.. _Get the content of webpage using get(): #get-the-content-of-webpage-using-get
|
||||
.. _Count total archives for an URL using total_archives(): #count-total-archives-for-an-url-using-total_archives
|
||||
.. _Tests: #tests
|
||||
.. _Dependency: #dependency
|
||||
.. _License: #license
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Using `pip`_:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
pip install waybackpy
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
Capturing aka Saving an url Using save()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# Capturing a new archive on Wayback machine.
|
||||
target_url = waybackpy.Url("https://github.com/akamhy/waybackpy", user_agnet="My-cool-user-agent")
|
||||
archived_url = target_url.save()
|
||||
print(archived_url)
|
||||
|
||||
This should print an URL similar to the following archived URL:
|
||||
|
||||
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
|
||||
|
||||
.. _pip: https://en.wikipedia.org/wiki/Pip_(package_manager)
|
||||
|
||||
Receiving the oldest archive for an URL Using oldest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retrieving the oldest archive on Wayback machine.
|
||||
target_url = waybackpy.Url("https://www.google.com/", "My-cool-user-agent")
|
||||
oldest_archive = target_url.oldest()
|
||||
print(oldest_archive)
|
||||
|
||||
This should print the oldest available archive for https://google.com.
|
||||
|
||||
http://web.archive.org/web/19981111184551/http://google.com:80/
|
||||
|
||||
Receiving the newest archive for an URL using newest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retrieving the newest/latest archive on Wayback machine.
|
||||
target_url = waybackpy.Url(url="https://www.google.com/", user_agnet="My-cool-user-agent")
|
||||
newest_archive = target_url.newest()
|
||||
print(newest_archive)
|
||||
|
||||
This print the newest available archive for
|
||||
https://www.microsoft.com/en-us, something just like this:
|
||||
|
||||
http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/
|
||||
|
||||
Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retriving the the closest archive from a specified year.
|
||||
# supported argumnets are year,month,day,hour and minute
|
||||
target_url = waybackpy.Url(https://www.facebook.com/", "Any-User-Agent")
|
||||
archive_near_year = target_url.near(year=2010)
|
||||
print(archive_near_year)
|
||||
|
||||
returns :
|
||||
http://web.archive.org/web/20100504071154/http://www.facebook.com/
|
||||
|
||||
Please note that if you only specify the year, the current month and
|
||||
day are default arguments for month and day respectively. Just
|
||||
putting the year parameter would not return the archive closer to
|
||||
January but the current month you are using the package. You need to
|
||||
specify the month “1” for January , 2 for february and so on.
|
||||
|
||||
..
|
||||
|
||||
Do not pad (don’t use zeros in the month, year, day, minute, and hour
|
||||
arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
|
||||
Get the content of webpage using get()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# supported argumnets encoding and user_agent
|
||||
target = waybackpy.Url("google.com", "any-user_agent")
|
||||
oldest_url = target.oldest()
|
||||
webpage = target.get(oldest_url) # We are getting the source of oldest archive of google.com.
|
||||
print(webpage)
|
||||
|
||||
..
|
||||
|
||||
This should print the source code for oldest archive of google.com.
|
||||
If no URL is passed in get() then it should retrive the source code
|
||||
of google.com and not any archive.
|
||||
|
||||
Count total archives for an URL using total_archives()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
from waybackpy import Url
|
||||
# retriving the content of a webpage from any url including but not limited to the archived urls.
|
||||
count = Url("https://en.wikipedia.org/wiki/Python (programming language)", "User-Agent").total_archives()
|
||||
print(count)
|
||||
|
||||
..
|
||||
|
||||
This should print an integer (int), which is the number of total
|
||||
archives on archive.org
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
- `Here`_
|
||||
|
||||
Dependency
|
||||
----------
|
||||
|
||||
- None, just python standard libraries (re, json, urllib and datetime).
|
||||
Both python 2 and 3 are supported :)
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
`MIT License`_
|
||||
|
||||
.. _Here: https://github.com/akamhy/waybackpy/tree/master/tests
|
||||
.. _MIT License: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
click
|
||||
requests
|
@ -1,3 +1,7 @@
|
||||
[metadata]
|
||||
description-file = README.md
|
||||
license_file = LICENSE
|
||||
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = E203,W503
|
||||
|
74
setup.py
74
setup.py
@ -1,49 +1,51 @@
|
||||
import os.path
|
||||
from setuptools import setup
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
|
||||
long_description = f.read()
|
||||
|
||||
about = {}
|
||||
with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py')) as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), "waybackpy", "__version__.py")) as f:
|
||||
exec(f.read(), about)
|
||||
|
||||
|
||||
setup(
|
||||
name = about['__title__'],
|
||||
packages = ['waybackpy'],
|
||||
version = about['__version__'],
|
||||
description = about['__description__'],
|
||||
name=about["__title__"],
|
||||
packages=["waybackpy"],
|
||||
version=about["__version__"],
|
||||
description=about["__description__"],
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
license= about['__license__'],
|
||||
author = about['__author__'],
|
||||
author_email = about['__author_email__'],
|
||||
url = about['__url__'],
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/2.0.0.tar.gz',
|
||||
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
|
||||
install_requires=[],
|
||||
python_requires= ">=2.7",
|
||||
long_description_content_type="text/markdown",
|
||||
license=about["__license__"],
|
||||
author=about["__author__"],
|
||||
author_email=about["__author_email__"],
|
||||
url=about["__url__"],
|
||||
download_url="https://github.com/akamhy/waybackpy/archive/3.0.0.tar.gz",
|
||||
keywords=[
|
||||
"Archive Website",
|
||||
"Wayback Machine",
|
||||
"Internet Archive",
|
||||
],
|
||||
install_requires=["requests", "click"],
|
||||
python_requires=">=3.4",
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Intended Audience :: Developers',
|
||||
'Natural Language :: English',
|
||||
'Topic :: Software Development :: Build Tools',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python',
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.2',
|
||||
'Programming Language :: Python :: 3.3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: Implementation :: CPython',
|
||||
],
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
"Natural Language :: English",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.4",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
],
|
||||
entry_points={"console_scripts": ["waybackpy = waybackpy.cli:main"]},
|
||||
project_urls={
|
||||
'Documentation': 'https://waybackpy.readthedocs.io',
|
||||
'Source': 'https://github.com/akamhy/waybackpy',
|
||||
"Documentation": "https://github.com/akamhy/waybackpy/wiki",
|
||||
"Source": "https://github.com/akamhy/waybackpy",
|
||||
"Tracker": "https://github.com/akamhy/waybackpy/issues",
|
||||
},
|
||||
)
|
||||
|
127
tests/test_1.py
127
tests/test_1.py
@ -1,127 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
sys.path.append("..")
|
||||
import waybackpy
|
||||
import pytest
|
||||
import random
|
||||
|
||||
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
|
||||
|
||||
def test_clean_url():
|
||||
test_url = " https://en.wikipedia.org/wiki/Network security "
|
||||
answer = "https://en.wikipedia.org/wiki/Network_security"
|
||||
target = waybackpy.Url(test_url, user_agent)
|
||||
test_result = target.clean_url()
|
||||
assert answer == test_result
|
||||
|
||||
def test_url_check():
|
||||
broken_url = "http://wwwgooglecom/"
|
||||
with pytest.raises(Exception) as e_info:
|
||||
waybackpy.Url(broken_url, user_agent)
|
||||
|
||||
def test_save():
|
||||
# Test for urls that exist and can be archived.
|
||||
|
||||
url_list = [
|
||||
"en.wikipedia.org",
|
||||
"www.wikidata.org",
|
||||
"commons.wikimedia.org",
|
||||
"www.wiktionary.org",
|
||||
"www.w3schools.com",
|
||||
"www.youtube.com"
|
||||
]
|
||||
x = random.randint(0, len(url_list)-1)
|
||||
url1 = url_list[x]
|
||||
target = waybackpy.Url(url1, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36")
|
||||
archived_url1 = target.save()
|
||||
assert url1 in archived_url1
|
||||
|
||||
if sys.version_info > (3, 6):
|
||||
|
||||
# Test for urls that are incorrect.
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url2 = "ha ha ha ha"
|
||||
waybackpy.Url(url2, user_agent)
|
||||
|
||||
# Test for urls not allowed to archive by robot.txt.
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url3 = "http://www.archive.is/faq.html"
|
||||
target = waybackpy.Url(url3, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0")
|
||||
target.save()
|
||||
|
||||
|
||||
# Non existent urls, test
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
|
||||
target = waybackpy.Url(url3, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27")
|
||||
target.save()
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
def test_near():
|
||||
url = "google.com"
|
||||
target = waybackpy.Url(url, "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4")
|
||||
archive_near_year = target.near(year=2010)
|
||||
assert "2010" in archive_near_year
|
||||
|
||||
if sys.version_info > (3, 6):
|
||||
archive_near_month_year = target.near( year=2015, month=2)
|
||||
assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
|
||||
|
||||
archive_near_day_month_year = target.near(year=2006, month=11, day=15)
|
||||
assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year)
|
||||
|
||||
target = waybackpy.Url("www.python.org", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246")
|
||||
archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15)
|
||||
assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
|
||||
|
||||
with pytest.raises(Exception) as e_info:
|
||||
NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
|
||||
target = waybackpy.Url(NeverArchivedUrl, user_agent)
|
||||
target.near(year=2010)
|
||||
else:
|
||||
pass
|
||||
|
||||
def test_oldest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "20200504141153" in target.oldest()
|
||||
|
||||
def test_newest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert url in target.newest()
|
||||
|
||||
def test_get():
|
||||
target = waybackpy.Url("google.com", user_agent)
|
||||
assert "Welcome to Google" in target.get(target.oldest())
|
||||
|
||||
def test_total_archives():
|
||||
if sys.version_info > (3, 6):
|
||||
target = waybackpy.Url(" https://google.com ", user_agent)
|
||||
assert target.total_archives() > 500000
|
||||
else:
|
||||
pass
|
||||
|
||||
target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent)
|
||||
assert target.total_archives() == 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_clean_url()
|
||||
print(".") #1
|
||||
test_url_check()
|
||||
print(".") #1
|
||||
test_get()
|
||||
print(".") #3
|
||||
test_near()
|
||||
print(".") #4
|
||||
test_newest()
|
||||
print(".") #5
|
||||
test_save()
|
||||
print(".") #6
|
||||
test_oldest()
|
||||
print(".") #7
|
||||
test_total_archives()
|
||||
print(".") #8
|
||||
print("OK")
|
@ -1,32 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
|
||||
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
|
||||
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
|
||||
# ┃┗┛┗┛┃┗━┓┃━┃┃━┃┃┃┏━┓┃┗━┓┃━┃┏━┛┃┗┛┛┃┏━━┛┃┃━┃┃
|
||||
# ┗┓┏┓┏┛┃┗┛┗┓┃┗━┛┃┃┗━┛┃┃┗┛┗┓┃┗━┓┃┏┓┓┃┃━━━┃┗━┛┃
|
||||
# ━┗┛┗┛━┗━━━┛┗━┓┏┛┗━━━┛┗━━━┛┗━━┛┗┛┗┛┗┛━━━┗━┓┏┛
|
||||
# ━━━━━━━━━━━┏━┛┃━━━━━━━━━━━━━━━━━━━━━━━━┏━┛┃━
|
||||
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
|
||||
|
||||
"""
|
||||
Waybackpy is a Python library that interfaces with the Internet Archive's Wayback Machine API.
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Archive pages and retrieve archived pages easily.
|
||||
|
||||
Usage:
|
||||
>>> import waybackpy
|
||||
>>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
|
||||
>>> new_archive = target_url.save()
|
||||
>>> print(new_archive)
|
||||
https://web.archive.org/web/20200502170312/https://www.python.org/
|
||||
|
||||
Full documentation @ <https://akamhy.github.io/waybackpy/>.
|
||||
:copyright: (c) 2020 by akamhy.
|
||||
:license: MIT
|
||||
"""
|
||||
|
||||
from .wrapper import Url
|
||||
from .__version__ import __title__, __description__, __url__, __version__
|
||||
from .__version__ import __author__, __author_email__, __license__, __copyright__
|
||||
from .cdx_api import WaybackMachineCDXServerAPI
|
||||
from .save_api import WaybackMachineSaveAPI
|
||||
from .availability_api import WaybackMachineAvailabilityAPI
|
||||
from .__version__ import (
|
||||
__title__,
|
||||
__description__,
|
||||
__url__,
|
||||
__version__,
|
||||
__author__,
|
||||
__author_email__,
|
||||
__license__,
|
||||
__copyright__,
|
||||
)
|
||||
|
@ -1,10 +1,11 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__title__ = "waybackpy"
|
||||
__description__ = "A Python library that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
|
||||
__description__ = (
|
||||
"Python package that interfaces with the Internet Archive's Wayback Machine APIs. "
|
||||
"Archive pages and retrieve archived pages easily."
|
||||
)
|
||||
__url__ = "https://akamhy.github.io/waybackpy/"
|
||||
__version__ = "2.0.0"
|
||||
__version__ = "3.0.0"
|
||||
__author__ = "akamhy"
|
||||
__author_email__ = "akash3pro@gmail.com"
|
||||
__author_email__ = "akamhy@yahoo.com"
|
||||
__license__ = "MIT"
|
||||
__copyright__ = "Copyright 2020 akamhy"
|
||||
__copyright__ = "Copyright 2020-2022 Akash Mahanty et al."
|
||||
|
109
waybackpy/availability_api.py
Normal file
109
waybackpy/availability_api.py
Normal file
@ -0,0 +1,109 @@
|
||||
import re
|
||||
import time
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from .__version__ import __version__
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
def full_url(endpoint, params):
|
||||
if not params:
|
||||
return endpoint.strip()
|
||||
|
||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||
|
||||
for key, val in params.items():
|
||||
key = "filter" if key.startswith("filter") else key
|
||||
key = "collapse" if key.startswith("collapse") else key
|
||||
amp = "" if full_url.endswith("?") else "&"
|
||||
full_url = (
|
||||
full_url
|
||||
+ amp
|
||||
+ "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
|
||||
)
|
||||
return full_url
|
||||
|
||||
|
||||
class WaybackMachineAvailabilityAPI:
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
self.headers = {"User-Agent": self.user_agent}
|
||||
self.payload = {"url": "{url}".format(url=self.url)}
|
||||
self.endpoint = "https://archive.org/wayback/available"
|
||||
self.JSON = None
|
||||
|
||||
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
|
||||
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||
|
||||
def __repr__(self):
|
||||
return str(self) # self.__str__()
|
||||
|
||||
def __str__(self):
|
||||
if not self.JSON:
|
||||
return None
|
||||
return self.archive_url
|
||||
|
||||
def json(self):
|
||||
self.request_url = full_url(self.endpoint, self.payload)
|
||||
self.response = requests.get(self.request_url, self.headers)
|
||||
self.JSON = self.response.json()
|
||||
return self.JSON
|
||||
|
||||
def timestamp(self):
|
||||
if not self.JSON["archived_snapshots"] or not self.JSON:
|
||||
return datetime.max
|
||||
|
||||
return datetime.strptime(
|
||||
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||
)
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
data = self.JSON
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
archive_url = None
|
||||
else:
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||
)
|
||||
return archive_url
|
||||
|
||||
def wayback_timestamp(self, **kwargs):
|
||||
return "".join(
|
||||
str(kwargs[key]).zfill(2)
|
||||
for key in ["year", "month", "day", "hour", "minute"]
|
||||
)
|
||||
|
||||
def oldest(self):
|
||||
return self.near(year=1994)
|
||||
|
||||
def newest(self):
|
||||
return self.near(unix_timestamp=int(time.time()))
|
||||
|
||||
def near(
|
||||
self,
|
||||
year=None,
|
||||
month=None,
|
||||
day=None,
|
||||
hour=None,
|
||||
minute=None,
|
||||
unix_timestamp=None,
|
||||
):
|
||||
if unix_timestamp:
|
||||
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||
else:
|
||||
now = datetime.utcnow().timetuple()
|
||||
timestamp = self.wayback_timestamp(
|
||||
year=year if year else now.tm_year,
|
||||
month=month if month else now.tm_mon,
|
||||
day=day if day else now.tm_mday,
|
||||
hour=hour if hour else now.tm_hour,
|
||||
minute=minute if minute else now.tm_min,
|
||||
)
|
||||
|
||||
self.payload["timestamp"] = timestamp
|
||||
self.json()
|
||||
return self
|
185
waybackpy/cdx_api.py
Normal file
185
waybackpy/cdx_api.py
Normal file
@ -0,0 +1,185 @@
|
||||
from .exceptions import WaybackError
|
||||
from .cdx_snapshot import CDXSnapshot
|
||||
from .cdx_utils import (
|
||||
get_total_pages,
|
||||
get_response,
|
||||
check_filters,
|
||||
check_collapses,
|
||||
check_match_type,
|
||||
)
|
||||
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
class WaybackMachineCDXServerAPI:
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
user_agent=None,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
filters=[],
|
||||
match_type=None,
|
||||
gzip=None,
|
||||
collapses=[],
|
||||
limit=None,
|
||||
):
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = str(user_agent) if user_agent else DEFAULT_USER_AGENT
|
||||
self.start_timestamp = str(start_timestamp) if start_timestamp else None
|
||||
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
||||
self.filters = filters
|
||||
check_filters(self.filters)
|
||||
self.match_type = str(match_type).strip() if match_type else None
|
||||
check_match_type(self.match_type, self.url)
|
||||
self.gzip = gzip if gzip else True
|
||||
self.collapses = collapses
|
||||
check_collapses(self.collapses)
|
||||
self.limit = limit if limit else 5000
|
||||
self.last_api_request_url = None
|
||||
self.use_page = False
|
||||
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||
|
||||
def cdx_api_manager(self, payload, headers, use_page=False):
|
||||
|
||||
total_pages = get_total_pages(self.url, self.user_agent)
|
||||
# If we only have two or less pages of archives then we care for accuracy
|
||||
# pagination API can be lagged sometimes
|
||||
if use_page == True and total_pages >= 2:
|
||||
blank_pages = 0
|
||||
for i in range(total_pages):
|
||||
payload["page"] = str(i)
|
||||
|
||||
url, res = get_response(
|
||||
self.endpoint, params=payload, headers=headers, return_full_url=True
|
||||
)
|
||||
|
||||
self.last_api_request_url = url
|
||||
text = res.text
|
||||
if len(text) == 0:
|
||||
blank_pages += 1
|
||||
|
||||
if blank_pages >= 2:
|
||||
break
|
||||
|
||||
yield text
|
||||
else:
|
||||
|
||||
payload["showResumeKey"] = "true"
|
||||
payload["limit"] = str(self.limit)
|
||||
resumeKey = None
|
||||
|
||||
more = True
|
||||
while more:
|
||||
|
||||
if resumeKey:
|
||||
payload["resumeKey"] = resumeKey
|
||||
|
||||
url, res = get_response(
|
||||
self.endpoint, params=payload, headers=headers, return_full_url=True
|
||||
)
|
||||
|
||||
self.last_api_request_url = url
|
||||
|
||||
text = res.text.strip()
|
||||
lines = text.splitlines()
|
||||
|
||||
more = False
|
||||
|
||||
if len(lines) >= 3:
|
||||
|
||||
second_last_line = lines[-2]
|
||||
|
||||
if len(second_last_line) == 0:
|
||||
|
||||
resumeKey = lines[-1].strip()
|
||||
text = text.replace(resumeKey, "", 1).strip()
|
||||
more = True
|
||||
|
||||
yield text
|
||||
|
||||
def add_payload(self, payload):
|
||||
if self.start_timestamp:
|
||||
payload["from"] = self.start_timestamp
|
||||
|
||||
if self.end_timestamp:
|
||||
payload["to"] = self.end_timestamp
|
||||
|
||||
if self.gzip != True:
|
||||
payload["gzip"] = "false"
|
||||
|
||||
if self.match_type:
|
||||
payload["matchType"] = self.match_type
|
||||
|
||||
if self.filters and len(self.filters) > 0:
|
||||
for i, f in enumerate(self.filters):
|
||||
payload["filter" + str(i)] = f
|
||||
|
||||
if self.collapses and len(self.collapses) > 0:
|
||||
for i, f in enumerate(self.collapses):
|
||||
payload["collapse" + str(i)] = f
|
||||
|
||||
# Don't need to return anything as it's dictionary.
|
||||
payload["url"] = self.url
|
||||
|
||||
def snapshots(self):
|
||||
payload = {}
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
|
||||
self.add_payload(payload)
|
||||
|
||||
if not self.start_timestamp or self.end_timestamp:
|
||||
self.use_page = True
|
||||
|
||||
if self.collapses != []:
|
||||
self.use_page = False
|
||||
|
||||
texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
|
||||
|
||||
for text in texts:
|
||||
|
||||
if text.isspace() or len(text) <= 1 or not text:
|
||||
continue
|
||||
|
||||
snapshot_list = text.split("\n")
|
||||
|
||||
for snapshot in snapshot_list:
|
||||
|
||||
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
|
||||
continue
|
||||
|
||||
properties = {
|
||||
"urlkey": None,
|
||||
"timestamp": None,
|
||||
"original": None,
|
||||
"mimetype": None,
|
||||
"statuscode": None,
|
||||
"digest": None,
|
||||
"length": None,
|
||||
}
|
||||
|
||||
prop_values = snapshot.split(" ")
|
||||
|
||||
prop_values_len = len(prop_values)
|
||||
properties_len = len(properties)
|
||||
|
||||
if prop_values_len != properties_len:
|
||||
raise WaybackError(
|
||||
"Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(
|
||||
prop_values_len=prop_values_len,
|
||||
properties_len=properties_len,
|
||||
snapshot=snapshot,
|
||||
)
|
||||
)
|
||||
|
||||
(
|
||||
properties["urlkey"],
|
||||
properties["timestamp"],
|
||||
properties["original"],
|
||||
properties["mimetype"],
|
||||
properties["statuscode"],
|
||||
properties["digest"],
|
||||
properties["length"],
|
||||
) = prop_values
|
||||
|
||||
yield CDXSnapshot(properties)
|
27
waybackpy/cdx_snapshot.py
Normal file
27
waybackpy/cdx_snapshot.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class CDXSnapshot:
|
||||
def __init__(self, properties):
|
||||
self.urlkey = properties["urlkey"]
|
||||
self.timestamp = properties["timestamp"]
|
||||
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
||||
self.original = properties["original"]
|
||||
self.mimetype = properties["mimetype"]
|
||||
self.statuscode = properties["statuscode"]
|
||||
self.digest = properties["digest"]
|
||||
self.length = properties["length"]
|
||||
self.archive_url = (
|
||||
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
|
||||
urlkey=self.urlkey,
|
||||
timestamp=self.timestamp,
|
||||
original=self.original,
|
||||
mimetype=self.mimetype,
|
||||
statuscode=self.statuscode,
|
||||
digest=self.digest,
|
||||
length=self.length,
|
||||
)
|
154
waybackpy/cdx_utils.py
Normal file
154
waybackpy/cdx_utils.py
Normal file
@ -0,0 +1,154 @@
|
||||
import re
|
||||
import requests
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
from .exceptions import WaybackError
|
||||
|
||||
|
||||
def get_total_pages(url, user_agent):
|
||||
request_url = (
|
||||
"https://web.archive.org/cdx/search/cdx?url={url}&showNumPages=true".format(
|
||||
url=url
|
||||
)
|
||||
)
|
||||
headers = {"User-Agent": user_agent}
|
||||
return int((requests.get(request_url, headers=headers).text).strip())
|
||||
|
||||
|
||||
def full_url(endpoint, params):
|
||||
if not params:
|
||||
return endpoint
|
||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||
for key, val in params.items():
|
||||
key = "filter" if key.startswith("filter") else key
|
||||
key = "collapse" if key.startswith("collapse") else key
|
||||
amp = "" if full_url.endswith("?") else "&"
|
||||
full_url = (
|
||||
full_url
|
||||
+ amp
|
||||
+ "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
|
||||
)
|
||||
return full_url
|
||||
|
||||
|
||||
def get_response(
|
||||
endpoint,
|
||||
params=None,
|
||||
headers=None,
|
||||
return_full_url=False,
|
||||
retries=5,
|
||||
backoff_factor=0.5,
|
||||
no_raise_on_redirects=False,
|
||||
):
|
||||
|
||||
s = requests.Session()
|
||||
|
||||
retries = Retry(
|
||||
total=retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
)
|
||||
|
||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
# The URL with parameters required for the get request
|
||||
url = full_url(endpoint, params)
|
||||
|
||||
try:
|
||||
|
||||
if not return_full_url:
|
||||
return s.get(url, headers=headers)
|
||||
|
||||
return (url, s.get(url, headers=headers))
|
||||
|
||||
except Exception as e:
|
||||
|
||||
reason = str(e)
|
||||
|
||||
if no_raise_on_redirects:
|
||||
if "Exceeded 30 redirects" in reason:
|
||||
return
|
||||
|
||||
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
||||
url=url, reason=reason
|
||||
)
|
||||
|
||||
exc = WaybackError(exc_message)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
||||
|
||||
|
||||
def check_filters(filters):
|
||||
if not isinstance(filters, list):
|
||||
raise WaybackError("filters must be a list.")
|
||||
|
||||
# [!]field:regex
|
||||
for _filter in filters:
|
||||
try:
|
||||
|
||||
match = re.search(
|
||||
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
|
||||
_filter,
|
||||
)
|
||||
|
||||
key = match.group(1)
|
||||
val = match.group(2)
|
||||
|
||||
except Exception:
|
||||
|
||||
exc_message = (
|
||||
"Filter '{_filter}' is not following the cdx filter syntax.".format(
|
||||
_filter=_filter
|
||||
)
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def check_collapses(collapses):
|
||||
|
||||
if not isinstance(collapses, list):
|
||||
raise WaybackError("collapses must be a list.")
|
||||
|
||||
if len(collapses) == 0:
|
||||
return
|
||||
|
||||
for collapse in collapses:
|
||||
try:
|
||||
match = re.search(
|
||||
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
|
||||
collapse,
|
||||
)
|
||||
field = match.group(1)
|
||||
|
||||
N = None
|
||||
if 2 == len(match.groups()):
|
||||
N = match.group(2)
|
||||
|
||||
if N:
|
||||
if not (field + N == collapse):
|
||||
raise Exception
|
||||
else:
|
||||
if not (field == collapse):
|
||||
raise Exception
|
||||
|
||||
except Exception:
|
||||
exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
|
||||
collapse=collapse
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def check_match_type(match_type, url):
|
||||
if not match_type:
|
||||
return
|
||||
|
||||
if "*" in url:
|
||||
raise WaybackError("Can not use wildcard with match_type argument")
|
||||
|
||||
legal_match_type = ["exact", "prefix", "host", "domain"]
|
||||
|
||||
if match_type not in legal_match_type:
|
||||
exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
|
||||
match_type=match_type
|
||||
)
|
||||
raise WaybackError(exc_message)
|
347
waybackpy/cli.py
Normal file
347
waybackpy/cli.py
Normal file
@ -0,0 +1,347 @@
|
||||
import click
|
||||
import re
|
||||
import os
|
||||
import json as JSON
|
||||
import random
|
||||
import string
|
||||
from .__version__ import __version__
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
from .cdx_api import WaybackMachineCDXServerAPI
|
||||
from .save_api import WaybackMachineSaveAPI
|
||||
from .availability_api import WaybackMachineAvailabilityAPI
|
||||
from .wrapper import Url
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"-u", "--url", help="URL on which Wayback machine operations are to be performed."
|
||||
)
|
||||
@click.option(
|
||||
"-ua",
|
||||
"--user-agent",
|
||||
"--user_agent",
|
||||
default=DEFAULT_USER_AGENT,
|
||||
help="User agent, default user agent is '%s' " % DEFAULT_USER_AGENT,
|
||||
)
|
||||
@click.option(
|
||||
"-v", "--version", is_flag=True, default=False, help="Print waybackpy version."
|
||||
)
|
||||
@click.option(
|
||||
"-n",
|
||||
"--newest",
|
||||
"-au",
|
||||
"--archive_url",
|
||||
"--archive-url",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Fetch the newest archive of the specified URL",
|
||||
)
|
||||
@click.option(
|
||||
"-o",
|
||||
"--oldest",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Fetch the oldest archive of the specified URL",
|
||||
)
|
||||
@click.option(
|
||||
"-j",
|
||||
"--json",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Spit out the JSON data for availability_api commands.",
|
||||
)
|
||||
@click.option(
|
||||
"-N", "--near", default=False, is_flag=True, help="Archive near specified time."
|
||||
)
|
||||
@click.option("-Y", "--year", type=click.IntRange(1994, 9999), help="Year in integer.")
|
||||
@click.option("-M", "--month", type=click.IntRange(1, 12), help="Month in integer.")
|
||||
@click.option("-D", "--day", type=click.IntRange(1, 31), help="Day in integer.")
|
||||
@click.option("-H", "--hour", type=click.IntRange(0, 24), help="Hour in integer.")
|
||||
@click.option("-MIN", "--minute", type=click.IntRange(0, 60), help="Minute in integer.")
|
||||
@click.option(
|
||||
"-s",
|
||||
"--save",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Save the specified URL's webpage and print the archive URL.",
|
||||
)
|
||||
@click.option(
|
||||
"-h",
|
||||
"--headers",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Spit out the headers data for save_api commands.",
|
||||
)
|
||||
@click.option(
|
||||
"-ku",
|
||||
"--known-urls",
|
||||
"--known_urls",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="List known URLs. Uses CDX API.",
|
||||
)
|
||||
@click.option(
|
||||
"-sub",
|
||||
"--subdomain",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Use with '--known_urls' to include known URLs for subdomains.",
|
||||
)
|
||||
@click.option(
|
||||
"-f",
|
||||
"--file",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Use with '--known_urls' to save the URLs in file at current directory.",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--cdx",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Spit out the headers data for save_api commands.",
|
||||
)
|
||||
@click.option(
|
||||
"-st",
|
||||
"--start-timestamp",
|
||||
"--start_timestamp",
|
||||
)
|
||||
@click.option(
|
||||
"-et",
|
||||
"--end-timestamp",
|
||||
"--end_timestamp",
|
||||
)
|
||||
@click.option(
|
||||
"-f",
|
||||
"--filters",
|
||||
multiple=True,
|
||||
)
|
||||
@click.option(
|
||||
"-mt",
|
||||
"--match-type",
|
||||
"--match_type",
|
||||
)
|
||||
@click.option(
|
||||
"-gz",
|
||||
"--gzip",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--collapses",
|
||||
multiple=True,
|
||||
)
|
||||
@click.option(
|
||||
"-l",
|
||||
"--limit",
|
||||
)
|
||||
@click.option(
|
||||
"-cp",
|
||||
"--cdx-print",
|
||||
"--cdx_print",
|
||||
multiple=True,
|
||||
)
|
||||
def main(
|
||||
url,
|
||||
user_agent,
|
||||
version,
|
||||
newest,
|
||||
oldest,
|
||||
json,
|
||||
near,
|
||||
year,
|
||||
month,
|
||||
day,
|
||||
hour,
|
||||
minute,
|
||||
save,
|
||||
headers,
|
||||
known_urls,
|
||||
subdomain,
|
||||
file,
|
||||
cdx,
|
||||
start_timestamp,
|
||||
end_timestamp,
|
||||
filters,
|
||||
match_type,
|
||||
gzip,
|
||||
collapses,
|
||||
limit,
|
||||
cdx_print,
|
||||
):
|
||||
"""
|
||||
┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
|
||||
┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
|
||||
┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
|
||||
┃┗┛┗┛┃┗━┓┃━┃┃━┃┃┃┏━┓┃┗━┓┃━┃┏━┛┃┗┛┛┃┏━━┛┃┃━┃┃
|
||||
┗┓┏┓┏┛┃┗┛┗┓┃┗━┛┃┃┗━┛┃┃┗┛┗┓┃┗━┓┃┏┓┓┃┃━━━┃┗━┛┃
|
||||
━┗┛┗┛━┗━━━┛┗━┓┏┛┗━━━┛┗━━━┛┗━━┛┗┛┗┛┗┛━━━┗━┓┏┛
|
||||
━━━━━━━━━━━┏━┛┃━━━━━━━━━━━━━━━━━━━━━━━━┏━┛┃━
|
||||
━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
|
||||
|
||||
waybackpy : Python package & CLI tool that interfaces the Wayback Machine API
|
||||
|
||||
Released under the MIT License.
|
||||
License @ https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
||||
|
||||
Copyright (c) 2020 waybackpy contributors. Contributors list @
|
||||
https://github.com/akamhy/waybackpy/graphs/contributors
|
||||
|
||||
https://github.com/akamhy/waybackpy
|
||||
|
||||
https://pypi.org/project/waybackpy
|
||||
|
||||
"""
|
||||
|
||||
if version:
|
||||
click.echo("waybackpy version %s" % __version__)
|
||||
return
|
||||
|
||||
if not url:
|
||||
click.echo("No URL detected. Please pass an URL.")
|
||||
return
|
||||
|
||||
def echo_availability_api(availability_api_instance):
|
||||
click.echo("Archive URL:")
|
||||
if not availability_api_instance.archive_url:
|
||||
archive_url = (
|
||||
"NO ARCHIVE FOUND - The requested URL is probably "
|
||||
+ "not yet archived or if the URL was recently archived then it is "
|
||||
+ "not yet available via the Wayback Machine's availability API "
|
||||
+ "because of database lag and should be available after some time."
|
||||
)
|
||||
else:
|
||||
archive_url = availability_api_instance.archive_url
|
||||
click.echo(archive_url)
|
||||
if json:
|
||||
click.echo("JSON response:")
|
||||
click.echo(JSON.dumps(availability_api_instance.JSON))
|
||||
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
|
||||
if oldest:
|
||||
availability_api.oldest()
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if newest:
|
||||
availability_api.newest()
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if near:
|
||||
near_args = {}
|
||||
keys = ["year", "month", "day", "hour", "minute"]
|
||||
args_arr = [year, month, day, hour, minute]
|
||||
for key, arg in zip(keys, args_arr):
|
||||
if arg:
|
||||
near_args[key] = arg
|
||||
availability_api.near(**near_args)
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if save:
|
||||
save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
|
||||
save_api.save()
|
||||
click.echo("Archive URL:")
|
||||
click.echo(save_api.archive_url)
|
||||
click.echo("Cached save:")
|
||||
click.echo(save_api.cached_save)
|
||||
if headers:
|
||||
click.echo("Save API headers:")
|
||||
click.echo(save_api.headers)
|
||||
return
|
||||
|
||||
def save_urls_on_file(url_gen):
|
||||
domain = None
|
||||
sys_random = random.SystemRandom()
|
||||
uid = "".join(
|
||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url_count = 0
|
||||
|
||||
for url in url_gen:
|
||||
url_count += 1
|
||||
if not domain:
|
||||
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
||||
|
||||
domain = "domain-unknown"
|
||||
|
||||
if match:
|
||||
domain = match.group(1)
|
||||
|
||||
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
if not os.path.isfile(file_path):
|
||||
open(file_path, "w+").close()
|
||||
|
||||
with open(file_path, "a") as f:
|
||||
f.write("{url}\n".format(url=url))
|
||||
|
||||
click.echo(url)
|
||||
|
||||
if url_count > 0:
|
||||
click.echo(
|
||||
"\n\n'{file_name}' saved in current working directory".format(
|
||||
file_name=file_name
|
||||
)
|
||||
)
|
||||
else:
|
||||
click.echo("No known URLs found. Please try a diffrent input!")
|
||||
|
||||
if known_urls:
|
||||
wayback = Url(url, user_agent)
|
||||
url_gen = wayback.known_urls(subdomain=subdomain)
|
||||
|
||||
if file:
|
||||
return save_urls_on_file(url_gen)
|
||||
else:
|
||||
for url in url_gen:
|
||||
click.echo(url)
|
||||
|
||||
if cdx:
|
||||
filters = list(filters)
|
||||
collapses = list(collapses)
|
||||
cdx_print = list(cdx_print)
|
||||
|
||||
cdx_api = WaybackMachineCDXServerAPI(
|
||||
url,
|
||||
user_agent=user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
filters=filters,
|
||||
match_type=match_type,
|
||||
gzip=gzip,
|
||||
collapses=collapses,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
snapshots = cdx_api.snapshots()
|
||||
|
||||
for snapshot in snapshots:
|
||||
if len(cdx_print) == 0:
|
||||
click.echo(snapshot)
|
||||
else:
|
||||
output_string = ""
|
||||
if "urlkey" or "url-key" or "url_key" in cdx_print:
|
||||
output_string = output_string + snapshot.urlkey + " "
|
||||
if "timestamp" or "time-stamp" or "time_stamp" in cdx_print:
|
||||
output_string = output_string + snapshot.timestamp + " "
|
||||
if "original" in cdx_print:
|
||||
output_string = output_string + snapshot.original + " "
|
||||
if "original" in cdx_print:
|
||||
output_string = output_string + snapshot.original + " "
|
||||
if "mimetype" or "mime-type" or "mime_type" in cdx_print:
|
||||
output_string = output_string + snapshot.mimetype + " "
|
||||
if "statuscode" or "status-code" or "status_code" in cdx_print:
|
||||
output_string = output_string + snapshot.statuscode + " "
|
||||
if "digest" in cdx_print:
|
||||
output_string = output_string + snapshot.digest + " "
|
||||
if "length" in cdx_print:
|
||||
output_string = output_string + snapshot.length + " "
|
||||
if "archiveurl" or "archive-url" or "archive_url" in cdx_print:
|
||||
output_string = output_string + snapshot.archive_url + " "
|
||||
click.echo(output_string)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,6 +1,38 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
waybackpy.exceptions
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
This module contains the set of Waybackpy's exceptions.
|
||||
"""
|
||||
|
||||
|
||||
class WaybackError(Exception):
|
||||
"""
|
||||
Raised when API Service error.
|
||||
Raised when Waybackpy can not return what you asked for.
|
||||
1) Wayback Machine API Service is unreachable/down.
|
||||
2) You passed illegal arguments.
|
||||
"""
|
||||
|
||||
|
||||
class RedirectSaveError(WaybackError):
|
||||
"""
|
||||
Raised when the original URL is redirected and the
|
||||
redirect URL is archived but not the original URL.
|
||||
"""
|
||||
|
||||
|
||||
class URLError(Exception):
|
||||
"""
|
||||
Raised when malformed URLs are passed as arguments.
|
||||
"""
|
||||
|
||||
|
||||
class MaximumRetriesExceeded(WaybackError):
|
||||
"""
|
||||
MaximumRetriesExceeded
|
||||
"""
|
||||
|
||||
|
||||
class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
|
||||
"""
|
||||
MaximumSaveRetriesExceeded
|
||||
"""
|
||||
|
131
waybackpy/save_api.py
Normal file
131
waybackpy/save_api.py
Normal file
@ -0,0 +1,131 @@
|
||||
import re
|
||||
import time
|
||||
import requests
|
||||
|
||||
from datetime import datetime
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
from .exceptions import MaximumSaveRetriesExceeded
|
||||
|
||||
|
||||
class WaybackMachineSaveAPI:
|
||||
|
||||
"""
|
||||
WaybackMachineSaveAPI class provides an interface for saving URLs on the
|
||||
Wayback Machine.
|
||||
"""
|
||||
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.request_url = "https://web.archive.org/save/" + self.url
|
||||
self.user_agent = user_agent
|
||||
self.request_headers = {"User-Agent": self.user_agent}
|
||||
self.max_tries = max_tries
|
||||
self.total_save_retries = 5
|
||||
self.backoff_factor = 0.5
|
||||
self.status_forcelist = [500, 502, 503, 504]
|
||||
self._archive_url = None
|
||||
self.instance_birth_time = datetime.utcnow()
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
|
||||
if self._archive_url:
|
||||
return self._archive_url
|
||||
else:
|
||||
return self.save()
|
||||
|
||||
def get_save_request_headers(self):
|
||||
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
total=self.total_save_retries,
|
||||
backoff_factor=self.backoff_factor,
|
||||
status_forcelist=self.status_forcelist,
|
||||
)
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
self.response = session.get(self.request_url, headers=self.request_headers)
|
||||
self.headers = self.response.headers
|
||||
self.status_code = self.response.status_code
|
||||
self.response_url = self.response.url
|
||||
|
||||
def archive_url_parser(self):
|
||||
|
||||
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
||||
match = re.search(regex1, str(self.headers))
|
||||
if match:
|
||||
return "https://web.archive.org" + match.group(1)
|
||||
|
||||
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
|
||||
match = re.search(regex2, str(self.headers))
|
||||
if match:
|
||||
return "https://" + match.group(1)
|
||||
|
||||
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
|
||||
match = re.search(regex3, str(self.headers))
|
||||
if match:
|
||||
return "https://" + match.group(1)
|
||||
|
||||
if self.response_url:
|
||||
self.response_url = self.response_url.strip()
|
||||
if "web.archive.org/web" in self.response_url:
|
||||
regex = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$"
|
||||
match = re.search(regex, self.response_url)
|
||||
if match:
|
||||
return "https://" + match.group(0)
|
||||
|
||||
def sleep(self, tries):
|
||||
|
||||
sleep_seconds = 5
|
||||
if tries % 3 == 0:
|
||||
sleep_seconds = 10
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
def timestamp(self):
|
||||
m = re.search(
|
||||
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||
)
|
||||
string_timestamp = m.group(1)
|
||||
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
|
||||
|
||||
timestamp_unixtime = time.mktime(timestamp.timetuple())
|
||||
instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple())
|
||||
|
||||
if timestamp_unixtime < instance_birth_time_unixtime:
|
||||
self.cached_save = True
|
||||
else:
|
||||
self.cached_save = False
|
||||
|
||||
return timestamp
|
||||
|
||||
def save(self):
|
||||
|
||||
saved_archive = None
|
||||
tries = 0
|
||||
|
||||
while True:
|
||||
|
||||
tries += 1
|
||||
|
||||
if tries >= self.max_tries:
|
||||
raise MaximumSaveRetriesExceeded(
|
||||
"Tried %s times but failed to save and return the archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
||||
% (str(tries), self.url, self.response_url, str(self.headers)),
|
||||
)
|
||||
|
||||
if not saved_archive:
|
||||
|
||||
if tries > 1:
|
||||
self.sleep(tries)
|
||||
|
||||
self.get_save_request_headers()
|
||||
saved_archive = self.archive_url_parser()
|
||||
|
||||
if not saved_archive:
|
||||
continue
|
||||
else:
|
||||
self._archive_url = saved_archive
|
||||
self.timestamp()
|
||||
return saved_archive
|
11
waybackpy/utils.py
Normal file
11
waybackpy/utils.py
Normal file
@ -0,0 +1,11 @@
|
||||
import requests
|
||||
from .__version__ import __version__
|
||||
|
||||
DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
|
||||
|
||||
|
||||
def latest_version(package_name, headers):
|
||||
request_url = "https://pypi.org/pypi/" + package_name + "/json"
|
||||
response = requests.get(request_url, headers=headers)
|
||||
data = response.json()
|
||||
return data["info"]["version"]
|
@ -1,153 +1,117 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
from waybackpy.exceptions import WaybackError
|
||||
|
||||
if sys.version_info >= (3, 0): # If the python ver >= 3
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError, URLError
|
||||
else: # For python2.x
|
||||
from urllib2 import Request, urlopen, HTTPError, URLError
|
||||
|
||||
default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||
|
||||
class Url():
|
||||
"""waybackpy Url object"""
|
||||
from .save_api import WaybackMachineSaveAPI
|
||||
from .availability_api import WaybackMachineAvailabilityAPI
|
||||
from .cdx_api import WaybackMachineCDXServerAPI
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
from .exceptions import WaybackError
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
def __init__(self, url, user_agent=default_UA):
|
||||
class Url:
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
|
||||
self.url = url
|
||||
self.user_agent = user_agent
|
||||
self.url_check() # checks url validity on init.
|
||||
|
||||
def __repr__(self):
|
||||
"""Representation of the object."""
|
||||
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
|
||||
self.user_agent = str(user_agent)
|
||||
self.archive_url = None
|
||||
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
"""String representation of the object."""
|
||||
return "%s" % self.clean_url()
|
||||
if not self.archive_url:
|
||||
self.newest()
|
||||
return self.archive_url
|
||||
|
||||
def __len__(self):
|
||||
"""Length of the URL."""
|
||||
return len(self.clean_url())
|
||||
td_max = timedelta(
|
||||
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
||||
)
|
||||
|
||||
def url_check(self):
|
||||
"""Check for common URL problems."""
|
||||
if "." not in self.url:
|
||||
raise URLError("'%s' is not a vaild url." % self.url)
|
||||
return True
|
||||
if not self.timestamp:
|
||||
self.oldest()
|
||||
|
||||
def clean_url(self):
|
||||
"""Fix the URL, if possible."""
|
||||
return str(self.url).strip().replace(" ","_")
|
||||
if self.timestamp == datetime.max:
|
||||
return td_max.days
|
||||
|
||||
def wayback_timestamp(self, **kwargs):
|
||||
"""Return the formatted the timestamp."""
|
||||
return (
|
||||
str(kwargs["year"])
|
||||
+
|
||||
str(kwargs["month"]).zfill(2)
|
||||
+
|
||||
str(kwargs["day"]).zfill(2)
|
||||
+
|
||||
str(kwargs["hour"]).zfill(2)
|
||||
+
|
||||
str(kwargs["minute"]).zfill(2)
|
||||
)
|
||||
|
||||
def handle_HTTPError(self, e):
|
||||
"""Handle some common HTTPErrors."""
|
||||
if e.code == 404:
|
||||
raise HTTPError(e)
|
||||
if e.code >= 400:
|
||||
raise WaybackError(e)
|
||||
return (datetime.utcnow() - self.timestamp).days
|
||||
|
||||
def save(self):
|
||||
"""Create a new archives for an URL on the Wayback Machine."""
|
||||
request_url = ("https://web.archive.org/save/" + self.clean_url())
|
||||
hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec
|
||||
req = Request(request_url, headers=hdr) #nosec
|
||||
try:
|
||||
response = urlopen(req, timeout=30) #nosec
|
||||
except Exception:
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except Exception as e:
|
||||
raise WaybackError(e)
|
||||
header = response.headers
|
||||
try:
|
||||
arch = re.search(r"rel=\"memento.*?web\.archive\.org(/web/[0-9]{14}/.*?)>", str(header)).group(1)
|
||||
except KeyError as e:
|
||||
raise WaybackError(e)
|
||||
return "https://web.archive.org" + arch
|
||||
self.wayback_machine_save_api = WaybackMachineSaveAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
self.archive_url = self.wayback_machine_save_api.archive_url
|
||||
self.timestamp = self.wayback_machine_save_api.timestamp()
|
||||
self.headers = self.wayback_machine_save_api.headers
|
||||
return self
|
||||
|
||||
def get(self, url=None, user_agent=None, encoding=None):
|
||||
"""Returns the source code of the supplied URL. Auto detects the encoding if not supplied."""
|
||||
if not url:
|
||||
url = self.clean_url()
|
||||
if not user_agent:
|
||||
user_agent = self.user_agent
|
||||
hdr = { 'User-Agent' : '%s' % user_agent }
|
||||
req = Request(url, headers=hdr) #nosec
|
||||
try:
|
||||
resp=urlopen(req) #nosec
|
||||
except URLError:
|
||||
try:
|
||||
resp=urlopen(req) #nosec
|
||||
except URLError as e:
|
||||
raise HTTPError(e)
|
||||
if not encoding:
|
||||
try:
|
||||
encoding= resp.headers['content-type'].split('charset=')[-1]
|
||||
except AttributeError:
|
||||
encoding = "UTF-8"
|
||||
return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
|
||||
def near(
|
||||
self,
|
||||
year=None,
|
||||
month=None,
|
||||
day=None,
|
||||
hour=None,
|
||||
minute=None,
|
||||
unix_timestamp=None,
|
||||
):
|
||||
|
||||
def near(self, **kwargs):
|
||||
""" Returns the archived from Wayback Machine for an URL closest to the time supplied.
|
||||
Supported params are year, month, day, hour and minute.
|
||||
The non supplied parameters are default to the runtime time.
|
||||
"""
|
||||
year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
|
||||
month=kwargs.get("month", datetime.utcnow().strftime('%m'))
|
||||
day=kwargs.get("day", datetime.utcnow().strftime('%d'))
|
||||
hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
|
||||
minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
|
||||
timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
|
||||
request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % (self.clean_url(), str(timestamp))
|
||||
hdr = { 'User-Agent' : '%s' % self.user_agent }
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except Exception as e:
|
||||
self.handle_HTTPError(e)
|
||||
data = json.loads(response.read().decode("UTF-8"))
|
||||
if not data["archived_snapshots"]:
|
||||
raise WaybackError("'%s' is not yet archived." % url)
|
||||
archive_url = (data["archived_snapshots"]["closest"]["url"])
|
||||
# wayback machine returns http sometimes, idk why? But they support https
|
||||
archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
|
||||
return archive_url
|
||||
self.wayback_machine_availability_api.near(
|
||||
year=year,
|
||||
month=month,
|
||||
day=day,
|
||||
hour=hour,
|
||||
minute=minute,
|
||||
unix_timestamp=unix_timestamp,
|
||||
)
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def oldest(self, year=1994):
|
||||
"""Returns the oldest archive from Wayback Machine for an URL."""
|
||||
return self.near(year=year)
|
||||
def oldest(self):
|
||||
self.wayback_machine_availability_api.oldest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def newest(self):
|
||||
"""Returns the newest archive on Wayback Machine for an URL, sometimes you may not get the newest archive because wayback machine DB lag."""
|
||||
return self.near()
|
||||
self.wayback_machine_availability_api.newest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def total_archives(self):
|
||||
"""Returns the total number of archives on Wayback Machine for an URL."""
|
||||
hdr = { 'User-Agent' : '%s' % self.user_agent }
|
||||
request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url()
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except Exception as e:
|
||||
self.handle_HTTPError(e)
|
||||
return str(response.read()).count(",") # Most efficient method to count number of archives (yet)
|
||||
def set_availability_api_attrs(self):
|
||||
self.archive_url = self.wayback_machine_availability_api.archive_url
|
||||
self.JSON = self.wayback_machine_availability_api.JSON
|
||||
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
||||
|
||||
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
self.url,
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
)
|
||||
|
||||
count = 0
|
||||
for _ in cdx.snapshots():
|
||||
count = count + 1
|
||||
return count
|
||||
|
||||
def known_urls(
|
||||
self,
|
||||
subdomain=False,
|
||||
host=False,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
match_type="prefix",
|
||||
):
|
||||
if subdomain:
|
||||
match_type = "domain"
|
||||
if host:
|
||||
match_type = "host"
|
||||
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
self.url,
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
match_type=match_type,
|
||||
collapses=["urlkey"],
|
||||
)
|
||||
|
||||
for snapshot in cdx.snapshots():
|
||||
yield (snapshot.original)
|
||||
|
Reference in New Issue
Block a user