Compare commits
313 Commits
Author | SHA1 | Date | |
---|---|---|---|
de5a3e1561 | |||
52e46fecc2 | |||
3b6415abc7 | |||
66e16d6d89 | |||
16b9bdd7f9 | |||
7adc01bff2 | |||
9bbd056268 | |||
2ab44391cf | |||
cc3628ae18 | |||
1d751b942b | |||
261a867a21 | |||
2e487e88d3 | |||
c8d0ad493a | |||
ce869177fd | |||
58616fb986 | |||
4e68cd5743 | |||
a7b805292d | |||
6dc6124dc4 | |||
5a7fc7d568 | |||
5a9c861cad | |||
dd1917c77e | |||
db8f902cff | |||
88cda94c0b | |||
09290f88d1 | |||
e5835091c9 | |||
7312ed1f4f | |||
6ae8f843d3 | |||
36b936820b | |||
a3bc6aad2b | |||
edc2f63d93 | |||
ffe0810b12 | |||
40233eb115 | |||
d549d31421 | |||
0725163af8 | |||
712471176b | |||
dcd7b03302 | |||
76205d9cf6 | |||
ec0a0d04cc | |||
7bb01df846 | |||
6142e0b353 | |||
a65990aee3 | |||
259a024eb1 | |||
91402792e6 | |||
eabf4dc046 | |||
5a7bd73565 | |||
4693dbf9c1 | |||
f4f2e51315 | |||
d6b7df6837 | |||
dafba5d0cb | |||
6c71dfbe41 | |||
a6470b1036 | |||
04cda4558e | |||
625ed63482 | |||
a03813315f | |||
a2550f17d7 | |||
15ef5816db | |||
93b52bd0fe | |||
28ff877081 | |||
3e3ecff9df | |||
ce64135ba8 | |||
2af6580ffb | |||
8a3c515176 | |||
d98c4f32ad | |||
e0a4b007d5 | |||
6fb6b2deee | |||
1882862992 | |||
0c6107e675 | |||
bd079978bf | |||
5dec4927cd | |||
62e5217b9e | |||
9823c809e9 | |||
db5737a857 | |||
ca0821a466 | |||
bb4dbc7d3c | |||
7c7fd75376 | |||
0b71433667 | |||
1b499a7594 | |||
da390ee8a3 | |||
d3e68d0e70 | |||
fde28d57aa | |||
6092e504c8 | |||
93ef60ecd2 | |||
461b3f74c9 | |||
3c53b411b0 | |||
8125526061 | |||
2dc81569a8 | |||
fd163f3d36 | |||
a0a918cf0d | |||
4943cf6873 | |||
bc3efc7d63 | |||
f89368f16d | |||
c919a6a605 | |||
0280fca189 | |||
60ee8b95a8 | |||
ca51c14332 | |||
525cf17c6f | |||
406e03c52f | |||
672b33e83a | |||
b19b840628 | |||
a6df4f899c | |||
7686e9c20d | |||
3c5932bc39 | |||
f9a986f489 | |||
0d7458ee90 | |||
ac8b9d6a50 | |||
58cd9c28e7 | |||
5088305a58 | |||
9f847a5e55 | |||
6c04c2f3d3 | |||
925be7b17e | |||
2b132456ac | |||
50e3154a4e | |||
7aef50428f | |||
d8ec0f5025 | |||
0a2f97c034 | |||
3e9cf23578 | |||
7f927ec7be | |||
9de6393cd5 | |||
91e7f65617 | |||
d465454019 | |||
1a81eb97fb | |||
6b3b2e2a7d | |||
82c65454e6 | |||
19710461b6 | |||
a3661d6b85 | |||
58375e4ef4 | |||
ea023e98da | |||
f1065ed1c8 | |||
315519b21f | |||
07c98661de | |||
2cd991a54e | |||
ede251afb3 | |||
a8ce970ca0 | |||
243af26bf6 | |||
0f1db94884 | |||
c304f58ea2 | |||
23f7222cb5 | |||
ce7294d990 | |||
c9fa114d2e | |||
8b6bacb28e | |||
32d8ad7780 | |||
cbf2f90faa | |||
4dde3e3134 | |||
1551e8f1c6 | |||
c84f09e2d2 | |||
57a32669b5 | |||
fe017cbcc8 | |||
5edb03d24b | |||
c5de2232ba | |||
ca9186c301 | |||
8a4b631c13 | |||
ec9ce92f48 | |||
e95d35c37f | |||
36d662b961 | |||
2835f8877e | |||
18cbd2fd30 | |||
a2812fb56f | |||
77effcf649 | |||
7272ef45a0 | |||
56116551ac | |||
4dcda94cb0 | |||
09f59b0182 | |||
ed24184b99 | |||
56bef064b1 | |||
44bb2cf5e4 | |||
e231228721 | |||
b8b2d6dfa9 | |||
3eca6294df | |||
eb037a0284 | |||
a01821f20b | |||
b21036f8df | |||
b43bacb7ac | |||
f7313b255a | |||
7457e1c793 | |||
f7493d823f | |||
7fa7b59ce3 | |||
78a608db50 | |||
93f7dfdaf9 | |||
83c6f256c9 | |||
dee9105794 | |||
3bfc3b46d0 | |||
553f150bee | |||
b3a7e714a5 | |||
cd9841713c | |||
1ea9548d46 | |||
be7642c837 | |||
a418a4e464 | |||
aec035ef1e | |||
6d37993ab9 | |||
72b80ca44e | |||
c10aa9279c | |||
68d809a7d6 | |||
4ad09a419b | |||
ddc6620f09 | |||
4066a65678 | |||
8e46a9ba7a | |||
a5a98b9b00 | |||
a721ab7d6c | |||
7db27ae5e1 | |||
8fd4462025 | |||
c458a15820 | |||
bae3412bee | |||
94cb08bb37 | |||
af888db13e | |||
d24f2408ee | |||
ddd2274015 | |||
99abdb7c67 | |||
f3bb9a8540 | |||
bb94e0d1c5 | |||
1a78d88be2 | |||
3ec61758b3 | |||
83c962166d | |||
e87dee3bdf | |||
b27bfff15a | |||
970fc1cd08 | |||
65391bf14b | |||
8ab116f276 | |||
6f82041ec9 | |||
11059c960e | |||
eee1b8eba1 | |||
f7de8f5575 | |||
3fa0c32064 | |||
aa1e3b8825 | |||
58d2d585c8 | |||
e8efed2e2f | |||
49089b7321 | |||
55d8687566 | |||
0fa28527af | |||
68259fd2d9 | |||
e7086a89d3 | |||
e39467227c | |||
ba840404cf | |||
8fbd2d9e55 | |||
eebf6043de | |||
3d3b09d6d8 | |||
ef15b5863c | |||
256c0cdb6b | |||
12c72a8294 | |||
0ad27f5ecc | |||
700b60b5f8 | |||
11032596c8 | |||
9727f92168 | |||
d2893fec13 | |||
f1353b2129 | |||
c76a95ef90 | |||
62d88359ce | |||
9942c474c9 | |||
dfb736e794 | |||
84d1766917 | |||
9d3cdfafb3 | |||
20a16bfa45 | |||
f2112c73f6 | |||
9860527d96 | |||
9ac1e877c8 | |||
f881705d00 | |||
f015c3f4f3 | |||
42ac399362 | |||
e9d010c793 | |||
58a6409528 | |||
7ca2029158 | |||
80331833f2 | |||
5e3d3a815f | |||
6182a18cf4 | |||
9bca750310 | |||
c22749a6a3 | |||
151df94fe3 | |||
24540d0b2c | |||
bdfc72d05d | |||
3b104c1a28 | |||
fb0d4658a7 | |||
48833980e1 | |||
0c4f119981 | |||
afded51a04 | |||
b950616561 | |||
444675538f | |||
0ca6710334 | |||
01a7c591ad | |||
74d3bc154b | |||
a8e94dfb25 | |||
cc38798b32 | |||
bc3dd44f27 | |||
ba46cdafe2 | |||
538afb14e9 | |||
7605b614ee | |||
d0a4e25cf5 | |||
8c5c0153da | |||
e7dac74906 | |||
c686708c9e | |||
f9ae8ada70 | |||
e56ece3dc9 | |||
db127a5c54 | |||
ed497bbd23 | |||
45fe07ddb6 | |||
0029d63d8a | |||
beb5b625ec | |||
b40d734346 | |||
be0a30de85 | |||
3a65a60bd6 | |||
7b626f5ea5 | |||
73371d6c68 | |||
8904ba4d67 | |||
b4a7f7ea6f | |||
a2ead04021 | |||
3513feb075 | |||
d34b98373f | |||
38f3b81742 | |||
660a826aed | |||
a52d035c0e | |||
6737ce0e26 | |||
98cc918c8f | |||
b103bfc6e4 | |||
edd05838b8 | |||
031212e161 |
31
.github/workflows/python-publish.yml
vendored
Normal file
31
.github/workflows/python-publish.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
# This workflows will upload a Python Package using Twine when a release is created
|
||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
||||
|
||||
name: Upload Python Package
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: '3.x'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools wheel twine
|
||||
- name: Build and publish
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: |
|
||||
python setup.py sdist bdist_wheel
|
||||
twine upload dist/*
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,6 @@
|
||||
# Files generated while testing
|
||||
*-urls-*.txt
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
@ -1,6 +1,10 @@
|
||||
{
|
||||
"scanSettings": {
|
||||
"baseBranches": []
|
||||
},
|
||||
"checkRunSettings": {
|
||||
"vulnerableCheckRunConclusionLevel": "failure"
|
||||
"vulnerableCheckRunConclusionLevel": "failure",
|
||||
"displayMode": "diff"
|
||||
},
|
||||
"issueSettings": {
|
||||
"minSeverityLevel": "LOW"
|
||||
|
10
CONTRIBUTORS.md
Normal file
10
CONTRIBUTORS.md
Normal file
@ -0,0 +1,10 @@
|
||||
## AUTHORS
|
||||
- akamhy (<https://github.com/akamhy>)
|
||||
- danvalen1 (<https://github.com/danvalen1>)
|
||||
- AntiCompositeNumber (<https://github.com/AntiCompositeNumber>)
|
||||
- jonasjancarik (<https://github.com/jonasjancarik>)
|
||||
|
||||
## ACKNOWLEDGEMENTS
|
||||
- mhmdiaa (<https://github.com/mhmdiaa>) for <https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050>. known_urls is based on this gist.
|
||||
- datashaman (<https://stackoverflow.com/users/401467/datashaman>) for <https://stackoverflow.com/a/35504626>. _get_response is based on this amazing answer.
|
||||
- dequeued0 (<https://github.com/dequeued0>) for reporting bugs and useful feature requests.
|
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 akamhy
|
||||
Copyright (c) 2020 waybackpy contributors ( https://github.com/akamhy/waybackpy/graphs/contributors )
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
236
README.md
236
README.md
@ -1,147 +1,129 @@
|
||||
# waybackpy
|
||||
[](https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade)
|
||||
[](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
[](https://www.python.org/)
|
||||
[](https://github.com/akamhy/waybackpy/graphs/commit-activity)
|
||||
<div align="center">
|
||||
|
||||
<img src="https://raw.githubusercontent.com/akamhy/waybackpy/master/assets/waybackpy_logo.svg"><br>
|
||||
|
||||
<h3>Python package & CLI tool that interfaces with the Wayback Machine API</h3>
|
||||
|
||||
</div>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://pypi.org/project/waybackpy/"><img alt="pypi" src="https://img.shields.io/pypi/v/waybackpy.svg"></a>
|
||||
<a href="https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md"><img alt="Contributions Welcome" src="https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square"></a>
|
||||
<a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a>
|
||||
<a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a>
|
||||
<a href="#"><img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square"></a>
|
||||
</p>
|
||||
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
## ⭐️ Introduction
|
||||
Waybackpy is a [Python package](https://www.udacity.com/blog/2021/01/what-is-a-python-package.html) and a CLI tool that interfaces with the Wayback Machine API.
|
||||
|
||||
Wayback Machine has 3 client side APIs.
|
||||
|
||||
- Save API
|
||||
- Availability API
|
||||
- CDX API
|
||||
|
||||
All three of these can be accessed by waybackpy.
|
||||
|
||||
|
||||
### 🏗 Installation
|
||||
|
||||

|
||||

|
||||
|
||||
The waybackpy is a python wrapper for [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine).
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
<!--ts-->
|
||||
|
||||
* [Installation](#installation)
|
||||
|
||||
* [Usage](#usage)
|
||||
* [Saving an url using save()](#capturing-aka-saving-an-url-using-save)
|
||||
* [Receiving the oldest archive for an URL Using oldest()](#receiving-the-oldest-archive-for-an-url-using-oldest)
|
||||
* [Receiving the recent most/newest archive for an URL using newest()](#receiving-the-newest-archive-for-an-url-using-newest)
|
||||
* [Receiving archive close to a specified year, month, day, hour, and minute using near()](#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
|
||||
* [Get the content of webpage using get()](#get-the-content-of-webpage-using-get)
|
||||
|
||||
* [Tests](#tests)
|
||||
|
||||
* [Dependency](#dependency)
|
||||
|
||||
* [License](#license)
|
||||
|
||||
<!--te-->
|
||||
|
||||
## Installation
|
||||
Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)):
|
||||
|
||||
**pip install waybackpy**
|
||||
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
#### Capturing aka Saving an url Using save()
|
||||
|
||||
```diff
|
||||
+ waybackpy.save(url, UA=user_agent)
|
||||
```bash
|
||||
pip install waybackpy
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
Install directly from GitHub:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/akamhy/waybackpy.git
|
||||
```
|
||||
|
||||
### Docker Image
|
||||
Docker Hub : <https://hub.docker.com/r/secsi/waybackpy>
|
||||
|
||||
Docker image is automatically updated on every release by [Regulary and Automatically Updated Docker Images](https://github.com/cybersecsi/RAUDI) (RAUDI).
|
||||
|
||||
RAUDI is a tool by SecSI (<https://secsi.io>), an Italian cybersecurity startup.
|
||||
|
||||
|
||||
|
||||
### Usage
|
||||
|
||||
#### As a Python package
|
||||
|
||||
##### Save API aka SavePageNow
|
||||
```python
|
||||
import waybackpy
|
||||
# Capturing a new archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
archived_url = waybackpy.save("https://github.com/akamhy/waybackpy", UA = "Any-User-Agent")
|
||||
print(archived_url)
|
||||
>>> from waybackpy import WaybackMachineSaveAPI
|
||||
>>> url = "https://github.com"
|
||||
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
>>>
|
||||
>>> save_api = WaybackMachineSaveAPI(url, user_agent)
|
||||
>>> save_api.save()
|
||||
https://web.archive.org/web/20220118125249/https://github.com/
|
||||
>>> save_api.cached_save
|
||||
False
|
||||
>>> save_api.timestamp()
|
||||
datetime.datetime(2022, 1, 18, 12, 52, 49)
|
||||
```
|
||||
This should print something similar to the following archived URL:
|
||||
|
||||
<https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy>
|
||||
|
||||
#### Receiving the oldest archive for an URL Using oldest()
|
||||
|
||||
```diff
|
||||
+ waybackpy.oldest(url, UA=user_agent)
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
|
||||
##### Availability API
|
||||
```python
|
||||
import waybackpy
|
||||
# retrieving the oldest archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
oldest_archive = waybackpy.oldest("https://www.google.com/", UA = "Any-User-Agent")
|
||||
print(oldest_archive)
|
||||
>>> from waybackpy import WaybackMachineAvailabilityAPI
|
||||
>>>
|
||||
>>> url = "https://google.com"
|
||||
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
>>>
|
||||
>>> availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
|
||||
>>>
|
||||
>>> availability_api.oldest()
|
||||
https://web.archive.org/web/19981111184551/http://google.com:80/
|
||||
>>>
|
||||
>>> availability_api.newest()
|
||||
https://web.archive.org/web/20220118150444/https://www.google.com/
|
||||
>>>
|
||||
>>> availability_api.near(year=2010, month=10, day=10, hour=10)
|
||||
https://web.archive.org/web/20101010101708/http://www.google.com/
|
||||
```
|
||||
This returns the oldest available archive for <https://google.com>.
|
||||
|
||||
<http://web.archive.org/web/19981111184551/http://google.com:80/>
|
||||
|
||||
#### Receiving the newest archive for an URL using newest()
|
||||
|
||||
```diff
|
||||
+ waybackpy.newest(url, UA=user_agent)
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
|
||||
##### CDX API aka CDXServerAPI
|
||||
```python
|
||||
import waybackpy
|
||||
# retrieving the newest archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
newest_archive = waybackpy.newest("https://www.microsoft.com/en-us", UA = "Any-User-Agent")
|
||||
print(newest_archive)
|
||||
>>> from waybackpy import WaybackMachineCDXServerAPI
|
||||
>>> url = "https://pypi.org"
|
||||
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
>>> cdx = WaybackMachineCDXServerAPI(url, user_agent, start_timestamp=2016, end_timestamp=2017)
|
||||
>>> for item in cdx.snapshots():
|
||||
... print(item.archive_url)
|
||||
...
|
||||
https://web.archive.org/web/20160110011047/http://pypi.org/
|
||||
https://web.archive.org/web/20160305104847/http://pypi.org/
|
||||
.
|
||||
. # URLS REDACTED FOR READABILITY
|
||||
.
|
||||
https://web.archive.org/web/20171127171549/https://pypi.org/
|
||||
https://web.archive.org/web/20171206002737/http://pypi.org:80/
|
||||
```
|
||||
This returns the newest available archive for <https://www.microsoft.com/en-us>, something just like this:
|
||||
|
||||
<http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/>
|
||||
> Documentation at <https://github.com/akamhy/waybackpy/wiki/Python-package-docs>.
|
||||
|
||||
#### Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
|
||||
```diff
|
||||
+ waybackpy.near(url, year=2020, month=1, day=1, hour=1, minute=1, UA=user_agent)
|
||||
#### As a CLI tool
|
||||
```bash
|
||||
$ waybackpy --save --url "https://en.wikipedia.org/wiki/Social_media" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media
|
||||
|
||||
$ waybackpy --oldest --url "https://en.wikipedia.org/wiki/Humanoid" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20040415020811/http://en.wikipedia.org:80/wiki/Humanoid
|
||||
|
||||
$ waybackpy --newest --url "https://en.wikipedia.org/wiki/Remote_sensing" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20201221130522/https://en.wikipedia.org/wiki/Remote_sensing
|
||||
```
|
||||
> url is mandotory. year,month,day,hour and minute are optional arguments. UA is not mandotory, but higly recomended.
|
||||
> CLI documentation is at <https://github.com/akamhy/waybackpy/wiki/CLI-docs>.
|
||||
|
||||
### 🛡 License
|
||||
[](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
|
||||
```python
|
||||
import waybackpy
|
||||
# retriving the the closest archive from a specified year.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
# supported argumnets are year,month,day,hour and minute
|
||||
archive_near_year = waybackpy.near("https://www.facebook.com/", year=2010, UA ="Any-User-Agent")
|
||||
print(archive_near_year)
|
||||
```
|
||||
returns : <http://web.archive.org/web/20100504071154/http://www.facebook.com/>
|
||||
|
||||
```waybackpy.near("https://www.facebook.com/", year=2010, month=1, UA ="Any-User-Agent")``` returns: <http://web.archive.org/web/20101111173430/http://www.facebook.com//>
|
||||
|
||||
```waybackpy.near("https://www.oracle.com/index.html", year=2019, month=1, day=5, UA ="Any-User-Agent")``` returns: <http://web.archive.org/web/20190105054437/https://www.oracle.com/index.html>
|
||||
> Please note that if you only specify the year, the current month and day are default arguments for month and day respectively. Do not expect just putting the year parameter would return the archive closer to January but the current month you are using the package. If you are using it in July 2018 and let's say you use ```waybackpy.near("https://www.facebook.com/", year=2011, UA ="Any-User-Agent")``` then you would be returned the nearest archive to July 2011 and not January 2011. You need to specify the month "1" for January.
|
||||
|
||||
> Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
|
||||
#### Get the content of webpage using get()
|
||||
|
||||
```diff
|
||||
+ waybackpy.get(url, encoding="UTF-8", UA=user_agent)
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended. encoding is detected automatically, don't specify unless necessary.
|
||||
|
||||
```python
|
||||
from waybackpy import get
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
# supported argumnets are url, encoding and UA
|
||||
webpage = get("https://example.com/", UA="User-Agent")
|
||||
print(webpage)
|
||||
```
|
||||
> This should print the source code for <https://example.com/>.
|
||||
|
||||
## Dependency
|
||||
* None, just python standard libraries. Both python 2 and 3 are supported :)
|
||||
|
||||
|
||||
## License
|
||||
|
||||
[MIT License](LICENSE)
|
||||
Released under the MIT License. See [license](https://github.com/akamhy/waybackpy/blob/master/LICENSE) for details.
|
||||
|
1
_config.yml
Normal file
1
_config.yml
Normal file
@ -0,0 +1 @@
|
||||
theme: jekyll-theme-cayman
|
1
assets/waybackpy_logo.svg
Normal file
1
assets/waybackpy_logo.svg
Normal file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 176.612 41.908" height="158.392" width="667.51" xmlns:v="https://github.com/akamhy/waybackpy"><text transform="matrix(.862888 0 0 1.158899 -.748 -98.312)" y="110.937" x="0.931" xml:space="preserve" font-weight="bold" font-size="28.149" font-family="sans-serif" letter-spacing="0" word-spacing="0" writing-mode="lr-tb" fill="#003dff"><tspan y="110.937" x="0.931"><tspan y="110.937" x="0.931" letter-spacing="3.568" writing-mode="lr-tb">waybackpy</tspan></tspan></text><path d="M.749 0h153.787v4.864H.749zm22.076 37.418h153.787v4.49H22.825z" fill="navy"/><path d="M0 37.418h22.825v4.49H0zM154.536 0h21.702v4.864h-21.702z" fill="#f0f"/></svg>
|
After Width: | Height: | Size: 694 B |
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
click
|
||||
requests
|
@ -1,2 +1,7 @@
|
||||
[metadata]
|
||||
description-file = README.md
|
||||
license_file = LICENSE
|
||||
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = E203,W503
|
||||
|
65
setup.py
65
setup.py
@ -1,32 +1,51 @@
|
||||
import os.path
|
||||
from setuptools import setup
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
|
||||
long_description = f.read()
|
||||
|
||||
about = {}
|
||||
with open(os.path.join(os.path.dirname(__file__), "waybackpy", "__version__.py")) as f:
|
||||
exec(f.read(), about)
|
||||
|
||||
setup(
|
||||
name = 'waybackpy',
|
||||
packages = ['waybackpy'],
|
||||
version = 'v1.2',
|
||||
description = 'A python wrapper for Internet Archives Wayback Machine',
|
||||
name=about["__title__"],
|
||||
packages=["waybackpy"],
|
||||
version=about["__version__"],
|
||||
description=about["__description__"],
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
license='MIT',
|
||||
author = 'akamhy',
|
||||
author_email = 'akash3pro@gmail.com',
|
||||
url = 'https://github.com/akamhy/waybackpy',
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/v1.2.tar.gz',
|
||||
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
|
||||
install_requires=[],
|
||||
python_requires='>=3.6',
|
||||
long_description_content_type="text/markdown",
|
||||
license=about["__license__"],
|
||||
author=about["__author__"],
|
||||
author_email=about["__author_email__"],
|
||||
url=about["__url__"],
|
||||
download_url="https://github.com/akamhy/waybackpy/archive/3.0.0.tar.gz",
|
||||
keywords=[
|
||||
"Archive Website",
|
||||
"Wayback Machine",
|
||||
"Internet Archive",
|
||||
],
|
||||
install_requires=["requests", "click"],
|
||||
python_requires=">=3.4",
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Intended Audience :: Developers',
|
||||
'Topic :: Software Development :: Build Tools',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
],
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
"Natural Language :: English",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.4",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
],
|
||||
entry_points={"console_scripts": ["waybackpy = waybackpy.cli:main"]},
|
||||
project_urls={
|
||||
"Documentation": "https://github.com/akamhy/waybackpy/wiki",
|
||||
"Source": "https://github.com/akamhy/waybackpy",
|
||||
"Tracker": "https://github.com/akamhy/waybackpy/issues",
|
||||
},
|
||||
)
|
||||
|
@ -1,6 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from .wrapper import save, near, oldest, newest, get
|
||||
|
||||
__version__ = "v1.2"
|
||||
|
||||
__all__ = ['wrapper', 'exceptions']
|
||||
from .wrapper import Url
|
||||
from .cdx_api import WaybackMachineCDXServerAPI
|
||||
from .save_api import WaybackMachineSaveAPI
|
||||
from .availability_api import WaybackMachineAvailabilityAPI
|
||||
from .__version__ import (
|
||||
__title__,
|
||||
__description__,
|
||||
__url__,
|
||||
__version__,
|
||||
__author__,
|
||||
__author_email__,
|
||||
__license__,
|
||||
__copyright__,
|
||||
)
|
||||
|
11
waybackpy/__version__.py
Normal file
11
waybackpy/__version__.py
Normal file
@ -0,0 +1,11 @@
|
||||
__title__ = "waybackpy"
|
||||
__description__ = (
|
||||
"Python package that interfaces with the Internet Archive's Wayback Machine APIs. "
|
||||
"Archive pages and retrieve archived pages easily."
|
||||
)
|
||||
__url__ = "https://akamhy.github.io/waybackpy/"
|
||||
__version__ = "3.0.0"
|
||||
__author__ = "akamhy"
|
||||
__author_email__ = "akamhy@yahoo.com"
|
||||
__license__ = "MIT"
|
||||
__copyright__ = "Copyright 2020-2022 Akash Mahanty et al."
|
109
waybackpy/availability_api.py
Normal file
109
waybackpy/availability_api.py
Normal file
@ -0,0 +1,109 @@
|
||||
import re
|
||||
import time
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from .__version__ import __version__
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
def full_url(endpoint, params):
|
||||
if not params:
|
||||
return endpoint.strip()
|
||||
|
||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||
|
||||
for key, val in params.items():
|
||||
key = "filter" if key.startswith("filter") else key
|
||||
key = "collapse" if key.startswith("collapse") else key
|
||||
amp = "" if full_url.endswith("?") else "&"
|
||||
full_url = (
|
||||
full_url
|
||||
+ amp
|
||||
+ "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
|
||||
)
|
||||
return full_url
|
||||
|
||||
|
||||
class WaybackMachineAvailabilityAPI:
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = user_agent
|
||||
self.headers = {"User-Agent": self.user_agent}
|
||||
self.payload = {"url": "{url}".format(url=self.url)}
|
||||
self.endpoint = "https://archive.org/wayback/available"
|
||||
self.JSON = None
|
||||
|
||||
def unix_timestamp_to_wayback_timestamp(self, unix_timestamp):
|
||||
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||
|
||||
def __repr__(self):
|
||||
return str(self) # self.__str__()
|
||||
|
||||
def __str__(self):
|
||||
if not self.JSON:
|
||||
return None
|
||||
return self.archive_url
|
||||
|
||||
def json(self):
|
||||
self.request_url = full_url(self.endpoint, self.payload)
|
||||
self.response = requests.get(self.request_url, self.headers)
|
||||
self.JSON = self.response.json()
|
||||
return self.JSON
|
||||
|
||||
def timestamp(self):
|
||||
if not self.JSON["archived_snapshots"] or not self.JSON:
|
||||
return datetime.max
|
||||
|
||||
return datetime.strptime(
|
||||
self.JSON["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||
)
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
data = self.JSON
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
archive_url = None
|
||||
else:
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||
)
|
||||
return archive_url
|
||||
|
||||
def wayback_timestamp(self, **kwargs):
|
||||
return "".join(
|
||||
str(kwargs[key]).zfill(2)
|
||||
for key in ["year", "month", "day", "hour", "minute"]
|
||||
)
|
||||
|
||||
def oldest(self):
|
||||
return self.near(year=1994)
|
||||
|
||||
def newest(self):
|
||||
return self.near(unix_timestamp=int(time.time()))
|
||||
|
||||
def near(
|
||||
self,
|
||||
year=None,
|
||||
month=None,
|
||||
day=None,
|
||||
hour=None,
|
||||
minute=None,
|
||||
unix_timestamp=None,
|
||||
):
|
||||
if unix_timestamp:
|
||||
timestamp = self.unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||
else:
|
||||
now = datetime.utcnow().timetuple()
|
||||
timestamp = self.wayback_timestamp(
|
||||
year=year if year else now.tm_year,
|
||||
month=month if month else now.tm_mon,
|
||||
day=day if day else now.tm_mday,
|
||||
hour=hour if hour else now.tm_hour,
|
||||
minute=minute if minute else now.tm_min,
|
||||
)
|
||||
|
||||
self.payload["timestamp"] = timestamp
|
||||
self.json()
|
||||
return self
|
185
waybackpy/cdx_api.py
Normal file
185
waybackpy/cdx_api.py
Normal file
@ -0,0 +1,185 @@
|
||||
from .exceptions import WaybackError
|
||||
from .cdx_snapshot import CDXSnapshot
|
||||
from .cdx_utils import (
|
||||
get_total_pages,
|
||||
get_response,
|
||||
check_filters,
|
||||
check_collapses,
|
||||
check_match_type,
|
||||
)
|
||||
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
|
||||
|
||||
class WaybackMachineCDXServerAPI:
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
user_agent=None,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
filters=[],
|
||||
match_type=None,
|
||||
gzip=None,
|
||||
collapses=[],
|
||||
limit=None,
|
||||
):
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.user_agent = str(user_agent) if user_agent else DEFAULT_USER_AGENT
|
||||
self.start_timestamp = str(start_timestamp) if start_timestamp else None
|
||||
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
||||
self.filters = filters
|
||||
check_filters(self.filters)
|
||||
self.match_type = str(match_type).strip() if match_type else None
|
||||
check_match_type(self.match_type, self.url)
|
||||
self.gzip = gzip if gzip else True
|
||||
self.collapses = collapses
|
||||
check_collapses(self.collapses)
|
||||
self.limit = limit if limit else 5000
|
||||
self.last_api_request_url = None
|
||||
self.use_page = False
|
||||
self.endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||
|
||||
def cdx_api_manager(self, payload, headers, use_page=False):
|
||||
|
||||
total_pages = get_total_pages(self.url, self.user_agent)
|
||||
# If we only have two or less pages of archives then we care for accuracy
|
||||
# pagination API can be lagged sometimes
|
||||
if use_page == True and total_pages >= 2:
|
||||
blank_pages = 0
|
||||
for i in range(total_pages):
|
||||
payload["page"] = str(i)
|
||||
|
||||
url, res = get_response(
|
||||
self.endpoint, params=payload, headers=headers, return_full_url=True
|
||||
)
|
||||
|
||||
self.last_api_request_url = url
|
||||
text = res.text
|
||||
if len(text) == 0:
|
||||
blank_pages += 1
|
||||
|
||||
if blank_pages >= 2:
|
||||
break
|
||||
|
||||
yield text
|
||||
else:
|
||||
|
||||
payload["showResumeKey"] = "true"
|
||||
payload["limit"] = str(self.limit)
|
||||
resumeKey = None
|
||||
|
||||
more = True
|
||||
while more:
|
||||
|
||||
if resumeKey:
|
||||
payload["resumeKey"] = resumeKey
|
||||
|
||||
url, res = get_response(
|
||||
self.endpoint, params=payload, headers=headers, return_full_url=True
|
||||
)
|
||||
|
||||
self.last_api_request_url = url
|
||||
|
||||
text = res.text.strip()
|
||||
lines = text.splitlines()
|
||||
|
||||
more = False
|
||||
|
||||
if len(lines) >= 3:
|
||||
|
||||
second_last_line = lines[-2]
|
||||
|
||||
if len(second_last_line) == 0:
|
||||
|
||||
resumeKey = lines[-1].strip()
|
||||
text = text.replace(resumeKey, "", 1).strip()
|
||||
more = True
|
||||
|
||||
yield text
|
||||
|
||||
def add_payload(self, payload):
|
||||
if self.start_timestamp:
|
||||
payload["from"] = self.start_timestamp
|
||||
|
||||
if self.end_timestamp:
|
||||
payload["to"] = self.end_timestamp
|
||||
|
||||
if self.gzip != True:
|
||||
payload["gzip"] = "false"
|
||||
|
||||
if self.match_type:
|
||||
payload["matchType"] = self.match_type
|
||||
|
||||
if self.filters and len(self.filters) > 0:
|
||||
for i, f in enumerate(self.filters):
|
||||
payload["filter" + str(i)] = f
|
||||
|
||||
if self.collapses and len(self.collapses) > 0:
|
||||
for i, f in enumerate(self.collapses):
|
||||
payload["collapse" + str(i)] = f
|
||||
|
||||
# Don't need to return anything as it's dictionary.
|
||||
payload["url"] = self.url
|
||||
|
||||
def snapshots(self):
|
||||
payload = {}
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
|
||||
self.add_payload(payload)
|
||||
|
||||
if not self.start_timestamp or self.end_timestamp:
|
||||
self.use_page = True
|
||||
|
||||
if self.collapses != []:
|
||||
self.use_page = False
|
||||
|
||||
texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
|
||||
|
||||
for text in texts:
|
||||
|
||||
if text.isspace() or len(text) <= 1 or not text:
|
||||
continue
|
||||
|
||||
snapshot_list = text.split("\n")
|
||||
|
||||
for snapshot in snapshot_list:
|
||||
|
||||
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
|
||||
continue
|
||||
|
||||
properties = {
|
||||
"urlkey": None,
|
||||
"timestamp": None,
|
||||
"original": None,
|
||||
"mimetype": None,
|
||||
"statuscode": None,
|
||||
"digest": None,
|
||||
"length": None,
|
||||
}
|
||||
|
||||
prop_values = snapshot.split(" ")
|
||||
|
||||
prop_values_len = len(prop_values)
|
||||
properties_len = len(properties)
|
||||
|
||||
if prop_values_len != properties_len:
|
||||
raise WaybackError(
|
||||
"Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(
|
||||
prop_values_len=prop_values_len,
|
||||
properties_len=properties_len,
|
||||
snapshot=snapshot,
|
||||
)
|
||||
)
|
||||
|
||||
(
|
||||
properties["urlkey"],
|
||||
properties["timestamp"],
|
||||
properties["original"],
|
||||
properties["mimetype"],
|
||||
properties["statuscode"],
|
||||
properties["digest"],
|
||||
properties["length"],
|
||||
) = prop_values
|
||||
|
||||
yield CDXSnapshot(properties)
|
27
waybackpy/cdx_snapshot.py
Normal file
27
waybackpy/cdx_snapshot.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class CDXSnapshot:
|
||||
def __init__(self, properties):
|
||||
self.urlkey = properties["urlkey"]
|
||||
self.timestamp = properties["timestamp"]
|
||||
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
||||
self.original = properties["original"]
|
||||
self.mimetype = properties["mimetype"]
|
||||
self.statuscode = properties["statuscode"]
|
||||
self.digest = properties["digest"]
|
||||
self.length = properties["length"]
|
||||
self.archive_url = (
|
||||
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
|
||||
urlkey=self.urlkey,
|
||||
timestamp=self.timestamp,
|
||||
original=self.original,
|
||||
mimetype=self.mimetype,
|
||||
statuscode=self.statuscode,
|
||||
digest=self.digest,
|
||||
length=self.length,
|
||||
)
|
154
waybackpy/cdx_utils.py
Normal file
154
waybackpy/cdx_utils.py
Normal file
@ -0,0 +1,154 @@
|
||||
import re
|
||||
import requests
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
from .exceptions import WaybackError
|
||||
|
||||
|
||||
def get_total_pages(url, user_agent):
|
||||
request_url = (
|
||||
"https://web.archive.org/cdx/search/cdx?url={url}&showNumPages=true".format(
|
||||
url=url
|
||||
)
|
||||
)
|
||||
headers = {"User-Agent": user_agent}
|
||||
return int((requests.get(request_url, headers=headers).text).strip())
|
||||
|
||||
|
||||
def full_url(endpoint, params):
|
||||
if not params:
|
||||
return endpoint
|
||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||
for key, val in params.items():
|
||||
key = "filter" if key.startswith("filter") else key
|
||||
key = "collapse" if key.startswith("collapse") else key
|
||||
amp = "" if full_url.endswith("?") else "&"
|
||||
full_url = (
|
||||
full_url
|
||||
+ amp
|
||||
+ "{key}={val}".format(key=key, val=requests.utils.quote(str(val)))
|
||||
)
|
||||
return full_url
|
||||
|
||||
|
||||
def get_response(
|
||||
endpoint,
|
||||
params=None,
|
||||
headers=None,
|
||||
return_full_url=False,
|
||||
retries=5,
|
||||
backoff_factor=0.5,
|
||||
no_raise_on_redirects=False,
|
||||
):
|
||||
|
||||
s = requests.Session()
|
||||
|
||||
retries = Retry(
|
||||
total=retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
)
|
||||
|
||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
# The URL with parameters required for the get request
|
||||
url = full_url(endpoint, params)
|
||||
|
||||
try:
|
||||
|
||||
if not return_full_url:
|
||||
return s.get(url, headers=headers)
|
||||
|
||||
return (url, s.get(url, headers=headers))
|
||||
|
||||
except Exception as e:
|
||||
|
||||
reason = str(e)
|
||||
|
||||
if no_raise_on_redirects:
|
||||
if "Exceeded 30 redirects" in reason:
|
||||
return
|
||||
|
||||
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
||||
url=url, reason=reason
|
||||
)
|
||||
|
||||
exc = WaybackError(exc_message)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
||||
|
||||
|
||||
def check_filters(filters):
|
||||
if not isinstance(filters, list):
|
||||
raise WaybackError("filters must be a list.")
|
||||
|
||||
# [!]field:regex
|
||||
for _filter in filters:
|
||||
try:
|
||||
|
||||
match = re.search(
|
||||
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
|
||||
_filter,
|
||||
)
|
||||
|
||||
key = match.group(1)
|
||||
val = match.group(2)
|
||||
|
||||
except Exception:
|
||||
|
||||
exc_message = (
|
||||
"Filter '{_filter}' is not following the cdx filter syntax.".format(
|
||||
_filter=_filter
|
||||
)
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def check_collapses(collapses):
|
||||
|
||||
if not isinstance(collapses, list):
|
||||
raise WaybackError("collapses must be a list.")
|
||||
|
||||
if len(collapses) == 0:
|
||||
return
|
||||
|
||||
for collapse in collapses:
|
||||
try:
|
||||
match = re.search(
|
||||
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
|
||||
collapse,
|
||||
)
|
||||
field = match.group(1)
|
||||
|
||||
N = None
|
||||
if 2 == len(match.groups()):
|
||||
N = match.group(2)
|
||||
|
||||
if N:
|
||||
if not (field + N == collapse):
|
||||
raise Exception
|
||||
else:
|
||||
if not (field == collapse):
|
||||
raise Exception
|
||||
|
||||
except Exception:
|
||||
exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
|
||||
collapse=collapse
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def check_match_type(match_type, url):
|
||||
if not match_type:
|
||||
return
|
||||
|
||||
if "*" in url:
|
||||
raise WaybackError("Can not use wildcard with match_type argument")
|
||||
|
||||
legal_match_type = ["exact", "prefix", "host", "domain"]
|
||||
|
||||
if match_type not in legal_match_type:
|
||||
exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
|
||||
match_type=match_type
|
||||
)
|
||||
raise WaybackError(exc_message)
|
347
waybackpy/cli.py
Normal file
347
waybackpy/cli.py
Normal file
@ -0,0 +1,347 @@
|
||||
import click
|
||||
import re
|
||||
import os
|
||||
import json as JSON
|
||||
import random
|
||||
import string
|
||||
from .__version__ import __version__
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
from .cdx_api import WaybackMachineCDXServerAPI
|
||||
from .save_api import WaybackMachineSaveAPI
|
||||
from .availability_api import WaybackMachineAvailabilityAPI
|
||||
from .wrapper import Url
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"-u", "--url", help="URL on which Wayback machine operations are to be performed."
|
||||
)
|
||||
@click.option(
|
||||
"-ua",
|
||||
"--user-agent",
|
||||
"--user_agent",
|
||||
default=DEFAULT_USER_AGENT,
|
||||
help="User agent, default user agent is '%s' " % DEFAULT_USER_AGENT,
|
||||
)
|
||||
@click.option(
|
||||
"-v", "--version", is_flag=True, default=False, help="Print waybackpy version."
|
||||
)
|
||||
@click.option(
|
||||
"-n",
|
||||
"--newest",
|
||||
"-au",
|
||||
"--archive_url",
|
||||
"--archive-url",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Fetch the newest archive of the specified URL",
|
||||
)
|
||||
@click.option(
|
||||
"-o",
|
||||
"--oldest",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Fetch the oldest archive of the specified URL",
|
||||
)
|
||||
@click.option(
|
||||
"-j",
|
||||
"--json",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Spit out the JSON data for availability_api commands.",
|
||||
)
|
||||
@click.option(
|
||||
"-N", "--near", default=False, is_flag=True, help="Archive near specified time."
|
||||
)
|
||||
@click.option("-Y", "--year", type=click.IntRange(1994, 9999), help="Year in integer.")
|
||||
@click.option("-M", "--month", type=click.IntRange(1, 12), help="Month in integer.")
|
||||
@click.option("-D", "--day", type=click.IntRange(1, 31), help="Day in integer.")
|
||||
@click.option("-H", "--hour", type=click.IntRange(0, 24), help="Hour in integer.")
|
||||
@click.option("-MIN", "--minute", type=click.IntRange(0, 60), help="Minute in integer.")
|
||||
@click.option(
|
||||
"-s",
|
||||
"--save",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Save the specified URL's webpage and print the archive URL.",
|
||||
)
|
||||
@click.option(
|
||||
"-h",
|
||||
"--headers",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Spit out the headers data for save_api commands.",
|
||||
)
|
||||
@click.option(
|
||||
"-ku",
|
||||
"--known-urls",
|
||||
"--known_urls",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="List known URLs. Uses CDX API.",
|
||||
)
|
||||
@click.option(
|
||||
"-sub",
|
||||
"--subdomain",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Use with '--known_urls' to include known URLs for subdomains.",
|
||||
)
|
||||
@click.option(
|
||||
"-f",
|
||||
"--file",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Use with '--known_urls' to save the URLs in file at current directory.",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--cdx",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="Spit out the headers data for save_api commands.",
|
||||
)
|
||||
@click.option(
|
||||
"-st",
|
||||
"--start-timestamp",
|
||||
"--start_timestamp",
|
||||
)
|
||||
@click.option(
|
||||
"-et",
|
||||
"--end-timestamp",
|
||||
"--end_timestamp",
|
||||
)
|
||||
@click.option(
|
||||
"-f",
|
||||
"--filters",
|
||||
multiple=True,
|
||||
)
|
||||
@click.option(
|
||||
"-mt",
|
||||
"--match-type",
|
||||
"--match_type",
|
||||
)
|
||||
@click.option(
|
||||
"-gz",
|
||||
"--gzip",
|
||||
)
|
||||
@click.option(
|
||||
"-c",
|
||||
"--collapses",
|
||||
multiple=True,
|
||||
)
|
||||
@click.option(
|
||||
"-l",
|
||||
"--limit",
|
||||
)
|
||||
@click.option(
|
||||
"-cp",
|
||||
"--cdx-print",
|
||||
"--cdx_print",
|
||||
multiple=True,
|
||||
)
|
||||
def main(
|
||||
url,
|
||||
user_agent,
|
||||
version,
|
||||
newest,
|
||||
oldest,
|
||||
json,
|
||||
near,
|
||||
year,
|
||||
month,
|
||||
day,
|
||||
hour,
|
||||
minute,
|
||||
save,
|
||||
headers,
|
||||
known_urls,
|
||||
subdomain,
|
||||
file,
|
||||
cdx,
|
||||
start_timestamp,
|
||||
end_timestamp,
|
||||
filters,
|
||||
match_type,
|
||||
gzip,
|
||||
collapses,
|
||||
limit,
|
||||
cdx_print,
|
||||
):
|
||||
"""
|
||||
┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
|
||||
┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
|
||||
┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
|
||||
┃┗┛┗┛┃┗━┓┃━┃┃━┃┃┃┏━┓┃┗━┓┃━┃┏━┛┃┗┛┛┃┏━━┛┃┃━┃┃
|
||||
┗┓┏┓┏┛┃┗┛┗┓┃┗━┛┃┃┗━┛┃┃┗┛┗┓┃┗━┓┃┏┓┓┃┃━━━┃┗━┛┃
|
||||
━┗┛┗┛━┗━━━┛┗━┓┏┛┗━━━┛┗━━━┛┗━━┛┗┛┗┛┗┛━━━┗━┓┏┛
|
||||
━━━━━━━━━━━┏━┛┃━━━━━━━━━━━━━━━━━━━━━━━━┏━┛┃━
|
||||
━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
|
||||
|
||||
waybackpy : Python package & CLI tool that interfaces the Wayback Machine API
|
||||
|
||||
Released under the MIT License.
|
||||
License @ https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
||||
|
||||
Copyright (c) 2020 waybackpy contributors. Contributors list @
|
||||
https://github.com/akamhy/waybackpy/graphs/contributors
|
||||
|
||||
https://github.com/akamhy/waybackpy
|
||||
|
||||
https://pypi.org/project/waybackpy
|
||||
|
||||
"""
|
||||
|
||||
if version:
|
||||
click.echo("waybackpy version %s" % __version__)
|
||||
return
|
||||
|
||||
if not url:
|
||||
click.echo("No URL detected. Please pass an URL.")
|
||||
return
|
||||
|
||||
def echo_availability_api(availability_api_instance):
|
||||
click.echo("Archive URL:")
|
||||
if not availability_api_instance.archive_url:
|
||||
archive_url = (
|
||||
"NO ARCHIVE FOUND - The requested URL is probably "
|
||||
+ "not yet archived or if the URL was recently archived then it is "
|
||||
+ "not yet available via the Wayback Machine's availability API "
|
||||
+ "because of database lag and should be available after some time."
|
||||
)
|
||||
else:
|
||||
archive_url = availability_api_instance.archive_url
|
||||
click.echo(archive_url)
|
||||
if json:
|
||||
click.echo("JSON response:")
|
||||
click.echo(JSON.dumps(availability_api_instance.JSON))
|
||||
|
||||
availability_api = WaybackMachineAvailabilityAPI(url, user_agent=user_agent)
|
||||
|
||||
if oldest:
|
||||
availability_api.oldest()
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if newest:
|
||||
availability_api.newest()
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if near:
|
||||
near_args = {}
|
||||
keys = ["year", "month", "day", "hour", "minute"]
|
||||
args_arr = [year, month, day, hour, minute]
|
||||
for key, arg in zip(keys, args_arr):
|
||||
if arg:
|
||||
near_args[key] = arg
|
||||
availability_api.near(**near_args)
|
||||
echo_availability_api(availability_api)
|
||||
return
|
||||
|
||||
if save:
|
||||
save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
|
||||
save_api.save()
|
||||
click.echo("Archive URL:")
|
||||
click.echo(save_api.archive_url)
|
||||
click.echo("Cached save:")
|
||||
click.echo(save_api.cached_save)
|
||||
if headers:
|
||||
click.echo("Save API headers:")
|
||||
click.echo(save_api.headers)
|
||||
return
|
||||
|
||||
def save_urls_on_file(url_gen):
|
||||
domain = None
|
||||
sys_random = random.SystemRandom()
|
||||
uid = "".join(
|
||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url_count = 0
|
||||
|
||||
for url in url_gen:
|
||||
url_count += 1
|
||||
if not domain:
|
||||
match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
||||
|
||||
domain = "domain-unknown"
|
||||
|
||||
if match:
|
||||
domain = match.group(1)
|
||||
|
||||
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
if not os.path.isfile(file_path):
|
||||
open(file_path, "w+").close()
|
||||
|
||||
with open(file_path, "a") as f:
|
||||
f.write("{url}\n".format(url=url))
|
||||
|
||||
click.echo(url)
|
||||
|
||||
if url_count > 0:
|
||||
click.echo(
|
||||
"\n\n'{file_name}' saved in current working directory".format(
|
||||
file_name=file_name
|
||||
)
|
||||
)
|
||||
else:
|
||||
click.echo("No known URLs found. Please try a diffrent input!")
|
||||
|
||||
if known_urls:
|
||||
wayback = Url(url, user_agent)
|
||||
url_gen = wayback.known_urls(subdomain=subdomain)
|
||||
|
||||
if file:
|
||||
return save_urls_on_file(url_gen)
|
||||
else:
|
||||
for url in url_gen:
|
||||
click.echo(url)
|
||||
|
||||
if cdx:
|
||||
filters = list(filters)
|
||||
collapses = list(collapses)
|
||||
cdx_print = list(cdx_print)
|
||||
|
||||
cdx_api = WaybackMachineCDXServerAPI(
|
||||
url,
|
||||
user_agent=user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
filters=filters,
|
||||
match_type=match_type,
|
||||
gzip=gzip,
|
||||
collapses=collapses,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
snapshots = cdx_api.snapshots()
|
||||
|
||||
for snapshot in snapshots:
|
||||
if len(cdx_print) == 0:
|
||||
click.echo(snapshot)
|
||||
else:
|
||||
output_string = ""
|
||||
if "urlkey" or "url-key" or "url_key" in cdx_print:
|
||||
output_string = output_string + snapshot.urlkey + " "
|
||||
if "timestamp" or "time-stamp" or "time_stamp" in cdx_print:
|
||||
output_string = output_string + snapshot.timestamp + " "
|
||||
if "original" in cdx_print:
|
||||
output_string = output_string + snapshot.original + " "
|
||||
if "original" in cdx_print:
|
||||
output_string = output_string + snapshot.original + " "
|
||||
if "mimetype" or "mime-type" or "mime_type" in cdx_print:
|
||||
output_string = output_string + snapshot.mimetype + " "
|
||||
if "statuscode" or "status-code" or "status_code" in cdx_print:
|
||||
output_string = output_string + snapshot.statuscode + " "
|
||||
if "digest" in cdx_print:
|
||||
output_string = output_string + snapshot.digest + " "
|
||||
if "length" in cdx_print:
|
||||
output_string = output_string + snapshot.length + " "
|
||||
if "archiveurl" or "archive-url" or "archive_url" in cdx_print:
|
||||
output_string = output_string + snapshot.archive_url + " "
|
||||
click.echo(output_string)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,38 +1,38 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
waybackpy.exceptions
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
This module contains the set of Waybackpy's exceptions.
|
||||
"""
|
||||
|
||||
class TooManyArchivingRequests(Exception):
|
||||
|
||||
"""Error when a single url reqeusted for archiving too many times in a short timespam.
|
||||
Wayback machine doesn't supports archivng any url too many times in a short period of time.
|
||||
class WaybackError(Exception):
|
||||
"""
|
||||
Raised when Waybackpy can not return what you asked for.
|
||||
1) Wayback Machine API Service is unreachable/down.
|
||||
2) You passed illegal arguments.
|
||||
"""
|
||||
|
||||
class ArchivingNotAllowed(Exception):
|
||||
|
||||
"""Files like robots.txt are set to deny robot archiving.
|
||||
Wayback machine respects these file, will not archive.
|
||||
class RedirectSaveError(WaybackError):
|
||||
"""
|
||||
Raised when the original URL is redirected and the
|
||||
redirect URL is archived but not the original URL.
|
||||
"""
|
||||
|
||||
class PageNotSaved(Exception):
|
||||
|
||||
class URLError(Exception):
|
||||
"""
|
||||
When unable to save a webpage.
|
||||
Raised when malformed URLs are passed as arguments.
|
||||
"""
|
||||
|
||||
class ArchiveNotFound(Exception):
|
||||
|
||||
class MaximumRetriesExceeded(WaybackError):
|
||||
"""
|
||||
When a page was never archived but client asks for old archive.
|
||||
MaximumRetriesExceeded
|
||||
"""
|
||||
|
||||
class UrlNotFound(Exception):
|
||||
"""
|
||||
Raised when 404 UrlNotFound.
|
||||
"""
|
||||
|
||||
class BadGateWay(Exception):
|
||||
class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
|
||||
"""
|
||||
Raised when 502 bad gateway.
|
||||
"""
|
||||
|
||||
class InvalidUrl(Exception):
|
||||
"""
|
||||
Raised when url doesn't follow the standard url format.
|
||||
MaximumSaveRetriesExceeded
|
||||
"""
|
||||
|
131
waybackpy/save_api.py
Normal file
131
waybackpy/save_api.py
Normal file
@ -0,0 +1,131 @@
|
||||
import re
|
||||
import time
|
||||
import requests
|
||||
|
||||
from datetime import datetime
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
from .exceptions import MaximumSaveRetriesExceeded
|
||||
|
||||
|
||||
class WaybackMachineSaveAPI:
|
||||
|
||||
"""
|
||||
WaybackMachineSaveAPI class provides an interface for saving URLs on the
|
||||
Wayback Machine.
|
||||
"""
|
||||
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT, max_tries=8):
|
||||
self.url = str(url).strip().replace(" ", "%20")
|
||||
self.request_url = "https://web.archive.org/save/" + self.url
|
||||
self.user_agent = user_agent
|
||||
self.request_headers = {"User-Agent": self.user_agent}
|
||||
self.max_tries = max_tries
|
||||
self.total_save_retries = 5
|
||||
self.backoff_factor = 0.5
|
||||
self.status_forcelist = [500, 502, 503, 504]
|
||||
self._archive_url = None
|
||||
self.instance_birth_time = datetime.utcnow()
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
|
||||
if self._archive_url:
|
||||
return self._archive_url
|
||||
else:
|
||||
return self.save()
|
||||
|
||||
def get_save_request_headers(self):
|
||||
|
||||
session = requests.Session()
|
||||
retries = Retry(
|
||||
total=self.total_save_retries,
|
||||
backoff_factor=self.backoff_factor,
|
||||
status_forcelist=self.status_forcelist,
|
||||
)
|
||||
session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
self.response = session.get(self.request_url, headers=self.request_headers)
|
||||
self.headers = self.response.headers
|
||||
self.status_code = self.response.status_code
|
||||
self.response_url = self.response.url
|
||||
|
||||
def archive_url_parser(self):
|
||||
|
||||
regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
|
||||
match = re.search(regex1, str(self.headers))
|
||||
if match:
|
||||
return "https://web.archive.org" + match.group(1)
|
||||
|
||||
regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
|
||||
match = re.search(regex2, str(self.headers))
|
||||
if match:
|
||||
return "https://" + match.group(1)
|
||||
|
||||
regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
|
||||
match = re.search(regex3, str(self.headers))
|
||||
if match:
|
||||
return "https://" + match.group(1)
|
||||
|
||||
if self.response_url:
|
||||
self.response_url = self.response_url.strip()
|
||||
if "web.archive.org/web" in self.response_url:
|
||||
regex = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$"
|
||||
match = re.search(regex, self.response_url)
|
||||
if match:
|
||||
return "https://" + match.group(0)
|
||||
|
||||
def sleep(self, tries):
|
||||
|
||||
sleep_seconds = 5
|
||||
if tries % 3 == 0:
|
||||
sleep_seconds = 10
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
def timestamp(self):
|
||||
m = re.search(
|
||||
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||
)
|
||||
string_timestamp = m.group(1)
|
||||
timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
|
||||
|
||||
timestamp_unixtime = time.mktime(timestamp.timetuple())
|
||||
instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple())
|
||||
|
||||
if timestamp_unixtime < instance_birth_time_unixtime:
|
||||
self.cached_save = True
|
||||
else:
|
||||
self.cached_save = False
|
||||
|
||||
return timestamp
|
||||
|
||||
def save(self):
|
||||
|
||||
saved_archive = None
|
||||
tries = 0
|
||||
|
||||
while True:
|
||||
|
||||
tries += 1
|
||||
|
||||
if tries >= self.max_tries:
|
||||
raise MaximumSaveRetriesExceeded(
|
||||
"Tried %s times but failed to save and return the archive for %s.\nResponse URL:\n%s \nResponse Header:\n%s\n"
|
||||
% (str(tries), self.url, self.response_url, str(self.headers)),
|
||||
)
|
||||
|
||||
if not saved_archive:
|
||||
|
||||
if tries > 1:
|
||||
self.sleep(tries)
|
||||
|
||||
self.get_save_request_headers()
|
||||
saved_archive = self.archive_url_parser()
|
||||
|
||||
if not saved_archive:
|
||||
continue
|
||||
else:
|
||||
self._archive_url = saved_archive
|
||||
self.timestamp()
|
||||
return saved_archive
|
11
waybackpy/utils.py
Normal file
11
waybackpy/utils.py
Normal file
@ -0,0 +1,11 @@
|
||||
import requests
|
||||
from .__version__ import __version__
|
||||
|
||||
DEFAULT_USER_AGENT = "waybackpy %s - https://github.com/akamhy/waybackpy" % __version__
|
||||
|
||||
|
||||
def latest_version(package_name, headers):
|
||||
request_url = "https://pypi.org/pypi/" + package_name + "/json"
|
||||
response = requests.get(request_url, headers=headers)
|
||||
data = response.json()
|
||||
return data["info"]["version"]
|
@ -1,88 +1,117 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
from datetime import datetime
|
||||
from waybackpy.exceptions import TooManyArchivingRequests, ArchivingNotAllowed, PageNotSaved, ArchiveNotFound, UrlNotFound, BadGateWay, InvalidUrl
|
||||
try:
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError
|
||||
except ImportError:
|
||||
from urllib2 import Request, urlopen, HTTPError
|
||||
from .save_api import WaybackMachineSaveAPI
|
||||
from .availability_api import WaybackMachineAvailabilityAPI
|
||||
from .cdx_api import WaybackMachineCDXServerAPI
|
||||
from .utils import DEFAULT_USER_AGENT
|
||||
from .exceptions import WaybackError
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
default_UA = "waybackpy python package"
|
||||
class Url:
|
||||
def __init__(self, url, user_agent=DEFAULT_USER_AGENT):
|
||||
self.url = url
|
||||
self.user_agent = str(user_agent)
|
||||
self.archive_url = None
|
||||
self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
|
||||
def clean_url(url):
|
||||
return str(url).strip().replace(" ","_")
|
||||
def __str__(self):
|
||||
if not self.archive_url:
|
||||
self.newest()
|
||||
return self.archive_url
|
||||
|
||||
def save(url,UA=default_UA):
|
||||
base_save_url = "https://web.archive.org/save/"
|
||||
request_url = (base_save_url + clean_url(url))
|
||||
hdr = { 'User-Agent' : '%s' % UA } #nosec
|
||||
req = Request(request_url, headers=hdr) #nosec
|
||||
if "." not in url:
|
||||
raise InvalidUrl("'%s' is not a vaild url." % url)
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except HTTPError as e:
|
||||
if e.code == 502:
|
||||
raise BadGateWay(e)
|
||||
elif e.code == 429:
|
||||
raise TooManyArchivingRequests(e)
|
||||
elif e.code == 404:
|
||||
raise UrlNotFound(e)
|
||||
else:
|
||||
raise PageNotSaved(e)
|
||||
def __len__(self):
|
||||
td_max = timedelta(
|
||||
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
||||
)
|
||||
|
||||
header = response.headers
|
||||
if "exclusion.robots.policy" in str(header):
|
||||
raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url))
|
||||
archive_id = header['Content-Location']
|
||||
archived_url = "https://web.archive.org" + archive_id
|
||||
return archived_url
|
||||
if not self.timestamp:
|
||||
self.oldest()
|
||||
|
||||
def get(url,encoding=None,UA=default_UA):
|
||||
hdr = { 'User-Agent' : '%s' % UA }
|
||||
request_url = clean_url(url)
|
||||
req = Request(request_url, headers=hdr) #nosec
|
||||
resp=urlopen(req) #nosec
|
||||
if encoding is None:
|
||||
try:
|
||||
encoding= resp.headers['content-type'].split('charset=')[-1]
|
||||
except AttributeError:
|
||||
encoding = "UTF-8"
|
||||
return resp.read().decode(encoding)
|
||||
if self.timestamp == datetime.max:
|
||||
return td_max.days
|
||||
|
||||
def wayback_timestamp(year,month,day,hour,minute):
|
||||
year = str(year)
|
||||
month = str(month).zfill(2)
|
||||
day = str(day).zfill(2)
|
||||
hour = str(hour).zfill(2)
|
||||
minute = str(minute).zfill(2)
|
||||
return (year+month+day+hour+minute)
|
||||
return (datetime.utcnow() - self.timestamp).days
|
||||
|
||||
def near(
|
||||
url,
|
||||
year=datetime.utcnow().strftime('%Y'),
|
||||
month=datetime.utcnow().strftime('%m'),
|
||||
day=datetime.utcnow().strftime('%d'),
|
||||
hour=datetime.utcnow().strftime('%H'),
|
||||
minute=datetime.utcnow().strftime('%M'),
|
||||
UA=default_UA,
|
||||
def save(self):
|
||||
self.wayback_machine_save_api = WaybackMachineSaveAPI(
|
||||
self.url, user_agent=self.user_agent
|
||||
)
|
||||
self.archive_url = self.wayback_machine_save_api.archive_url
|
||||
self.timestamp = self.wayback_machine_save_api.timestamp()
|
||||
self.headers = self.wayback_machine_save_api.headers
|
||||
return self
|
||||
|
||||
def near(
|
||||
self,
|
||||
year=None,
|
||||
month=None,
|
||||
day=None,
|
||||
hour=None,
|
||||
minute=None,
|
||||
unix_timestamp=None,
|
||||
):
|
||||
timestamp = wayback_timestamp(year,month,day,hour,minute)
|
||||
request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % (clean_url(url), str(timestamp))
|
||||
hdr = { 'User-Agent' : '%s' % UA }
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
response = urlopen(req) #nosec
|
||||
data = json.loads(response.read().decode("UTF-8"))
|
||||
if not data["archived_snapshots"]:
|
||||
raise ArchiveNotFound("'%s' is not yet archived." % url)
|
||||
|
||||
archive_url = (data["archived_snapshots"]["closest"]["url"])
|
||||
return archive_url
|
||||
self.wayback_machine_availability_api.near(
|
||||
year=year,
|
||||
month=month,
|
||||
day=day,
|
||||
hour=hour,
|
||||
minute=minute,
|
||||
unix_timestamp=unix_timestamp,
|
||||
)
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def oldest(url,UA=default_UA,year=1994):
|
||||
return near(url,year=year,UA=UA)
|
||||
def oldest(self):
|
||||
self.wayback_machine_availability_api.oldest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def newest(url,UA=default_UA):
|
||||
return near(url,UA=UA)
|
||||
def newest(self):
|
||||
self.wayback_machine_availability_api.newest()
|
||||
self.set_availability_api_attrs()
|
||||
return self
|
||||
|
||||
def set_availability_api_attrs(self):
|
||||
self.archive_url = self.wayback_machine_availability_api.archive_url
|
||||
self.JSON = self.wayback_machine_availability_api.JSON
|
||||
self.timestamp = self.wayback_machine_availability_api.timestamp()
|
||||
|
||||
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
self.url,
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
)
|
||||
|
||||
count = 0
|
||||
for _ in cdx.snapshots():
|
||||
count = count + 1
|
||||
return count
|
||||
|
||||
def known_urls(
|
||||
self,
|
||||
subdomain=False,
|
||||
host=False,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
match_type="prefix",
|
||||
):
|
||||
if subdomain:
|
||||
match_type = "domain"
|
||||
if host:
|
||||
match_type = "host"
|
||||
|
||||
cdx = WaybackMachineCDXServerAPI(
|
||||
self.url,
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
match_type=match_type,
|
||||
collapses=["urlkey"],
|
||||
)
|
||||
|
||||
for snapshot in cdx.snapshots():
|
||||
yield (snapshot.original)
|
||||
|
Reference in New Issue
Block a user