Compare commits
107 Commits
Author | SHA1 | Date | |
---|---|---|---|
77effcf649 | |||
7272ef45a0 | |||
56116551ac | |||
4dcda94cb0 | |||
09f59b0182 | |||
ed24184b99 | |||
56bef064b1 | |||
44bb2cf5e4 | |||
e231228721 | |||
b8b2d6dfa9 | |||
3eca6294df | |||
eb037a0284 | |||
a01821f20b | |||
b21036f8df | |||
b43bacb7ac | |||
f7313b255a | |||
7457e1c793 | |||
f7493d823f | |||
7fa7b59ce3 | |||
78a608db50 | |||
93f7dfdaf9 | |||
83c6f256c9 | |||
dee9105794 | |||
3bfc3b46d0 | |||
553f150bee | |||
b3a7e714a5 | |||
cd9841713c | |||
1ea9548d46 | |||
be7642c837 | |||
a418a4e464 | |||
aec035ef1e | |||
6d37993ab9 | |||
72b80ca44e | |||
c10aa9279c | |||
68d809a7d6 | |||
4ad09a419b | |||
ddc6620f09 | |||
4066a65678 | |||
8e46a9ba7a | |||
a5a98b9b00 | |||
a721ab7d6c | |||
7db27ae5e1 | |||
8fd4462025 | |||
c458a15820 | |||
bae3412bee | |||
94cb08bb37 | |||
af888db13e | |||
d24f2408ee | |||
ddd2274015 | |||
99abdb7c67 | |||
f3bb9a8540 | |||
bb94e0d1c5 | |||
1a78d88be2 | |||
3ec61758b3 | |||
83c962166d | |||
e87dee3bdf | |||
b27bfff15a | |||
970fc1cd08 | |||
65391bf14b | |||
8ab116f276 | |||
6f82041ec9 | |||
11059c960e | |||
eee1b8eba1 | |||
f7de8f5575 | |||
3fa0c32064 | |||
aa1e3b8825 | |||
58d2d585c8 | |||
e8efed2e2f | |||
49089b7321 | |||
55d8687566 | |||
0fa28527af | |||
68259fd2d9 | |||
e7086a89d3 | |||
e39467227c | |||
ba840404cf | |||
8fbd2d9e55 | |||
eebf6043de | |||
3d3b09d6d8 | |||
ef15b5863c | |||
256c0cdb6b | |||
12c72a8294 | |||
0ad27f5ecc | |||
700b60b5f8 | |||
11032596c8 | |||
9727f92168 | |||
d2893fec13 | |||
f1353b2129 | |||
c76a95ef90 | |||
62d88359ce | |||
9942c474c9 | |||
dfb736e794 | |||
84d1766917 | |||
9d3cdfafb3 | |||
20a16bfa45 | |||
f2112c73f6 | |||
9860527d96 | |||
9ac1e877c8 | |||
f881705d00 | |||
f015c3f4f3 | |||
42ac399362 | |||
e9d010c793 | |||
58a6409528 | |||
7ca2029158 | |||
80331833f2 | |||
5e3d3a815f | |||
6182a18cf4 | |||
9bca750310 |
31
.github/workflows/python-publish.yml
vendored
Normal file
31
.github/workflows/python-publish.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
# This workflows will upload a Python Package using Twine when a release is created
|
||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
||||
|
||||
name: Upload Python Package
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: '3.x'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools wheel twine
|
||||
- name: Build and publish
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: |
|
||||
python setup.py sdist bdist_wheel
|
||||
twine upload dist/*
|
23
.travis.yml
23
.travis.yml
@ -1,14 +1,19 @@
|
||||
language: python
|
||||
python:
|
||||
- "2.7"
|
||||
- "3.6"
|
||||
- "3.8"
|
||||
os: linux
|
||||
dist: xenial
|
||||
cache: pip
|
||||
install:
|
||||
- pip install pytest
|
||||
before_script:
|
||||
cd tests
|
||||
python:
|
||||
- 2.7
|
||||
- 3.6
|
||||
- 3.8
|
||||
before_install:
|
||||
- python --version
|
||||
- pip install -U pip
|
||||
- pip install -U pytest
|
||||
- pip install codecov
|
||||
- pip install pytest pytest-cov
|
||||
script:
|
||||
- pytest test_1.py
|
||||
- cd tests
|
||||
- pytest --cov=../waybackpy
|
||||
after_success:
|
||||
- if [[ $TRAVIS_PYTHON_VERSION == 3.8 ]]; then python -m codecov; fi
|
||||
|
322
README.md
322
README.md
@ -1,5 +1,6 @@
|
||||
# waybackpy
|
||||
[](https://travis-ci.org/akamhy/waybackpy)
|
||||
|
||||
[](https://travis-ci.org/akamhy/waybackpy)
|
||||
[](https://pypistats.org/packages/waybackpy)
|
||||
[](https://github.com/akamhy/waybackpy/releases)
|
||||
[](https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade)
|
||||
@ -7,170 +8,281 @@
|
||||
[](https://codeclimate.com/github/akamhy/waybackpy/maintainability)
|
||||
[](https://www.codefactor.io/repository/github/akamhy/waybackpy)
|
||||
[](https://www.python.org/)
|
||||

|
||||

|
||||

|
||||
[](https://github.com/akamhy/waybackpy/graphs/commit-activity)
|
||||
|
||||
[](https://codecov.io/gh/akamhy/waybackpy)
|
||||

|
||||

|
||||
|
||||
|
||||

|
||||

|
||||
|
||||
The waybackpy is a python wrapper for [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine).
|
||||
Waybackpy is a Python library that interfaces with the [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine) API. Archive pages and retrieve archived pages easily.
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
<!--ts-->
|
||||
|
||||
* [Installation](https://github.com/akamhy/waybackpy#installation)
|
||||
* [Installation](#installation)
|
||||
|
||||
* [Usage](https://github.com/akamhy/waybackpy#usage)
|
||||
* [Saving an url using save()](https://github.com/akamhy/waybackpy#capturing-aka-saving-an-url-using-save)
|
||||
* [Receiving the oldest archive for an URL Using oldest()](https://github.com/akamhy/waybackpy#receiving-the-oldest-archive-for-an-url-using-oldest)
|
||||
* [Receiving the recent most/newest archive for an URL using newest()](https://github.com/akamhy/waybackpy#receiving-the-newest-archive-for-an-url-using-newest)
|
||||
* [Receiving archive close to a specified year, month, day, hour, and minute using near()](https://github.com/akamhy/waybackpy#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
|
||||
* [Get the content of webpage using get()](https://github.com/akamhy/waybackpy#get-the-content-of-webpage-using-get)
|
||||
* [Count total archives for an URL using total_archives()](https://github.com/akamhy/waybackpy#count-total-archives-for-an-url-using-total_archives)
|
||||
* [Usage](#usage)
|
||||
* [As a python package](#as-a-python-package)
|
||||
* [Saving an url using save()](#capturing-aka-saving-an-url-using-save)
|
||||
* [Receiving the oldest archive for an URL Using oldest()](#receiving-the-oldest-archive-for-an-url-using-oldest)
|
||||
* [Receiving the recent most/newest archive for an URL using newest()](#receiving-the-newest-archive-for-an-url-using-newest)
|
||||
* [Receiving archive close to a specified year, month, day, hour, and minute using near()](#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
|
||||
* [Get the content of webpage using get()](#get-the-content-of-webpage-using-get)
|
||||
* [Count total archives for an URL using total_archives()](#count-total-archives-for-an-url-using-total_archives)
|
||||
* [With CLI](#with-the-cli)
|
||||
* [Save](#save)
|
||||
* [Oldest archive](#oldest-archive)
|
||||
* [Newest archive](#newest-archive)
|
||||
* [Total archives](#total-number-of-archives)
|
||||
* [Archive near a time](#archive-near-time)
|
||||
* [Get the source code](#get-the-source-code)
|
||||
|
||||
* [Tests](#tests)
|
||||
|
||||
* [Tests](https://github.com/akamhy/waybackpy#tests)
|
||||
* [Dependency](#dependency)
|
||||
|
||||
* [Dependency](https://github.com/akamhy/waybackpy#dependency)
|
||||
|
||||
* [License](https://github.com/akamhy/waybackpy#license)
|
||||
* [License](#license)
|
||||
|
||||
<!--te-->
|
||||
|
||||
## Installation
|
||||
Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)):
|
||||
|
||||
**pip install waybackpy**
|
||||
|
||||
|
||||
```bash
|
||||
pip install waybackpy
|
||||
```
|
||||
or direct from this repository using git.
|
||||
```bash
|
||||
pip install git+https://github.com/akamhy/waybackpy.git
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
#### Capturing aka Saving an url Using save()
|
||||
### As a python package
|
||||
|
||||
```diff
|
||||
+ waybackpy.save(url, UA=user_agent)
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended.
|
||||
#### Capturing aka Saving an url using save()
|
||||
```python
|
||||
import waybackpy
|
||||
# Capturing a new archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
archived_url = waybackpy.save("https://github.com/akamhy/waybackpy", UA = "Any-User-Agent")
|
||||
print(archived_url)
|
||||
|
||||
new_archive_url = waybackpy.Url(
|
||||
|
||||
url = "https://en.wikipedia.org/wiki/Multivariable_calculus",
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
).save()
|
||||
|
||||
print(new_archive_url)
|
||||
```
|
||||
This should print something similar to the following archived URL:
|
||||
|
||||
<https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy>
|
||||
|
||||
#### Receiving the oldest archive for an URL Using oldest()
|
||||
|
||||
```diff
|
||||
+ waybackpy.oldest(url, UA=user_agent)
|
||||
```bash
|
||||
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended.
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPySaveExample></sub>
|
||||
|
||||
|
||||
|
||||
#### Receiving the oldest archive for an URL using oldest()
|
||||
```python
|
||||
import waybackpy
|
||||
# retrieving the oldest archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
oldest_archive = waybackpy.oldest("https://www.google.com/", UA = "Any-User-Agent")
|
||||
print(oldest_archive)
|
||||
```
|
||||
This returns the oldest available archive for <https://google.com>.
|
||||
|
||||
<http://web.archive.org/web/19981111184551/http://google.com:80/>
|
||||
oldest_archive_url = waybackpy.Url(
|
||||
|
||||
"https://www.google.com/",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
).oldest()
|
||||
|
||||
print(oldest_archive_url)
|
||||
```
|
||||
```bash
|
||||
http://web.archive.org/web/19981111184551/http://google.com:80/
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyOldestExample></sub>
|
||||
|
||||
|
||||
|
||||
#### Receiving the newest archive for an URL using newest()
|
||||
|
||||
```diff
|
||||
+ waybackpy.newest(url, UA=user_agent)
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
|
||||
```python
|
||||
import waybackpy
|
||||
# retrieving the newest archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
newest_archive = waybackpy.newest("https://www.microsoft.com/en-us", UA = "Any-User-Agent")
|
||||
print(newest_archive)
|
||||
```
|
||||
This returns the newest available archive for <https://www.microsoft.com/en-us>, something just like this:
|
||||
|
||||
<http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/>
|
||||
newest_archive_url = waybackpy.Url(
|
||||
|
||||
"https://www.facebook.com/",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
|
||||
|
||||
).newest()
|
||||
|
||||
print(newest_archive_url)
|
||||
```
|
||||
```bash
|
||||
https://web.archive.org/web/20200714013225/https://www.facebook.com/
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyNewestExample></sub>
|
||||
|
||||
|
||||
|
||||
#### Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
|
||||
```diff
|
||||
+ waybackpy.near(url, year=2020, month=1, day=1, hour=1, minute=1, UA=user_agent)
|
||||
```
|
||||
> url is mandotory. year,month,day,hour and minute are optional arguments. UA is not mandotory, but higly recomended.
|
||||
|
||||
|
||||
```python
|
||||
import waybackpy
|
||||
# retriving the the closest archive from a specified year.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
# supported argumnets are year,month,day,hour and minute
|
||||
archive_near_year = waybackpy.near("https://www.facebook.com/", year=2010, UA ="Any-User-Agent")
|
||||
print(archive_near_year)
|
||||
from waybackpy import Url
|
||||
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
|
||||
github_url = "https://github.com/"
|
||||
|
||||
|
||||
github_wayback_obj = Url(github_url, user_agent)
|
||||
|
||||
# Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
```
|
||||
returns : <http://web.archive.org/web/20100504071154/http://www.facebook.com/>
|
||||
```python
|
||||
github_archive_near_2010 = github_wayback_obj.near(year=2010)
|
||||
print(github_archive_near_2010)
|
||||
```
|
||||
```bash
|
||||
https://web.archive.org/web/20100719134402/http://github.com/
|
||||
```
|
||||
```python
|
||||
github_archive_near_2011_may = github_wayback_obj.near(year=2011, month=5)
|
||||
print(github_archive_near_2011_may)
|
||||
```
|
||||
```bash
|
||||
https://web.archive.org/web/20110519185447/https://github.com/
|
||||
```
|
||||
```python
|
||||
github_archive_near_2015_january_26 = github_wayback_obj.near(
|
||||
year=2015, month=1, day=26
|
||||
)
|
||||
print(github_archive_near_2015_january_26)
|
||||
```
|
||||
```bash
|
||||
https://web.archive.org/web/20150127031159/https://github.com
|
||||
```
|
||||
```python
|
||||
github_archive_near_2018_4_july_9_2_am = github_wayback_obj.near(
|
||||
year=2018, month=7, day=4, hour = 9, minute = 2
|
||||
)
|
||||
print(github_archive_near_2018_4_july_9_2_am)
|
||||
```
|
||||
```bash
|
||||
https://web.archive.org/web/20180704090245/https://github.com/
|
||||
|
||||
```waybackpy.near("https://www.facebook.com/", year=2010, month=1, UA ="Any-User-Agent")``` returns: <http://web.archive.org/web/20101111173430/http://www.facebook.com//>
|
||||
```
|
||||
|
||||
<sub>The library doesn't supports seconds yet. You are encourged to create a PR ;)</sub>
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyNearExample></sub>
|
||||
|
||||
```waybackpy.near("https://www.oracle.com/index.html", year=2019, month=1, day=5, UA ="Any-User-Agent")``` returns: <http://web.archive.org/web/20190105054437/https://www.oracle.com/index.html>
|
||||
> Please note that if you only specify the year, the current month and day are default arguments for month and day respectively. Do not expect just putting the year parameter would return the archive closer to January but the current month you are using the package. If you are using it in July 2018 and let's say you use ```waybackpy.near("https://www.facebook.com/", year=2011, UA ="Any-User-Agent")``` then you would be returned the nearest archive to July 2011 and not January 2011. You need to specify the month "1" for January.
|
||||
|
||||
> Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
|
||||
#### Get the content of webpage using get()
|
||||
|
||||
```diff
|
||||
+ waybackpy.get(url, encoding="UTF-8", UA=user_agent)
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended. encoding is detected automatically, don't specify unless necessary.
|
||||
|
||||
```python
|
||||
from waybackpy import get
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
# supported argumnets are url, encoding and UA
|
||||
webpage = get("https://example.com/", UA="User-Agent")
|
||||
print(webpage)
|
||||
import waybackpy
|
||||
|
||||
google_url = "https://www.google.com/"
|
||||
|
||||
User_Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
|
||||
|
||||
waybackpy_url_object = waybackpy.Url(google_url, User_Agent)
|
||||
|
||||
|
||||
# If no argument is passed in get(), it gets the source of the Url used to create the object.
|
||||
current_google_url_source = waybackpy_url_object.get()
|
||||
print(current_google_url_source)
|
||||
|
||||
|
||||
# The following chunk of code will force a new archive of google.com and get the source of the archived page.
|
||||
# waybackpy_url_object.save() type is string.
|
||||
google_newest_archive_source = waybackpy_url_object.get(
|
||||
waybackpy_url_object.save()
|
||||
)
|
||||
print(google_newest_archive_source)
|
||||
|
||||
|
||||
# waybackpy_url_object.oldest() type is str, it's oldest archive of google.com
|
||||
google_oldest_archive_source = waybackpy_url_object.get(
|
||||
waybackpy_url_object.oldest()
|
||||
)
|
||||
print(google_oldest_archive_source)
|
||||
```
|
||||
> This should print the source code for <https://example.com/>.
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyGetExample#main.py></sub>
|
||||
|
||||
|
||||
#### Count total archives for an URL using total_archives()
|
||||
|
||||
```diff
|
||||
+ waybackpy.total_archives(url, UA=user_agent)
|
||||
```
|
||||
> url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
```python
|
||||
from waybackpy import total_archives
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
# supported argumnets are url and UA
|
||||
count = total_archives("https://en.wikipedia.org/wiki/Python (programming language)", UA="User-Agent")
|
||||
print(count)
|
||||
import waybackpy
|
||||
|
||||
URL = "https://en.wikipedia.org/wiki/Python (programming language)"
|
||||
|
||||
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
|
||||
|
||||
archive_count = waybackpy.Url(
|
||||
url=URL,
|
||||
user_agent=UA
|
||||
).total_archives()
|
||||
|
||||
print(archive_count) # total_archives() returns an int
|
||||
```
|
||||
> This should print an integer (int), which is the number of total archives on archive.org
|
||||
```bash
|
||||
2440
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyTotalArchivesExample></sub>
|
||||
|
||||
### With the CLI
|
||||
|
||||
#### Save
|
||||
```bash
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/Social_media" --user_agent "my-unique-user-agent" --save
|
||||
https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashSave></sub>
|
||||
|
||||
#### Oldest archive
|
||||
```bash
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --oldest
|
||||
https://web.archive.org/web/20040803000845/http://en.wikipedia.org:80/wiki/SpaceX
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashOldest></sub>
|
||||
|
||||
#### Newest archive
|
||||
```bash
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/YouTube" --user_agent "my-unique-user-agent" --newest
|
||||
https://web.archive.org/web/20200606044708/https://en.wikipedia.org/wiki/YouTube
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashNewest></sub>
|
||||
|
||||
#### Total number of archives
|
||||
```bash
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/Linux_kernel" --user_agent "my-unique-user-agent" --total
|
||||
853
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashTotal></sub>
|
||||
|
||||
#### Archive near time
|
||||
```bash
|
||||
$ waybackpy --url facebook.com --user_agent "my-unique-user-agent" --near --year 2012 --month 5 --day 12
|
||||
https://web.archive.org/web/20120512142515/https://www.facebook.com/
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashNear></sub>
|
||||
|
||||
#### Get the source code
|
||||
```bash
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get url # Prints the source code of the url
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get oldest # Prints the source code of the oldest archive
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get newest # Prints the source code of the newest archive
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive.
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashGet></sub>
|
||||
|
||||
## Tests
|
||||
* [Here](https://github.com/akamhy/waybackpy/tree/master/tests)
|
||||
|
||||
|
||||
## Dependency
|
||||
* None, just python standard libraries (json, urllib and datetime). Both python 2 and 3 are supported :)
|
||||
* None, just python standard libraries (re, json, urllib, argparse and datetime). Both python 2 and 3 are supported :)
|
||||
|
||||
|
||||
## License
|
||||
|
||||
[MIT License](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
|
565
index.rst
565
index.rst
@ -3,9 +3,361 @@ waybackpy
|
||||
|
||||
|Build Status| |Downloads| |Release| |Codacy Badge| |License: MIT|
|
||||
|Maintainability| |CodeFactor| |made-with-python| |pypi| |PyPI - Python
|
||||
Version| |Maintenance|
|
||||
Version| |Maintenance| |codecov| |image12| |contributions welcome|
|
||||
|
||||
.. |Build Status| image:: https://travis-ci.org/akamhy/waybackpy.svg?branch=master
|
||||
|Internet Archive| |Wayback Machine|
|
||||
|
||||
Waybackpy is a Python library that interfaces with the `Internet
|
||||
Archive <https://en.wikipedia.org/wiki/Internet_Archive>`__'s `Wayback
|
||||
Machine <https://en.wikipedia.org/wiki/Wayback_Machine>`__ API. Archive
|
||||
pages and retrieve archived pages easily.
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<!--ts-->
|
||||
|
||||
- `Installation <#installation>`__
|
||||
|
||||
- `Usage <#usage>`__
|
||||
- `As a python package <#as-a-python-package>`__
|
||||
|
||||
- `Saving an url using
|
||||
save() <#capturing-aka-saving-an-url-using-save>`__
|
||||
- `Receiving the oldest archive for an URL Using
|
||||
oldest() <#receiving-the-oldest-archive-for-an-url-using-oldest>`__
|
||||
- `Receiving the recent most/newest archive for an URL using
|
||||
newest() <#receiving-the-newest-archive-for-an-url-using-newest>`__
|
||||
- `Receiving archive close to a specified year, month, day, hour,
|
||||
and minute using
|
||||
near() <#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near>`__
|
||||
- `Get the content of webpage using
|
||||
get() <#get-the-content-of-webpage-using-get>`__
|
||||
- `Count total archives for an URL using
|
||||
total\_archives() <#count-total-archives-for-an-url-using-total_archives>`__
|
||||
|
||||
- `With CLI <#with-the-cli>`__
|
||||
|
||||
- `Save <#save>`__
|
||||
- `Oldest archive <#oldest-archive>`__
|
||||
- `Newest archive <#newest-archive>`__
|
||||
- `Total archives <#total-number-of-archives>`__
|
||||
- `Archive near a time <#archive-near-time>`__
|
||||
- `Get the source code <#get-the-source-code>`__
|
||||
|
||||
- `Tests <#tests>`__
|
||||
|
||||
- `Dependency <#dependency>`__
|
||||
|
||||
- `License <#license>`__
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<!--te-->
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Using `pip <https://en.wikipedia.org/wiki/Pip_(package_manager)>`__:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
pip install waybackpy
|
||||
|
||||
or direct from this repository using git.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
pip install git+https://github.com/akamhy/waybackpy.git
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
As a python package
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Capturing aka Saving an url using save()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
new_archive_url = waybackpy.Url(
|
||||
|
||||
url = "https://en.wikipedia.org/wiki/Multivariable_calculus",
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
).save()
|
||||
|
||||
print(new_archive_url)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPySaveExample\
|
||||
|
||||
Receiving the oldest archive for an URL using oldest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
oldest_archive_url = waybackpy.Url(
|
||||
|
||||
"https://www.google.com/",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
).oldest()
|
||||
|
||||
print(oldest_archive_url)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
http://web.archive.org/web/19981111184551/http://google.com:80/
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyOldestExample\
|
||||
|
||||
Receiving the newest archive for an URL using newest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
newest_archive_url = waybackpy.Url(
|
||||
|
||||
"https://www.facebook.com/",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
|
||||
|
||||
).newest()
|
||||
|
||||
print(newest_archive_url)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20200714013225/https://www.facebook.com/
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyNewestExample\
|
||||
|
||||
Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
from waybackpy import Url
|
||||
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
|
||||
github_url = "https://github.com/"
|
||||
|
||||
|
||||
github_wayback_obj = Url(github_url, user_agent)
|
||||
|
||||
# Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
|
||||
.. code:: python
|
||||
|
||||
github_archive_near_2010 = github_wayback_obj.near(year=2010)
|
||||
print(github_archive_near_2010)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20100719134402/http://github.com/
|
||||
|
||||
.. code:: python
|
||||
|
||||
github_archive_near_2011_may = github_wayback_obj.near(year=2011, month=5)
|
||||
print(github_archive_near_2011_may)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20110519185447/https://github.com/
|
||||
|
||||
.. code:: python
|
||||
|
||||
github_archive_near_2015_january_26 = github_wayback_obj.near(
|
||||
year=2015, month=1, day=26
|
||||
)
|
||||
print(github_archive_near_2015_january_26)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20150127031159/https://github.com
|
||||
|
||||
.. code:: python
|
||||
|
||||
github_archive_near_2018_4_july_9_2_am = github_wayback_obj.near(
|
||||
year=2018, month=7, day=4, hour = 9, minute = 2
|
||||
)
|
||||
print(github_archive_near_2018_4_july_9_2_am)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20180704090245/https://github.com/
|
||||
|
||||
The library doesn't supports seconds yet. You are encourged to create a
|
||||
PR ;)
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyNearExample\
|
||||
|
||||
Get the content of webpage using get()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
google_url = "https://www.google.com/"
|
||||
|
||||
User_Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
|
||||
|
||||
waybackpy_url_object = waybackpy.Url(google_url, User_Agent)
|
||||
|
||||
|
||||
# If no argument is passed in get(), it gets the source of the Url used to create the object.
|
||||
current_google_url_source = waybackpy_url_object.get()
|
||||
print(current_google_url_source)
|
||||
|
||||
|
||||
# The following chunk of code will force a new archive of google.com and get the source of the archived page.
|
||||
# waybackpy_url_object.save() type is string.
|
||||
google_newest_archive_source = waybackpy_url_object.get(
|
||||
waybackpy_url_object.save()
|
||||
)
|
||||
print(google_newest_archive_source)
|
||||
|
||||
|
||||
# waybackpy_url_object.oldest() type is str, it's oldest archive of google.com
|
||||
google_oldest_archive_source = waybackpy_url_object.get(
|
||||
waybackpy_url_object.oldest()
|
||||
)
|
||||
print(google_oldest_archive_source)
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyGetExample#main.py\
|
||||
|
||||
Count total archives for an URL using total\_archives()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
URL = "https://en.wikipedia.org/wiki/Python (programming language)"
|
||||
|
||||
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
|
||||
|
||||
archive_count = waybackpy.Url(
|
||||
url=URL,
|
||||
user_agent=UA
|
||||
).total_archives()
|
||||
|
||||
print(archive_count) # total_archives() returns an int
|
||||
|
||||
.. code:: bash
|
||||
|
||||
2440
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyTotalArchivesExample\
|
||||
|
||||
With the CLI
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Save
|
||||
^^^^
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/Social_media" --user_agent "my-unique-user-agent" --save
|
||||
https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashSave\
|
||||
|
||||
Oldest archive
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --oldest
|
||||
https://web.archive.org/web/20040803000845/http://en.wikipedia.org:80/wiki/SpaceX
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashOldest\
|
||||
|
||||
Newest archive
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/YouTube" --user_agent "my-unique-user-agent" --newest
|
||||
https://web.archive.org/web/20200606044708/https://en.wikipedia.org/wiki/YouTube
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashNewest\
|
||||
|
||||
Total number of archives
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/Linux_kernel" --user_agent "my-unique-user-agent" --total
|
||||
853
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashTotal\
|
||||
|
||||
Archive near time
|
||||
^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ waybackpy --url facebook.com --user_agent "my-unique-user-agent" --near --year 2012 --month 5 --day 12
|
||||
https://web.archive.org/web/20120512142515/https://www.facebook.com/
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashNear\
|
||||
|
||||
Get the source code
|
||||
^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get url # Prints the source code of the url
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get oldest # Prints the source code of the oldest archive
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get newest # Prints the source code of the newest archive
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive.
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashGet\
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
- `Here <https://github.com/akamhy/waybackpy/tree/master/tests>`__
|
||||
|
||||
Dependency
|
||||
----------
|
||||
|
||||
- None, just python standard libraries (re, json, urllib, argparse and datetime).
|
||||
Both python 2 and 3 are supported :)
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
`MIT
|
||||
License <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
|
||||
|
||||
.. |Build Status| image:: https://img.shields.io/travis/akamhy/waybackpy.svg?label=Travis%20CI&logo=travis&style=flat-square
|
||||
:target: https://travis-ci.org/akamhy/waybackpy
|
||||
.. |Downloads| image:: https://img.shields.io/pypi/dm/waybackpy.svg
|
||||
:target: https://pypistats.org/packages/waybackpy
|
||||
@ -21,212 +373,13 @@ Version| |Maintenance|
|
||||
:target: https://www.codefactor.io/repository/github/akamhy/waybackpy
|
||||
.. |made-with-python| image:: https://img.shields.io/badge/Made%20with-Python-1f425f.svg
|
||||
:target: https://www.python.org/
|
||||
.. |pypi| image:: https://img.shields.io/pypi/v/wayback.svg
|
||||
.. |pypi| image:: https://img.shields.io/pypi/v/waybackpy.svg
|
||||
.. |PyPI - Python Version| image:: https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square
|
||||
.. |Maintenance| image:: https://img.shields.io/badge/Maintained%3F-yes-green.svg
|
||||
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity
|
||||
|
||||
|Internet Archive| |Wayback Machine|
|
||||
|
||||
The waybackpy is a python wrapper for `Internet Archive`_\ ’s `Wayback
|
||||
Machine`_.
|
||||
|
||||
.. _Internet Archive: https://en.wikipedia.org/wiki/Internet_Archive
|
||||
.. _Wayback Machine: https://en.wikipedia.org/wiki/Wayback_Machine
|
||||
|
||||
.. |codecov| image:: https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg
|
||||
:target: https://codecov.io/gh/akamhy/waybackpy
|
||||
.. |image12| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
|
||||
.. |contributions welcome| image:: https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square
|
||||
.. |Internet Archive| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png
|
||||
.. |Wayback Machine| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Using `pip`_:
|
||||
|
||||
**pip install waybackpy**
|
||||
|
||||
.. _pip: https://en.wikipedia.org/wiki/Pip_(package_manager)
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
Archiving aka Saving an url Using save()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: diff
|
||||
|
||||
+ waybackpy.save(url, UA=user_agent)
|
||||
|
||||
..
|
||||
|
||||
url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# Capturing a new archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
archived_url = waybackpy.save("https://github.com/akamhy/waybackpy", UA = "Any-User-Agent")
|
||||
print(archived_url)
|
||||
|
||||
This should print something similar to the following archived URL:
|
||||
|
||||
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
|
||||
|
||||
Receiving the oldest archive for an URL Using oldest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: diff
|
||||
|
||||
+ waybackpy.oldest(url, UA=user_agent)
|
||||
|
||||
..
|
||||
|
||||
url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retrieving the oldest archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
oldest_archive = waybackpy.oldest("https://www.google.com/", UA = "Any-User-Agent")
|
||||
print(oldest_archive)
|
||||
|
||||
This returns the oldest available archive for https://google.com.
|
||||
|
||||
http://web.archive.org/web/19981111184551/http://google.com:80/
|
||||
|
||||
Receiving the newest archive for an URL using newest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: diff
|
||||
|
||||
+ waybackpy.newest(url, UA=user_agent)
|
||||
|
||||
..
|
||||
|
||||
url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retrieving the newest archive on Wayback machine.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
newest_archive = waybackpy.newest("https://www.microsoft.com/en-us", UA = "Any-User-Agent")
|
||||
print(newest_archive)
|
||||
|
||||
This returns the newest available archive for
|
||||
https://www.microsoft.com/en-us, something just like this:
|
||||
|
||||
http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/
|
||||
|
||||
Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: diff
|
||||
|
||||
+ waybackpy.near(url, year=2020, month=1, day=1, hour=1, minute=1, UA=user_agent)
|
||||
|
||||
..
|
||||
|
||||
url is mandotory. year,month,day,hour and minute are optional
|
||||
arguments. UA is not mandotory, but higly recomended.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retriving the the closest archive from a specified year.
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
# supported argumnets are year,month,day,hour and minute
|
||||
archive_near_year = waybackpy.near("https://www.facebook.com/", year=2010, UA ="Any-User-Agent")
|
||||
print(archive_near_year)
|
||||
|
||||
returns :
|
||||
http://web.archive.org/web/20100504071154/http://www.facebook.com/
|
||||
|
||||
``waybackpy.near("https://www.facebook.com/", year=2010, month=1, UA ="Any-User-Agent")``
|
||||
returns:
|
||||
http://web.archive.org/web/20101111173430/http://www.facebook.com//
|
||||
|
||||
``waybackpy.near("https://www.oracle.com/index.html", year=2019, month=1, day=5, UA ="Any-User-Agent")``
|
||||
returns:
|
||||
http://web.archive.org/web/20190105054437/https://www.oracle.com/index.html
|
||||
> Please note that if you only specify the year, the current month and
|
||||
day are default arguments for month and day respectively. Do not expect
|
||||
just putting the year parameter would return the archive closer to
|
||||
January but the current month you are using the package. If you are
|
||||
using it in July 2018 and let’s say you use
|
||||
``waybackpy.near("https://www.facebook.com/", year=2011, UA ="Any-User-Agent")``
|
||||
then you would be returned the nearest archive to July 2011 and not
|
||||
January 2011. You need to specify the month “1” for January.
|
||||
|
||||
Do not pad (don’t use zeros in the month, year, day, minute, and hour
|
||||
arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
|
||||
Get the content of webpage using get()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: diff
|
||||
|
||||
+ waybackpy.get(url, encoding="UTF-8", UA=user_agent)
|
||||
|
||||
..
|
||||
|
||||
url is mandatory. UA is not, but highly recommended. encoding is
|
||||
detected automatically, don’t specify unless necessary.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from waybackpy import get
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
# supported argumnets are url, encoding and UA
|
||||
webpage = get("https://example.com/", UA="User-Agent")
|
||||
print(webpage)
|
||||
|
||||
..
|
||||
|
||||
This should print the source code for https://example.com/.
|
||||
|
||||
Count total archives for an URL using total_archives()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: diff
|
||||
|
||||
+ waybackpy.total_archives(url, UA=user_agent)
|
||||
|
||||
..
|
||||
|
||||
url is mandatory. UA is not, but highly recommended.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from waybackpy import total_archives
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
|
||||
# supported argumnets are url and UA
|
||||
count = total_archives("https://en.wikipedia.org/wiki/Python (programming language)", UA="User-Agent")
|
||||
print(count)
|
||||
|
||||
..
|
||||
|
||||
This should print an integer (int), which is the number of total
|
||||
archives on archive.org
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
- `Here`_
|
||||
|
||||
Dependency
|
||||
----------
|
||||
|
||||
- None, just python standard libraries (json, urllib and datetime).
|
||||
Both python 2 and 3 are supported :)
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
`MIT License`_
|
||||
|
||||
.. _Here: https://github.com/akamhy/waybackpy/tree/master/tests
|
||||
.. _MIT License: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
||||
|
@ -1,3 +1,7 @@
|
||||
[metadata]
|
||||
description-file = README.md
|
||||
license_file = LICENSE
|
||||
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = E203,W503
|
||||
|
9
setup.py
9
setup.py
@ -5,7 +5,7 @@ with open(os.path.join(os.path.dirname(__file__), 'README.md')) as f:
|
||||
long_description = f.read()
|
||||
|
||||
about = {}
|
||||
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'waybackpy', '__version__.py'), 'r', 'utf-8') as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py')) as f:
|
||||
exec(f.read(), about)
|
||||
|
||||
setup(
|
||||
@ -19,7 +19,7 @@ setup(
|
||||
author = about['__author__'],
|
||||
author_email = about['__author_email__'],
|
||||
url = about['__url__'],
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/v1.4.tar.gz',
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/2.1.5.tar.gz',
|
||||
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
|
||||
install_requires=[],
|
||||
python_requires= ">=2.7",
|
||||
@ -42,6 +42,11 @@ setup(
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: Implementation :: CPython',
|
||||
],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'waybackpy = waybackpy.cli:main'
|
||||
]
|
||||
},
|
||||
project_urls={
|
||||
'Documentation': 'https://waybackpy.readthedocs.io',
|
||||
'Source': 'https://github.com/akamhy/waybackpy',
|
||||
|
@ -1,98 +0,0 @@
|
||||
import sys
|
||||
sys.path.append("..")
|
||||
import waybackpy
|
||||
import pytest
|
||||
|
||||
|
||||
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
|
||||
|
||||
def test_clean_url():
|
||||
test_url = " https://en.wikipedia.org/wiki/Network security "
|
||||
answer = "https://en.wikipedia.org/wiki/Network_security"
|
||||
test_result = waybackpy.clean_url(test_url)
|
||||
assert answer == test_result
|
||||
|
||||
def test_url_check():
|
||||
InvalidUrl = "http://wwwgooglecom/"
|
||||
with pytest.raises(Exception) as e_info:
|
||||
waybackpy.url_check(InvalidUrl)
|
||||
|
||||
def test_save():
|
||||
# Test for urls that exist and can be archived.
|
||||
url1="https://github.com/akamhy/waybackpy"
|
||||
archived_url1 = waybackpy.save(url1, UA=user_agent)
|
||||
assert url1 in archived_url1
|
||||
|
||||
# Test for urls that are incorrect.
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url2 = "ha ha ha ha"
|
||||
waybackpy.save(url2, UA=user_agent)
|
||||
|
||||
# Test for urls not allowed to archive by robot.txt.
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url3 = "http://www.archive.is/faq.html"
|
||||
waybackpy.save(url3, UA=user_agent)
|
||||
|
||||
# Non existent urls, test
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
|
||||
archived_url4 = waybackpy.save(url4, UA=user_agent)
|
||||
|
||||
def test_near():
|
||||
url = "google.com"
|
||||
archive_near_year = waybackpy.near(url, year=2010, UA=user_agent)
|
||||
assert "2010" in archive_near_year
|
||||
|
||||
archive_near_month_year = waybackpy.near(url, year=2015, month=2, UA=user_agent)
|
||||
assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
|
||||
|
||||
archive_near_day_month_year = waybackpy.near(url, year=2006, month=11, day=15, UA=user_agent)
|
||||
assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year)
|
||||
|
||||
archive_near_hour_day_month_year = waybackpy.near("www.python.org", year=2008, month=5, day=9, hour=15, UA=user_agent)
|
||||
assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
|
||||
|
||||
with pytest.raises(Exception) as e_info:
|
||||
NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
|
||||
waybackpy.near(NeverArchivedUrl, year=2010, UA=user_agent)
|
||||
|
||||
def test_oldest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
archive_oldest = waybackpy.oldest(url, UA=user_agent)
|
||||
assert "20200504141153" in archive_oldest
|
||||
|
||||
def test_newest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
archive_newest = waybackpy.newest(url, UA=user_agent)
|
||||
assert url in archive_newest
|
||||
|
||||
def test_get():
|
||||
oldest_google_archive = waybackpy.oldest("google.com", UA=user_agent)
|
||||
oldest_google_page_text = waybackpy.get(oldest_google_archive, UA=user_agent)
|
||||
assert "Welcome to Google" in oldest_google_page_text
|
||||
|
||||
def test_total_archives():
|
||||
|
||||
count1 = waybackpy.total_archives("https://en.wikipedia.org/wiki/Python (programming language)", UA=user_agent)
|
||||
assert count1 > 2000
|
||||
|
||||
count2 = waybackpy.total_archives("https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8", UA=user_agent)
|
||||
assert count2 == 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_clean_url()
|
||||
print(".")
|
||||
test_url_check()
|
||||
print(".")
|
||||
test_get()
|
||||
print(".")
|
||||
test_near()
|
||||
print(".")
|
||||
test_newest()
|
||||
print(".")
|
||||
test_save()
|
||||
print(".")
|
||||
test_oldest()
|
||||
print(".")
|
||||
test_total_archives()
|
||||
print(".")
|
97
tests/test_cli.py
Normal file
97
tests/test_cli.py
Normal file
@ -0,0 +1,97 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
import argparse
|
||||
|
||||
sys.path.append("..")
|
||||
import waybackpy.cli as cli # noqa: E402
|
||||
from waybackpy.wrapper import Url # noqa: E402
|
||||
from waybackpy.__version__ import __version__
|
||||
|
||||
codecov_python = False
|
||||
if sys.version_info > (3, 7):
|
||||
codecov_python = True
|
||||
|
||||
# Namespace(day=None, get=None, hour=None, minute=None, month=None, near=False,
|
||||
# newest=False, oldest=False, save=False, total=False, url=None, user_agent=None, version=False, year=None)
|
||||
|
||||
if codecov_python:
|
||||
def test_save():
|
||||
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=True, newest=False, near=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
|
||||
def test_oldest():
|
||||
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=True, save=False, newest=False, near=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
|
||||
def test_newest():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=True, near=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
|
||||
def test_total_archives():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=True, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert isinstance(reply, int)
|
||||
|
||||
def test_near():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=True, get=None, year=2020, month=7, day=15, hour=1, minute=1)
|
||||
reply = cli.args_handler(args)
|
||||
assert "202007" in reply
|
||||
|
||||
|
||||
def test_get():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="url")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="oldest")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="newest")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
|
||||
if codecov_python:
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="save")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="BullShit")
|
||||
reply = cli.args_handler(args)
|
||||
assert "get the source code of the" in reply
|
||||
|
||||
def test_args_handler():
|
||||
args = argparse.Namespace(version=True)
|
||||
reply = cli.args_handler(args)
|
||||
assert __version__ == reply
|
||||
|
||||
args = argparse.Namespace(url=None, version=False)
|
||||
reply = cli.args_handler(args)
|
||||
assert "Specify an URL" in reply
|
||||
|
||||
def test_main():
|
||||
# This also tests the parse_args method in cli.py
|
||||
cli.main(['temp.py', '--version'])
|
194
tests/test_wrapper.py
Normal file
194
tests/test_wrapper.py
Normal file
@ -0,0 +1,194 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
import pytest
|
||||
import random
|
||||
import time
|
||||
|
||||
sys.path.append("..")
|
||||
import waybackpy.wrapper as waybackpy # noqa: E402
|
||||
|
||||
if sys.version_info >= (3, 0): # If the python ver >= 3
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import URLError
|
||||
else: # For python2.x
|
||||
from urllib2 import Request, urlopen, URLError
|
||||
|
||||
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
|
||||
|
||||
|
||||
def test_clean_url():
|
||||
test_url = " https://en.wikipedia.org/wiki/Network security "
|
||||
answer = "https://en.wikipedia.org/wiki/Network_security"
|
||||
target = waybackpy.Url(test_url, user_agent)
|
||||
test_result = target._clean_url()
|
||||
assert answer == test_result
|
||||
|
||||
def test_dunders():
|
||||
url = "https://en.wikipedia.org/wiki/Network_security"
|
||||
user_agent = "UA"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target)
|
||||
assert len(target) == len(url)
|
||||
assert str(target) == url
|
||||
|
||||
def test_archive_url_parser():
|
||||
request_url = "https://amazon.com"
|
||||
hdr = {"User-Agent": user_agent} # nosec
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
header = waybackpy._get_response(req).headers
|
||||
with pytest.raises(Exception):
|
||||
waybackpy._archive_url_parser(header)
|
||||
|
||||
def test_url_check():
|
||||
broken_url = "http://wwwgooglecom/"
|
||||
with pytest.raises(Exception):
|
||||
waybackpy.Url(broken_url, user_agent)
|
||||
|
||||
|
||||
def test_save():
|
||||
# Test for urls that exist and can be archived.
|
||||
time.sleep(10)
|
||||
|
||||
url_list = [
|
||||
"en.wikipedia.org",
|
||||
"www.wikidata.org",
|
||||
"commons.wikimedia.org",
|
||||
"www.wiktionary.org",
|
||||
"www.w3schools.com",
|
||||
"www.ibm.com",
|
||||
]
|
||||
x = random.randint(0, len(url_list) - 1)
|
||||
url1 = url_list[x]
|
||||
target = waybackpy.Url(
|
||||
url1,
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
|
||||
)
|
||||
archived_url1 = target.save()
|
||||
assert url1 in archived_url1
|
||||
|
||||
if sys.version_info > (3, 6):
|
||||
|
||||
# Test for urls that are incorrect.
|
||||
with pytest.raises(Exception):
|
||||
url2 = "ha ha ha ha"
|
||||
waybackpy.Url(url2, user_agent)
|
||||
time.sleep(5)
|
||||
# Test for urls not allowed to archive by robot.txt.
|
||||
with pytest.raises(Exception):
|
||||
url3 = "http://www.archive.is/faq.html"
|
||||
target = waybackpy.Url(
|
||||
url3,
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) "
|
||||
"Gecko/20100101 Firefox/25.0",
|
||||
)
|
||||
target.save()
|
||||
|
||||
time.sleep(5)
|
||||
# Non existent urls, test
|
||||
with pytest.raises(Exception):
|
||||
url4 = (
|
||||
"https://githfgdhshajagjstgeths537agajaajgsagudadhuss87623"
|
||||
"46887adsiugujsdgahub.us"
|
||||
)
|
||||
target = waybackpy.Url(
|
||||
url3,
|
||||
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) "
|
||||
"AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 "
|
||||
"Safari/533.20.27",
|
||||
)
|
||||
target.save()
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
def test_near():
|
||||
time.sleep(10)
|
||||
url = "google.com"
|
||||
target = waybackpy.Url(
|
||||
url,
|
||||
"Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 "
|
||||
"(KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
|
||||
)
|
||||
archive_near_year = target.near(year=2010)
|
||||
assert "2010" in archive_near_year
|
||||
|
||||
if sys.version_info > (3, 6):
|
||||
time.sleep(5)
|
||||
archive_near_month_year = target.near(year=2015, month=2)
|
||||
assert (
|
||||
("201502" in archive_near_month_year)
|
||||
or ("201501" in archive_near_month_year)
|
||||
or ("201503" in archive_near_month_year)
|
||||
)
|
||||
|
||||
target = waybackpy.Url(
|
||||
"www.python.org",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
|
||||
)
|
||||
archive_near_hour_day_month_year = target.near(
|
||||
year=2008, month=5, day=9, hour=15
|
||||
)
|
||||
assert (
|
||||
("2008050915" in archive_near_hour_day_month_year)
|
||||
or ("2008050914" in archive_near_hour_day_month_year)
|
||||
or ("2008050913" in archive_near_hour_day_month_year)
|
||||
)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
NeverArchivedUrl = (
|
||||
"https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
|
||||
)
|
||||
target = waybackpy.Url(NeverArchivedUrl, user_agent)
|
||||
target.near(year=2010)
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
def test_oldest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "20200504141153" in target.oldest()
|
||||
|
||||
|
||||
def test_newest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert url in target.newest()
|
||||
|
||||
|
||||
def test_get():
|
||||
target = waybackpy.Url("google.com", user_agent)
|
||||
assert "Welcome to Google" in target.get(target.oldest())
|
||||
|
||||
|
||||
|
||||
def test_wayback_timestamp():
|
||||
ts = waybackpy._wayback_timestamp(
|
||||
year=2020, month=1, day=2, hour=3, minute=4
|
||||
)
|
||||
assert "202001020304" in str(ts)
|
||||
|
||||
|
||||
def test_get_response():
|
||||
hdr = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) "
|
||||
"Gecko/20100101 Firefox/78.0"
|
||||
}
|
||||
req = Request("https://www.google.com", headers=hdr) # nosec
|
||||
response = waybackpy._get_response(req)
|
||||
assert response.code == 200
|
||||
|
||||
|
||||
def test_total_archives():
|
||||
if sys.version_info > (3, 6):
|
||||
target = waybackpy.Url(" https://google.com ", user_agent)
|
||||
assert target.total_archives() > 500000
|
||||
else:
|
||||
pass
|
||||
target = waybackpy.Url(
|
||||
" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent
|
||||
)
|
||||
assert target.total_archives() == 0
|
@ -10,13 +10,15 @@
|
||||
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
|
||||
|
||||
"""
|
||||
A python wrapper for Internet Archive's Wayback Machine API.
|
||||
Waybackpy is a Python library that interfaces with the Internet Archive's Wayback Machine API.
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Archive pages and retrieve archived pages easily.
|
||||
|
||||
Usage:
|
||||
>>> import waybackpy
|
||||
>>> new_archive = waybackpy.save('https://www.python.org')
|
||||
>>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
|
||||
>>> new_archive = target_url.save()
|
||||
>>> print(new_archive)
|
||||
https://web.archive.org/web/20200502170312/https://www.python.org/
|
||||
|
||||
@ -25,6 +27,14 @@ Full documentation @ <https://akamhy.github.io/waybackpy/>.
|
||||
:license: MIT
|
||||
"""
|
||||
|
||||
from .wrapper import save, near, oldest, newest, get, clean_url, url_check, total_archives
|
||||
from .__version__ import __title__, __description__, __url__, __version__
|
||||
from .__version__ import __author__, __author_email__, __license__, __copyright__
|
||||
from .wrapper import Url
|
||||
from .__version__ import (
|
||||
__title__,
|
||||
__description__,
|
||||
__url__,
|
||||
__version__,
|
||||
__author__,
|
||||
__author_email__,
|
||||
__license__,
|
||||
__copyright__,
|
||||
)
|
||||
|
@ -1,7 +1,9 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__title__ = "waybackpy"
|
||||
__description__ = "A python wrapper for Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
|
||||
__description__ = "A Python library that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
|
||||
__url__ = "https://akamhy.github.io/waybackpy/"
|
||||
__version__ = "v1.5"
|
||||
__version__ = "2.1.5"
|
||||
__author__ = "akamhy"
|
||||
__author_email__ = "akash3pro@gmail.com"
|
||||
__license__ = "MIT"
|
||||
|
103
waybackpy/cli.py
Normal file
103
waybackpy/cli.py
Normal file
@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import argparse
|
||||
from waybackpy.wrapper import Url
|
||||
from waybackpy.__version__ import __version__
|
||||
|
||||
def _save(obj):
|
||||
return (obj.save())
|
||||
|
||||
def _oldest(obj):
|
||||
return (obj.oldest())
|
||||
|
||||
def _newest(obj):
|
||||
return (obj.newest())
|
||||
|
||||
def _total_archives(obj):
|
||||
return (obj.total_archives())
|
||||
|
||||
def _near(obj, args):
|
||||
_near_args = {}
|
||||
if args.year:
|
||||
_near_args["year"] = args.year
|
||||
if args.month:
|
||||
_near_args["month"] = args.month
|
||||
if args.day:
|
||||
_near_args["day"] = args.day
|
||||
if args.hour:
|
||||
_near_args["hour"] = args.hour
|
||||
if args.minute:
|
||||
_near_args["minute"] = args.minute
|
||||
return (obj.near(**_near_args))
|
||||
|
||||
def _get(obj, args):
|
||||
if args.get.lower() == "url":
|
||||
return (obj.get())
|
||||
|
||||
if args.get.lower() == "oldest":
|
||||
return (obj.get(obj.oldest()))
|
||||
|
||||
if args.get.lower() == "latest" or args.get.lower() == "newest":
|
||||
return (obj.get(obj.newest()))
|
||||
|
||||
if args.get.lower() == "save":
|
||||
return (obj.get(obj.save()))
|
||||
|
||||
return ("Use get as \"--get 'source'\", 'source' can be one of the followings: \
|
||||
\n1) url - get the source code of the url specified using --url/-u.\
|
||||
\n2) oldest - get the source code of the oldest archive for the supplied url.\
|
||||
\n3) newest - get the source code of the newest archive for the supplied url.\
|
||||
\n4) save - Create a new archive and get the source code of this new archive for the supplied url.")
|
||||
|
||||
def args_handler(args):
|
||||
if args.version:
|
||||
return (__version__)
|
||||
|
||||
if not args.url:
|
||||
return ("Specify an URL. See --help for help using waybackpy.")
|
||||
|
||||
if args.user_agent:
|
||||
obj = Url(args.url, args.user_agent)
|
||||
else:
|
||||
obj = Url(args.url)
|
||||
|
||||
if args.save:
|
||||
return _save(obj)
|
||||
if args.oldest:
|
||||
return _oldest(obj)
|
||||
if args.newest:
|
||||
return _newest(obj)
|
||||
if args.total:
|
||||
return _total_archives(obj)
|
||||
if args.near:
|
||||
return _near(obj, args)
|
||||
if args.get:
|
||||
return _get(obj, args)
|
||||
return ("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help for help using waybackpy.")
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-u", "--url", help="URL on which Wayback machine operations would occur.")
|
||||
parser.add_argument("-ua", "--user_agent", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\".")
|
||||
parser.add_argument("-s", "--save", action='store_true', help="Save the URL on the Wayback machine.")
|
||||
parser.add_argument("-o", "--oldest", action='store_true', help="Oldest archive for the specified URL.")
|
||||
parser.add_argument("-n", "--newest", action='store_true', help="Newest archive for the specified URL.")
|
||||
parser.add_argument("-t", "--total", action='store_true', help="Total number of archives for the specified URL.")
|
||||
parser.add_argument("-g", "--get", help="Prints the source code of the supplied url. Use '--get help' for extended usage.")
|
||||
parser.add_argument("-v", "--version", action='store_true', help="Prints the waybackpy version.")
|
||||
parser.add_argument("-N", "--near", action='store_true', help="Latest/Newest archive for the specified URL.")
|
||||
parser.add_argument("-Y", "--year", type=int, help="Year in integer. For use with --near.")
|
||||
parser.add_argument("-M", "--month", type=int, help="Month in integer. For use with --near.")
|
||||
parser.add_argument("-D", "--day", type=int, help="Day in integer. For use with --near.")
|
||||
parser.add_argument("-H", "--hour", type=int, help="Hour in integer. For use with --near.")
|
||||
parser.add_argument("-MIN", "--minute", type=int, help="Minute in integer. For use with --near.")
|
||||
return parser.parse_args(argv[1:])
|
||||
|
||||
def main(argv):
|
||||
args = parse_args(argv)
|
||||
output = args_handler(args)
|
||||
print(output)
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv))
|
@ -1,43 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
class TooManyArchivingRequests(Exception):
|
||||
|
||||
"""Error when a single url reqeusted for archiving too many times in a short timespam.
|
||||
Wayback machine doesn't supports archivng any url too many times in a short period of time.
|
||||
class WaybackError(Exception):
|
||||
"""
|
||||
|
||||
class ArchivingNotAllowed(Exception):
|
||||
|
||||
"""Files like robots.txt are set to deny robot archiving.
|
||||
Wayback machine respects these file, will not archive.
|
||||
"""
|
||||
|
||||
class PageNotSaved(Exception):
|
||||
"""
|
||||
When unable to save a webpage.
|
||||
"""
|
||||
|
||||
class ArchiveNotFound(Exception):
|
||||
"""
|
||||
When a page was never archived but client asks for old archive.
|
||||
"""
|
||||
|
||||
class UrlNotFound(Exception):
|
||||
"""
|
||||
Raised when 404 UrlNotFound.
|
||||
"""
|
||||
|
||||
class BadGateWay(Exception):
|
||||
"""
|
||||
Raised when 502 bad gateway.
|
||||
"""
|
||||
|
||||
class WaybackUnavailable(Exception):
|
||||
"""
|
||||
Raised when 503 API Service Temporarily Unavailable.
|
||||
"""
|
||||
|
||||
class InvalidUrl(Exception):
|
||||
"""
|
||||
Raised when url doesn't follow the standard url format.
|
||||
Raised when API Service error.
|
||||
"""
|
||||
|
@ -1,143 +1,169 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
from waybackpy.exceptions import TooManyArchivingRequests, ArchivingNotAllowed, PageNotSaved, ArchiveNotFound, UrlNotFound, BadGateWay, InvalidUrl, WaybackUnavailable
|
||||
try:
|
||||
from waybackpy.exceptions import WaybackError
|
||||
from waybackpy.__version__ import __version__
|
||||
|
||||
if sys.version_info >= (3, 0): # If the python ver >= 3
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError, URLError
|
||||
except ImportError:
|
||||
from urllib2 import Request, urlopen, HTTPError, URLError
|
||||
from urllib.error import URLError
|
||||
else: # For python2.x
|
||||
from urllib2 import Request, urlopen, URLError
|
||||
|
||||
default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||
|
||||
|
||||
default_UA = "waybackpy python package"
|
||||
|
||||
def url_check(url):
|
||||
if "." not in url:
|
||||
raise InvalidUrl("'%s' is not a vaild url." % url)
|
||||
|
||||
def clean_url(url):
|
||||
return str(url).strip().replace(" ","_")
|
||||
|
||||
def wayback_timestamp(**kwargs):
|
||||
return (
|
||||
str(kwargs["year"])
|
||||
+
|
||||
str(kwargs["month"]).zfill(2)
|
||||
+
|
||||
str(kwargs["day"]).zfill(2)
|
||||
+
|
||||
str(kwargs["hour"]).zfill(2)
|
||||
+
|
||||
str(kwargs["minute"]).zfill(2)
|
||||
)
|
||||
|
||||
def handle_HTTPError(e):
|
||||
if e.code == 502:
|
||||
raise BadGateWay(e)
|
||||
elif e.code == 503:
|
||||
raise WaybackUnavailable(e)
|
||||
elif e.code == 429:
|
||||
raise TooManyArchivingRequests(e)
|
||||
elif e.code == 404:
|
||||
raise UrlNotFound(e)
|
||||
|
||||
def save(url, UA=default_UA):
|
||||
url_check(url)
|
||||
request_url = ("https://web.archive.org/save/" + clean_url(url))
|
||||
|
||||
hdr = { 'User-Agent' : '%s' % UA } #nosec
|
||||
req = Request(request_url, headers=hdr) #nosec
|
||||
def _archive_url_parser(header):
|
||||
"""Parse out the archive from header."""
|
||||
# Regex1
|
||||
arch = re.search(
|
||||
r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
|
||||
)
|
||||
if arch:
|
||||
return arch.group(1)
|
||||
# Regex2
|
||||
arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
|
||||
if arch:
|
||||
return arch.group(1)
|
||||
raise WaybackError(
|
||||
"No archive URL found in the API response. "
|
||||
"This version of waybackpy (%s) is likely out of date. Visit "
|
||||
"https://github.com/akamhy/waybackpy for the latest version "
|
||||
"of waybackpy.\nHeader:\n%s" % (__version__, str(header))
|
||||
)
|
||||
|
||||
|
||||
def _wayback_timestamp(**kwargs):
|
||||
"""Return a formatted timestamp."""
|
||||
return "".join(
|
||||
str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
|
||||
)
|
||||
|
||||
|
||||
def _get_response(req):
|
||||
"""Get response for the supplied request."""
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except HTTPError as e:
|
||||
if handle_HTTPError(e) is None:
|
||||
raise PageNotSaved(e)
|
||||
except URLError:
|
||||
response = urlopen(req) # nosec
|
||||
except Exception:
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except URLError as e:
|
||||
raise UrlNotFound(e)
|
||||
response = urlopen(req) # nosec
|
||||
except Exception as e:
|
||||
exc = WaybackError("Error while retrieving %s" % req.full_url)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
||||
return response
|
||||
|
||||
header = response.headers
|
||||
class Url:
|
||||
"""waybackpy Url object"""
|
||||
|
||||
if "exclusion.robots.policy" in str(header):
|
||||
raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url))
|
||||
def __init__(self, url, user_agent=default_UA):
|
||||
self.url = url
|
||||
self.user_agent = user_agent
|
||||
self._url_check() # checks url validity on init.
|
||||
|
||||
return "https://web.archive.org" + header['Content-Location']
|
||||
def __repr__(self):
|
||||
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
|
||||
|
||||
def get(url, encoding=None, UA=default_UA):
|
||||
url_check(url)
|
||||
hdr = { 'User-Agent' : '%s' % UA }
|
||||
req = Request(clean_url(url), headers=hdr) #nosec
|
||||
def __str__(self):
|
||||
return "%s" % self._clean_url()
|
||||
|
||||
try:
|
||||
resp=urlopen(req) #nosec
|
||||
except URLError:
|
||||
try:
|
||||
resp=urlopen(req) #nosec
|
||||
except URLError as e:
|
||||
raise UrlNotFound(e)
|
||||
def __len__(self):
|
||||
return len(self._clean_url())
|
||||
|
||||
if encoding is None:
|
||||
try:
|
||||
encoding= resp.headers['content-type'].split('charset=')[-1]
|
||||
except AttributeError:
|
||||
encoding = "UTF-8"
|
||||
def _url_check(self):
|
||||
"""Check for common URL problems."""
|
||||
if "." not in self.url:
|
||||
raise URLError("'%s' is not a vaild URL." % self.url)
|
||||
|
||||
return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
|
||||
def _clean_url(self):
|
||||
"""Fix the URL, if possible."""
|
||||
return str(self.url).strip().replace(" ", "_")
|
||||
|
||||
def near(url, **kwargs):
|
||||
def save(self):
|
||||
"""Create a new Wayback Machine archive for this URL."""
|
||||
request_url = "https://web.archive.org/save/" + self._clean_url()
|
||||
hdr = {"User-Agent": "%s" % self.user_agent} # nosec
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
header = _get_response(req).headers
|
||||
return "https://" + _archive_url_parser(header)
|
||||
|
||||
try:
|
||||
url = kwargs["url"]
|
||||
except KeyError:
|
||||
url = url
|
||||
def get(self, url="", user_agent="", encoding=""):
|
||||
"""Return the source code of the supplied URL.
|
||||
If encoding is not supplied, it is auto-detected from the response.
|
||||
"""
|
||||
if not url:
|
||||
url = self._clean_url()
|
||||
if not user_agent:
|
||||
user_agent = self.user_agent
|
||||
|
||||
year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
|
||||
month=kwargs.get("month", datetime.utcnow().strftime('%m'))
|
||||
day=kwargs.get("day", datetime.utcnow().strftime('%d'))
|
||||
hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
|
||||
minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
|
||||
UA=kwargs.get("UA", default_UA)
|
||||
hdr = {"User-Agent": "%s" % user_agent}
|
||||
req = Request(url, headers=hdr) # nosec
|
||||
response = _get_response(req)
|
||||
if not encoding:
|
||||
try:
|
||||
encoding = response.headers["content-type"].split("charset=")[-1]
|
||||
except AttributeError:
|
||||
encoding = "UTF-8"
|
||||
return response.read().decode(encoding.replace("text/html", "UTF-8", 1))
|
||||
|
||||
url_check(url)
|
||||
timestamp = wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
|
||||
request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % (clean_url(url), str(timestamp))
|
||||
hdr = { 'User-Agent' : '%s' % UA }
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
def near(self, year=None, month=None, day=None, hour=None, minute=None):
|
||||
""" Return the closest Wayback Machine archive to the time supplied.
|
||||
Supported params are year, month, day, hour and minute.
|
||||
Any non-supplied parameters default to the current time.
|
||||
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except HTTPError as e:
|
||||
handle_HTTPError(e)
|
||||
"""
|
||||
now = datetime.utcnow().timetuple()
|
||||
timestamp = _wayback_timestamp(
|
||||
year=year if year else now.tm_year,
|
||||
month=month if month else now.tm_mon,
|
||||
day=day if day else now.tm_mday,
|
||||
hour=hour if hour else now.tm_hour,
|
||||
minute=minute if minute else now.tm_min,
|
||||
)
|
||||
|
||||
data = json.loads(response.read().decode("UTF-8"))
|
||||
if not data["archived_snapshots"]:
|
||||
raise ArchiveNotFound("'%s' is not yet archived." % url)
|
||||
request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % (
|
||||
self._clean_url(),
|
||||
timestamp,
|
||||
)
|
||||
hdr = {"User-Agent": "%s" % self.user_agent}
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
response = _get_response(req)
|
||||
data = json.loads(response.read().decode("UTF-8"))
|
||||
if not data["archived_snapshots"]:
|
||||
raise WaybackError(
|
||||
"'%s' is not yet archived. Use wayback.Url(url, user_agent).save() "
|
||||
"to create a new archive." % self._clean_url()
|
||||
)
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
# wayback machine returns http sometimes, idk why? But they support https
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||
)
|
||||
return archive_url
|
||||
|
||||
archive_url = (data["archived_snapshots"]["closest"]["url"])
|
||||
# wayback machine returns http sometimes, idk why? But they support https
|
||||
archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
|
||||
return archive_url
|
||||
def oldest(self, year=1994):
|
||||
"""Return the oldest Wayback Machine archive for this URL."""
|
||||
return self.near(year=year)
|
||||
|
||||
def oldest(url, UA=default_UA, year=1994):
|
||||
return near(url, year=year, UA=UA)
|
||||
def newest(self):
|
||||
"""Return the newest Wayback Machine archive available for this URL.
|
||||
|
||||
def newest(url, UA=default_UA):
|
||||
return near(url, UA=UA)
|
||||
Due to Wayback Machine database lag, this may not always be the
|
||||
most recent archive.
|
||||
"""
|
||||
return self.near()
|
||||
|
||||
def total_archives(url, UA=default_UA):
|
||||
url_check(url)
|
||||
|
||||
hdr = { 'User-Agent' : '%s' % UA }
|
||||
request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json" % clean_url(url)
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except HTTPError as e:
|
||||
handle_HTTPError(e)
|
||||
|
||||
return (len(json.loads(response.read())))
|
||||
def total_archives(self):
|
||||
"""Returns the total number of Wayback Machine archives for this URL."""
|
||||
hdr = {"User-Agent": "%s" % self.user_agent}
|
||||
request_url = (
|
||||
"https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode"
|
||||
% self._clean_url()
|
||||
)
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
response = _get_response(req)
|
||||
# Most efficient method to count number of archives (yet)
|
||||
return str(response.read()).count(",")
|
||||
|
Reference in New Issue
Block a user