Compare commits

..

78 Commits
v1.1 ... v1.6

Author SHA1 Message Date
akamhy
80331833f2 Update setup.py 2020-05-07 20:12:32 +05:30
akamhy
5e3d3a815f fix 2020-05-07 20:03:17 +05:30
akamhy
6182a18cf4 fix 2020-05-07 20:02:47 +05:30
akamhy
9bca750310 v1.5 2020-05-07 19:59:23 +05:30
akamhy
c22749a6a3 update 2020-05-07 19:54:00 +05:30
akamhy
151df94fe3 license_file = LICENSE 2020-05-07 19:38:19 +05:30
akamhy
24540d0b2c update 2020-05-07 19:33:39 +05:30
akamhy
bdfc72d05d Create __version__.py 2020-05-07 19:16:26 +05:30
akamhy
3b104c1a28 v1.5 2020-05-07 19:03:02 +05:30
akamhy
fb0d4658a7 ce 2020-05-07 19:02:12 +05:30
akamhy
48833980e1 update 2020-05-07 18:58:01 +05:30
akamhy
0c4f119981 Update wrapper.py 2020-05-07 17:25:34 +05:30
akamhy
afded51a04 Update wrapper.py 2020-05-07 17:20:23 +05:30
akamhy
b950616561 Update wrapper.py 2020-05-07 17:17:17 +05:30
akamhy
444675538f fix code Complexity (#8)
* fix code Complexity

* Update wrapper.py

* codefactor badge
2020-05-07 16:51:08 +05:30
akamhy
0ca6710334 Update wrapper.py 2020-05-07 16:24:33 +05:30
akamhy
01a7c591ad retry 2020-05-07 15:46:39 +05:30
akamhy
74d3bc154b fix issue with py2.7 2020-05-07 15:34:41 +05:30
akamhy
a8e94dfb25 Update README.md 2020-05-07 15:14:55 +05:30
akamhy
cc38798b32 Update README.md 2020-05-07 15:14:30 +05:30
akamhy
bc3dd44f27 Update README.md 2020-05-07 15:13:58 +05:30
akamhy
ba46cdafe2 Update README.md 2020-05-07 15:12:37 +05:30
akamhy
538afb14e9 Update test_1.py 2020-05-07 15:06:52 +05:30
akamhy
7605b614ee test for total_archives() 2020-05-07 15:00:28 +05:30
akamhy
d0a4e25cf5 Update __init__.py 2020-05-07 14:53:09 +05:30
akamhy
8c5c0153da + total_archives() 2020-05-07 14:52:05 +05:30
akamhy
e7dac74906 Update __init__.py 2020-05-07 09:06:49 +05:30
akamhy
c686708c9e more testing 2020-05-07 08:59:09 +05:30
akamhy
f9ae8ada70 Update test_1.py 2020-05-07 08:39:24 +05:30
akamhy
e56ece3dc9 Update README.md 2020-05-07 08:23:31 +05:30
akamhy
db127a5c54 always return https 2020-05-06 20:16:25 +05:30
akamhy
ed497bbd23 Update wrapper.py 2020-05-06 20:07:25 +05:30
akamhy
45fe07ddb6 Update wrapper.py 2020-05-06 19:35:01 +05:30
akamhy
0029d63d8a 503 API Service Temporarily Unavailable 2020-05-06 19:22:56 +05:30
akamhy
beb5b625ec Set theme jekyll-theme-cayman 2020-05-06 12:20:43 +05:30
akamhy
b40d734346 Update README.md 2020-05-06 09:18:02 +05:30
akamhy
be0a30de85 Create index.rst 2020-05-05 20:22:46 +05:30
akamhy
3a65a60bd6 Update README.md 2020-05-05 19:08:26 +05:30
akamhy
7b626f5ea5 Update README.md 2020-05-05 17:54:38 +05:30
akamhy
73371d6c68 Update README.md 2020-05-05 17:49:23 +05:30
akamhy
8904ba4d67 Update README.md 2020-05-05 17:47:55 +05:30
akamhy
b4a7f7ea6f Update README.md 2020-05-05 17:47:00 +05:30
akamhy
a2ead04021 Update README.md 2020-05-05 17:44:12 +05:30
akamhy
3513feb075 Update __init__.py 2020-05-05 17:37:38 +05:30
akamhy
d34b98373f Update setup.py 2020-05-05 17:37:16 +05:30
akamhy
38f3b81742 Update .travis.yml 2020-05-05 17:27:58 +05:30
akamhy
660a826aed Update .travis.yml 2020-05-05 17:21:36 +05:30
akamhy
a52d035c0e Update .travis.yml 2020-05-05 17:19:24 +05:30
akamhy
6737ce0e26 Create .travis.yml 2020-05-05 17:14:57 +05:30
akamhy
98cc918c8f Update test_1.py 2020-05-05 17:10:33 +05:30
akamhy
b103bfc6e4 Create test_1.py 2020-05-05 16:29:55 +05:30
akamhy
edd05838b8 v1.3 2020-05-05 11:29:22 +05:30
akamhy
031212e161 v1.3 2020-05-05 11:28:58 +05:30
akamhy
d3bd5b05b5 Update setup.py 2020-05-05 10:50:09 +05:30
akamhy
d6598a67b9 Update setup.py 2020-05-05 10:40:23 +05:30
akamhy
e5a6057249 Update setup.py 2020-05-05 10:39:10 +05:30
akamhy
2a1b3bc6ee Update setup.py 2020-05-05 10:36:05 +05:30
akamhy
b4ca98eca2 Update setup.py 2020-05-05 10:32:06 +05:30
akamhy
36b01754ec Update setup.py 2020-05-05 10:23:38 +05:30
akamhy
3d8bf4eec6 Update setup.py 2020-05-05 10:22:54 +05:30
akamhy
e7761b3709 Update README.md 2020-05-05 10:21:08 +05:30
akamhy
df851dce0c Update setup.py 2020-05-05 10:16:15 +05:30
akamhy
f5acbcfc95 Update exceptions.py 2020-05-05 10:07:27 +05:30
akamhy
44156e5e7e Update exceptions.py 2020-05-05 10:05:47 +05:30
akamhy
a6cb955669 Update wrapper.py 2020-05-05 10:04:40 +05:30
akamhy
8acb14a243 Update wrapper.py 2020-05-05 10:00:29 +05:30
akamhy
7d434c3f0f Update wrapper.py 2020-05-05 09:57:39 +05:30
akamhy
057c61d677 Update wrapper.py 2020-05-05 09:48:39 +05:30
akamhy
6705c04f38 Update wrapper.py 2020-05-05 09:43:13 +05:30
akamhy
e631c0aadb Update README.md 2020-05-05 09:37:53 +05:30
akamhy
423782ea75 Update README.md 2020-05-05 09:36:11 +05:30
whitesource-bolt-for-github[bot]
7944f0878d Add .whitesource configuration file (#6)
Co-authored-by: whitesource-bolt-for-github[bot] <42819689+whitesource-bolt-for-github[bot]@users.noreply.github.com>
2020-05-05 09:33:50 +05:30
akamhy
850b055527 Update README.md 2020-05-05 09:31:43 +05:30
akamhy
32bc765113 Update README.md (#5)
* Update README.md

* Update README.md

* Update README.md
2020-05-05 09:27:02 +05:30
akamhy
09b4ba2649 Version 1.2 with bug fixes and support for webpage retrieval (#4) 2020-05-05 09:03:16 +05:30
akamhy
929790feca Update README.md (#1)
Add usage/ documentaion
2020-05-04 21:06:00 +05:30
akamhy
09a521ae43 Create setup.cfg 2020-05-04 16:23:00 +05:30
akamhy
a503be5a86 Create setup.py 2020-05-04 16:21:24 +05:30
12 changed files with 739 additions and 48 deletions

14
.travis.yml Normal file
View File

@@ -0,0 +1,14 @@
language: python
python:
- "2.7"
- "3.6"
- "3.8"
os: linux
dist: xenial
cache: pip
install:
- pip install pytest
before_script:
cd tests
script:
- pytest test_1.py

8
.whitesource Normal file
View File

@@ -0,0 +1,8 @@
{
"checkRunSettings": {
"vulnerableCheckRunConclusionLevel": "failure"
},
"issueSettings": {
"minSeverityLevel": "LOW"
}
}

178
README.md
View File

@@ -1,2 +1,176 @@
# pywayback # waybackpy
A python wrapper for Internet Archive's Wayback Machine [![Build Status](https://travis-ci.org/akamhy/waybackpy.svg?branch=master)](https://travis-ci.org/akamhy/waybackpy)
[![Downloads](https://img.shields.io/pypi/dm/waybackpy.svg)](https://pypistats.org/packages/waybackpy)
[![Release](https://img.shields.io/github/v/release/akamhy/waybackpy.svg)](https://github.com/akamhy/waybackpy/releases)
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65)](https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=akamhy/waybackpy&amp;utm_campaign=Badge_Grade)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
[![Maintainability](https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability)](https://codeclimate.com/github/akamhy/waybackpy/maintainability)
[![CodeFactor](https://www.codefactor.io/repository/github/akamhy/waybackpy/badge)](https://www.codefactor.io/repository/github/akamhy/waybackpy)
[![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)
![pypi](https://img.shields.io/pypi/v/waybackpy.svg)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/akamhy/waybackpy/graphs/commit-activity)
![Internet Archive](https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png)
![Wayback Machine](https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png)
The waybackpy is a python wrapper for [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine).
Table of contents
=================
<!--ts-->
* [Installation](https://github.com/akamhy/waybackpy#installation)
* [Usage](https://github.com/akamhy/waybackpy#usage)
* [Saving an url using save()](https://github.com/akamhy/waybackpy#capturing-aka-saving-an-url-using-save)
* [Receiving the oldest archive for an URL Using oldest()](https://github.com/akamhy/waybackpy#receiving-the-oldest-archive-for-an-url-using-oldest)
* [Receiving the recent most/newest archive for an URL using newest()](https://github.com/akamhy/waybackpy#receiving-the-newest-archive-for-an-url-using-newest)
* [Receiving archive close to a specified year, month, day, hour, and minute using near()](https://github.com/akamhy/waybackpy#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
* [Get the content of webpage using get()](https://github.com/akamhy/waybackpy#get-the-content-of-webpage-using-get)
* [Count total archives for an URL using total_archives()](https://github.com/akamhy/waybackpy#count-total-archives-for-an-url-using-total_archives)
* [Tests](https://github.com/akamhy/waybackpy#tests)
* [Dependency](https://github.com/akamhy/waybackpy#dependency)
* [License](https://github.com/akamhy/waybackpy#license)
<!--te-->
## Installation
Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)):
**pip install waybackpy**
## Usage
#### Capturing aka Saving an url Using save()
```diff
+ waybackpy.save(url, UA=user_agent)
```
> url is mandatory. UA is not, but highly recommended.
```python
import waybackpy
# Capturing a new archive on Wayback machine.
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
archived_url = waybackpy.save("https://github.com/akamhy/waybackpy", UA = "Any-User-Agent")
print(archived_url)
```
This should print something similar to the following archived URL:
<https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy>
#### Receiving the oldest archive for an URL Using oldest()
```diff
+ waybackpy.oldest(url, UA=user_agent)
```
> url is mandatory. UA is not, but highly recommended.
```python
import waybackpy
# retrieving the oldest archive on Wayback machine.
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
oldest_archive = waybackpy.oldest("https://www.google.com/", UA = "Any-User-Agent")
print(oldest_archive)
```
This returns the oldest available archive for <https://google.com>.
<http://web.archive.org/web/19981111184551/http://google.com:80/>
#### Receiving the newest archive for an URL using newest()
```diff
+ waybackpy.newest(url, UA=user_agent)
```
> url is mandatory. UA is not, but highly recommended.
```python
import waybackpy
# retrieving the newest archive on Wayback machine.
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
newest_archive = waybackpy.newest("https://www.microsoft.com/en-us", UA = "Any-User-Agent")
print(newest_archive)
```
This returns the newest available archive for <https://www.microsoft.com/en-us>, something just like this:
<http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/>
#### Receiving archive close to a specified year, month, day, hour, and minute using near()
```diff
+ waybackpy.near(url, year=2020, month=1, day=1, hour=1, minute=1, UA=user_agent)
```
> url is mandotory. year,month,day,hour and minute are optional arguments. UA is not mandotory, but higly recomended.
```python
import waybackpy
# retriving the the closest archive from a specified year.
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
# supported argumnets are year,month,day,hour and minute
archive_near_year = waybackpy.near("https://www.facebook.com/", year=2010, UA ="Any-User-Agent")
print(archive_near_year)
```
returns : <http://web.archive.org/web/20100504071154/http://www.facebook.com/>
```waybackpy.near("https://www.facebook.com/", year=2010, month=1, UA ="Any-User-Agent")``` returns: <http://web.archive.org/web/20101111173430/http://www.facebook.com//>
```waybackpy.near("https://www.oracle.com/index.html", year=2019, month=1, day=5, UA ="Any-User-Agent")``` returns: <http://web.archive.org/web/20190105054437/https://www.oracle.com/index.html>
> Please note that if you only specify the year, the current month and day are default arguments for month and day respectively. Do not expect just putting the year parameter would return the archive closer to January but the current month you are using the package. If you are using it in July 2018 and let's say you use ```waybackpy.near("https://www.facebook.com/", year=2011, UA ="Any-User-Agent")``` then you would be returned the nearest archive to July 2011 and not January 2011. You need to specify the month "1" for January.
> Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
#### Get the content of webpage using get()
```diff
+ waybackpy.get(url, encoding="UTF-8", UA=user_agent)
```
> url is mandatory. UA is not, but highly recommended. encoding is detected automatically, don't specify unless necessary.
```python
from waybackpy import get
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
# supported argumnets are url, encoding and UA
webpage = get("https://example.com/", UA="User-Agent")
print(webpage)
```
> This should print the source code for <https://example.com/>.
#### Count total archives for an URL using total_archives()
```diff
+ waybackpy.total_archives(url, UA=user_agent)
```
> url is mandatory. UA is not, but highly recommended.
```python
from waybackpy import total_archives
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
# supported argumnets are url and UA
count = total_archives("https://en.wikipedia.org/wiki/Python (programming language)", UA="User-Agent")
print(count)
```
> This should print an integer (int), which is the number of total archives on archive.org
## Tests
* [Here](https://github.com/akamhy/waybackpy/tree/master/tests)
## Dependency
* None, just python standard libraries (json, urllib and datetime). Both python 2 and 3 are supported :)
## License
[MIT License](https://github.com/akamhy/waybackpy/blob/master/LICENSE)

1
_config.yml Normal file
View File

@@ -0,0 +1 @@
theme: jekyll-theme-cayman

232
index.rst Normal file
View File

@@ -0,0 +1,232 @@
waybackpy
=========
|Build Status| |Downloads| |Release| |Codacy Badge| |License: MIT|
|Maintainability| |CodeFactor| |made-with-python| |pypi| |PyPI - Python
Version| |Maintenance|
.. |Build Status| image:: https://travis-ci.org/akamhy/waybackpy.svg?branch=master
:target: https://travis-ci.org/akamhy/waybackpy
.. |Downloads| image:: https://img.shields.io/pypi/dm/waybackpy.svg
:target: https://pypistats.org/packages/waybackpy
.. |Release| image:: https://img.shields.io/github/v/release/akamhy/waybackpy.svg
:target: https://github.com/akamhy/waybackpy/releases
.. |Codacy Badge| image:: https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65
:target: https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
.. |Maintainability| image:: https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability
:target: https://codeclimate.com/github/akamhy/waybackpy/maintainability
.. |CodeFactor| image:: https://www.codefactor.io/repository/github/akamhy/waybackpy/badge
:target: https://www.codefactor.io/repository/github/akamhy/waybackpy
.. |made-with-python| image:: https://img.shields.io/badge/Made%20with-Python-1f425f.svg
:target: https://www.python.org/
.. |pypi| image:: https://img.shields.io/pypi/v/waybackpy.svg
.. |PyPI - Python Version| image:: https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square
.. |Maintenance| image:: https://img.shields.io/badge/Maintained%3F-yes-green.svg
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity
|Internet Archive| |Wayback Machine|
The waybackpy is a python wrapper for `Internet Archive`_\ s `Wayback
Machine`_.
.. _Internet Archive: https://en.wikipedia.org/wiki/Internet_Archive
.. _Wayback Machine: https://en.wikipedia.org/wiki/Wayback_Machine
.. |Internet Archive| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png
.. |Wayback Machine| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png
Installation
------------
Using `pip`_:
**pip install waybackpy**
.. _pip: https://en.wikipedia.org/wiki/Pip_(package_manager)
Usage
-----
Archiving aka Saving an url Using save()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: diff
+ waybackpy.save(url, UA=user_agent)
..
url is mandatory. UA is not, but highly recommended.
.. code:: python
import waybackpy
# Capturing a new archive on Wayback machine.
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
archived_url = waybackpy.save("https://github.com/akamhy/waybackpy", UA = "Any-User-Agent")
print(archived_url)
This should print something similar to the following archived URL:
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
Receiving the oldest archive for an URL Using oldest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: diff
+ waybackpy.oldest(url, UA=user_agent)
..
url is mandatory. UA is not, but highly recommended.
.. code:: python
import waybackpy
# retrieving the oldest archive on Wayback machine.
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
oldest_archive = waybackpy.oldest("https://www.google.com/", UA = "Any-User-Agent")
print(oldest_archive)
This returns the oldest available archive for https://google.com.
http://web.archive.org/web/19981111184551/http://google.com:80/
Receiving the newest archive for an URL using newest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: diff
+ waybackpy.newest(url, UA=user_agent)
..
url is mandatory. UA is not, but highly recommended.
.. code:: python
import waybackpy
# retrieving the newest archive on Wayback machine.
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
newest_archive = waybackpy.newest("https://www.microsoft.com/en-us", UA = "Any-User-Agent")
print(newest_archive)
This returns the newest available archive for
https://www.microsoft.com/en-us, something just like this:
http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/
Receiving archive close to a specified year, month, day, hour, and minute using near()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: diff
+ waybackpy.near(url, year=2020, month=1, day=1, hour=1, minute=1, UA=user_agent)
..
url is mandotory. year,month,day,hour and minute are optional
arguments. UA is not mandotory, but higly recomended.
.. code:: python
import waybackpy
# retriving the the closest archive from a specified year.
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
# supported argumnets are year,month,day,hour and minute
archive_near_year = waybackpy.near("https://www.facebook.com/", year=2010, UA ="Any-User-Agent")
print(archive_near_year)
returns :
http://web.archive.org/web/20100504071154/http://www.facebook.com/
``waybackpy.near("https://www.facebook.com/", year=2010, month=1, UA ="Any-User-Agent")``
returns:
http://web.archive.org/web/20101111173430/http://www.facebook.com//
``waybackpy.near("https://www.oracle.com/index.html", year=2019, month=1, day=5, UA ="Any-User-Agent")``
returns:
http://web.archive.org/web/20190105054437/https://www.oracle.com/index.html
> Please note that if you only specify the year, the current month and
day are default arguments for month and day respectively. Do not expect
just putting the year parameter would return the archive closer to
January but the current month you are using the package. If you are
using it in July 2018 and lets say you use
``waybackpy.near("https://www.facebook.com/", year=2011, UA ="Any-User-Agent")``
then you would be returned the nearest archive to July 2011 and not
January 2011. You need to specify the month “1” for January.
Do not pad (dont use zeros in the month, year, day, minute, and hour
arguments). e.g. For January, set month = 1 and not month = 01.
Get the content of webpage using get()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: diff
+ waybackpy.get(url, encoding="UTF-8", UA=user_agent)
..
url is mandatory. UA is not, but highly recommended. encoding is
detected automatically, dont specify unless necessary.
.. code:: python
from waybackpy import get
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
# supported argumnets are url, encoding and UA
webpage = get("https://example.com/", UA="User-Agent")
print(webpage)
..
This should print the source code for https://example.com/.
Count total archives for an URL using total_archives()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: diff
+ waybackpy.total_archives(url, UA=user_agent)
..
url is mandatory. UA is not, but highly recommended.
.. code:: python
from waybackpy import total_archives
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
# Default user-agent (UA) is "waybackpy python package", if not specified in the call.
# supported argumnets are url and UA
count = total_archives("https://en.wikipedia.org/wiki/Python (programming language)", UA="User-Agent")
print(count)
..
This should print an integer (int), which is the number of total
archives on archive.org
Tests
-----
- `Here`_
Dependency
----------
- None, just python standard libraries (json, urllib and datetime).
Both python 2 and 3 are supported :)
License
-------
`MIT License`_
.. _Here: https://github.com/akamhy/waybackpy/tree/master/tests
.. _MIT License: https://github.com/akamhy/waybackpy/blob/master/LICENSE

3
setup.cfg Normal file
View File

@@ -0,0 +1,3 @@
[metadata]
description-file = README.md
license_file = LICENSE

49
setup.py Normal file
View File

@@ -0,0 +1,49 @@
import os.path
from setuptools import setup
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as f:
long_description = f.read()
about = {}
with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py')) as f:
exec(f.read(), about)
setup(
name = about['__title__'],
packages = ['waybackpy'],
version = about['__version__'],
description = about['__description__'],
long_description=long_description,
long_description_content_type='text/markdown',
license= about['__license__'],
author = about['__author__'],
author_email = about['__author_email__'],
url = about['__url__'],
download_url = 'https://github.com/akamhy/waybackpy/archive/v1.5.tar.gz',
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
install_requires=[],
python_requires= ">=2.7",
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'Natural Language :: English',
'Topic :: Software Development :: Build Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: Implementation :: CPython',
],
project_urls={
'Documentation': 'https://waybackpy.readthedocs.io',
'Source': 'https://github.com/akamhy/waybackpy',
},
)

98
tests/test_1.py Normal file
View File

@@ -0,0 +1,98 @@
import sys
sys.path.append("..")
import waybackpy
import pytest
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
def test_clean_url():
test_url = " https://en.wikipedia.org/wiki/Network security "
answer = "https://en.wikipedia.org/wiki/Network_security"
test_result = waybackpy.clean_url(test_url)
assert answer == test_result
def test_url_check():
InvalidUrl = "http://wwwgooglecom/"
with pytest.raises(Exception) as e_info:
waybackpy.url_check(InvalidUrl)
def test_save():
# Test for urls that exist and can be archived.
url1="https://github.com/akamhy/waybackpy"
archived_url1 = waybackpy.save(url1, UA=user_agent)
assert url1 in archived_url1
# Test for urls that are incorrect.
with pytest.raises(Exception) as e_info:
url2 = "ha ha ha ha"
waybackpy.save(url2, UA=user_agent)
# Test for urls not allowed to archive by robot.txt.
with pytest.raises(Exception) as e_info:
url3 = "http://www.archive.is/faq.html"
waybackpy.save(url3, UA=user_agent)
# Non existent urls, test
with pytest.raises(Exception) as e_info:
url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
archived_url4 = waybackpy.save(url4, UA=user_agent)
def test_near():
url = "google.com"
archive_near_year = waybackpy.near(url, year=2010, UA=user_agent)
assert "2010" in archive_near_year
archive_near_month_year = waybackpy.near(url, year=2015, month=2, UA=user_agent)
assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
archive_near_day_month_year = waybackpy.near(url, year=2006, month=11, day=15, UA=user_agent)
assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year)
archive_near_hour_day_month_year = waybackpy.near("www.python.org", year=2008, month=5, day=9, hour=15, UA=user_agent)
assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
with pytest.raises(Exception) as e_info:
NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
waybackpy.near(NeverArchivedUrl, year=2010, UA=user_agent)
def test_oldest():
url = "github.com/akamhy/waybackpy"
archive_oldest = waybackpy.oldest(url, UA=user_agent)
assert "20200504141153" in archive_oldest
def test_newest():
url = "github.com/akamhy/waybackpy"
archive_newest = waybackpy.newest(url, UA=user_agent)
assert url in archive_newest
def test_get():
oldest_google_archive = waybackpy.oldest("google.com", UA=user_agent)
oldest_google_page_text = waybackpy.get(oldest_google_archive, UA=user_agent)
assert "Welcome to Google" in oldest_google_page_text
def test_total_archives():
count1 = waybackpy.total_archives("https://en.wikipedia.org/wiki/Python (programming language)", UA=user_agent)
assert count1 > 2000
count2 = waybackpy.total_archives("https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8", UA=user_agent)
assert count2 == 0
if __name__ == "__main__":
test_clean_url()
print(".")
test_url_check()
print(".")
test_get()
print(".")
test_near()
print(".")
test_newest()
print(".")
test_save()
print(".")
test_oldest()
print(".")
test_total_archives()
print(".")

View File

@@ -1,6 +1,30 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from .wrapper import save, near, oldest, newest
__version__ = "1.1" # ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
# ┃┗┛┗┛┃┗━┓┃━┃┃━┃┃┃┏━┓┃┗━┓┃━┃┏━┛┃┗┛┛┃┏━━┛┃┃━┃┃
# ┗┓┏┓┏┛┃┗┛┗┓┃┗━┛┃┃┗━┛┃┃┗┛┗┓┃┗━┓┃┏┓┓┃┃━━━┃┗━┛┃
# ━┗┛┗┛━┗━━━┛┗━┓┏┛┗━━━┛┗━━━┛┗━━┛┗┛┗┛┗┛━━━┗━┓┏┛
# ━━━━━━━━━━━┏━┛┃━━━━━━━━━━━━━━━━━━━━━━━━┏━┛┃━
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
__all__ = ['wrapper', 'exceptions'] """
A python wrapper for Internet Archive's Wayback Machine API.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Archive pages and retrieve archived pages easily.
Usage:
>>> import waybackpy
>>> new_archive = waybackpy.save('https://www.python.org')
>>> print(new_archive)
https://web.archive.org/web/20200502170312/https://www.python.org/
Full documentation @ <https://akamhy.github.io/waybackpy/>.
:copyright: (c) 2020 by akamhy.
:license: MIT
"""
from .wrapper import save, near, oldest, newest, get, clean_url, url_check, total_archives
from .__version__ import __title__, __description__, __url__, __version__
from .__version__ import __author__, __author_email__, __license__, __copyright__

8
waybackpy/__version__.py Normal file
View File

@@ -0,0 +1,8 @@
__title__ = "waybackpy"
__description__ = "A python wrapper for Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
__url__ = "https://akamhy.github.io/waybackpy/"
__version__ = "v1.5"
__author__ = "akamhy"
__author_email__ = "akash3pro@gmail.com"
__license__ = "MIT"
__copyright__ = "Copyright 2020 akamhy"

View File

@@ -1,14 +1,14 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
class TooManyArchivingRequests(Exception): class TooManyArchivingRequests(Exception):
"""
Error when a single url reqeusted for archiving too many times in a short timespam. """Error when a single url reqeusted for archiving too many times in a short timespam.
Wayback machine doesn't supports archivng any url too many times in a short period of time. Wayback machine doesn't supports archivng any url too many times in a short period of time.
""" """
class ArchivingNotAllowed(Exception): class ArchivingNotAllowed(Exception):
"""
Files like robots.txt are set to deny robot archiving. """Files like robots.txt are set to deny robot archiving.
Wayback machine respects these file, will not archive. Wayback machine respects these file, will not archive.
""" """
@@ -32,6 +32,11 @@ class BadGateWay(Exception):
Raised when 502 bad gateway. Raised when 502 bad gateway.
""" """
class WaybackUnavailable(Exception):
"""
Raised when 503 API Service Temporarily Unavailable.
"""
class InvalidUrl(Exception): class InvalidUrl(Exception):
""" """
Raised when url doesn't follow the standard url format. Raised when url doesn't follow the standard url format.

View File

@@ -1,64 +1,125 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json
from datetime import datetime from datetime import datetime
from waybackpy.exceptions import * from waybackpy.exceptions import TooManyArchivingRequests, ArchivingNotAllowed, PageNotSaved, ArchiveNotFound, UrlNotFound, BadGateWay, InvalidUrl, WaybackUnavailable
try: try:
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.error import HTTPError from urllib.error import HTTPError, URLError
except ImportError: except ImportError:
from urllib2 import Request, urlopen, HTTPError from urllib2 import Request, urlopen, HTTPError, URLError
default_UA = "waybackpy python package" default_UA = "waybackpy python package"
def url_check(url):
if "." not in url:
raise InvalidUrl("'%s' is not a vaild url." % url)
def clean_url(url): def clean_url(url):
return str(url).strip().replace(" ","_") return str(url).strip().replace(" ","_")
def save(url,UA=default_UA): def wayback_timestamp(**kwargs):
base_save_url = "https://web.archive.org/save/" return (
request_url = (base_save_url + clean_url(url)) str(kwargs["year"])
hdr = { 'User-Agent' : '%s' % UA } +
req = Request(request_url, headers=hdr) str(kwargs["month"]).zfill(2)
if "." not in url: +
raise InvalidUrl("'%s' is not a vaild url." % url) str(kwargs["day"]).zfill(2)
try: +
response = urlopen(req) #nosec str(kwargs["hour"]).zfill(2)
except HTTPError as e: +
str(kwargs["minute"]).zfill(2)
)
def handle_HTTPError(e):
if e.code == 502: if e.code == 502:
raise BadGateWay(e) raise BadGateWay(e)
elif e.code == 503:
raise WaybackUnavailable(e)
elif e.code == 429: elif e.code == 429:
raise TooManyArchivingRequests(e) raise TooManyArchivingRequests(e)
elif e.code == 404: elif e.code == 404:
raise UrlNotFound(e) raise UrlNotFound(e)
else:
def save(url, UA=default_UA):
url_check(url)
request_url = ("https://web.archive.org/save/" + clean_url(url))
hdr = { 'User-Agent' : '%s' % UA } #nosec
req = Request(request_url, headers=hdr) #nosec
try:
response = urlopen(req) #nosec
except HTTPError as e:
if handle_HTTPError(e) is None:
raise PageNotSaved(e) raise PageNotSaved(e)
except URLError:
try:
response = urlopen(req) #nosec
except URLError as e:
raise UrlNotFound(e)
header = response.headers header = response.headers
if "exclusion.robots.policy" in str(header): if "exclusion.robots.policy" in str(header):
raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url)) raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url))
archive_id = header['Content-Location']
archived_url = "https://web.archive.org" + archive_id
return archived_url
def near( return "https://web.archive.org" + header['Content-Location']
url,
year=datetime.utcnow().strftime('%Y'), def get(url, encoding=None, UA=default_UA):
month=datetime.utcnow().strftime('%m'), url_check(url)
day=datetime.utcnow().strftime('%d'), hdr = { 'User-Agent' : '%s' % UA }
hour=datetime.utcnow().strftime('%H'), req = Request(clean_url(url), headers=hdr) #nosec
minute=datetime.utcnow().strftime('%M'),
UA=default_UA, try:
): resp=urlopen(req) #nosec
timestamp = str(year)+str(month)+str(day)+str(hour)+str(minute) except URLError:
try:
resp=urlopen(req) #nosec
except URLError as e:
raise UrlNotFound(e)
if encoding is None:
try:
encoding= resp.headers['content-type'].split('charset=')[-1]
except AttributeError:
encoding = "UTF-8"
return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
def near(url, **kwargs):
try:
url = kwargs["url"]
except KeyError:
url = url
year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
month=kwargs.get("month", datetime.utcnow().strftime('%m'))
day=kwargs.get("day", datetime.utcnow().strftime('%d'))
hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
UA=kwargs.get("UA", default_UA)
url_check(url)
timestamp = wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (clean_url(url), str(timestamp)) request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (clean_url(url), str(timestamp))
hdr = { 'User-Agent' : '%s' % UA } hdr = { 'User-Agent' : '%s' % UA }
req = Request(request_url, headers=hdr) req = Request(request_url, headers=hdr) # nosec
try:
response = urlopen(req) #nosec response = urlopen(req) #nosec
import json except HTTPError as e:
data = json.loads(response.read().decode('utf8')) handle_HTTPError(e)
data = json.loads(response.read().decode("UTF-8"))
if not data["archived_snapshots"]: if not data["archived_snapshots"]:
raise ArchiveNotFound("'%s' is not yet archived." % url) raise ArchiveNotFound("'%s' is not yet archived." % url)
archive_url = (data["archived_snapshots"]["closest"]["url"]) archive_url = (data["archived_snapshots"]["closest"]["url"])
# wayback machine returns http sometimes, idk why? But they support https
archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
return archive_url return archive_url
def oldest(url, UA=default_UA, year=1994): def oldest(url, UA=default_UA, year=1994):
@@ -66,3 +127,17 @@ def oldest(url,UA=default_UA,year=1994):
def newest(url, UA=default_UA): def newest(url, UA=default_UA):
return near(url, UA=UA) return near(url, UA=UA)
def total_archives(url, UA=default_UA):
url_check(url)
hdr = { 'User-Agent' : '%s' % UA }
request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json" % clean_url(url)
req = Request(request_url, headers=hdr) # nosec
try:
response = urlopen(req) #nosec
except HTTPError as e:
handle_HTTPError(e)
return (len(json.loads(response.read())))