Compare commits

...

118 Commits
v1.1 ... 2.0.2

Author SHA1 Message Date
f7de8f5575 sleeps to prevent too many requests in a timeframe 2020-07-18 19:25:19 +05:30
3fa0c32064 V2.0.1 link 2020-07-18 19:09:18 +05:30
aa1e3b8825 V2.0.1 2020-07-18 19:08:39 +05:30
58d2d585c8 No timeout for final try 2020-07-18 18:29:41 +05:30
e8efed2e2f Update test_1.py 2020-07-18 17:24:54 +05:30
49089b7321 2.0.0 link 2020-07-18 17:09:07 +05:30
55d8687566 Update test_1.py 2020-07-18 16:58:23 +05:30
0fa28527af Update index.rst 2020-07-18 16:54:07 +05:30
68259fd2d9 Update index.rst 2020-07-18 16:53:27 +05:30
e7086a89d3 Update index.rst 2020-07-18 16:52:37 +05:30
e39467227c Update index.rst 2020-07-18 16:51:47 +05:30
ba840404cf Update index.rst 2020-07-18 16:50:37 +05:30
8fbd2d9e55 Update index.rst 2020-07-18 16:49:03 +05:30
eebf6043de Update index.rst 2020-07-18 16:48:29 +05:30
3d3b09d6d8 Update README.md 2020-07-18 16:46:40 +05:30
ef15b5863c Update index.rst 2020-07-18 16:44:32 +05:30
256c0cdb6b update test - save 2020-07-18 16:39:35 +05:30
12c72a8294 fix link 2020-07-18 16:30:20 +05:30
0ad27f5ecc update readme for newer oop and some test changes (#12)
* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* docstrings

* user agent ; more variants

* description update

* Update __init__.py

* # -*- coding: utf-8 -*-

* Update test_1.py

* update docs for get()

* Update README.md
2020-07-18 16:22:09 +05:30
700b60b5f8 Update README.md 2020-07-18 08:16:59 +05:30
11032596c8 Update README.md 2020-07-18 08:15:43 +05:30
9727f92168 Update README.md 2020-07-18 08:12:33 +05:30
d2893fec13 Delete CONTRIBUTING.md 2020-07-18 08:12:00 +05:30
f1353b2129 Update CONTRIBUTING.md 2020-07-18 00:58:50 +05:30
c76a95ef90 Create CONTRIBUTING.md (#11) 2020-07-18 00:57:48 +05:30
62d88359ce Update README.md 2020-07-18 00:40:21 +05:30
9942c474c9 Update README.md 2020-07-18 00:35:12 +05:30
dfb736e794 Size 2020-07-18 00:32:00 +05:30
84d1766917 Update README.md 2020-07-18 00:20:58 +05:30
9d3cdfafb3 Update README.md 2020-07-18 00:20:17 +05:30
20a16bfa45 Version 2.0.0 on it's way for release (tommorow) 2020-07-18 00:09:28 +05:30
f2112c73f6 Python 2 support 2020-07-17 21:08:32 +05:30
9860527d96 OOP (#10)
* Update wrapper.py

* Update exceptions.py

* Update __init__.py

* test adjusted for new changes

* Update wrapper.py
2020-07-17 20:50:00 +05:30
9ac1e877c8 Update README.md 2020-07-16 20:39:12 +05:30
f881705d00 detecet python version whith sys.version_info (#9) 2020-06-26 15:48:01 +05:30
f015c3f4f3 test on the worst case possible 2020-05-08 09:56:01 +05:30
42ac399362 Most efficient method to count (yet) 2020-05-08 09:47:13 +05:30
e9d010c793 just count the status code, consumes less memory 2020-05-08 09:28:18 +05:30
58a6409528 v1.6 2020-05-07 20:14:59 +05:30
7ca2029158 Update setup.py 2020-05-07 20:14:40 +05:30
80331833f2 Update setup.py 2020-05-07 20:12:32 +05:30
5e3d3a815f fix 2020-05-07 20:03:17 +05:30
6182a18cf4 fix 2020-05-07 20:02:47 +05:30
9bca750310 v1.5 2020-05-07 19:59:23 +05:30
c22749a6a3 update 2020-05-07 19:54:00 +05:30
151df94fe3 license_file = LICENSE 2020-05-07 19:38:19 +05:30
24540d0b2c update 2020-05-07 19:33:39 +05:30
bdfc72d05d Create __version__.py 2020-05-07 19:16:26 +05:30
3b104c1a28 v1.5 2020-05-07 19:03:02 +05:30
fb0d4658a7 ce 2020-05-07 19:02:12 +05:30
48833980e1 update 2020-05-07 18:58:01 +05:30
0c4f119981 Update wrapper.py 2020-05-07 17:25:34 +05:30
afded51a04 Update wrapper.py 2020-05-07 17:20:23 +05:30
b950616561 Update wrapper.py 2020-05-07 17:17:17 +05:30
444675538f fix code Complexity (#8)
* fix code Complexity

* Update wrapper.py

* codefactor badge
2020-05-07 16:51:08 +05:30
0ca6710334 Update wrapper.py 2020-05-07 16:24:33 +05:30
01a7c591ad retry 2020-05-07 15:46:39 +05:30
74d3bc154b fix issue with py2.7 2020-05-07 15:34:41 +05:30
a8e94dfb25 Update README.md 2020-05-07 15:14:55 +05:30
cc38798b32 Update README.md 2020-05-07 15:14:30 +05:30
bc3dd44f27 Update README.md 2020-05-07 15:13:58 +05:30
ba46cdafe2 Update README.md 2020-05-07 15:12:37 +05:30
538afb14e9 Update test_1.py 2020-05-07 15:06:52 +05:30
7605b614ee test for total_archives() 2020-05-07 15:00:28 +05:30
d0a4e25cf5 Update __init__.py 2020-05-07 14:53:09 +05:30
8c5c0153da + total_archives() 2020-05-07 14:52:05 +05:30
e7dac74906 Update __init__.py 2020-05-07 09:06:49 +05:30
c686708c9e more testing 2020-05-07 08:59:09 +05:30
f9ae8ada70 Update test_1.py 2020-05-07 08:39:24 +05:30
e56ece3dc9 Update README.md 2020-05-07 08:23:31 +05:30
db127a5c54 always return https 2020-05-06 20:16:25 +05:30
ed497bbd23 Update wrapper.py 2020-05-06 20:07:25 +05:30
45fe07ddb6 Update wrapper.py 2020-05-06 19:35:01 +05:30
0029d63d8a 503 API Service Temporarily Unavailable 2020-05-06 19:22:56 +05:30
beb5b625ec Set theme jekyll-theme-cayman 2020-05-06 12:20:43 +05:30
b40d734346 Update README.md 2020-05-06 09:18:02 +05:30
be0a30de85 Create index.rst 2020-05-05 20:22:46 +05:30
3a65a60bd6 Update README.md 2020-05-05 19:08:26 +05:30
7b626f5ea5 Update README.md 2020-05-05 17:54:38 +05:30
73371d6c68 Update README.md 2020-05-05 17:49:23 +05:30
8904ba4d67 Update README.md 2020-05-05 17:47:55 +05:30
b4a7f7ea6f Update README.md 2020-05-05 17:47:00 +05:30
a2ead04021 Update README.md 2020-05-05 17:44:12 +05:30
3513feb075 Update __init__.py 2020-05-05 17:37:38 +05:30
d34b98373f Update setup.py 2020-05-05 17:37:16 +05:30
38f3b81742 Update .travis.yml 2020-05-05 17:27:58 +05:30
660a826aed Update .travis.yml 2020-05-05 17:21:36 +05:30
a52d035c0e Update .travis.yml 2020-05-05 17:19:24 +05:30
6737ce0e26 Create .travis.yml 2020-05-05 17:14:57 +05:30
98cc918c8f Update test_1.py 2020-05-05 17:10:33 +05:30
b103bfc6e4 Create test_1.py 2020-05-05 16:29:55 +05:30
edd05838b8 v1.3 2020-05-05 11:29:22 +05:30
031212e161 v1.3 2020-05-05 11:28:58 +05:30
d3bd5b05b5 Update setup.py 2020-05-05 10:50:09 +05:30
d6598a67b9 Update setup.py 2020-05-05 10:40:23 +05:30
e5a6057249 Update setup.py 2020-05-05 10:39:10 +05:30
2a1b3bc6ee Update setup.py 2020-05-05 10:36:05 +05:30
b4ca98eca2 Update setup.py 2020-05-05 10:32:06 +05:30
36b01754ec Update setup.py 2020-05-05 10:23:38 +05:30
3d8bf4eec6 Update setup.py 2020-05-05 10:22:54 +05:30
e7761b3709 Update README.md 2020-05-05 10:21:08 +05:30
df851dce0c Update setup.py 2020-05-05 10:16:15 +05:30
f5acbcfc95 Update exceptions.py 2020-05-05 10:07:27 +05:30
44156e5e7e Update exceptions.py 2020-05-05 10:05:47 +05:30
a6cb955669 Update wrapper.py 2020-05-05 10:04:40 +05:30
8acb14a243 Update wrapper.py 2020-05-05 10:00:29 +05:30
7d434c3f0f Update wrapper.py 2020-05-05 09:57:39 +05:30
057c61d677 Update wrapper.py 2020-05-05 09:48:39 +05:30
6705c04f38 Update wrapper.py 2020-05-05 09:43:13 +05:30
e631c0aadb Update README.md 2020-05-05 09:37:53 +05:30
423782ea75 Update README.md 2020-05-05 09:36:11 +05:30
7944f0878d Add .whitesource configuration file (#6)
Co-authored-by: whitesource-bolt-for-github[bot] <42819689+whitesource-bolt-for-github[bot]@users.noreply.github.com>
2020-05-05 09:33:50 +05:30
850b055527 Update README.md 2020-05-05 09:31:43 +05:30
32bc765113 Update README.md (#5)
* Update README.md

* Update README.md

* Update README.md
2020-05-05 09:27:02 +05:30
09b4ba2649 Version 1.2 with bug fixes and support for webpage retrieval (#4) 2020-05-05 09:03:16 +05:30
929790feca Update README.md (#1)
Add usage/ documentaion
2020-05-04 21:06:00 +05:30
09a521ae43 Create setup.cfg 2020-05-04 16:23:00 +05:30
a503be5a86 Create setup.py 2020-05-04 16:21:24 +05:30
12 changed files with 759 additions and 96 deletions

14
.travis.yml Normal file
View File

@ -0,0 +1,14 @@
language: python
python:
- "2.7"
- "3.6"
- "3.8"
os: linux
dist: xenial
cache: pip
install:
- pip install pytest
before_script:
cd tests
script:
- pytest test_1.py

8
.whitesource Normal file
View File

@ -0,0 +1,8 @@
{
"checkRunSettings": {
"vulnerableCheckRunConclusionLevel": "failure"
},
"issueSettings": {
"minSeverityLevel": "LOW"
}
}

144
README.md
View File

@ -1,2 +1,142 @@
# pywayback
A python wrapper for Internet Archive's Wayback Machine
# waybackpy
[![Build Status](https://img.shields.io/travis/akamhy/waybackpy.svg?label=Travis%20CI&logo=travis&style=flat-square)](https://travis-ci.org/akamhy/waybackpy)
[![Downloads](https://img.shields.io/pypi/dm/waybackpy.svg)](https://pypistats.org/packages/waybackpy)
[![Release](https://img.shields.io/github/v/release/akamhy/waybackpy.svg)](https://github.com/akamhy/waybackpy/releases)
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65)](https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=akamhy/waybackpy&amp;utm_campaign=Badge_Grade)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
[![Maintainability](https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability)](https://codeclimate.com/github/akamhy/waybackpy/maintainability)
[![CodeFactor](https://www.codefactor.io/repository/github/akamhy/waybackpy/badge)](https://www.codefactor.io/repository/github/akamhy/waybackpy)
[![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)
![pypi](https://img.shields.io/pypi/v/waybackpy.svg)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/akamhy/waybackpy/graphs/commit-activity)
[![codecov](https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg)](https://codecov.io/gh/akamhy/waybackpy)
![](https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square)
![contributions welcome](https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square)
![Internet Archive](https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png)
![Wayback Machine](https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png)
Waybackpy is a Python library that interfaces with the [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine) API. Archive pages and retrieve archived pages easily.
Table of contents
=================
<!--ts-->
* [Installation](#installation)
* [Usage](#usage)
* [Saving an url using save()](#capturing-aka-saving-an-url-using-save)
* [Receiving the oldest archive for an URL Using oldest()](#receiving-the-oldest-archive-for-an-url-using-oldest)
* [Receiving the recent most/newest archive for an URL using newest()](#receiving-the-newest-archive-for-an-url-using-newest)
* [Receiving archive close to a specified year, month, day, hour, and minute using near()](#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
* [Get the content of webpage using get()](#get-the-content-of-webpage-using-get)
* [Count total archives for an URL using total_archives()](#count-total-archives-for-an-url-using-total_archives)
* [Tests](#tests)
* [Dependency](#dependency)
* [License](#license)
<!--te-->
## Installation
Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)):
```bash
pip install waybackpy
```
## Usage
#### Capturing aka Saving an url Using save()
```python
import waybackpy
# Capturing a new archive on Wayback machine.
target_url = waybackpy.Url("https://github.com/akamhy/waybackpy", user_agnet="My-cool-user-agent")
archived_url = target_url.save()
print(archived_url)
```
This should print an URL similar to the following archived URL:
> <https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy>
#### Receiving the oldest archive for an URL Using oldest()
```python
import waybackpy
# retrieving the oldest archive on Wayback machine.
target_url = waybackpy.Url("https://www.google.com/", "My-cool-user-agent")
oldest_archive = target_url.oldest()
print(oldest_archive)
```
This should print the oldest available archive for <https://google.com>.
> <http://web.archive.org/web/19981111184551/http://google.com:80/>
#### Receiving the newest archive for an URL using newest()
```python
import waybackpy
# retrieving the newest/latest archive on Wayback machine.
target_url = waybackpy.Url(url="https://www.google.com/", user_agnet="My-cool-user-agent")
newest_archive = target_url.newest()
print(newest_archive)
```
This print the newest available archive for <https://www.microsoft.com/en-us>, something just like this:
> <http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/>
#### Receiving archive close to a specified year, month, day, hour, and minute using near()
```python
import waybackpy
# retriving the the closest archive from a specified year.
# supported argumnets are year,month,day,hour and minute
target_url = waybackpy.Url(https://www.facebook.com/", "Any-User-Agent")
archive_near_year = target_url.near(year=2010)
print(archive_near_year)
```
returns : <http://web.archive.org/web/20100504071154/http://www.facebook.com/>
> Please note that if you only specify the year, the current month and day are default arguments for month and day respectively. Just putting the year parameter would not return the archive closer to January but the current month you are using the package. You need to specify the month "1" for January , 2 for february and so on.
> Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
#### Get the content of webpage using get()
```python
import waybackpy
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
# supported argumnets encoding and user_agent
target = waybackpy.Url("google.com", "any-user_agent")
oldest_url = target.oldest()
webpage = target.get(oldest_url) # We are getting the source of oldest archive of google.com.
print(webpage)
```
> This should print the source code for oldest archive of google.com. If no URL is passed in get() then it should retrive the source code of google.com and not any archive.
#### Count total archives for an URL using total_archives()
```python
from waybackpy import Url
# retriving the content of a webpage from any url including but not limited to the archived urls.
count = Url("https://en.wikipedia.org/wiki/Python (programming language)", "User-Agent").total_archives()
print(count)
```
> This should print an integer (int), which is the number of total archives on archive.org
## Tests
* [Here](https://github.com/akamhy/waybackpy/tree/master/tests)
## Dependency
* None, just python standard libraries (re, json, urllib and datetime). Both python 2 and 3 are supported :)
## License
[MIT License](https://github.com/akamhy/waybackpy/blob/master/LICENSE)

1
_config.yml Normal file
View File

@ -0,0 +1 @@
theme: jekyll-theme-cayman

225
index.rst Normal file
View File

@ -0,0 +1,225 @@
waybackpy
=========
|Build Status| |Downloads| |Release| |Codacy Badge| |License: MIT|
|Maintainability| |CodeFactor| |made-with-python| |pypi| |PyPI - Python
Version| |Maintenance| |codecov| |image1| |contributions welcome|
.. |Build Status| image:: https://img.shields.io/travis/akamhy/waybackpy.svg?label=Travis%20CI&logo=travis&style=flat-square
:target: https://travis-ci.org/akamhy/waybackpy
.. |Downloads| image:: https://img.shields.io/pypi/dm/waybackpy.svg
:target: https://pypistats.org/packages/waybackpy
.. |Release| image:: https://img.shields.io/github/v/release/akamhy/waybackpy.svg
:target: https://github.com/akamhy/waybackpy/releases
.. |Codacy Badge| image:: https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65
:target: https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
.. |Maintainability| image:: https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability
:target: https://codeclimate.com/github/akamhy/waybackpy/maintainability
.. |CodeFactor| image:: https://www.codefactor.io/repository/github/akamhy/waybackpy/badge
:target: https://www.codefactor.io/repository/github/akamhy/waybackpy
.. |made-with-python| image:: https://img.shields.io/badge/Made%20with-Python-1f425f.svg
:target: https://www.python.org/
.. |pypi| image:: https://img.shields.io/pypi/v/waybackpy.svg
.. |PyPI - Python Version| image:: https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square
.. |Maintenance| image:: https://img.shields.io/badge/Maintained%3F-yes-green.svg
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity
.. |codecov| image:: https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg
:target: https://codecov.io/gh/akamhy/waybackpy
.. |image1| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
.. |contributions welcome| image:: https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square
|Internet Archive| |Wayback Machine|
Waybackpy is a Python library that interfaces with the `Internet
Archive`_\ s `Wayback Machine`_ API. Archive pages and retrieve
archived pages easily.
.. _Internet Archive: https://en.wikipedia.org/wiki/Internet_Archive
.. _Wayback Machine: https://en.wikipedia.org/wiki/Wayback_Machine
.. |Internet Archive| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png
.. |Wayback Machine| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png
Table of contents
=================
.. raw:: html
<!--ts-->
- `Installation`_
- `Usage`_
- `Saving an url using save()`_
- `Receiving the oldest archive for an URL Using oldest()`_
- `Receiving the recent most/newest archive for an URL using
newest()`_
- `Receiving archive close to a specified year, month, day, hour,
and minute using near()`_
- `Get the content of webpage using get()`_
- `Count total archives for an URL using total_archives()`_
- `Tests`_
- `Dependency`_
- `License`_
.. raw:: html
<!--te-->
.. _Installation: #installation
.. _Usage: #usage
.. _Saving an url using save(): #capturing-aka-saving-an-url-using-save
.. _Receiving the oldest archive for an URL Using oldest(): #receiving-the-oldest-archive-for-an-url-using-oldest
.. _Receiving the recent most/newest archive for an URL using newest(): #receiving-the-newest-archive-for-an-url-using-newest
.. _Receiving archive close to a specified year, month, day, hour, and minute using near(): #receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near
.. _Get the content of webpage using get(): #get-the-content-of-webpage-using-get
.. _Count total archives for an URL using total_archives(): #count-total-archives-for-an-url-using-total_archives
.. _Tests: #tests
.. _Dependency: #dependency
.. _License: #license
Installation
------------
Using `pip`_:
.. code:: bash
pip install waybackpy
Usage
-----
Capturing aka Saving an url Using save()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
# Capturing a new archive on Wayback machine.
target_url = waybackpy.Url("https://github.com/akamhy/waybackpy", user_agnet="My-cool-user-agent")
archived_url = target_url.save()
print(archived_url)
This should print an URL similar to the following archived URL:
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
.. _pip: https://en.wikipedia.org/wiki/Pip_(package_manager)
Receiving the oldest archive for an URL Using oldest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
# retrieving the oldest archive on Wayback machine.
target_url = waybackpy.Url("https://www.google.com/", "My-cool-user-agent")
oldest_archive = target_url.oldest()
print(oldest_archive)
This should print the oldest available archive for https://google.com.
http://web.archive.org/web/19981111184551/http://google.com:80/
Receiving the newest archive for an URL using newest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
# retrieving the newest/latest archive on Wayback machine.
target_url = waybackpy.Url(url="https://www.google.com/", user_agnet="My-cool-user-agent")
newest_archive = target_url.newest()
print(newest_archive)
This print the newest available archive for
https://www.microsoft.com/en-us, something just like this:
http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/
Receiving archive close to a specified year, month, day, hour, and minute using near()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
# retriving the the closest archive from a specified year.
# supported argumnets are year,month,day,hour and minute
target_url = waybackpy.Url(https://www.facebook.com/", "Any-User-Agent")
archive_near_year = target_url.near(year=2010)
print(archive_near_year)
returns :
http://web.archive.org/web/20100504071154/http://www.facebook.com/
Please note that if you only specify the year, the current month and
day are default arguments for month and day respectively. Just
putting the year parameter would not return the archive closer to
January but the current month you are using the package. You need to
specify the month “1” for January , 2 for february and so on.
..
Do not pad (dont use zeros in the month, year, day, minute, and hour
arguments). e.g. For January, set month = 1 and not month = 01.
Get the content of webpage using get()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
# supported argumnets encoding and user_agent
target = waybackpy.Url("google.com", "any-user_agent")
oldest_url = target.oldest()
webpage = target.get(oldest_url) # We are getting the source of oldest archive of google.com.
print(webpage)
..
This should print the source code for oldest archive of google.com.
If no URL is passed in get() then it should retrive the source code
of google.com and not any archive.
Count total archives for an URL using total_archives()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
from waybackpy import Url
# retriving the content of a webpage from any url including but not limited to the archived urls.
count = Url("https://en.wikipedia.org/wiki/Python (programming language)", "User-Agent").total_archives()
print(count)
..
This should print an integer (int), which is the number of total
archives on archive.org
Tests
-----
- `Here`_
Dependency
----------
- None, just python standard libraries (re, json, urllib and datetime).
Both python 2 and 3 are supported :)
License
-------
`MIT License`_
.. _Here: https://github.com/akamhy/waybackpy/tree/master/tests
.. _MIT License: https://github.com/akamhy/waybackpy/blob/master/LICENSE

3
setup.cfg Normal file
View File

@ -0,0 +1,3 @@
[metadata]
description-file = README.md
license_file = LICENSE

49
setup.py Normal file
View File

@ -0,0 +1,49 @@
import os.path
from setuptools import setup
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as f:
long_description = f.read()
about = {}
with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py')) as f:
exec(f.read(), about)
setup(
name = about['__title__'],
packages = ['waybackpy'],
version = about['__version__'],
description = about['__description__'],
long_description=long_description,
long_description_content_type='text/markdown',
license= about['__license__'],
author = about['__author__'],
author_email = about['__author_email__'],
url = about['__url__'],
download_url = 'https://github.com/akamhy/waybackpy/archive/2.0.1.tar.gz',
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
install_requires=[],
python_requires= ">=2.7",
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'Natural Language :: English',
'Topic :: Software Development :: Build Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: Implementation :: CPython',
],
project_urls={
'Documentation': 'https://waybackpy.readthedocs.io',
'Source': 'https://github.com/akamhy/waybackpy',
},
)

134
tests/test_1.py Normal file
View File

@ -0,0 +1,134 @@
# -*- coding: utf-8 -*-
import sys
sys.path.append("..")
import waybackpy
import pytest
import random
import time
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
def test_clean_url():
time.sleep(10)
test_url = " https://en.wikipedia.org/wiki/Network security "
answer = "https://en.wikipedia.org/wiki/Network_security"
target = waybackpy.Url(test_url, user_agent)
test_result = target.clean_url()
assert answer == test_result
def test_url_check():
time.sleep(10)
broken_url = "http://wwwgooglecom/"
with pytest.raises(Exception) as e_info:
waybackpy.Url(broken_url, user_agent)
def test_save():
# Test for urls that exist and can be archived.
time.sleep(10)
url_list = [
"en.wikipedia.org",
"www.wikidata.org",
"commons.wikimedia.org",
"www.wiktionary.org",
"www.w3schools.com",
"www.youtube.com"
]
x = random.randint(0, len(url_list)-1)
url1 = url_list[x]
target = waybackpy.Url(url1, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36")
archived_url1 = target.save()
assert url1 in archived_url1
if sys.version_info > (3, 6):
# Test for urls that are incorrect.
with pytest.raises(Exception) as e_info:
url2 = "ha ha ha ha"
waybackpy.Url(url2, user_agent)
time.sleep(5)
# Test for urls not allowed to archive by robot.txt.
with pytest.raises(Exception) as e_info:
url3 = "http://www.archive.is/faq.html"
target = waybackpy.Url(url3, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0")
target.save()
time.sleep(5)
# Non existent urls, test
with pytest.raises(Exception) as e_info:
url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
target = waybackpy.Url(url3, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27")
target.save()
else:
pass
def test_near():
time.sleep(10)
url = "google.com"
target = waybackpy.Url(url, "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4")
archive_near_year = target.near(year=2010)
assert "2010" in archive_near_year
if sys.version_info > (3, 6):
time.sleep(5)
archive_near_month_year = target.near( year=2015, month=2)
assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
target = waybackpy.Url("www.python.org", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246")
archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15)
assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
with pytest.raises(Exception) as e_info:
NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
target = waybackpy.Url(NeverArchivedUrl, user_agent)
target.near(year=2010)
else:
pass
def test_oldest():
time.sleep(10)
url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent)
assert "20200504141153" in target.oldest()
def test_newest():
time.sleep(10)
url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent)
assert url in target.newest()
def test_get():
time.sleep(10)
target = waybackpy.Url("google.com", user_agent)
assert "Welcome to Google" in target.get(target.oldest())
def test_total_archives():
time.sleep(10)
if sys.version_info > (3, 6):
target = waybackpy.Url(" https://google.com ", user_agent)
assert target.total_archives() > 500000
else:
pass
time.sleep(5)
target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent)
assert target.total_archives() == 0
if __name__ == "__main__":
test_clean_url()
print(".") #1
test_url_check()
print(".") #1
test_get()
print(".") #3
test_near()
print(".") #4
test_newest()
print(".") #5
test_save()
print(".") #6
test_oldest()
print(".") #7
test_total_archives()
print(".") #8
print("OK")

View File

@ -1,6 +1,32 @@
# -*- coding: utf-8 -*-
from .wrapper import save, near, oldest, newest
__version__ = "1.1"
# ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
# ┃┗┛┗┛┃┗━┓┃━┃┃━┃┃┃┏━┓┃┗━┓┃━┃┏━┛┃┗┛┛┃┏━━┛┃┃━┃┃
# ┗┓┏┓┏┛┃┗┛┗┓┃┗━┛┃┃┗━┛┃┃┗┛┗┓┃┗━┓┃┏┓┓┃┃━━━┃┗━┛┃
# ━┗┛┗┛━┗━━━┛┗━┓┏┛┗━━━┛┗━━━┛┗━━┛┗┛┗┛┗┛━━━┗━┓┏┛
# ━━━━━━━━━━━┏━┛┃━━━━━━━━━━━━━━━━━━━━━━━━┏━┛┃━
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
__all__ = ['wrapper', 'exceptions']
"""
Waybackpy is a Python library that interfaces with the Internet Archive's Wayback Machine API.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Archive pages and retrieve archived pages easily.
Usage:
>>> import waybackpy
>>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
>>> new_archive = target_url.save()
>>> print(new_archive)
https://web.archive.org/web/20200502170312/https://www.python.org/
Full documentation @ <https://akamhy.github.io/waybackpy/>.
:copyright: (c) 2020 by akamhy.
:license: MIT
"""
from .wrapper import Url
from .__version__ import __title__, __description__, __url__, __version__
from .__version__ import __author__, __author_email__, __license__, __copyright__

10
waybackpy/__version__.py Normal file
View File

@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
__title__ = "waybackpy"
__description__ = "A Python library that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
__url__ = "https://akamhy.github.io/waybackpy/"
__version__ = "2.0.1"
__author__ = "akamhy"
__author_email__ = "akash3pro@gmail.com"
__license__ = "MIT"
__copyright__ = "Copyright 2020 akamhy"

View File

@ -1,38 +1,6 @@
# -*- coding: utf-8 -*-
class TooManyArchivingRequests(Exception):
class WaybackError(Exception):
"""
Error when a single url reqeusted for archiving too many times in a short timespam.
Wayback machine doesn't supports archivng any url too many times in a short period of time.
"""
class ArchivingNotAllowed(Exception):
"""
Files like robots.txt are set to deny robot archiving.
Wayback machine respects these file, will not archive.
"""
class PageNotSaved(Exception):
"""
When unable to save a webpage.
"""
class ArchiveNotFound(Exception):
"""
When a page was never archived but client asks for old archive.
"""
class UrlNotFound(Exception):
"""
Raised when 404 UrlNotFound.
"""
class BadGateWay(Exception):
"""
Raised when 502 bad gateway.
"""
class InvalidUrl(Exception):
"""
Raised when url doesn't follow the standard url format.
Raised when API Service error.
"""

View File

@ -1,68 +1,153 @@
# -*- coding: utf-8 -*-
import re
import sys
import json
from datetime import datetime
from waybackpy.exceptions import *
try:
from waybackpy.exceptions import WaybackError
if sys.version_info >= (3, 0): # If the python ver >= 3
from urllib.request import Request, urlopen
from urllib.error import HTTPError
except ImportError:
from urllib2 import Request, urlopen, HTTPError
from urllib.error import HTTPError, URLError
else: # For python2.x
from urllib2 import Request, urlopen, HTTPError, URLError
default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
class Url():
"""waybackpy Url object"""
default_UA = "waybackpy python package"
def __init__(self, url, user_agent=default_UA):
self.url = url
self.user_agent = user_agent
self.url_check() # checks url validity on init.
def clean_url(url):
return str(url).strip().replace(" ","_")
def __repr__(self):
"""Representation of the object."""
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
def save(url,UA=default_UA):
base_save_url = "https://web.archive.org/save/"
request_url = (base_save_url + clean_url(url))
hdr = { 'User-Agent' : '%s' % UA }
req = Request(request_url, headers=hdr)
if "." not in url:
raise InvalidUrl("'%s' is not a vaild url." % url)
try:
response = urlopen(req) #nosec
except HTTPError as e:
if e.code == 502:
raise BadGateWay(e)
elif e.code == 429:
raise TooManyArchivingRequests(e)
elif e.code == 404:
raise UrlNotFound(e)
else:
raise PageNotSaved(e)
def __str__(self):
"""String representation of the object."""
return "%s" % self.clean_url()
header = response.headers
if "exclusion.robots.policy" in str(header):
raise ArchivingNotAllowed("Can not archive %s. Disabled by site owner." % (url))
archive_id = header['Content-Location']
archived_url = "https://web.archive.org" + archive_id
return archived_url
def __len__(self):
"""Length of the URL."""
return len(self.clean_url())
def near(
url,
year=datetime.utcnow().strftime('%Y'),
month=datetime.utcnow().strftime('%m'),
day=datetime.utcnow().strftime('%d'),
hour=datetime.utcnow().strftime('%H'),
minute=datetime.utcnow().strftime('%M'),
UA=default_UA,
):
timestamp = str(year)+str(month)+str(day)+str(hour)+str(minute)
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (clean_url(url), str(timestamp))
hdr = { 'User-Agent' : '%s' % UA }
req = Request(request_url, headers=hdr)
response = urlopen(req) #nosec
import json
data = json.loads(response.read().decode('utf8'))
if not data["archived_snapshots"]:
raise ArchiveNotFound("'%s' is not yet archived." % url)
archive_url = (data["archived_snapshots"]["closest"]["url"])
return archive_url
def url_check(self):
"""Check for common URL problems."""
if "." not in self.url:
raise URLError("'%s' is not a vaild url." % self.url)
return True
def oldest(url,UA=default_UA,year=1994):
return near(url,year=year,UA=UA)
def clean_url(self):
"""Fix the URL, if possible."""
return str(self.url).strip().replace(" ","_")
def newest(url,UA=default_UA):
return near(url,UA=UA)
def wayback_timestamp(self, **kwargs):
"""Return the formatted the timestamp."""
return (
str(kwargs["year"])
+
str(kwargs["month"]).zfill(2)
+
str(kwargs["day"]).zfill(2)
+
str(kwargs["hour"]).zfill(2)
+
str(kwargs["minute"]).zfill(2)
)
def handle_HTTPError(self, e):
"""Handle some common HTTPErrors."""
if e.code == 404:
raise HTTPError(e)
if e.code >= 400:
raise WaybackError(e)
def save(self):
"""Create a new archives for an URL on the Wayback Machine."""
request_url = ("https://web.archive.org/save/" + self.clean_url())
hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec
req = Request(request_url, headers=hdr) #nosec
try:
response = urlopen(req, timeout=30) #nosec
except Exception:
try:
response = urlopen(req) #nosec
except Exception as e:
raise WaybackError(e)
header = response.headers
try:
arch = re.search(r"rel=\"memento.*?web\.archive\.org(/web/[0-9]{14}/.*?)>", str(header)).group(1)
except KeyError as e:
raise WaybackError(e)
return "https://web.archive.org" + arch
def get(self, url=None, user_agent=None, encoding=None):
"""Returns the source code of the supplied URL. Auto detects the encoding if not supplied."""
if not url:
url = self.clean_url()
if not user_agent:
user_agent = self.user_agent
hdr = { 'User-Agent' : '%s' % user_agent }
req = Request(url, headers=hdr) #nosec
try:
resp=urlopen(req) #nosec
except URLError:
try:
resp=urlopen(req) #nosec
except URLError as e:
raise HTTPError(e)
if not encoding:
try:
encoding= resp.headers['content-type'].split('charset=')[-1]
except AttributeError:
encoding = "UTF-8"
return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
def near(self, **kwargs):
""" Returns the archived from Wayback Machine for an URL closest to the time supplied.
Supported params are year, month, day, hour and minute.
The non supplied parameters are default to the runtime time.
"""
year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
month=kwargs.get("month", datetime.utcnow().strftime('%m'))
day=kwargs.get("day", datetime.utcnow().strftime('%d'))
hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
request_url = "https://archive.org/wayback/available?url=%s&timestamp=%s" % (self.clean_url(), str(timestamp))
hdr = { 'User-Agent' : '%s' % self.user_agent }
req = Request(request_url, headers=hdr) # nosec
try:
response = urlopen(req) #nosec
except Exception as e:
self.handle_HTTPError(e)
data = json.loads(response.read().decode("UTF-8"))
if not data["archived_snapshots"]:
raise WaybackError("'%s' is not yet archived." % url)
archive_url = (data["archived_snapshots"]["closest"]["url"])
# wayback machine returns http sometimes, idk why? But they support https
archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
return archive_url
def oldest(self, year=1994):
"""Returns the oldest archive from Wayback Machine for an URL."""
return self.near(year=year)
def newest(self):
"""Returns the newest archive on Wayback Machine for an URL, sometimes you may not get the newest archive because wayback machine DB lag."""
return self.near()
def total_archives(self):
"""Returns the total number of archives on Wayback Machine for an URL."""
hdr = { 'User-Agent' : '%s' % self.user_agent }
request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url()
req = Request(request_url, headers=hdr) # nosec
try:
response = urlopen(req) #nosec
except Exception as e:
self.handle_HTTPError(e)
return str(response.read()).count(",") # Most efficient method to count number of archives (yet)