Compare commits

...

18 Commits
2.1.8 ... 2.2.0

Author SHA1 Message Date
925be7b17e V2.2.0 2020-10-17 17:10:46 +05:30
2b132456ac updated index.rst and minor docs updated. 2020-10-17 16:56:51 +05:30
50e3154a4e lint README.md 2020-10-17 12:01:49 +05:30
7aef50428f add link to the repo 2020-10-17 11:51:56 +05:30
d8ec0f5025 More pythonic code snippets in README (#36) 2020-10-17 11:49:27 +05:30
0a2f97c034 Update README, drop python 2 support
* Drop python 2 support

* updated docs

* added new docs
2020-10-16 22:37:32 +05:30
3e9cf23578 3.9 archive doesn't not exist yet. 2020-10-16 19:43:06 +05:30
7f927ec7be added tests for json and archive_url, updated broken tests (#34)
* added tests for json and archive_url, updated broken tests

* drop 2.7 support
2020-10-16 19:25:45 +05:30
9de6393cd5 Add support for JSON and archive_url (#33)
CLI support for JSON and archive_url attributes
2020-10-16 15:16:18 +05:30
91e7f65617 Fixing len() bug (#32)
* added class functionality

* Update wrapper.py

* style edits

* fixed bug with len() of url()

* fixing len() bug

* fixing len() bug

* squashing bug

* removed test notebook
2020-10-16 10:04:13 +05:30
d465454019 Adding attributes to Url class (#28)
* added class functionality

* Update wrapper.py

* style edits
2020-10-15 22:10:32 +05:30
1a81eb97fb lint 2020-10-03 16:58:11 +05:30
6b3b2e2a7d tests for newly added known_urls feature 2020-10-03 09:33:50 +05:30
82c65454e6 2.1.9 2020-10-03 01:34:15 +05:30
19710461b6 Update setup.py 2020-10-03 01:33:46 +05:30
a3661d6b85 Update index.rst 2020-10-03 01:33:15 +05:30
58375e4ef4 fix broken links 2020-10-03 01:31:28 +05:30
ea023e98da update 2020-10-03 01:22:51 +05:30
12 changed files with 543 additions and 222 deletions

View File

@ -3,7 +3,6 @@ os: linux
dist: xenial
cache: pip
python:
- 2.7
- 3.6
- 3.8
before_install:

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2020 akamhy
Copyright (c) 2020 Akash Mahanty (https://github.com/akamhy)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

190
README.md
View File

@ -17,7 +17,7 @@
![Wayback Machine](https://raw.githubusercontent.com/akamhy/waybackpy/master/assets/waybackpy-colored%20284.png)
Waybackpy is a Python library that interfaces with [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine) API. Archive webpages and retrieve archived webpages easily.
Waybackpy is a Python package that interfaces with [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine) API. Archive webpages and retrieve archived webpages easily.
Table of contents
=================
@ -27,20 +27,24 @@ Table of contents
* [Usage](#usage)
* [As a Python package](#as-a-python-package)
* [Saving an url](#capturing-aka-saving-an-url-using-save)
* [Saving a webpage](#capturing-aka-saving-an-url-using-save)
* [Retrieving archive](#retrieving-the-archive-for-an-url-using-archive_url)
* [Retrieving the oldest archive](#retrieving-the-oldest-archive-for-an-url-using-oldest)
* [Retrieving the recent most/newest archive](#retrieving-the-newest-archive-for-an-url-using-newest)
* [Retrieving the JSON response of availability API](#retrieving-the-json-reponse-for-the-avaliblity-api-request)
* [Retrieving archive close to a specified year, month, day, hour, and minute](#retrieving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
* [Get the content of webpage](#get-the-content-of-webpage-using-get)
* [Count total archives for an URL](#count-total-archives-for-an-url-using-total_archives)
* [List of URLs that Wayback Machine knows and has archived for a domain name](#retrieving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
* [List of URLs that Wayback Machine knows and has archived for a domain name](#list-of-urls-that-wayback-machine-knows-and-has-archived-for-a-domain-name)
* [As a Command-line tool](#with-the-command-line-interface)
* [Save](#save)
* [Oldest archive](#oldest-archive)
* [Newest archive](#newest-archive)
* [With the Command-line interface](#with-the-command-line-interface)
* [Saving webpage](#save)
* [Archive URL](#get-archive-url)
* [Oldest archive URL](#oldest-archive)
* [Newest archive URL](#newest-archive)
* [JSON response of API](#get-json-data-of-avaialblity-api)
* [Total archives](#total-number-of-archives)
* [Archive near a time](#archive-near-time)
* [Archive near specified time](#archive-near-time)
* [Get the source code](#get-the-source-code)
* [Fetch all the URLs that the Wayback Machine knows for a domain](#fetch-all-the-urls-that-the-wayback-machine-knows-for-a-domain)
@ -77,33 +81,49 @@ pip install git+https://github.com/akamhy/waybackpy.git
```python
import waybackpy
new_archive_url = waybackpy.Url(
url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
url = "https://en.wikipedia.org/wiki/Multivariable_calculus",
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
).save()
print(new_archive_url)
waybackpy_url_obj = waybackpy.Url(url, user_agent)
archive = waybackpy_url_obj.save()
print(archive)
```
```bash
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus
```
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPySaveExample></sub>
#### Retrieving the archive for an URL using archive_url
```python
import waybackpy
url = "https://www.google.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
waybackpy_url_obj = waybackpy.Url(url, user_agent)
archive_url = waybackpy_url_obj.archive_url
print(archive_url)
```
```bash
https://web.archive.org/web/20201016153320/https://www.google.com/
```
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyArchiveUrl></sub>
#### Retrieving the oldest archive for an URL using oldest()
```python
import waybackpy
oldest_archive_url = waybackpy.Url(
"https://www.google.com/",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
).oldest()
url = "https://www.google.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
waybackpy_url_obj = waybackpy.Url(url, user_agent)
oldest_archive_url = waybackpy_url_obj.oldest()
print(oldest_archive_url)
```
@ -118,78 +138,89 @@ http://web.archive.org/web/19981111184551/http://google.com:80/
```python
import waybackpy
newest_archive_url = waybackpy.Url(
"https://www.facebook.com/",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
).newest()
url = "https://www.facebook.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
waybackpy_url_obj = waybackpy.Url(url, user_agent)
newest_archive_url = waybackpy_url_obj.newest()
print(newest_archive_url)
```
```bash
https://web.archive.org/web/20200714013225/https://www.facebook.com/
https://web.archive.org/web/20201016150543/https://www.facebook.com/
```
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyNewestExample></sub>
#### Retrieving the JSON reponse for the avaliblity API request
```python
import waybackpy
url = "https://www.facebook.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
waybackpy_url_obj = waybackpy.Url(url, user_agent)
json_dict = waybackpy_url_obj.JSON
print(json_dict)
```
```javascript
{'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'available': True, 'url': 'http://web.archive.org/web/20201016150543/https://www.facebook.com/', 'timestamp': '20201016150543', 'status': '200'}}}
```
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyJSON></sub>
#### Retrieving archive close to a specified year, month, day, hour, and minute using near()
```python
from waybackpy import Url
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
github_url = "https://github.com/"
url = "https://github.com/"
github_wayback_obj = Url(github_url, user_agent)
waybackpy_url_obj = Url(url, user_agent)
# Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
```
```python
github_archive_near_2010 = github_wayback_obj.near(year=2010)
github_archive_near_2010 = waybackpy_url_obj.near(year=2010)
print(github_archive_near_2010)
```
```bash
https://web.archive.org/web/20100719134402/http://github.com/
https://web.archive.org/web/20101018053604/http://github.com:80/
```
```python
github_archive_near_2011_may = github_wayback_obj.near(year=2011, month=5)
github_archive_near_2011_may = waybackpy_url_obj.near(year=2011, month=5)
print(github_archive_near_2011_may)
```
```bash
https://web.archive.org/web/20110519185447/https://github.com/
https://web.archive.org/web/20110518233639/https://github.com/
```
```python
github_archive_near_2015_january_26 = github_wayback_obj.near(
year=2015, month=1, day=26
)
github_archive_near_2015_january_26 = waybackpy_url_obj.near(year=2015, month=1, day=26)
print(github_archive_near_2015_january_26)
```
```bash
https://web.archive.org/web/20150127031159/https://github.com
https://web.archive.org/web/20150125102636/https://github.com/
```
```python
github_archive_near_2018_4_july_9_2_am = github_wayback_obj.near(
year=2018, month=7, day=4, hour = 9, minute = 2
)
github_archive_near_2018_4_july_9_2_am = waybackpy_url_obj.near(year=2018, month=7, day=4, hour=9, minute=2)
print(github_archive_near_2018_4_july_9_2_am)
```
```bash
https://web.archive.org/web/20180704090245/https://github.com/
```
<sub>The library doesn't supports seconds yet. You are encourged to create a PR ;)</sub>
<sub>The package doesn't support second argument yet. You are encourged to create a PR ;)</sub>
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyNearExample></sub>
@ -212,16 +243,12 @@ print(current_google_url_source)
# The following chunk of code will force a new archive of google.com and get the source of the archived page.
# waybackpy_url_object.save() type is string.
google_newest_archive_source = waybackpy_url_object.get(
waybackpy_url_object.save()
)
google_newest_archive_source = waybackpy_url_object.get(waybackpy_url_object.save())
print(google_newest_archive_source)
# waybackpy_url_object.oldest() type is str, it's oldest archive of google.com
google_oldest_archive_source = waybackpy_url_object.get(
waybackpy_url_object.oldest()
)
google_oldest_archive_source = waybackpy_url_object.get(waybackpy_url_object.oldest())
print(google_oldest_archive_source)
```
@ -233,24 +260,22 @@ print(google_oldest_archive_source)
import waybackpy
URL = "https://en.wikipedia.org/wiki/Python (programming language)"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
archive_count = waybackpy.Url(
url=URL,
user_agent=UA
).total_archives()
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
archive_count = waybackpy_url_object.total_archives()
print(archive_count) # total_archives() returns an int
```
```bash
2440
2516
```
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyTotalArchivesExample></sub>
#### List of URLs that Wayback Machine knows and has archived for a domain name
#### List of URLs that Wayback Machine knows and has archived for a domain name
1) If alive=True is set, waybackpy will check all URLs to identify the alive URLs. Don't use with popular websites like google or it would take too long.
2) To include URLs from subdomain set sundomain=True
@ -261,8 +286,8 @@ import waybackpy
URL = "akamhy.github.io"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
known_urls = waybackpy.Url(url=URL, user_agent=UA).known_urls(alive=True, subdomain=False) # alive and subdomain are optional.
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
known_urls = waybackpy_url_object.known_urls(alive=True, subdomain=False) # alive and subdomain are optional.
print(known_urls) # known_urls() returns list of URLs
```
@ -286,6 +311,15 @@ https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashSave></sub>
#### Get archive URL
```bash
$ waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --archive_url
https://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX
```
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashArchiveUrl></sub>
#### Oldest archive
```bash
@ -304,6 +338,20 @@ https://web.archive.org/web/20200606044708/https://en.wikipedia.org/wiki/YouTube
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashNewest></sub>
#### Get JSON data of avaialblity API
```bash
waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --json
```
```javascript
{'archived_snapshots': {'closest': {'timestamp': '20201007132458', 'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX'}}, 'url': 'https://en.wikipedia.org/wiki/SpaceX'}
```
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashJSON></sub>
#### Total number of archives
```bash
@ -334,7 +382,8 @@ waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashGet></sub>
#### Fetch all the URLs that the Wayback Machine knows for a domain
#### Fetch all the URLs that the Wayback Machine knows for a domain
1) You can add the '--alive' flag to only fetch alive links.
2) You can add the '--subdomain' flag to add subdomains.
3) '--alive' and '--subdomain' flags can be used simultaneously.
@ -345,19 +394,19 @@ pip install waybackpy
# Ignore the above installation line.
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls
# Prints all known URLs under akamhy.github.io
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --alive
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --alive
# Prints all known URLs under akamhy.github.io which are still working and not dead links.
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain
# Prints all known URLs under akamhy.github.io inclusing subdomain
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain --alive
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain --alive
# Prints all known URLs under akamhy.github.io including subdomain which are not dead links and still alive.
```
@ -368,9 +417,20 @@ waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --sub
[Here](https://github.com/akamhy/waybackpy/tree/master/tests)
To run tests locally:
```bash
pip install -U pytest
pip install codecov
pip install pytest pytest-cov
cd tests
pytest --cov=../waybackpy
python -m codecov #For reporting coverage on Codecov
```
## Dependency
None, just python standard libraries (re, json, urllib, argparse and datetime). Both python 2 and 3 are supported :)
None, just pre-installed [python standard libraries](https://docs.python.org/3/library/).
## Packaging

319
index.rst
View File

@ -5,12 +5,14 @@ waybackpy
|Codacy Badge| |Maintainability| |CodeFactor| |made-with-python| |pypi|
|PyPI - Python Version| |Maintenance| |Repo size| |License: MIT|
|Internet Archive| |Wayback Machine|
.. figure:: https://raw.githubusercontent.com/akamhy/waybackpy/master/assets/waybackpy-colored%20284.png
:alt: Wayback Machine
Waybackpy is a Python library that interfaces with the `Internet
Wayback Machine
Waybackpy is a Python package that interfaces with `Internet
Archive <https://en.wikipedia.org/wiki/Internet_Archive>`__'s `Wayback
Machine <https://en.wikipedia.org/wiki/Wayback_Machine>`__ API. Archive
pages and retrieve archived pages easily.
webpages and retrieve archived webpages easily.
Table of contents
=================
@ -24,33 +26,46 @@ Table of contents
- `Usage <#usage>`__
- `As a Python package <#as-a-python-package>`__
- `Saving an url using
save() <#capturing-aka-saving-an-url-using-save>`__
- `Receiving the oldest archive for an URL Using
oldest() <#receiving-the-oldest-archive-for-an-url-using-oldest>`__
- `Receiving the recent most/newest archive for an URL using
newest() <#receiving-the-newest-archive-for-an-url-using-newest>`__
- `Receiving archive close to a specified year, month, day, hour,
and minute using
near() <#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near>`__
- `Get the content of webpage using
get() <#get-the-content-of-webpage-using-get>`__
- `Count total archives for an URL using
total\_archives() <#count-total-archives-for-an-url-using-total_archives>`__
- `Saving a webpage <#capturing-aka-saving-an-url-using-save>`__
- `Retrieving
archive <#retrieving-the-archive-for-an-url-using-archive_url>`__
- `Retrieving the oldest
archive <#retrieving-the-oldest-archive-for-an-url-using-oldest>`__
- `Retrieving the recent most/newest
archive <#retrieving-the-newest-archive-for-an-url-using-newest>`__
- `Retrieving the JSON response of availability
API <#retrieving-the-json-reponse-for-the-avaliblity-api-request>`__
- `Retrieving archive close to a specified year, month, day, hour,
and
minute <#retrieving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near>`__
- `Get the content of
webpage <#get-the-content-of-webpage-using-get>`__
- `Count total archives for an
URL <#count-total-archives-for-an-url-using-total_archives>`__
- `List of URLs that Wayback Machine knows and has archived for a
domain
name <#list-of-urls-that-wayback-machine-knows-and-has-archived-for-a-domain-name>`__
- `With Command-line interface <#with-the-command-line-interface>`__
- `With the Command-line
interface <#with-the-command-line-interface>`__
- `Save <#save>`__
- `Oldest archive <#oldest-archive>`__
- `Newest archive <#newest-archive>`__
- `Saving webpage <#save>`__
- `Archive URL <#get-archive-url>`__
- `Oldest archive URL <#oldest-archive>`__
- `Newest archive URL <#newest-archive>`__
- `JSON response of API <#get-json-data-of-avaialblity-api>`__
- `Total archives <#total-number-of-archives>`__
- `Archive near a time <#archive-near-time>`__
- `Archive near specified time <#archive-near-time>`__
- `Get the source code <#get-the-source-code>`__
- `Fetch all the URLs that the Wayback Machine knows for a
domain <#fetch-all-the-urls-that-the-wayback-machine-knows-for-a-domain>`__
- `Tests <#tests>`__
- `Dependency <#dependency>`__
- `Packaging <#packaging>`__
- `License <#license>`__
.. raw:: html
@ -85,36 +100,53 @@ Capturing aka Saving an url using save()
import waybackpy
new_archive_url = waybackpy.Url(
url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
url = "https://en.wikipedia.org/wiki/Multivariable_calculus",
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
).save()
print(new_archive_url)
waybackpy_url_obj = waybackpy.Url(url, user_agent)
archive = waybackpy_url_obj.save()
print(archive)
.. code:: bash
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus
Try this out in your browser @
https://repl.it/@akamhy/WaybackPySaveExample\
Receiving the oldest archive for an URL using oldest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Retrieving the archive for an URL using archive\_url
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
oldest_archive_url = waybackpy.Url(
url = "https://www.google.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
"https://www.google.com/",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
).oldest()
waybackpy_url_obj = waybackpy.Url(url, user_agent)
archive_url = waybackpy_url_obj.archive_url
print(archive_url)
.. code:: bash
https://web.archive.org/web/20201016153320/https://www.google.com/
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyArchiveUrl\
Retrieving the oldest archive for an URL using oldest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
url = "https://www.google.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
waybackpy_url_obj = waybackpy.Url(url, user_agent)
oldest_archive_url = waybackpy_url_obj.oldest()
print(oldest_archive_url)
.. code:: bash
@ -124,86 +156,99 @@ Receiving the oldest archive for an URL using oldest()
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyOldestExample\
Receiving the newest archive for an URL using newest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Retrieving the newest archive for an URL using newest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
newest_archive_url = waybackpy.Url(
"https://www.facebook.com/",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
).newest()
url = "https://www.facebook.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
waybackpy_url_obj = waybackpy.Url(url, user_agent)
newest_archive_url = waybackpy_url_obj.newest()
print(newest_archive_url)
.. code:: bash
https://web.archive.org/web/20200714013225/https://www.facebook.com/
https://web.archive.org/web/20201016150543/https://www.facebook.com/
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyNewestExample\
Receiving archive close to a specified year, month, day, hour, and minute using near()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Retrieving the JSON reponse for the avaliblity API request
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import waybackpy
url = "https://www.facebook.com/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
waybackpy_url_obj = waybackpy.Url(url, user_agent)
json_dict = waybackpy_url_obj.JSON
print(json_dict)
.. code:: javascript
{'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'available': True, 'url': 'http://web.archive.org/web/20201016150543/https://www.facebook.com/', 'timestamp': '20201016150543', 'status': '200'}}}
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyJSON\
Retrieving archive close to a specified year, month, day, hour, and minute using near()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
from waybackpy import Url
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
github_url = "https://github.com/"
url = "https://github.com/"
github_wayback_obj = Url(github_url, user_agent)
waybackpy_url_obj = Url(url, user_agent)
# Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
.. code:: python
github_archive_near_2010 = github_wayback_obj.near(year=2010)
github_archive_near_2010 = waybackpy_url_obj.near(year=2010)
print(github_archive_near_2010)
.. code:: bash
https://web.archive.org/web/20100719134402/http://github.com/
https://web.archive.org/web/20101018053604/http://github.com:80/
.. code:: python
github_archive_near_2011_may = github_wayback_obj.near(year=2011, month=5)
github_archive_near_2011_may = waybackpy_url_obj.near(year=2011, month=5)
print(github_archive_near_2011_may)
.. code:: bash
https://web.archive.org/web/20110519185447/https://github.com/
https://web.archive.org/web/20110518233639/https://github.com/
.. code:: python
github_archive_near_2015_january_26 = github_wayback_obj.near(
year=2015, month=1, day=26
)
github_archive_near_2015_january_26 = waybackpy_url_obj.near(year=2015, month=1, day=26)
print(github_archive_near_2015_january_26)
.. code:: bash
https://web.archive.org/web/20150127031159/https://github.com
https://web.archive.org/web/20150125102636/https://github.com/
.. code:: python
github_archive_near_2018_4_july_9_2_am = github_wayback_obj.near(
year=2018, month=7, day=4, hour = 9, minute = 2
)
github_archive_near_2018_4_july_9_2_am = waybackpy_url_obj.near(year=2018, month=7, day=4, hour=9, minute=2)
print(github_archive_near_2018_4_july_9_2_am)
.. code:: bash
https://web.archive.org/web/20180704090245/https://github.com/
The library doesn't supports seconds yet. You are encourged to create a
PR ;)
The package doesn't support second argument yet. You are encourged to
create a PR ;)
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyNearExample\
@ -229,16 +274,12 @@ Get the content of webpage using get()
# The following chunk of code will force a new archive of google.com and get the source of the archived page.
# waybackpy_url_object.save() type is string.
google_newest_archive_source = waybackpy_url_object.get(
waybackpy_url_object.save()
)
google_newest_archive_source = waybackpy_url_object.get(waybackpy_url_object.save())
print(google_newest_archive_source)
# waybackpy_url_object.oldest() type is str, it's oldest archive of google.com
google_oldest_archive_source = waybackpy_url_object.get(
waybackpy_url_object.oldest()
)
google_oldest_archive_source = waybackpy_url_object.get(waybackpy_url_object.oldest())
print(google_oldest_archive_source)
Try this out in your browser @
@ -252,23 +293,50 @@ Count total archives for an URL using total\_archives()
import waybackpy
URL = "https://en.wikipedia.org/wiki/Python (programming language)"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
archive_count = waybackpy.Url(
url=URL,
user_agent=UA
).total_archives()
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
archive_count = waybackpy_url_object.total_archives()
print(archive_count) # total_archives() returns an int
.. code:: bash
2440
2516
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyTotalArchivesExample\
List of URLs that Wayback Machine knows and has archived for a domain name
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1) If alive=True is set, waybackpy will check all URLs to identify the
alive URLs. Don't use with popular websites like google or it would
take too long.
2) To include URLs from subdomain set sundomain=True
.. code:: python
import waybackpy
URL = "akamhy.github.io"
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
known_urls = waybackpy_url_object.known_urls(alive=True, subdomain=False) # alive and subdomain are optional.
print(known_urls) # known_urls() returns list of URLs
.. code:: bash
['http://akamhy.github.io',
'https://akamhy.github.io/waybackpy/',
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=a418a4e4641a1dbaad8f3bfbf293fad21a75ff11',
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c']
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py\
With the Command-line interface
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -283,6 +351,17 @@ Save
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashSave\
Get archive URL
^^^^^^^^^^^^^^^
.. code:: bash
$ waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --archive_url
https://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashArchiveUrl\
Oldest archive
^^^^^^^^^^^^^^
@ -305,6 +384,20 @@ Newest archive
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashNewest\
Get JSON data of avaialblity API
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: bash
waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --json
.. code:: javascript
{'archived_snapshots': {'closest': {'timestamp': '20201007132458', 'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX'}}, 'url': 'https://en.wikipedia.org/wiki/SpaceX'}
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashJSON\
Total number of archives
^^^^^^^^^^^^^^^^^^^^^^^^
@ -332,30 +425,84 @@ Get the source code
.. code:: bash
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get url # Prints the source code of the url
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get oldest # Prints the source code of the oldest archive
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get newest # Prints the source code of the newest archive
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive.
waybackpy --url google.com --user_agent "my-unique-user-agent" --get url # Prints the source code of the url
waybackpy --url google.com --user_agent "my-unique-user-agent" --get oldest # Prints the source code of the oldest archive
waybackpy --url google.com --user_agent "my-unique-user-agent" --get newest # Prints the source code of the newest archive
waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive.
Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashGet\
Fetch all the URLs that the Wayback Machine knows for a domain
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1) You can add the '--alive' flag to only fetch alive links.
2) You can add the '--subdomain' flag to add subdomains.
3) '--alive' and '--subdomain' flags can be used simultaneously.
4) All links will be saved in a file, and the file will be created in
the current working directory.
.. code:: bash
pip install waybackpy
# Ignore the above installation line.
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls
# Prints all known URLs under akamhy.github.io
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --alive
# Prints all known URLs under akamhy.github.io which are still working and not dead links.
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain
# Prints all known URLs under akamhy.github.io inclusing subdomain
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain --alive
# Prints all known URLs under akamhy.github.io including subdomain which are not dead links and still alive.
Try this out in your browser @
https://repl.it/@akamhy/WaybackpyKnownUrlsFromWaybackMachine#main.sh\
Tests
-----
- `Here <https://github.com/akamhy/waybackpy/tree/master/tests>`__
`Here <https://github.com/akamhy/waybackpy/tree/master/tests>`__
To run tests locally:
.. code:: bash
pip install -U pytest
pip install codecov
pip install pytest pytest-cov
cd tests
pytest --cov=../waybackpy
python -m codecov #For reporting coverage on Codecov
Dependency
----------
- None, just python standard libraries (re, json, urllib, argparse and
datetime). Both python 2 and 3 are supported :)
None, just pre-installed `python standard
libraries <https://docs.python.org/3/library/>`__.
Packaging
---------
1. Increment version.
2. Build package ``python setup.py sdist bdist_wheel``.
3. Sign & upload the package ``twine upload -s dist/*``.
License
-------
`MIT
License <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
Released under the MIT License. See
`license <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
for details.
.. |contributions welcome| image:: https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square
.. |Build Status| image:: https://img.shields.io/travis/akamhy/waybackpy.svg?label=Travis%20CI&logo=travis&style=flat-square
@ -381,6 +528,4 @@ License <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity
.. |Repo size| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
.. |Internet Archive| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png
.. |Wayback Machine| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE

View File

@ -19,10 +19,10 @@ setup(
author = about['__author__'],
author_email = about['__author_email__'],
url = about['__url__'],
download_url = 'https://github.com/akamhy/waybackpy/archive/2.1.8.tar.gz',
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
download_url = 'https://github.com/akamhy/waybackpy/archive/2.2.0.tar.gz',
keywords = ['waybackpy', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
install_requires=[],
python_requires= ">=2.7",
python_requires= ">=3.2",
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
@ -30,8 +30,6 @@ setup(
'Topic :: Software Development :: Build Tools',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',

View File

@ -19,69 +19,87 @@ if sys.version_info > (3, 7):
if codecov_python:
def test_save():
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=True, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
oldest=False, save=True, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
reply = cli.args_handler(args)
assert "pypi.org/user/akamhy" in reply
assert "pypi.org/user/akamhy" in str(reply)
def test_json():
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, json=True, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
reply = cli.args_handler(args)
assert "archived_snapshots" in str(reply)
def test_archive_url():
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, json=False, archive_url=True, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
reply = cli.args_handler(args)
assert "https://web.archive.org/web/" in str(reply)
def test_oldest():
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=True, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
oldest=True, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
reply = cli.args_handler(args)
assert "pypi.org/user/akamhy" in reply
assert "pypi.org/user/akamhy" in str(reply)
def test_newest():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, newest=True, near=False, alive=False, subdomain=False, known_urls=False, get=None)
oldest=False, save=False, json=False, archive_url=False, newest=True, near=False, alive=False, subdomain=False, known_urls=False, get=None)
reply = cli.args_handler(args)
assert "pypi.org/user/akamhy" in reply
assert "pypi.org/user/akamhy" in str(reply)
def test_total_archives():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=True, version=False,
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
reply = cli.args_handler(args)
assert isinstance(reply, int)
def test_known_urls():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://akamhy.github.io", total=False, version=False,
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=True, subdomain=True, known_urls=True, get=None)
reply = cli.args_handler(args)
assert "github" in str(reply)
def test_near():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, newest=False, near=True, alive=False, subdomain=False, known_urls=False, get=None, year=2020, month=7, day=15, hour=1, minute=1)
oldest=False, save=False, json=False, archive_url=False, newest=False, near=True, alive=False, subdomain=False, known_urls=False, get=None, year=2020, month=7, day=15, hour=1, minute=1)
reply = cli.args_handler(args)
assert "202007" in reply
assert "202007" in str(reply)
def test_get():
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="url")
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="url")
reply = cli.args_handler(args)
assert "waybackpy" in reply
assert "waybackpy" in str(reply)
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="oldest")
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="oldest")
reply = cli.args_handler(args)
assert "waybackpy" in reply
assert "waybackpy" in str(reply)
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="newest")
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="newest")
reply = cli.args_handler(args)
assert "waybackpy" in reply
assert "waybackpy" in str(reply)
if codecov_python:
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="save")
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="save")
reply = cli.args_handler(args)
assert "waybackpy" in reply
assert "waybackpy" in str(reply)
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="BullShit")
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="BullShit")
reply = cli.args_handler(args)
assert "get the source code of the" in reply
assert "get the source code of the" in str(reply)
def test_args_handler():
args = argparse.Namespace(version=True)
@ -90,7 +108,7 @@ def test_args_handler():
args = argparse.Namespace(url=None, version=False)
reply = cli.args_handler(args)
assert ("waybackpy %s" % (__version__)) in reply
assert ("waybackpy %s" % (__version__)) in str(reply)
def test_main():
# This also tests the parse_args method in cli.py

View File

@ -2,7 +2,7 @@
import sys
import pytest
import random
import time
sys.path.append("..")
import waybackpy.wrapper as waybackpy # noqa: E402
@ -28,8 +28,7 @@ def test_dunders():
user_agent = "UA"
target = waybackpy.Url(url, user_agent)
assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target)
assert len(target) == len(url)
assert str(target) == url
assert "en.wikipedia.org" in str(target)
def test_archive_url_parser():
request_url = "https://amazon.com"
@ -47,7 +46,6 @@ def test_url_check():
def test_save():
# Test for urls that exist and can be archived.
time.sleep(10)
url_list = [
"en.wikipedia.org",
@ -64,7 +62,7 @@ def test_save():
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
)
archived_url1 = target.save()
archived_url1 = str(target.save())
assert url1 in archived_url1
if sys.version_info > (3, 6):
@ -73,18 +71,16 @@ def test_save():
with pytest.raises(Exception):
url2 = "ha ha ha ha"
waybackpy.Url(url2, user_agent)
time.sleep(5)
url3 = "http://www.archive.is/faq.html"
# Test for urls not allowed to archive by robot.txt. Doesn't works anymore. Find alternatives.
# with pytest.raises(Exception):
#
#
# target = waybackpy.Url(
# url3,
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) "
# "Gecko/20100101 Firefox/25.0",
# )
# target.save()
# time.sleep(5)
# Non existent urls, test
with pytest.raises(Exception):
target = waybackpy.Url(
@ -100,7 +96,6 @@ def test_save():
def test_near():
time.sleep(10)
url = "google.com"
target = waybackpy.Url(
url,
@ -108,11 +103,10 @@ def test_near():
"(KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
)
archive_near_year = target.near(year=2010)
assert "2010" in archive_near_year
assert "2010" in str(archive_near_year)
if sys.version_info > (3, 6):
time.sleep(5)
archive_near_month_year = target.near(year=2015, month=2)
archive_near_month_year = str(target.near(year=2015, month=2))
assert (
("201502" in archive_near_month_year)
or ("201501" in archive_near_month_year)
@ -124,9 +118,9 @@ def test_near():
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
)
archive_near_hour_day_month_year = target.near(
archive_near_hour_day_month_year = str(target.near(
year=2008, month=5, day=9, hour=15
)
))
assert (
("2008050915" in archive_near_hour_day_month_year)
or ("2008050914" in archive_near_hour_day_month_year)
@ -146,13 +140,22 @@ def test_near():
def test_oldest():
url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent)
assert "20200504141153" in target.oldest()
assert "20200504141153" in str(target.oldest())
def test_json():
url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent)
assert "archived_snapshots" in str(target.JSON)
def test_archive_url():
url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent)
assert "github.com/akamhy" in str(target.archive_url)
def test_newest():
url = "github.com/akamhy/waybackpy"
target = waybackpy.Url(url, user_agent)
assert url in target.newest()
assert url in str(target.newest())
def test_get():
@ -188,3 +191,11 @@ def test_total_archives():
" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent
)
assert target.total_archives() == 0
def test_known_urls():
target = waybackpy.Url("akamhy.github.io", user_agent)
assert len(target.known_urls(alive=True, subdomain=True)) > 2
target = waybackpy.Url("akamhy.github.io", user_agent)
assert len(target.known_urls()) > 3

View File

@ -10,7 +10,7 @@
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
"""
Waybackpy is a Python library that interfaces with the Internet Archive's Wayback Machine API.
Waybackpy is a Python package that interfaces with the Internet Archive's Wayback Machine API.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Archive pages and retrieve archived pages easily.

View File

@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
__title__ = "waybackpy"
__description__ = "A Python library that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
__description__ = "A Python package that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
__url__ = "https://akamhy.github.io/waybackpy/"
__version__ = "2.1.8"
__version__ = "2.2.0"
__author__ = "akamhy"
__author_email__ = "akash3pro@gmail.com"
__license__ = "MIT"

View File

@ -10,6 +10,12 @@ from waybackpy.__version__ import __version__
def _save(obj):
return (obj.save())
def _archive_url(obj):
return (obj.archive_url)
def _json(obj):
return (obj.JSON)
def _oldest(obj):
return (obj.oldest())
@ -34,6 +40,10 @@ def _near(obj, args):
return (obj.near(**_near_args))
def _known_urls(obj, args):
"""Abbreviations:
sd = subdomain
al = alive
"""
sd = False
al = False
if args.subdomain:
@ -48,7 +58,8 @@ def _known_urls(obj, args):
if m:
domain = m.group(1)
else:
domain = "waybackpy-known"
domain = "domain-unknown"
dir_path = os.path.abspath(os.getcwd())
file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
text = "\n".join(url_list) + "\n"
@ -67,6 +78,9 @@ def _get(obj, args):
if args.get.lower() == "url":
return (obj.get())
if args.get.lower() == "archive_url":
return (obj.get(obj.archive_url))
if args.get.lower() == "oldest":
return (obj.get(obj.oldest()))
@ -78,9 +92,10 @@ def _get(obj, args):
return ("Use get as \"--get 'source'\", 'source' can be one of the followings: \
\n1) url - get the source code of the url specified using --url/-u.\
\n2) oldest - get the source code of the oldest archive for the supplied url.\
\n3) newest - get the source code of the newest archive for the supplied url.\
\n4) save - Create a new archive and get the source code of this new archive for the supplied url.")
\n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\
\n3) oldest - get the source code of the oldest archive for the supplied url.\
\n4) newest - get the source code of the newest archive for the supplied url.\
\n5) save - Create a new archive and get the source code of this new archive for the supplied url.")
def args_handler(args):
if args.version:
@ -96,6 +111,10 @@ def args_handler(args):
if args.save:
return _save(obj)
if args.archive_url:
return _archive_url(obj)
if args.json:
return _json(obj)
if args.oldest:
return _oldest(obj)
if args.newest:
@ -118,19 +137,25 @@ def parse_args(argv):
userAgentArg = parser.add_argument_group('User Agent')
userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"")
saveArg = parser.add_argument_group("Create new archive/save URL")
saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine")
auArg = parser.add_argument_group("Get the latest Archive")
auArg.add_argument("--archive_url", "-au", action='store_true', help="Get the latest archive URL, alias for --newest")
jsonArg = parser.add_argument_group("Get the JSON data")
jsonArg.add_argument("--json", "-j", action='store_true', help="JSON data of the availability API request")
oldestArg = parser.add_argument_group("Oldest archive")
oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL")
newestArg = parser.add_argument_group("Newest archive")
newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL")
totalArg = parser.add_argument_group("Total number of archives")
totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL")
getArg = parser.add_argument_group("Get source code")
getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage")
@ -151,7 +176,7 @@ def parse_args(argv):
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version")
return parser.parse_args(argv[1:])
def main(argv=None):

View File

@ -2,5 +2,5 @@
class WaybackError(Exception):
"""
Raised when API Service error.
Raised when Wayback Machine API Service is unreachable/down.
"""

View File

@ -3,7 +3,7 @@
import re
import sys
import json
from datetime import datetime
from datetime import datetime, timedelta
from waybackpy.exceptions import WaybackError
from waybackpy.__version__ import __version__
@ -69,21 +69,77 @@ class Url:
self.url = url
self.user_agent = user_agent
self._url_check() # checks url validity on init.
self.JSON = self._JSON() # JSON of most recent archive
self.archive_url = self._archive_url() # URL of archive
self.timestamp = self._archive_timestamp() # timestamp for last archive
def __repr__(self):
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
def __str__(self):
return "%s" % self._clean_url()
return "%s" % self.archive_url
def __len__(self):
return len(self._clean_url())
td_max = timedelta(days=999999999,
hours=23,
minutes=59,
seconds=59,
microseconds=999999)
if self.timestamp == datetime.max:
return td_max.days
else:
diff = datetime.utcnow() - self.timestamp
return diff.days
def _url_check(self):
"""Check for common URL problems."""
if "." not in self.url:
raise URLError("'%s' is not a vaild URL." % self.url)
def _JSON(self):
request_url = "https://archive.org/wayback/available?url=%s" % (
self._clean_url(),
)
hdr = {"User-Agent": "%s" % self.user_agent}
req = Request(request_url, headers=hdr) # nosec
response = _get_response(req)
data_string = response.read().decode("UTF-8")
data = json.loads(data_string)
return data
def _archive_url(self):
"""Get URL of archive."""
data = self.JSON
if not data["archived_snapshots"]:
archive_url = None
else:
archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace(
"http://web.archive.org/web/",
"https://web.archive.org/web/",
1
)
return archive_url
def _archive_timestamp(self):
"""Get timestamp of last archive."""
data = self.JSON
if not data["archived_snapshots"]:
time = datetime.max
else:
time = datetime.strptime(data["archived_snapshots"]
["closest"]
["timestamp"],
'%Y%m%d%H%M%S')
return time
def _clean_url(self):
"""Fix the URL, if possible."""
return str(self.url).strip().replace(" ", "_")
@ -94,13 +150,15 @@ class Url:
hdr = {"User-Agent": "%s" % self.user_agent} # nosec
req = Request(request_url, headers=hdr) # nosec
header = _get_response(req).headers
return "https://" + _archive_url_parser(header)
self.archive_url = "https://" + _archive_url_parser(header)
self.timestamp = datetime.utcnow()
return self
def get(self, url="", user_agent="", encoding=""):
"""Return the source code of the supplied URL.
If encoding is not supplied, it is auto-detected from the response.
"""
if not url:
url = self._clean_url()
@ -146,11 +204,18 @@ class Url:
"to create a new archive." % self._clean_url()
)
archive_url = data["archived_snapshots"]["closest"]["url"]
# wayback machine returns http sometimes, idk why? But they support https
archive_url = archive_url.replace(
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
)
return archive_url
self.archive_url = archive_url
self.timestamp = datetime.strptime(data["archived_snapshots"]
["closest"]
["timestamp"],
'%Y%m%d%H%M%S')
return self
def oldest(self, year=1994):
"""Return the oldest Wayback Machine archive for this URL."""
@ -190,13 +255,13 @@ class Url:
if subdomain:
request_url = (
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url()
)
else:
request_url = (
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url()
)
@ -213,12 +278,12 @@ class Url:
for url in url_list:
try:
urlopen(url)
urlopen(url) # nosec
except:
continue
tmp_url_list.append(url)
url_list = tmp_url_list
return url_list
return url_list