Compare commits
18 Commits
Author | SHA1 | Date | |
---|---|---|---|
925be7b17e | |||
2b132456ac | |||
50e3154a4e | |||
7aef50428f | |||
d8ec0f5025 | |||
0a2f97c034 | |||
3e9cf23578 | |||
7f927ec7be | |||
9de6393cd5 | |||
91e7f65617 | |||
d465454019 | |||
1a81eb97fb | |||
6b3b2e2a7d | |||
82c65454e6 | |||
19710461b6 | |||
a3661d6b85 | |||
58375e4ef4 | |||
ea023e98da |
@ -3,7 +3,6 @@ os: linux
|
||||
dist: xenial
|
||||
cache: pip
|
||||
python:
|
||||
- 2.7
|
||||
- 3.6
|
||||
- 3.8
|
||||
before_install:
|
||||
|
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 akamhy
|
||||
Copyright (c) 2020 Akash Mahanty (https://github.com/akamhy)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
190
README.md
190
README.md
@ -17,7 +17,7 @@
|
||||
|
||||

|
||||
|
||||
Waybackpy is a Python library that interfaces with [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine) API. Archive webpages and retrieve archived webpages easily.
|
||||
Waybackpy is a Python package that interfaces with [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine) API. Archive webpages and retrieve archived webpages easily.
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
@ -27,20 +27,24 @@ Table of contents
|
||||
|
||||
* [Usage](#usage)
|
||||
* [As a Python package](#as-a-python-package)
|
||||
* [Saving an url](#capturing-aka-saving-an-url-using-save)
|
||||
* [Saving a webpage](#capturing-aka-saving-an-url-using-save)
|
||||
* [Retrieving archive](#retrieving-the-archive-for-an-url-using-archive_url)
|
||||
* [Retrieving the oldest archive](#retrieving-the-oldest-archive-for-an-url-using-oldest)
|
||||
* [Retrieving the recent most/newest archive](#retrieving-the-newest-archive-for-an-url-using-newest)
|
||||
* [Retrieving the JSON response of availability API](#retrieving-the-json-reponse-for-the-avaliblity-api-request)
|
||||
* [Retrieving archive close to a specified year, month, day, hour, and minute](#retrieving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
|
||||
* [Get the content of webpage](#get-the-content-of-webpage-using-get)
|
||||
* [Count total archives for an URL](#count-total-archives-for-an-url-using-total_archives)
|
||||
* [List of URLs that Wayback Machine knows and has archived for a domain name](#retrieving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
|
||||
* [List of URLs that Wayback Machine knows and has archived for a domain name](#list-of-urls-that-wayback-machine-knows-and-has-archived-for-a-domain-name)
|
||||
|
||||
* [As a Command-line tool](#with-the-command-line-interface)
|
||||
* [Save](#save)
|
||||
* [Oldest archive](#oldest-archive)
|
||||
* [Newest archive](#newest-archive)
|
||||
* [With the Command-line interface](#with-the-command-line-interface)
|
||||
* [Saving webpage](#save)
|
||||
* [Archive URL](#get-archive-url)
|
||||
* [Oldest archive URL](#oldest-archive)
|
||||
* [Newest archive URL](#newest-archive)
|
||||
* [JSON response of API](#get-json-data-of-avaialblity-api)
|
||||
* [Total archives](#total-number-of-archives)
|
||||
* [Archive near a time](#archive-near-time)
|
||||
* [Archive near specified time](#archive-near-time)
|
||||
* [Get the source code](#get-the-source-code)
|
||||
* [Fetch all the URLs that the Wayback Machine knows for a domain](#fetch-all-the-urls-that-the-wayback-machine-knows-for-a-domain)
|
||||
|
||||
@ -77,33 +81,49 @@ pip install git+https://github.com/akamhy/waybackpy.git
|
||||
```python
|
||||
import waybackpy
|
||||
|
||||
new_archive_url = waybackpy.Url(
|
||||
url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
url = "https://en.wikipedia.org/wiki/Multivariable_calculus",
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
).save()
|
||||
|
||||
print(new_archive_url)
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
archive = waybackpy_url_obj.save()
|
||||
print(archive)
|
||||
```
|
||||
|
||||
```bash
|
||||
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
|
||||
https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus
|
||||
```
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPySaveExample></sub>
|
||||
|
||||
#### Retrieving the archive for an URL using archive_url
|
||||
|
||||
```python
|
||||
import waybackpy
|
||||
|
||||
url = "https://www.google.com/"
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
archive_url = waybackpy_url_obj.archive_url
|
||||
print(archive_url)
|
||||
```
|
||||
|
||||
```bash
|
||||
https://web.archive.org/web/20201016153320/https://www.google.com/
|
||||
```
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyArchiveUrl></sub>
|
||||
|
||||
#### Retrieving the oldest archive for an URL using oldest()
|
||||
|
||||
```python
|
||||
import waybackpy
|
||||
|
||||
oldest_archive_url = waybackpy.Url(
|
||||
|
||||
"https://www.google.com/",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
).oldest()
|
||||
url = "https://www.google.com/"
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
oldest_archive_url = waybackpy_url_obj.oldest()
|
||||
print(oldest_archive_url)
|
||||
```
|
||||
|
||||
@ -118,78 +138,89 @@ http://web.archive.org/web/19981111184551/http://google.com:80/
|
||||
```python
|
||||
import waybackpy
|
||||
|
||||
newest_archive_url = waybackpy.Url(
|
||||
|
||||
"https://www.facebook.com/",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
|
||||
|
||||
).newest()
|
||||
url = "https://www.facebook.com/"
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
|
||||
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
newest_archive_url = waybackpy_url_obj.newest()
|
||||
print(newest_archive_url)
|
||||
```
|
||||
|
||||
```bash
|
||||
https://web.archive.org/web/20200714013225/https://www.facebook.com/
|
||||
https://web.archive.org/web/20201016150543/https://www.facebook.com/
|
||||
```
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyNewestExample></sub>
|
||||
|
||||
#### Retrieving the JSON reponse for the avaliblity API request
|
||||
|
||||
```python
|
||||
import waybackpy
|
||||
|
||||
url = "https://www.facebook.com/"
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
|
||||
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
json_dict = waybackpy_url_obj.JSON
|
||||
print(json_dict)
|
||||
```
|
||||
|
||||
```javascript
|
||||
{'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'available': True, 'url': 'http://web.archive.org/web/20201016150543/https://www.facebook.com/', 'timestamp': '20201016150543', 'status': '200'}}}
|
||||
```
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyJSON></sub>
|
||||
|
||||
#### Retrieving archive close to a specified year, month, day, hour, and minute using near()
|
||||
|
||||
```python
|
||||
from waybackpy import Url
|
||||
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
|
||||
github_url = "https://github.com/"
|
||||
url = "https://github.com/"
|
||||
|
||||
|
||||
github_wayback_obj = Url(github_url, user_agent)
|
||||
waybackpy_url_obj = Url(url, user_agent)
|
||||
|
||||
# Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
```
|
||||
|
||||
```python
|
||||
github_archive_near_2010 = github_wayback_obj.near(year=2010)
|
||||
github_archive_near_2010 = waybackpy_url_obj.near(year=2010)
|
||||
print(github_archive_near_2010)
|
||||
```
|
||||
|
||||
```bash
|
||||
https://web.archive.org/web/20100719134402/http://github.com/
|
||||
https://web.archive.org/web/20101018053604/http://github.com:80/
|
||||
```
|
||||
|
||||
```python
|
||||
github_archive_near_2011_may = github_wayback_obj.near(year=2011, month=5)
|
||||
github_archive_near_2011_may = waybackpy_url_obj.near(year=2011, month=5)
|
||||
print(github_archive_near_2011_may)
|
||||
```
|
||||
|
||||
```bash
|
||||
https://web.archive.org/web/20110519185447/https://github.com/
|
||||
https://web.archive.org/web/20110518233639/https://github.com/
|
||||
```
|
||||
|
||||
```python
|
||||
github_archive_near_2015_january_26 = github_wayback_obj.near(
|
||||
year=2015, month=1, day=26
|
||||
)
|
||||
github_archive_near_2015_january_26 = waybackpy_url_obj.near(year=2015, month=1, day=26)
|
||||
print(github_archive_near_2015_january_26)
|
||||
```
|
||||
|
||||
```bash
|
||||
https://web.archive.org/web/20150127031159/https://github.com
|
||||
https://web.archive.org/web/20150125102636/https://github.com/
|
||||
```
|
||||
|
||||
```python
|
||||
github_archive_near_2018_4_july_9_2_am = github_wayback_obj.near(
|
||||
year=2018, month=7, day=4, hour = 9, minute = 2
|
||||
)
|
||||
github_archive_near_2018_4_july_9_2_am = waybackpy_url_obj.near(year=2018, month=7, day=4, hour=9, minute=2)
|
||||
print(github_archive_near_2018_4_july_9_2_am)
|
||||
```
|
||||
|
||||
```bash
|
||||
https://web.archive.org/web/20180704090245/https://github.com/
|
||||
|
||||
```
|
||||
|
||||
<sub>The library doesn't supports seconds yet. You are encourged to create a PR ;)</sub>
|
||||
<sub>The package doesn't support second argument yet. You are encourged to create a PR ;)</sub>
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyNearExample></sub>
|
||||
|
||||
@ -212,16 +243,12 @@ print(current_google_url_source)
|
||||
|
||||
# The following chunk of code will force a new archive of google.com and get the source of the archived page.
|
||||
# waybackpy_url_object.save() type is string.
|
||||
google_newest_archive_source = waybackpy_url_object.get(
|
||||
waybackpy_url_object.save()
|
||||
)
|
||||
google_newest_archive_source = waybackpy_url_object.get(waybackpy_url_object.save())
|
||||
print(google_newest_archive_source)
|
||||
|
||||
|
||||
# waybackpy_url_object.oldest() type is str, it's oldest archive of google.com
|
||||
google_oldest_archive_source = waybackpy_url_object.get(
|
||||
waybackpy_url_object.oldest()
|
||||
)
|
||||
google_oldest_archive_source = waybackpy_url_object.get(waybackpy_url_object.oldest())
|
||||
print(google_oldest_archive_source)
|
||||
```
|
||||
|
||||
@ -233,24 +260,22 @@ print(google_oldest_archive_source)
|
||||
import waybackpy
|
||||
|
||||
URL = "https://en.wikipedia.org/wiki/Python (programming language)"
|
||||
|
||||
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
|
||||
|
||||
archive_count = waybackpy.Url(
|
||||
url=URL,
|
||||
user_agent=UA
|
||||
).total_archives()
|
||||
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
|
||||
|
||||
archive_count = waybackpy_url_object.total_archives()
|
||||
|
||||
print(archive_count) # total_archives() returns an int
|
||||
```
|
||||
|
||||
```bash
|
||||
2440
|
||||
2516
|
||||
```
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyTotalArchivesExample></sub>
|
||||
|
||||
#### List of URLs that Wayback Machine knows and has archived for a domain name
|
||||
#### List of URLs that Wayback Machine knows and has archived for a domain name
|
||||
|
||||
1) If alive=True is set, waybackpy will check all URLs to identify the alive URLs. Don't use with popular websites like google or it would take too long.
|
||||
2) To include URLs from subdomain set sundomain=True
|
||||
@ -261,8 +286,8 @@ import waybackpy
|
||||
URL = "akamhy.github.io"
|
||||
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
|
||||
|
||||
known_urls = waybackpy.Url(url=URL, user_agent=UA).known_urls(alive=True, subdomain=False) # alive and subdomain are optional.
|
||||
|
||||
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
|
||||
known_urls = waybackpy_url_object.known_urls(alive=True, subdomain=False) # alive and subdomain are optional.
|
||||
print(known_urls) # known_urls() returns list of URLs
|
||||
```
|
||||
|
||||
@ -286,6 +311,15 @@ https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashSave></sub>
|
||||
|
||||
#### Get archive URL
|
||||
|
||||
```bash
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --archive_url
|
||||
https://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX
|
||||
```
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashArchiveUrl></sub>
|
||||
|
||||
#### Oldest archive
|
||||
|
||||
```bash
|
||||
@ -304,6 +338,20 @@ https://web.archive.org/web/20200606044708/https://en.wikipedia.org/wiki/YouTube
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashNewest></sub>
|
||||
|
||||
#### Get JSON data of avaialblity API
|
||||
|
||||
```bash
|
||||
waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --json
|
||||
|
||||
```
|
||||
|
||||
```javascript
|
||||
{'archived_snapshots': {'closest': {'timestamp': '20201007132458', 'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX'}}, 'url': 'https://en.wikipedia.org/wiki/SpaceX'}
|
||||
|
||||
```
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashJSON></sub>
|
||||
|
||||
#### Total number of archives
|
||||
|
||||
```bash
|
||||
@ -334,7 +382,8 @@ waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save
|
||||
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyBashGet></sub>
|
||||
|
||||
#### Fetch all the URLs that the Wayback Machine knows for a domain
|
||||
#### Fetch all the URLs that the Wayback Machine knows for a domain
|
||||
|
||||
1) You can add the '--alive' flag to only fetch alive links.
|
||||
2) You can add the '--subdomain' flag to add subdomains.
|
||||
3) '--alive' and '--subdomain' flags can be used simultaneously.
|
||||
@ -345,19 +394,19 @@ pip install waybackpy
|
||||
|
||||
# Ignore the above installation line.
|
||||
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls
|
||||
# Prints all known URLs under akamhy.github.io
|
||||
|
||||
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --alive
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --alive
|
||||
# Prints all known URLs under akamhy.github.io which are still working and not dead links.
|
||||
|
||||
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain
|
||||
# Prints all known URLs under akamhy.github.io inclusing subdomain
|
||||
|
||||
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain --alive
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain --alive
|
||||
# Prints all known URLs under akamhy.github.io including subdomain which are not dead links and still alive.
|
||||
|
||||
```
|
||||
@ -368,9 +417,20 @@ waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --sub
|
||||
|
||||
[Here](https://github.com/akamhy/waybackpy/tree/master/tests)
|
||||
|
||||
To run tests locally:
|
||||
|
||||
```bash
|
||||
pip install -U pytest
|
||||
pip install codecov
|
||||
pip install pytest pytest-cov
|
||||
cd tests
|
||||
pytest --cov=../waybackpy
|
||||
python -m codecov #For reporting coverage on Codecov
|
||||
```
|
||||
|
||||
## Dependency
|
||||
|
||||
None, just python standard libraries (re, json, urllib, argparse and datetime). Both python 2 and 3 are supported :)
|
||||
None, just pre-installed [python standard libraries](https://docs.python.org/3/library/).
|
||||
|
||||
## Packaging
|
||||
|
||||
|
319
index.rst
319
index.rst
@ -5,12 +5,14 @@ waybackpy
|
||||
|Codacy Badge| |Maintainability| |CodeFactor| |made-with-python| |pypi|
|
||||
|PyPI - Python Version| |Maintenance| |Repo size| |License: MIT|
|
||||
|
||||
|Internet Archive| |Wayback Machine|
|
||||
.. figure:: https://raw.githubusercontent.com/akamhy/waybackpy/master/assets/waybackpy-colored%20284.png
|
||||
:alt: Wayback Machine
|
||||
|
||||
Waybackpy is a Python library that interfaces with the `Internet
|
||||
Wayback Machine
|
||||
Waybackpy is a Python package that interfaces with `Internet
|
||||
Archive <https://en.wikipedia.org/wiki/Internet_Archive>`__'s `Wayback
|
||||
Machine <https://en.wikipedia.org/wiki/Wayback_Machine>`__ API. Archive
|
||||
pages and retrieve archived pages easily.
|
||||
webpages and retrieve archived webpages easily.
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
@ -24,33 +26,46 @@ Table of contents
|
||||
- `Usage <#usage>`__
|
||||
- `As a Python package <#as-a-python-package>`__
|
||||
|
||||
- `Saving an url using
|
||||
save() <#capturing-aka-saving-an-url-using-save>`__
|
||||
- `Receiving the oldest archive for an URL Using
|
||||
oldest() <#receiving-the-oldest-archive-for-an-url-using-oldest>`__
|
||||
- `Receiving the recent most/newest archive for an URL using
|
||||
newest() <#receiving-the-newest-archive-for-an-url-using-newest>`__
|
||||
- `Receiving archive close to a specified year, month, day, hour,
|
||||
and minute using
|
||||
near() <#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near>`__
|
||||
- `Get the content of webpage using
|
||||
get() <#get-the-content-of-webpage-using-get>`__
|
||||
- `Count total archives for an URL using
|
||||
total\_archives() <#count-total-archives-for-an-url-using-total_archives>`__
|
||||
- `Saving a webpage <#capturing-aka-saving-an-url-using-save>`__
|
||||
- `Retrieving
|
||||
archive <#retrieving-the-archive-for-an-url-using-archive_url>`__
|
||||
- `Retrieving the oldest
|
||||
archive <#retrieving-the-oldest-archive-for-an-url-using-oldest>`__
|
||||
- `Retrieving the recent most/newest
|
||||
archive <#retrieving-the-newest-archive-for-an-url-using-newest>`__
|
||||
- `Retrieving the JSON response of availability
|
||||
API <#retrieving-the-json-reponse-for-the-avaliblity-api-request>`__
|
||||
- `Retrieving archive close to a specified year, month, day, hour,
|
||||
and
|
||||
minute <#retrieving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near>`__
|
||||
- `Get the content of
|
||||
webpage <#get-the-content-of-webpage-using-get>`__
|
||||
- `Count total archives for an
|
||||
URL <#count-total-archives-for-an-url-using-total_archives>`__
|
||||
- `List of URLs that Wayback Machine knows and has archived for a
|
||||
domain
|
||||
name <#list-of-urls-that-wayback-machine-knows-and-has-archived-for-a-domain-name>`__
|
||||
|
||||
- `With Command-line interface <#with-the-command-line-interface>`__
|
||||
- `With the Command-line
|
||||
interface <#with-the-command-line-interface>`__
|
||||
|
||||
- `Save <#save>`__
|
||||
- `Oldest archive <#oldest-archive>`__
|
||||
- `Newest archive <#newest-archive>`__
|
||||
- `Saving webpage <#save>`__
|
||||
- `Archive URL <#get-archive-url>`__
|
||||
- `Oldest archive URL <#oldest-archive>`__
|
||||
- `Newest archive URL <#newest-archive>`__
|
||||
- `JSON response of API <#get-json-data-of-avaialblity-api>`__
|
||||
- `Total archives <#total-number-of-archives>`__
|
||||
- `Archive near a time <#archive-near-time>`__
|
||||
- `Archive near specified time <#archive-near-time>`__
|
||||
- `Get the source code <#get-the-source-code>`__
|
||||
- `Fetch all the URLs that the Wayback Machine knows for a
|
||||
domain <#fetch-all-the-urls-that-the-wayback-machine-knows-for-a-domain>`__
|
||||
|
||||
- `Tests <#tests>`__
|
||||
|
||||
- `Dependency <#dependency>`__
|
||||
|
||||
- `Packaging <#packaging>`__
|
||||
|
||||
- `License <#license>`__
|
||||
|
||||
.. raw:: html
|
||||
@ -85,36 +100,53 @@ Capturing aka Saving an url using save()
|
||||
|
||||
import waybackpy
|
||||
|
||||
new_archive_url = waybackpy.Url(
|
||||
url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
url = "https://en.wikipedia.org/wiki/Multivariable_calculus",
|
||||
user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
).save()
|
||||
|
||||
print(new_archive_url)
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
archive = waybackpy_url_obj.save()
|
||||
print(archive)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
|
||||
https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPySaveExample\
|
||||
|
||||
Receiving the oldest archive for an URL using oldest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Retrieving the archive for an URL using archive\_url
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
oldest_archive_url = waybackpy.Url(
|
||||
url = "https://www.google.com/"
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
"https://www.google.com/",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
).oldest()
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
archive_url = waybackpy_url_obj.archive_url
|
||||
print(archive_url)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20201016153320/https://www.google.com/
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyArchiveUrl\
|
||||
|
||||
Retrieving the oldest archive for an URL using oldest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
url = "https://www.google.com/"
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
oldest_archive_url = waybackpy_url_obj.oldest()
|
||||
print(oldest_archive_url)
|
||||
|
||||
.. code:: bash
|
||||
@ -124,86 +156,99 @@ Receiving the oldest archive for an URL using oldest()
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyOldestExample\
|
||||
|
||||
Receiving the newest archive for an URL using newest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Retrieving the newest archive for an URL using newest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
newest_archive_url = waybackpy.Url(
|
||||
|
||||
"https://www.facebook.com/",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
|
||||
|
||||
).newest()
|
||||
url = "https://www.facebook.com/"
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
|
||||
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
newest_archive_url = waybackpy_url_obj.newest()
|
||||
print(newest_archive_url)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20200714013225/https://www.facebook.com/
|
||||
https://web.archive.org/web/20201016150543/https://www.facebook.com/
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyNewestExample\
|
||||
|
||||
Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Retrieving the JSON reponse for the avaliblity API request
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
url = "https://www.facebook.com/"
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0"
|
||||
|
||||
waybackpy_url_obj = waybackpy.Url(url, user_agent)
|
||||
json_dict = waybackpy_url_obj.JSON
|
||||
print(json_dict)
|
||||
|
||||
.. code:: javascript
|
||||
|
||||
{'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'available': True, 'url': 'http://web.archive.org/web/20201016150543/https://www.facebook.com/', 'timestamp': '20201016150543', 'status': '200'}}}
|
||||
|
||||
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyJSON\
|
||||
|
||||
Retrieving archive close to a specified year, month, day, hour, and minute using near()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
from waybackpy import Url
|
||||
|
||||
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0"
|
||||
github_url = "https://github.com/"
|
||||
url = "https://github.com/"
|
||||
|
||||
|
||||
github_wayback_obj = Url(github_url, user_agent)
|
||||
waybackpy_url_obj = Url(url, user_agent)
|
||||
|
||||
# Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
|
||||
.. code:: python
|
||||
|
||||
github_archive_near_2010 = github_wayback_obj.near(year=2010)
|
||||
github_archive_near_2010 = waybackpy_url_obj.near(year=2010)
|
||||
print(github_archive_near_2010)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20100719134402/http://github.com/
|
||||
https://web.archive.org/web/20101018053604/http://github.com:80/
|
||||
|
||||
.. code:: python
|
||||
|
||||
github_archive_near_2011_may = github_wayback_obj.near(year=2011, month=5)
|
||||
github_archive_near_2011_may = waybackpy_url_obj.near(year=2011, month=5)
|
||||
print(github_archive_near_2011_may)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20110519185447/https://github.com/
|
||||
https://web.archive.org/web/20110518233639/https://github.com/
|
||||
|
||||
.. code:: python
|
||||
|
||||
github_archive_near_2015_january_26 = github_wayback_obj.near(
|
||||
year=2015, month=1, day=26
|
||||
)
|
||||
github_archive_near_2015_january_26 = waybackpy_url_obj.near(year=2015, month=1, day=26)
|
||||
print(github_archive_near_2015_january_26)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20150127031159/https://github.com
|
||||
https://web.archive.org/web/20150125102636/https://github.com/
|
||||
|
||||
.. code:: python
|
||||
|
||||
github_archive_near_2018_4_july_9_2_am = github_wayback_obj.near(
|
||||
year=2018, month=7, day=4, hour = 9, minute = 2
|
||||
)
|
||||
github_archive_near_2018_4_july_9_2_am = waybackpy_url_obj.near(year=2018, month=7, day=4, hour=9, minute=2)
|
||||
print(github_archive_near_2018_4_july_9_2_am)
|
||||
|
||||
.. code:: bash
|
||||
|
||||
https://web.archive.org/web/20180704090245/https://github.com/
|
||||
|
||||
The library doesn't supports seconds yet. You are encourged to create a
|
||||
PR ;)
|
||||
The package doesn't support second argument yet. You are encourged to
|
||||
create a PR ;)
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyNearExample\
|
||||
@ -229,16 +274,12 @@ Get the content of webpage using get()
|
||||
|
||||
# The following chunk of code will force a new archive of google.com and get the source of the archived page.
|
||||
# waybackpy_url_object.save() type is string.
|
||||
google_newest_archive_source = waybackpy_url_object.get(
|
||||
waybackpy_url_object.save()
|
||||
)
|
||||
google_newest_archive_source = waybackpy_url_object.get(waybackpy_url_object.save())
|
||||
print(google_newest_archive_source)
|
||||
|
||||
|
||||
# waybackpy_url_object.oldest() type is str, it's oldest archive of google.com
|
||||
google_oldest_archive_source = waybackpy_url_object.get(
|
||||
waybackpy_url_object.oldest()
|
||||
)
|
||||
google_oldest_archive_source = waybackpy_url_object.get(waybackpy_url_object.oldest())
|
||||
print(google_oldest_archive_source)
|
||||
|
||||
Try this out in your browser @
|
||||
@ -252,23 +293,50 @@ Count total archives for an URL using total\_archives()
|
||||
import waybackpy
|
||||
|
||||
URL = "https://en.wikipedia.org/wiki/Python (programming language)"
|
||||
|
||||
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
|
||||
|
||||
archive_count = waybackpy.Url(
|
||||
url=URL,
|
||||
user_agent=UA
|
||||
).total_archives()
|
||||
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
|
||||
|
||||
archive_count = waybackpy_url_object.total_archives()
|
||||
|
||||
print(archive_count) # total_archives() returns an int
|
||||
|
||||
.. code:: bash
|
||||
|
||||
2440
|
||||
2516
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyTotalArchivesExample\
|
||||
|
||||
List of URLs that Wayback Machine knows and has archived for a domain name
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
1) If alive=True is set, waybackpy will check all URLs to identify the
|
||||
alive URLs. Don't use with popular websites like google or it would
|
||||
take too long.
|
||||
2) To include URLs from subdomain set sundomain=True
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
|
||||
URL = "akamhy.github.io"
|
||||
UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4"
|
||||
|
||||
waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA)
|
||||
known_urls = waybackpy_url_object.known_urls(alive=True, subdomain=False) # alive and subdomain are optional.
|
||||
print(known_urls) # known_urls() returns list of URLs
|
||||
|
||||
.. code:: bash
|
||||
|
||||
['http://akamhy.github.io',
|
||||
'https://akamhy.github.io/waybackpy/',
|
||||
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=a418a4e4641a1dbaad8f3bfbf293fad21a75ff11',
|
||||
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c']
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py\
|
||||
|
||||
With the Command-line interface
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -283,6 +351,17 @@ Save
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashSave\
|
||||
|
||||
Get archive URL
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --archive_url
|
||||
https://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashArchiveUrl\
|
||||
|
||||
Oldest archive
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
@ -305,6 +384,20 @@ Newest archive
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashNewest\
|
||||
|
||||
Get JSON data of avaialblity API
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: bash
|
||||
|
||||
waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --json
|
||||
|
||||
.. code:: javascript
|
||||
|
||||
{'archived_snapshots': {'closest': {'timestamp': '20201007132458', 'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX'}}, 'url': 'https://en.wikipedia.org/wiki/SpaceX'}
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashJSON\
|
||||
|
||||
Total number of archives
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
@ -332,30 +425,84 @@ Get the source code
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get url # Prints the source code of the url
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get oldest # Prints the source code of the oldest archive
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get newest # Prints the source code of the newest archive
|
||||
$ waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive.
|
||||
waybackpy --url google.com --user_agent "my-unique-user-agent" --get url # Prints the source code of the url
|
||||
waybackpy --url google.com --user_agent "my-unique-user-agent" --get oldest # Prints the source code of the oldest archive
|
||||
waybackpy --url google.com --user_agent "my-unique-user-agent" --get newest # Prints the source code of the newest archive
|
||||
waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive.
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyBashGet\
|
||||
|
||||
Fetch all the URLs that the Wayback Machine knows for a domain
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
1) You can add the '--alive' flag to only fetch alive links.
|
||||
2) You can add the '--subdomain' flag to add subdomains.
|
||||
3) '--alive' and '--subdomain' flags can be used simultaneously.
|
||||
4) All links will be saved in a file, and the file will be created in
|
||||
the current working directory.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
pip install waybackpy
|
||||
|
||||
# Ignore the above installation line.
|
||||
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls
|
||||
# Prints all known URLs under akamhy.github.io
|
||||
|
||||
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --alive
|
||||
# Prints all known URLs under akamhy.github.io which are still working and not dead links.
|
||||
|
||||
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain
|
||||
# Prints all known URLs under akamhy.github.io inclusing subdomain
|
||||
|
||||
|
||||
waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain --alive
|
||||
# Prints all known URLs under akamhy.github.io including subdomain which are not dead links and still alive.
|
||||
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackpyKnownUrlsFromWaybackMachine#main.sh\
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
- `Here <https://github.com/akamhy/waybackpy/tree/master/tests>`__
|
||||
`Here <https://github.com/akamhy/waybackpy/tree/master/tests>`__
|
||||
|
||||
To run tests locally:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
pip install -U pytest
|
||||
pip install codecov
|
||||
pip install pytest pytest-cov
|
||||
cd tests
|
||||
pytest --cov=../waybackpy
|
||||
python -m codecov #For reporting coverage on Codecov
|
||||
|
||||
Dependency
|
||||
----------
|
||||
|
||||
- None, just python standard libraries (re, json, urllib, argparse and
|
||||
datetime). Both python 2 and 3 are supported :)
|
||||
None, just pre-installed `python standard
|
||||
libraries <https://docs.python.org/3/library/>`__.
|
||||
|
||||
Packaging
|
||||
---------
|
||||
|
||||
1. Increment version.
|
||||
|
||||
2. Build package ``python setup.py sdist bdist_wheel``.
|
||||
|
||||
3. Sign & upload the package ``twine upload -s dist/*``.
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
`MIT
|
||||
License <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
|
||||
Released under the MIT License. See
|
||||
`license <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
|
||||
for details.
|
||||
|
||||
.. |contributions welcome| image:: https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square
|
||||
.. |Build Status| image:: https://img.shields.io/travis/akamhy/waybackpy.svg?label=Travis%20CI&logo=travis&style=flat-square
|
||||
@ -381,6 +528,4 @@ License <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
|
||||
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity
|
||||
.. |Repo size| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
|
||||
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
|
||||
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
||||
.. |Internet Archive| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png
|
||||
.. |Wayback Machine| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png
|
||||
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
8
setup.py
8
setup.py
@ -19,10 +19,10 @@ setup(
|
||||
author = about['__author__'],
|
||||
author_email = about['__author_email__'],
|
||||
url = about['__url__'],
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/2.1.8.tar.gz',
|
||||
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/2.2.0.tar.gz',
|
||||
keywords = ['waybackpy', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
|
||||
install_requires=[],
|
||||
python_requires= ">=2.7",
|
||||
python_requires= ">=3.2",
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Intended Audience :: Developers',
|
||||
@ -30,8 +30,6 @@ setup(
|
||||
'Topic :: Software Development :: Build Tools',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python',
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.2',
|
||||
'Programming Language :: Python :: 3.3',
|
||||
|
@ -19,69 +19,87 @@ if sys.version_info > (3, 7):
|
||||
if codecov_python:
|
||||
def test_save():
|
||||
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=True, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
oldest=False, save=True, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
assert "pypi.org/user/akamhy" in str(reply)
|
||||
|
||||
def test_json():
|
||||
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, json=True, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "archived_snapshots" in str(reply)
|
||||
|
||||
def test_archive_url():
|
||||
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, json=False, archive_url=True, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "https://web.archive.org/web/" in str(reply)
|
||||
|
||||
def test_oldest():
|
||||
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=True, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
oldest=True, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
assert "pypi.org/user/akamhy" in str(reply)
|
||||
|
||||
def test_newest():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=True, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=True, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
assert "pypi.org/user/akamhy" in str(reply)
|
||||
|
||||
def test_total_archives():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=True, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert isinstance(reply, int)
|
||||
|
||||
def test_known_urls():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://akamhy.github.io", total=False, version=False,
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=True, subdomain=True, known_urls=True, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "github" in str(reply)
|
||||
|
||||
def test_near():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=True, alive=False, subdomain=False, known_urls=False, get=None, year=2020, month=7, day=15, hour=1, minute=1)
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=False, near=True, alive=False, subdomain=False, known_urls=False, get=None, year=2020, month=7, day=15, hour=1, minute=1)
|
||||
reply = cli.args_handler(args)
|
||||
assert "202007" in reply
|
||||
|
||||
assert "202007" in str(reply)
|
||||
|
||||
def test_get():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="url")
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="url")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
assert "waybackpy" in str(reply)
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="oldest")
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="oldest")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
assert "waybackpy" in str(reply)
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="newest")
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="newest")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
assert "waybackpy" in str(reply)
|
||||
|
||||
if codecov_python:
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="save")
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="save")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
assert "waybackpy" in str(reply)
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="BullShit")
|
||||
oldest=False, save=False, json=False, archive_url=False, newest=False, near=False, alive=False, subdomain=False, known_urls=False, get="BullShit")
|
||||
reply = cli.args_handler(args)
|
||||
assert "get the source code of the" in reply
|
||||
assert "get the source code of the" in str(reply)
|
||||
|
||||
def test_args_handler():
|
||||
args = argparse.Namespace(version=True)
|
||||
@ -90,7 +108,7 @@ def test_args_handler():
|
||||
|
||||
args = argparse.Namespace(url=None, version=False)
|
||||
reply = cli.args_handler(args)
|
||||
assert ("waybackpy %s" % (__version__)) in reply
|
||||
assert ("waybackpy %s" % (__version__)) in str(reply)
|
||||
|
||||
def test_main():
|
||||
# This also tests the parse_args method in cli.py
|
||||
|
@ -2,7 +2,7 @@
|
||||
import sys
|
||||
import pytest
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
sys.path.append("..")
|
||||
import waybackpy.wrapper as waybackpy # noqa: E402
|
||||
@ -28,8 +28,7 @@ def test_dunders():
|
||||
user_agent = "UA"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target)
|
||||
assert len(target) == len(url)
|
||||
assert str(target) == url
|
||||
assert "en.wikipedia.org" in str(target)
|
||||
|
||||
def test_archive_url_parser():
|
||||
request_url = "https://amazon.com"
|
||||
@ -47,7 +46,6 @@ def test_url_check():
|
||||
|
||||
def test_save():
|
||||
# Test for urls that exist and can be archived.
|
||||
time.sleep(10)
|
||||
|
||||
url_list = [
|
||||
"en.wikipedia.org",
|
||||
@ -64,7 +62,7 @@ def test_save():
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
|
||||
)
|
||||
archived_url1 = target.save()
|
||||
archived_url1 = str(target.save())
|
||||
assert url1 in archived_url1
|
||||
|
||||
if sys.version_info > (3, 6):
|
||||
@ -73,18 +71,16 @@ def test_save():
|
||||
with pytest.raises(Exception):
|
||||
url2 = "ha ha ha ha"
|
||||
waybackpy.Url(url2, user_agent)
|
||||
time.sleep(5)
|
||||
url3 = "http://www.archive.is/faq.html"
|
||||
# Test for urls not allowed to archive by robot.txt. Doesn't works anymore. Find alternatives.
|
||||
# with pytest.raises(Exception):
|
||||
#
|
||||
#
|
||||
# target = waybackpy.Url(
|
||||
# url3,
|
||||
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) "
|
||||
# "Gecko/20100101 Firefox/25.0",
|
||||
# )
|
||||
# target.save()
|
||||
# time.sleep(5)
|
||||
# Non existent urls, test
|
||||
with pytest.raises(Exception):
|
||||
target = waybackpy.Url(
|
||||
@ -100,7 +96,6 @@ def test_save():
|
||||
|
||||
|
||||
def test_near():
|
||||
time.sleep(10)
|
||||
url = "google.com"
|
||||
target = waybackpy.Url(
|
||||
url,
|
||||
@ -108,11 +103,10 @@ def test_near():
|
||||
"(KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
|
||||
)
|
||||
archive_near_year = target.near(year=2010)
|
||||
assert "2010" in archive_near_year
|
||||
assert "2010" in str(archive_near_year)
|
||||
|
||||
if sys.version_info > (3, 6):
|
||||
time.sleep(5)
|
||||
archive_near_month_year = target.near(year=2015, month=2)
|
||||
archive_near_month_year = str(target.near(year=2015, month=2))
|
||||
assert (
|
||||
("201502" in archive_near_month_year)
|
||||
or ("201501" in archive_near_month_year)
|
||||
@ -124,9 +118,9 @@ def test_near():
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
|
||||
)
|
||||
archive_near_hour_day_month_year = target.near(
|
||||
archive_near_hour_day_month_year = str(target.near(
|
||||
year=2008, month=5, day=9, hour=15
|
||||
)
|
||||
))
|
||||
assert (
|
||||
("2008050915" in archive_near_hour_day_month_year)
|
||||
or ("2008050914" in archive_near_hour_day_month_year)
|
||||
@ -146,13 +140,22 @@ def test_near():
|
||||
def test_oldest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "20200504141153" in target.oldest()
|
||||
assert "20200504141153" in str(target.oldest())
|
||||
|
||||
def test_json():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "archived_snapshots" in str(target.JSON)
|
||||
|
||||
def test_archive_url():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "github.com/akamhy" in str(target.archive_url)
|
||||
|
||||
def test_newest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert url in target.newest()
|
||||
assert url in str(target.newest())
|
||||
|
||||
|
||||
def test_get():
|
||||
@ -188,3 +191,11 @@ def test_total_archives():
|
||||
" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent
|
||||
)
|
||||
assert target.total_archives() == 0
|
||||
|
||||
def test_known_urls():
|
||||
|
||||
target = waybackpy.Url("akamhy.github.io", user_agent)
|
||||
assert len(target.known_urls(alive=True, subdomain=True)) > 2
|
||||
|
||||
target = waybackpy.Url("akamhy.github.io", user_agent)
|
||||
assert len(target.known_urls()) > 3
|
||||
|
@ -10,7 +10,7 @@
|
||||
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
|
||||
|
||||
"""
|
||||
Waybackpy is a Python library that interfaces with the Internet Archive's Wayback Machine API.
|
||||
Waybackpy is a Python package that interfaces with the Internet Archive's Wayback Machine API.
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Archive pages and retrieve archived pages easily.
|
||||
|
@ -1,9 +1,9 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__title__ = "waybackpy"
|
||||
__description__ = "A Python library that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
|
||||
__description__ = "A Python package that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
|
||||
__url__ = "https://akamhy.github.io/waybackpy/"
|
||||
__version__ = "2.1.8"
|
||||
__version__ = "2.2.0"
|
||||
__author__ = "akamhy"
|
||||
__author_email__ = "akash3pro@gmail.com"
|
||||
__license__ = "MIT"
|
||||
|
@ -10,6 +10,12 @@ from waybackpy.__version__ import __version__
|
||||
def _save(obj):
|
||||
return (obj.save())
|
||||
|
||||
def _archive_url(obj):
|
||||
return (obj.archive_url)
|
||||
|
||||
def _json(obj):
|
||||
return (obj.JSON)
|
||||
|
||||
def _oldest(obj):
|
||||
return (obj.oldest())
|
||||
|
||||
@ -34,6 +40,10 @@ def _near(obj, args):
|
||||
return (obj.near(**_near_args))
|
||||
|
||||
def _known_urls(obj, args):
|
||||
"""Abbreviations:
|
||||
sd = subdomain
|
||||
al = alive
|
||||
"""
|
||||
sd = False
|
||||
al = False
|
||||
if args.subdomain:
|
||||
@ -48,7 +58,8 @@ def _known_urls(obj, args):
|
||||
if m:
|
||||
domain = m.group(1)
|
||||
else:
|
||||
domain = "waybackpy-known"
|
||||
domain = "domain-unknown"
|
||||
|
||||
dir_path = os.path.abspath(os.getcwd())
|
||||
file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
|
||||
text = "\n".join(url_list) + "\n"
|
||||
@ -67,6 +78,9 @@ def _get(obj, args):
|
||||
if args.get.lower() == "url":
|
||||
return (obj.get())
|
||||
|
||||
if args.get.lower() == "archive_url":
|
||||
return (obj.get(obj.archive_url))
|
||||
|
||||
if args.get.lower() == "oldest":
|
||||
return (obj.get(obj.oldest()))
|
||||
|
||||
@ -78,9 +92,10 @@ def _get(obj, args):
|
||||
|
||||
return ("Use get as \"--get 'source'\", 'source' can be one of the followings: \
|
||||
\n1) url - get the source code of the url specified using --url/-u.\
|
||||
\n2) oldest - get the source code of the oldest archive for the supplied url.\
|
||||
\n3) newest - get the source code of the newest archive for the supplied url.\
|
||||
\n4) save - Create a new archive and get the source code of this new archive for the supplied url.")
|
||||
\n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\
|
||||
\n3) oldest - get the source code of the oldest archive for the supplied url.\
|
||||
\n4) newest - get the source code of the newest archive for the supplied url.\
|
||||
\n5) save - Create a new archive and get the source code of this new archive for the supplied url.")
|
||||
|
||||
def args_handler(args):
|
||||
if args.version:
|
||||
@ -96,6 +111,10 @@ def args_handler(args):
|
||||
|
||||
if args.save:
|
||||
return _save(obj)
|
||||
if args.archive_url:
|
||||
return _archive_url(obj)
|
||||
if args.json:
|
||||
return _json(obj)
|
||||
if args.oldest:
|
||||
return _oldest(obj)
|
||||
if args.newest:
|
||||
@ -118,19 +137,25 @@ def parse_args(argv):
|
||||
|
||||
userAgentArg = parser.add_argument_group('User Agent')
|
||||
userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"")
|
||||
|
||||
|
||||
saveArg = parser.add_argument_group("Create new archive/save URL")
|
||||
saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine")
|
||||
|
||||
|
||||
auArg = parser.add_argument_group("Get the latest Archive")
|
||||
auArg.add_argument("--archive_url", "-au", action='store_true', help="Get the latest archive URL, alias for --newest")
|
||||
|
||||
jsonArg = parser.add_argument_group("Get the JSON data")
|
||||
jsonArg.add_argument("--json", "-j", action='store_true', help="JSON data of the availability API request")
|
||||
|
||||
oldestArg = parser.add_argument_group("Oldest archive")
|
||||
oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL")
|
||||
|
||||
|
||||
newestArg = parser.add_argument_group("Newest archive")
|
||||
newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL")
|
||||
|
||||
|
||||
totalArg = parser.add_argument_group("Total number of archives")
|
||||
totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL")
|
||||
|
||||
|
||||
getArg = parser.add_argument_group("Get source code")
|
||||
getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage")
|
||||
|
||||
@ -151,7 +176,7 @@ def parse_args(argv):
|
||||
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
|
||||
|
||||
parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version")
|
||||
|
||||
|
||||
return parser.parse_args(argv[1:])
|
||||
|
||||
def main(argv=None):
|
||||
|
@ -2,5 +2,5 @@
|
||||
|
||||
class WaybackError(Exception):
|
||||
"""
|
||||
Raised when API Service error.
|
||||
Raised when Wayback Machine API Service is unreachable/down.
|
||||
"""
|
||||
|
@ -3,7 +3,7 @@
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from waybackpy.exceptions import WaybackError
|
||||
from waybackpy.__version__ import __version__
|
||||
|
||||
@ -69,21 +69,77 @@ class Url:
|
||||
self.url = url
|
||||
self.user_agent = user_agent
|
||||
self._url_check() # checks url validity on init.
|
||||
self.JSON = self._JSON() # JSON of most recent archive
|
||||
self.archive_url = self._archive_url() # URL of archive
|
||||
self.timestamp = self._archive_timestamp() # timestamp for last archive
|
||||
|
||||
def __repr__(self):
|
||||
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
|
||||
|
||||
def __str__(self):
|
||||
return "%s" % self._clean_url()
|
||||
return "%s" % self.archive_url
|
||||
|
||||
def __len__(self):
|
||||
return len(self._clean_url())
|
||||
td_max = timedelta(days=999999999,
|
||||
hours=23,
|
||||
minutes=59,
|
||||
seconds=59,
|
||||
microseconds=999999)
|
||||
if self.timestamp == datetime.max:
|
||||
return td_max.days
|
||||
else:
|
||||
diff = datetime.utcnow() - self.timestamp
|
||||
return diff.days
|
||||
|
||||
def _url_check(self):
|
||||
"""Check for common URL problems."""
|
||||
if "." not in self.url:
|
||||
raise URLError("'%s' is not a vaild URL." % self.url)
|
||||
|
||||
def _JSON(self):
|
||||
request_url = "https://archive.org/wayback/available?url=%s" % (
|
||||
self._clean_url(),
|
||||
)
|
||||
|
||||
hdr = {"User-Agent": "%s" % self.user_agent}
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
response = _get_response(req)
|
||||
data_string = response.read().decode("UTF-8")
|
||||
data = json.loads(data_string)
|
||||
|
||||
return data
|
||||
|
||||
def _archive_url(self):
|
||||
"""Get URL of archive."""
|
||||
data = self.JSON
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
archive_url = None
|
||||
else:
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/",
|
||||
"https://web.archive.org/web/",
|
||||
1
|
||||
)
|
||||
|
||||
return archive_url
|
||||
|
||||
def _archive_timestamp(self):
|
||||
"""Get timestamp of last archive."""
|
||||
data = self.JSON
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
time = datetime.max
|
||||
|
||||
else:
|
||||
time = datetime.strptime(data["archived_snapshots"]
|
||||
["closest"]
|
||||
["timestamp"],
|
||||
'%Y%m%d%H%M%S')
|
||||
|
||||
return time
|
||||
|
||||
def _clean_url(self):
|
||||
"""Fix the URL, if possible."""
|
||||
return str(self.url).strip().replace(" ", "_")
|
||||
@ -94,13 +150,15 @@ class Url:
|
||||
hdr = {"User-Agent": "%s" % self.user_agent} # nosec
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
header = _get_response(req).headers
|
||||
return "https://" + _archive_url_parser(header)
|
||||
self.archive_url = "https://" + _archive_url_parser(header)
|
||||
self.timestamp = datetime.utcnow()
|
||||
return self
|
||||
|
||||
def get(self, url="", user_agent="", encoding=""):
|
||||
"""Return the source code of the supplied URL.
|
||||
If encoding is not supplied, it is auto-detected from the response.
|
||||
"""
|
||||
|
||||
|
||||
if not url:
|
||||
url = self._clean_url()
|
||||
|
||||
@ -146,11 +204,18 @@ class Url:
|
||||
"to create a new archive." % self._clean_url()
|
||||
)
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
# wayback machine returns http sometimes, idk why? But they support https
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||
)
|
||||
return archive_url
|
||||
|
||||
self.archive_url = archive_url
|
||||
self.timestamp = datetime.strptime(data["archived_snapshots"]
|
||||
["closest"]
|
||||
["timestamp"],
|
||||
'%Y%m%d%H%M%S')
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def oldest(self, year=1994):
|
||||
"""Return the oldest Wayback Machine archive for this URL."""
|
||||
@ -190,13 +255,13 @@ class Url:
|
||||
|
||||
if subdomain:
|
||||
request_url = (
|
||||
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
|
||||
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
|
||||
% self._clean_url()
|
||||
)
|
||||
|
||||
else:
|
||||
request_url = (
|
||||
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
|
||||
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
|
||||
% self._clean_url()
|
||||
)
|
||||
|
||||
@ -213,12 +278,12 @@ class Url:
|
||||
for url in url_list:
|
||||
|
||||
try:
|
||||
urlopen(url)
|
||||
urlopen(url) # nosec
|
||||
except:
|
||||
continue
|
||||
|
||||
tmp_url_list.append(url)
|
||||
|
||||
url_list = tmp_url_list
|
||||
|
||||
return url_list
|
||||
|
||||
return url_list
|
||||
|
Reference in New Issue
Block a user