From ca0821a4666662692a3edfd4c0c9c84777b78328 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Sat, 2 Jan 2021 12:20:43 +0530 Subject: [PATCH] Wiki docs (#58) * move docs to wiki * Update README.md * Update setup.py --- README.md | 373 +----------------------------------------------------- setup.py | 2 +- 2 files changed, 7 insertions(+), 368 deletions(-) diff --git a/README.md b/README.md index 59629c3..ed826a9 100644 --- a/README.md +++ b/README.md @@ -21,34 +21,13 @@ -Table of contents -================= +## Table of contents + * [Installation](#installation) -* [Usage](#usage) - * [As a Python package](#as-a-python-package) - * [Saving a webpage](#capturing-aka-saving-an-url-using-save) - * [Retrieving archive](#retrieving-the-archive-for-an-url-using-archive_url) - * [Retrieving the oldest archive](#retrieving-the-oldest-archive-for-an-url-using-oldest) - * [Retrieving the latest/newest archive](#retrieving-the-newest-archive-for-an-url-using-newest) - * [Retrieving the JSON response of availability API](#retrieving-the-json-response-for-the-availability-api-request) - * [Retrieving archive close to a specified year, month, day, hour, and minute](#retrieving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near) - * [Get the content of webpage](#get-the-content-of-webpage-using-get) - * [Count total archives for an URL](#count-total-archives-for-an-url-using-total_archives) - * [List of URLs that Wayback Machine knows and has archived for a domain name](#list-of-urls-that-wayback-machine-knows-and-has-archived-for-a-domain-name) - - * [With the Command-line interface](#with-the-command-line-interface) - * [Saving webpage](#save) - * [Archive URL](#get-archive-url) - * [Oldest archive URL](#oldest-archive) - * [Newest archive URL](#newest-archive) - * [JSON response of API](#get-json-data-of-avaialblity-api) - * [Total archives](#total-number-of-archives) - * [Archive near specified time](#archive-near-time) - * [Get the source code](#get-the-source-code) - * [Fetch all the URLs that the Wayback Machine knows for a domain](#fetch-all-the-urls-that-the-wayback-machine-knows-for-a-domain) +* [Documentation and Wiki](https://github.com/akamhy/waybackpy/wiki) * [Tests](#tests) @@ -58,7 +37,7 @@ Table of contents -## Installation +### Installation Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)): @@ -72,348 +51,8 @@ or direct from this repository using git. pip install git+https://github.com/akamhy/waybackpy.git ``` -## Usage -### As a Python package - -#### Capturing aka Saving an URL using save() - -```python -import waybackpy - -url = "https://en.wikipedia.org/wiki/Multivariable_calculus" -user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" - -waybackpy_url_obj = waybackpy.Url(url, user_agent) -archive = waybackpy_url_obj.save() -print(archive) -``` - -```bash -https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus -``` - -Try this out in your browser @ - -#### Retrieving the archive for an URL using archive_url - -```python -import waybackpy - -url = "https://www.google.com/" -user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0" - -waybackpy_url_obj = waybackpy.Url(url, user_agent) -archive_url = waybackpy_url_obj.archive_url -print(archive_url) -``` - -```bash -https://web.archive.org/web/20201016153320/https://www.google.com/ -``` - -Try this out in your browser @ - -#### Retrieving the oldest archive for an URL using oldest() - -```python -import waybackpy - -url = "https://www.google.com/" -user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0" - -waybackpy_url_obj = waybackpy.Url(url, user_agent) -oldest_archive_url = waybackpy_url_obj.oldest() -print(oldest_archive_url) -``` - -```bash -http://web.archive.org/web/19981111184551/http://google.com:80/ -``` - -Try this out in your browser @ - -#### Retrieving the newest archive for an URL using newest() - -```python -import waybackpy - -url = "https://www.facebook.com/" -user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0" - -waybackpy_url_obj = waybackpy.Url(url, user_agent) -newest_archive_url = waybackpy_url_obj.newest() -print(newest_archive_url) -``` - -```bash -https://web.archive.org/web/20201016150543/https://www.facebook.com/ -``` - -Try this out in your browser @ - -#### Retrieving the JSON response for the availability API request - -```python -import waybackpy - -url = "https://www.facebook.com/" -user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0" - -waybackpy_url_obj = waybackpy.Url(url, user_agent) -json_dict = waybackpy_url_obj.JSON -print(json_dict) -``` - -```javascript -{'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'available': True, 'url': 'http://web.archive.org/web/20201016150543/https://www.facebook.com/', 'timestamp': '20201016150543', 'status': '200'}}} -``` - -Try this out in your browser @ - -#### Retrieving archive close to a specified year, month, day, hour, and minute using near() - -```python -from waybackpy import Url - -user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0" -url = "https://github.com/" - -waybackpy_url_obj = Url(url, user_agent) - -# Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01. -``` - -```python -github_archive_near_2010 = waybackpy_url_obj.near(year=2010) -print(github_archive_near_2010) -``` - -```bash -https://web.archive.org/web/20101018053604/http://github.com:80/ -``` - -```python -github_archive_near_2011_may = waybackpy_url_obj.near(year=2011, month=5) -print(github_archive_near_2011_may) -``` - -```bash -https://web.archive.org/web/20110518233639/https://github.com/ -``` - -```python -github_archive_near_2015_january_26 = waybackpy_url_obj.near(year=2015, month=1, day=26) -print(github_archive_near_2015_january_26) -``` - -```bash -https://web.archive.org/web/20150125102636/https://github.com/ -``` - -```python -github_archive_near_2018_4_july_9_2_am = waybackpy_url_obj.near(year=2018, month=7, day=4, hour=9, minute=2) -print(github_archive_near_2018_4_july_9_2_am) -``` - -```bash -https://web.archive.org/web/20180704090245/https://github.com/ -``` - -The package doesn't support the seconds' argument yet. You are encouraged to create a PR ;) - -Try this out in your browser @ - -#### Get the content of webpage using get() - -```python -import waybackpy - -google_url = "https://www.google.com/" - -User_Agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36" - -waybackpy_url_object = waybackpy.Url(google_url, User_Agent) - - -# If no argument is passed in get(), it gets the source of the Url used to create the object. -current_google_url_source = waybackpy_url_object.get() -print(current_google_url_source) - - -# The following chunk of code will force a new archive of google.com and get the source of the archived page. -# waybackpy_url_object.save() type is string. -google_newest_archive_source = waybackpy_url_object.get(waybackpy_url_object.save()) -print(google_newest_archive_source) - - -# waybackpy_url_object.oldest() type is str, it's oldest archive of google.com -google_oldest_archive_source = waybackpy_url_object.get(waybackpy_url_object.oldest()) -print(google_oldest_archive_source) -``` - -Try this out in your browser @ - -#### Count total archives for an URL using total_archives() - -```python -import waybackpy - -URL = "https://en.wikipedia.org/wiki/Python (programming language)" -UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4" - -waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA) - -archive_count = waybackpy_url_object.total_archives() - -print(archive_count) # total_archives() returns an int -``` - -```bash -2516 -``` - -Try this out in your browser @ - -#### List of URLs that Wayback Machine knows and has archived for a domain name - -1) If alive=True is set, waybackpy will check all URLs to identify the alive URLs. Don't use with popular websites like google or it would take too long. -2) To include URLs from subdomain set sundomain=True - -```python -import waybackpy - -URL = "akamhy.github.io" -UA = "Mozilla/5.0 (iPad; CPU OS 8_1_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B435 Safari/600.1.4" - -waybackpy_url_object = waybackpy.Url(url=URL, user_agent=UA) -known_urls = waybackpy_url_object.known_urls(alive=True, subdomain=False) # alive and subdomain are optional. -print(known_urls) # known_urls() returns list of URLs -``` - -```bash -['http://akamhy.github.io', -'https://akamhy.github.io/waybackpy/', -'https://akamhy.github.io/waybackpy/assets/css/style.css?v=a418a4e4641a1dbaad8f3bfbf293fad21a75ff11', -'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c'] -``` - -Try this out in your browser @ - -### With the Command-line interface - -#### Save - -```bash -$ waybackpy --url "https://en.wikipedia.org/wiki/Social_media" --user_agent "my-unique-user-agent" --save -https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media -``` - -Try this out in your browser @ - -#### Get archive URL - -```bash -$ waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --archive_url -https://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX -``` - -Try this out in your browser @ - -#### Oldest archive - -```bash -$ waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --oldest -https://web.archive.org/web/20040803000845/http://en.wikipedia.org:80/wiki/SpaceX -``` - -Try this out in your browser @ - -#### Newest archive - -```bash -$ waybackpy --url "https://en.wikipedia.org/wiki/YouTube" --user_agent "my-unique-user-agent" --newest -https://web.archive.org/web/20200606044708/https://en.wikipedia.org/wiki/YouTube -``` - -Try this out in your browser @ - -#### Get JSON data of avaialblity API - -```bash -waybackpy --url "https://en.wikipedia.org/wiki/SpaceX" --user_agent "my-unique-user-agent" --json - -``` - -```javascript -{'archived_snapshots': {'closest': {'timestamp': '20201007132458', 'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX'}}, 'url': 'https://en.wikipedia.org/wiki/SpaceX'} - -``` - -Try this out in your browser @ - -#### Total number of archives - -```bash -$ waybackpy --url "https://en.wikipedia.org/wiki/Linux_kernel" --user_agent "my-unique-user-agent" --total -853 - -``` - -Try this out in your browser @ - -#### Archive near time - -```bash -$ waybackpy --url facebook.com --user_agent "my-unique-user-agent" --near --year 2012 --month 5 --day 12 -https://web.archive.org/web/20120512142515/https://www.facebook.com/ -``` - -Try this out in your browser @ - -#### Get the source code - -```bash -waybackpy --url google.com --user_agent "my-unique-user-agent" --get url # Prints the source code of the URL -waybackpy --url google.com --user_agent "my-unique-user-agent" --get oldest # Prints the source code of the oldest archive -waybackpy --url google.com --user_agent "my-unique-user-agent" --get newest # Prints the source code of the newest archive -waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on Wayback machine then print the source code of this archive. -``` - -Try this out in your browser @ - -#### Fetch all the URLs that the Wayback Machine knows for a domain - -1) You can add the '--alive' flag to only fetch alive links. -2) You can add the '--subdomain' flag to add subdomains. -3) '--alive' and '--subdomain' flags can be used simultaneously. -4) All links will be saved in a file, and the file will be created in the current working directory. - -```bash -pip install waybackpy - -# Ignore the above installation line. - -waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls -# Prints all known URLs under akamhy.github.io - - -waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --alive -# Prints all known URLs under akamhy.github.io which are still working and not dead links. - - -waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain -# Prints all known URLs under akamhy.github.io including subdomain - - -waybackpy --url akamhy.github.io --user_agent "my-user-agent" --known_urls --subdomain --alive -# Prints all known URLs under akamhy.github.io including subdomain which are not dead links and still alive. - -``` - -Try this out in your browser @ - -## Tests +### Tests To run tests locally: @@ -438,7 +77,7 @@ bash <(curl -s https://codecov.io/bash) -t SECRET_CODECOV_TOKEN You can find the tests [here](https://github.com/akamhy/waybackpy/tree/master/tests). -## Packaging +### Packaging 1. Increment version. diff --git a/setup.py b/setup.py index 5bb28d6..e014a57 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ setup( ], entry_points={"console_scripts": ["waybackpy = waybackpy.cli:main"]}, project_urls={ - "Documentation": "https://akamhy.github.io/waybackpy/", + "Documentation": "https://github.com/akamhy/waybackpy/wiki", "Source": "https://github.com/akamhy/waybackpy", "Tracker": "https://github.com/akamhy/waybackpy/issues", },