Compare commits
204 Commits
Author | SHA1 | Date | |
---|---|---|---|
5a9c861cad | |||
dd1917c77e | |||
db8f902cff | |||
88cda94c0b | |||
09290f88d1 | |||
e5835091c9 | |||
7312ed1f4f | |||
6ae8f843d3 | |||
36b936820b | |||
a3bc6aad2b | |||
edc2f63d93 | |||
ffe0810b12 | |||
40233eb115 | |||
d549d31421 | |||
0725163af8 | |||
712471176b | |||
dcd7b03302 | |||
76205d9cf6 | |||
ec0a0d04cc | |||
7bb01df846 | |||
6142e0b353 | |||
a65990aee3 | |||
259a024eb1 | |||
91402792e6 | |||
eabf4dc046 | |||
5a7bd73565 | |||
4693dbf9c1 | |||
f4f2e51315 | |||
d6b7df6837 | |||
dafba5d0cb | |||
6c71dfbe41 | |||
a6470b1036 | |||
04cda4558e | |||
625ed63482 | |||
a03813315f | |||
a2550f17d7 | |||
15ef5816db | |||
93b52bd0fe | |||
28ff877081 | |||
3e3ecff9df | |||
ce64135ba8 | |||
2af6580ffb | |||
8a3c515176 | |||
d98c4f32ad | |||
e0a4b007d5 | |||
6fb6b2deee | |||
1882862992 | |||
0c6107e675 | |||
bd079978bf | |||
5dec4927cd | |||
62e5217b9e | |||
9823c809e9 | |||
db5737a857 | |||
ca0821a466 | |||
bb4dbc7d3c | |||
7c7fd75376 | |||
0b71433667 | |||
1b499a7594 | |||
da390ee8a3 | |||
d3e68d0e70 | |||
fde28d57aa | |||
6092e504c8 | |||
93ef60ecd2 | |||
461b3f74c9 | |||
3c53b411b0 | |||
8125526061 | |||
2dc81569a8 | |||
fd163f3d36 | |||
a0a918cf0d | |||
4943cf6873 | |||
bc3efc7d63 | |||
f89368f16d | |||
c919a6a605 | |||
0280fca189 | |||
60ee8b95a8 | |||
ca51c14332 | |||
525cf17c6f | |||
406e03c52f | |||
672b33e83a | |||
b19b840628 | |||
a6df4f899c | |||
7686e9c20d | |||
3c5932bc39 | |||
f9a986f489 | |||
0d7458ee90 | |||
ac8b9d6a50 | |||
58cd9c28e7 | |||
5088305a58 | |||
9f847a5e55 | |||
6c04c2f3d3 | |||
925be7b17e | |||
2b132456ac | |||
50e3154a4e | |||
7aef50428f | |||
d8ec0f5025 | |||
0a2f97c034 | |||
3e9cf23578 | |||
7f927ec7be | |||
9de6393cd5 | |||
91e7f65617 | |||
d465454019 | |||
1a81eb97fb | |||
6b3b2e2a7d | |||
82c65454e6 | |||
19710461b6 | |||
a3661d6b85 | |||
58375e4ef4 | |||
ea023e98da | |||
f1065ed1c8 | |||
315519b21f | |||
07c98661de | |||
2cd991a54e | |||
ede251afb3 | |||
a8ce970ca0 | |||
243af26bf6 | |||
0f1db94884 | |||
c304f58ea2 | |||
23f7222cb5 | |||
ce7294d990 | |||
c9fa114d2e | |||
8b6bacb28e | |||
32d8ad7780 | |||
cbf2f90faa | |||
4dde3e3134 | |||
1551e8f1c6 | |||
c84f09e2d2 | |||
57a32669b5 | |||
fe017cbcc8 | |||
5edb03d24b | |||
c5de2232ba | |||
ca9186c301 | |||
8a4b631c13 | |||
ec9ce92f48 | |||
e95d35c37f | |||
36d662b961 | |||
2835f8877e | |||
18cbd2fd30 | |||
a2812fb56f | |||
77effcf649 | |||
7272ef45a0 | |||
56116551ac | |||
4dcda94cb0 | |||
09f59b0182 | |||
ed24184b99 | |||
56bef064b1 | |||
44bb2cf5e4 | |||
e231228721 | |||
b8b2d6dfa9 | |||
3eca6294df | |||
eb037a0284 | |||
a01821f20b | |||
b21036f8df | |||
b43bacb7ac | |||
f7313b255a | |||
7457e1c793 | |||
f7493d823f | |||
7fa7b59ce3 | |||
78a608db50 | |||
93f7dfdaf9 | |||
83c6f256c9 | |||
dee9105794 | |||
3bfc3b46d0 | |||
553f150bee | |||
b3a7e714a5 | |||
cd9841713c | |||
1ea9548d46 | |||
be7642c837 | |||
a418a4e464 | |||
aec035ef1e | |||
6d37993ab9 | |||
72b80ca44e | |||
c10aa9279c | |||
68d809a7d6 | |||
4ad09a419b | |||
ddc6620f09 | |||
4066a65678 | |||
8e46a9ba7a | |||
a5a98b9b00 | |||
a721ab7d6c | |||
7db27ae5e1 | |||
8fd4462025 | |||
c458a15820 | |||
bae3412bee | |||
94cb08bb37 | |||
af888db13e | |||
d24f2408ee | |||
ddd2274015 | |||
99abdb7c67 | |||
f3bb9a8540 | |||
bb94e0d1c5 | |||
1a78d88be2 | |||
3ec61758b3 | |||
83c962166d | |||
e87dee3bdf | |||
b27bfff15a | |||
970fc1cd08 | |||
65391bf14b | |||
8ab116f276 | |||
6f82041ec9 | |||
11059c960e | |||
eee1b8eba1 | |||
f7de8f5575 | |||
3fa0c32064 | |||
aa1e3b8825 |
42
.github/workflows/ci.yml
vendored
Normal file
42
.github/workflows/ci.yml
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
|
||||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
|
||||
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
branches: [ master ]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.8']
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install flake8 pytest codecov pytest-cov
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest --cov=waybackpy tests/
|
||||
- name: Upload coverage to Codecov
|
||||
run: |
|
||||
bash <(curl -s https://codecov.io/bash) -t ${{ secrets.CODECOV_TOKEN }}
|
31
.github/workflows/python-publish.yml
vendored
Normal file
31
.github/workflows/python-publish.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
# This workflows will upload a Python Package using Twine when a release is created
|
||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
||||
|
||||
name: Upload Python Package
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: '3.x'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools wheel twine
|
||||
- name: Build and publish
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: |
|
||||
python setup.py sdist bdist_wheel
|
||||
twine upload dist/*
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,6 @@
|
||||
# Files generated while testing
|
||||
*-urls-*.txt
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
4
.pep8speaks.yml
Normal file
4
.pep8speaks.yml
Normal file
@ -0,0 +1,4 @@
|
||||
# File : .pep8speaks.yml
|
||||
|
||||
scanner:
|
||||
diff_only: True # If True, errors caused by only the patch are shown
|
5
.pyup.yml
Normal file
5
.pyup.yml
Normal file
@ -0,0 +1,5 @@
|
||||
# autogenerated pyup.io config file
|
||||
# see https://pyup.io/docs/configuration/ for all available options
|
||||
|
||||
schedule: ''
|
||||
update: false
|
14
.travis.yml
14
.travis.yml
@ -1,14 +0,0 @@
|
||||
language: python
|
||||
python:
|
||||
- "2.7"
|
||||
- "3.6"
|
||||
- "3.8"
|
||||
os: linux
|
||||
dist: xenial
|
||||
cache: pip
|
||||
install:
|
||||
- pip install pytest
|
||||
before_script:
|
||||
cd tests
|
||||
script:
|
||||
- pytest test_1.py
|
58
CONTRIBUTING.md
Normal file
58
CONTRIBUTING.md
Normal file
@ -0,0 +1,58 @@
|
||||
# Contributing to waybackpy
|
||||
|
||||
We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
|
||||
|
||||
- Reporting a bug
|
||||
- Discussing the current state of the code
|
||||
- Submitting a fix
|
||||
- Proposing new features
|
||||
- Becoming a maintainer
|
||||
|
||||
## We Develop with Github
|
||||
|
||||
We use github to host code, to track issues and feature requests, as well as accept pull requests.
|
||||
|
||||
## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
|
||||
|
||||
Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests:
|
||||
|
||||
1. Fork the repo and create your branch from `master`.
|
||||
2. If you've added code that should be tested, add tests.
|
||||
3. If you've changed APIs, update the documentation.
|
||||
4. Ensure the test suite passes.
|
||||
5. Make sure your code lints.
|
||||
6. Issue that pull request!
|
||||
|
||||
## Any contributions you make will be under the MIT Software License
|
||||
|
||||
In short, when you submit code changes, your submissions are understood to be under the same [MIT License](https://github.com/akamhy/waybackpy/blob/master/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern.
|
||||
|
||||
## Report bugs using Github's [issues](https://github.com/akamhy/waybackpy/issues)
|
||||
|
||||
We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/akamhy/waybackpy/issues/new); it's that easy!
|
||||
|
||||
## Write bug reports with detail, background, and sample code
|
||||
|
||||
**Great Bug Reports** tend to have:
|
||||
|
||||
- A quick summary and/or background
|
||||
- Steps to reproduce
|
||||
- Be specific!
|
||||
- Give sample code if you can.
|
||||
- What you expected would happen
|
||||
- What actually happens
|
||||
- Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
|
||||
|
||||
People *love* thorough bug reports. I'm not even kidding.
|
||||
|
||||
## Use a Consistent Coding Style
|
||||
|
||||
* You can try running `flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics` for style unification.
|
||||
|
||||
## License
|
||||
|
||||
By contributing, you agree that your contributions will be licensed under its [MIT License](https://github.com/akamhy/waybackpy/blob/master/LICENSE).
|
||||
|
||||
## References
|
||||
|
||||
This document is forked from [this gist](https://gist.github.com/briandk/3d2e8b3ec8daf5a27a62) by [briandk](https://github.com/briandk) which was itself adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md)
|
9
CONTRIBUTORS.md
Normal file
9
CONTRIBUTORS.md
Normal file
@ -0,0 +1,9 @@
|
||||
## AUTHORS
|
||||
- akamhy (<https://github.com/akamhy>)
|
||||
- danvalen1 (<https://github.com/danvalen1>)
|
||||
- AntiCompositeNumber (<https://github.com/AntiCompositeNumber>)
|
||||
|
||||
## ACKNOWLEDGEMENTS
|
||||
- mhmdiaa (<https://github.com/mhmdiaa>) for <https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050>. known_urls is based on this gist.
|
||||
- datashaman (<https://stackoverflow.com/users/401467/datashaman>) for <https://stackoverflow.com/a/35504626>. _get_response is based on this amazing answer.
|
||||
- dequeued0 (<https://github.com/dequeued0>) for reporting bugs and useful feature requests.
|
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 akamhy
|
||||
Copyright (c) 2020 waybackpy contributors ( https://github.com/akamhy/waybackpy/graphs/contributors )
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
205
README.md
205
README.md
@ -1,142 +1,111 @@
|
||||
# waybackpy
|
||||
<div align="center">
|
||||
|
||||
[](https://travis-ci.org/akamhy/waybackpy)
|
||||
[](https://pypistats.org/packages/waybackpy)
|
||||
[](https://github.com/akamhy/waybackpy/releases)
|
||||
[](https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade)
|
||||
[](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
[](https://codeclimate.com/github/akamhy/waybackpy/maintainability)
|
||||
[](https://www.codefactor.io/repository/github/akamhy/waybackpy)
|
||||
[](https://www.python.org/)
|
||||

|
||||

|
||||
[](https://github.com/akamhy/waybackpy/graphs/commit-activity)
|
||||
[](https://codecov.io/gh/akamhy/waybackpy)
|
||||

|
||||

|
||||
<img src="https://raw.githubusercontent.com/akamhy/waybackpy/master/assets/waybackpy_logo.svg"><br>
|
||||
|
||||
<h2>Python package & CLI tool that interfaces with the Wayback Machine API</h2>
|
||||
|
||||

|
||||

|
||||
</div>
|
||||
|
||||
Waybackpy is a Python library that interfaces with the [Internet Archive](https://en.wikipedia.org/wiki/Internet_Archive)'s [Wayback Machine](https://en.wikipedia.org/wiki/Wayback_Machine) API. Archive pages and retrieve archived pages easily.
|
||||
<p align="center">
|
||||
<a href="https://pypi.org/project/waybackpy/"><img alt="pypi" src="https://img.shields.io/pypi/v/waybackpy.svg"></a>
|
||||
<a href="https://github.com/akamhy/waybackpy/actions?query=workflow%3ACI"><img alt="Build Status" src="https://github.com/akamhy/waybackpy/workflows/CI/badge.svg"></a>
|
||||
<a href="https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade"><img alt="Codacy Badge" src="https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65"></a>
|
||||
<a href="https://codecov.io/gh/akamhy/waybackpy"><img alt="codecov" src="https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg"></a>
|
||||
<a href="https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md"><img alt="Contributions Welcome" src="https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square"></a>
|
||||
<a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a>
|
||||
<a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a>
|
||||
<a href="#"><img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square"></a>
|
||||
</p>
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
<!--ts-->
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
* [Installation](#installation)
|
||||
### Installation
|
||||
|
||||
* [Usage](#usage)
|
||||
* [Saving an url using save()](#capturing-aka-saving-an-url-using-save)
|
||||
* [Receiving the oldest archive for an URL Using oldest()](#receiving-the-oldest-archive-for-an-url-using-oldest)
|
||||
* [Receiving the recent most/newest archive for an URL using newest()](#receiving-the-newest-archive-for-an-url-using-newest)
|
||||
* [Receiving archive close to a specified year, month, day, hour, and minute using near()](#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
|
||||
* [Get the content of webpage using get()](#get-the-content-of-webpage-using-get)
|
||||
* [Count total archives for an URL using total_archives()](#count-total-archives-for-an-url-using-total_archives)
|
||||
|
||||
|
||||
* [Tests](#tests)
|
||||
|
||||
* [Dependency](#dependency)
|
||||
|
||||
* [License](#license)
|
||||
|
||||
<!--te-->
|
||||
|
||||
## Installation
|
||||
Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)):
|
||||
|
||||
```bash
|
||||
pip install waybackpy
|
||||
```
|
||||
|
||||
Install directly from GitHub:
|
||||
|
||||
## Usage
|
||||
|
||||
#### Capturing aka Saving an url Using save()
|
||||
```python
|
||||
import waybackpy
|
||||
# Capturing a new archive on Wayback machine.
|
||||
target_url = waybackpy.Url("https://github.com/akamhy/waybackpy", user_agnet="My-cool-user-agent")
|
||||
archived_url = target_url.save()
|
||||
print(archived_url)
|
||||
```bash
|
||||
pip install git+https://github.com/akamhy/waybackpy.git
|
||||
```
|
||||
This should print an URL similar to the following archived URL:
|
||||
|
||||
> <https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy>
|
||||
### Supported Features
|
||||
|
||||
- Archive webpage
|
||||
- Retrieve all archives of a webpage/domain
|
||||
- Retrieve archive close to a date or timestamp
|
||||
- Retrieve all archives which have a particular prefix
|
||||
- Get source code of the archive easily
|
||||
- CDX API support
|
||||
|
||||
|
||||
#### Receiving the oldest archive for an URL Using oldest()
|
||||
### Usage
|
||||
|
||||
#### As a Python package
|
||||
```python
|
||||
import waybackpy
|
||||
# retrieving the oldest archive on Wayback machine.
|
||||
target_url = waybackpy.Url("https://www.google.com/", "My-cool-user-agent")
|
||||
oldest_archive = target_url.oldest()
|
||||
print(oldest_archive)
|
||||
>>> import waybackpy
|
||||
|
||||
>>> url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
|
||||
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
>>> wayback = waybackpy.Url(url, user_agent)
|
||||
|
||||
>>> archive = wayback.save()
|
||||
>>> archive.archive_url
|
||||
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
|
||||
|
||||
>>> archive.timestamp
|
||||
datetime.datetime(2021, 1, 4, 17, 35, 12, 691741)
|
||||
|
||||
>>> oldest_archive = wayback.oldest()
|
||||
>>> oldest_archive.archive_url
|
||||
'https://web.archive.org/web/20050422130129/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
|
||||
|
||||
>>> archive_close_to_2010_feb = wayback.near(year=2010, month=2)
|
||||
>>> archive_close_to_2010_feb.archive_url
|
||||
'https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
|
||||
|
||||
>>> wayback.newest().archive_url
|
||||
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
|
||||
```
|
||||
This should print the oldest available archive for <https://google.com>.
|
||||
|
||||
> <http://web.archive.org/web/19981111184551/http://google.com:80/>
|
||||
> Full Python package documentation can be found at <https://github.com/akamhy/waybackpy/wiki/Python-package-docs>.
|
||||
|
||||
|
||||
#### Receiving the newest archive for an URL using newest()
|
||||
```python
|
||||
import waybackpy
|
||||
# retrieving the newest/latest archive on Wayback machine.
|
||||
target_url = waybackpy.Url(url="https://www.google.com/", user_agnet="My-cool-user-agent")
|
||||
newest_archive = target_url.newest()
|
||||
print(newest_archive)
|
||||
|
||||
#### As a CLI tool
|
||||
```bash
|
||||
$ waybackpy --save --url "https://en.wikipedia.org/wiki/Social_media" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media
|
||||
|
||||
$ waybackpy --oldest --url "https://en.wikipedia.org/wiki/Humanoid" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20040415020811/http://en.wikipedia.org:80/wiki/Humanoid
|
||||
|
||||
$ waybackpy --newest --url "https://en.wikipedia.org/wiki/Remote_sensing" --user_agent "my-unique-user-agent"
|
||||
https://web.archive.org/web/20201221130522/https://en.wikipedia.org/wiki/Remote_sensing
|
||||
|
||||
$ waybackpy --total --url "https://en.wikipedia.org/wiki/Linux_kernel" --user_agent "my-unique-user-agent"
|
||||
1904
|
||||
|
||||
$ waybackpy --known_urls --url akamhy.github.io --user_agent "my-unique-user-agent" --file
|
||||
https://akamhy.github.io
|
||||
https://akamhy.github.io/assets/js/scale.fix.js
|
||||
https://akamhy.github.io/favicon.ico
|
||||
https://akamhy.github.io/robots.txt
|
||||
https://akamhy.github.io/waybackpy/
|
||||
|
||||
'akamhy.github.io-urls-iftor2.txt' saved in current working directory
|
||||
```
|
||||
This print the newest available archive for <https://www.microsoft.com/en-us>, something just like this:
|
||||
|
||||
> <http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/>
|
||||
|
||||
|
||||
#### Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
```python
|
||||
import waybackpy
|
||||
# retriving the the closest archive from a specified year.
|
||||
# supported argumnets are year,month,day,hour and minute
|
||||
target_url = waybackpy.Url(https://www.facebook.com/", "Any-User-Agent")
|
||||
archive_near_year = target_url.near(year=2010)
|
||||
print(archive_near_year)
|
||||
```
|
||||
returns : <http://web.archive.org/web/20100504071154/http://www.facebook.com/>
|
||||
|
||||
> Please note that if you only specify the year, the current month and day are default arguments for month and day respectively. Just putting the year parameter would not return the archive closer to January but the current month you are using the package. You need to specify the month "1" for January , 2 for february and so on.
|
||||
|
||||
> Do not pad (don't use zeros in the month, year, day, minute, and hour arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
|
||||
|
||||
#### Get the content of webpage using get()
|
||||
```python
|
||||
import waybackpy
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# supported argumnets encoding and user_agent
|
||||
target = waybackpy.Url("google.com", "any-user_agent")
|
||||
oldest_url = target.oldest()
|
||||
webpage = target.get(oldest_url) # We are getting the source of oldest archive of google.com.
|
||||
print(webpage)
|
||||
```
|
||||
> This should print the source code for oldest archive of google.com. If no URL is passed in get() then it should retrive the source code of google.com and not any archive.
|
||||
|
||||
#### Count total archives for an URL using total_archives()
|
||||
```python
|
||||
from waybackpy import Url
|
||||
# retriving the content of a webpage from any url including but not limited to the archived urls.
|
||||
count = Url("https://en.wikipedia.org/wiki/Python (programming language)", "User-Agent").total_archives()
|
||||
print(count)
|
||||
```
|
||||
> This should print an integer (int), which is the number of total archives on archive.org
|
||||
|
||||
|
||||
## Tests
|
||||
* [Here](https://github.com/akamhy/waybackpy/tree/master/tests)
|
||||
|
||||
|
||||
## Dependency
|
||||
* None, just python standard libraries (re, json, urllib and datetime). Both python 2 and 3 are supported :)
|
||||
|
||||
> Full CLI documentation can be found at <https://github.com/akamhy/waybackpy/wiki/CLI-docs>.
|
||||
|
||||
## License
|
||||
[MIT License](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
[](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
|
||||
Released under the MIT License. See
|
||||
[license](https://github.com/akamhy/waybackpy/blob/master/LICENSE) for details.
|
||||
|
||||
|
||||
-----------------------------------------------------------------------------------------------------------------------------------------------
|
||||
|
@ -1 +1 @@
|
||||
theme: jekyll-theme-cayman
|
||||
theme: jekyll-theme-cayman
|
||||
|
1
assets/waybackpy_logo.svg
Normal file
1
assets/waybackpy_logo.svg
Normal file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 176.612 41.908" height="158.392" width="667.51" xmlns:v="https://github.com/akamhy/waybackpy"><text transform="matrix(.862888 0 0 1.158899 -.748 -98.312)" y="110.937" x="0.931" xml:space="preserve" font-weight="bold" font-size="28.149" font-family="sans-serif" letter-spacing="0" word-spacing="0" writing-mode="lr-tb" fill="#003dff"><tspan y="110.937" x="0.931"><tspan y="110.937" x="0.931" letter-spacing="3.568" writing-mode="lr-tb">waybackpy</tspan></tspan></text><path d="M.749 0h153.787v4.864H.749zm22.076 37.418h153.787v4.49H22.825z" fill="navy"/><path d="M0 37.418h22.825v4.49H0zM154.536 0h21.702v4.864h-21.702z" fill="#f0f"/></svg>
|
After Width: | Height: | Size: 694 B |
225
index.rst
225
index.rst
@ -1,225 +0,0 @@
|
||||
waybackpy
|
||||
=========
|
||||
|
||||
|Build Status| |Downloads| |Release| |Codacy Badge| |License: MIT|
|
||||
|Maintainability| |CodeFactor| |made-with-python| |pypi| |PyPI - Python
|
||||
Version| |Maintenance| |codecov| |image1| |contributions welcome|
|
||||
|
||||
.. |Build Status| image:: https://img.shields.io/travis/akamhy/waybackpy.svg?label=Travis%20CI&logo=travis&style=flat-square
|
||||
:target: https://travis-ci.org/akamhy/waybackpy
|
||||
.. |Downloads| image:: https://img.shields.io/pypi/dm/waybackpy.svg
|
||||
:target: https://pypistats.org/packages/waybackpy
|
||||
.. |Release| image:: https://img.shields.io/github/v/release/akamhy/waybackpy.svg
|
||||
:target: https://github.com/akamhy/waybackpy/releases
|
||||
.. |Codacy Badge| image:: https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65
|
||||
:target: https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade
|
||||
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
|
||||
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
||||
.. |Maintainability| image:: https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability
|
||||
:target: https://codeclimate.com/github/akamhy/waybackpy/maintainability
|
||||
.. |CodeFactor| image:: https://www.codefactor.io/repository/github/akamhy/waybackpy/badge
|
||||
:target: https://www.codefactor.io/repository/github/akamhy/waybackpy
|
||||
.. |made-with-python| image:: https://img.shields.io/badge/Made%20with-Python-1f425f.svg
|
||||
:target: https://www.python.org/
|
||||
.. |pypi| image:: https://img.shields.io/pypi/v/waybackpy.svg
|
||||
.. |PyPI - Python Version| image:: https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square
|
||||
.. |Maintenance| image:: https://img.shields.io/badge/Maintained%3F-yes-green.svg
|
||||
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity
|
||||
.. |codecov| image:: https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg
|
||||
:target: https://codecov.io/gh/akamhy/waybackpy
|
||||
.. |image1| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
|
||||
.. |contributions welcome| image:: https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square
|
||||
|
||||
|
||||
|Internet Archive| |Wayback Machine|
|
||||
|
||||
Waybackpy is a Python library that interfaces with the `Internet
|
||||
Archive`_\ ’s `Wayback Machine`_ API. Archive pages and retrieve
|
||||
archived pages easily.
|
||||
|
||||
.. _Internet Archive: https://en.wikipedia.org/wiki/Internet_Archive
|
||||
.. _Wayback Machine: https://en.wikipedia.org/wiki/Wayback_Machine
|
||||
|
||||
.. |Internet Archive| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png
|
||||
.. |Wayback Machine| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png
|
||||
|
||||
Table of contents
|
||||
=================
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<!--ts-->
|
||||
|
||||
- `Installation`_
|
||||
|
||||
- `Usage`_
|
||||
|
||||
- `Saving an url using save()`_
|
||||
- `Receiving the oldest archive for an URL Using oldest()`_
|
||||
- `Receiving the recent most/newest archive for an URL using
|
||||
newest()`_
|
||||
- `Receiving archive close to a specified year, month, day, hour,
|
||||
and minute using near()`_
|
||||
- `Get the content of webpage using get()`_
|
||||
- `Count total archives for an URL using total_archives()`_
|
||||
|
||||
- `Tests`_
|
||||
|
||||
- `Dependency`_
|
||||
|
||||
- `License`_
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<!--te-->
|
||||
|
||||
.. _Installation: #installation
|
||||
.. _Usage: #usage
|
||||
.. _Saving an url using save(): #capturing-aka-saving-an-url-using-save
|
||||
.. _Receiving the oldest archive for an URL Using oldest(): #receiving-the-oldest-archive-for-an-url-using-oldest
|
||||
.. _Receiving the recent most/newest archive for an URL using newest(): #receiving-the-newest-archive-for-an-url-using-newest
|
||||
.. _Receiving archive close to a specified year, month, day, hour, and minute using near(): #receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near
|
||||
.. _Get the content of webpage using get(): #get-the-content-of-webpage-using-get
|
||||
.. _Count total archives for an URL using total_archives(): #count-total-archives-for-an-url-using-total_archives
|
||||
.. _Tests: #tests
|
||||
.. _Dependency: #dependency
|
||||
.. _License: #license
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
Using `pip`_:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
pip install waybackpy
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
Capturing aka Saving an url Using save()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# Capturing a new archive on Wayback machine.
|
||||
target_url = waybackpy.Url("https://github.com/akamhy/waybackpy", user_agnet="My-cool-user-agent")
|
||||
archived_url = target_url.save()
|
||||
print(archived_url)
|
||||
|
||||
This should print an URL similar to the following archived URL:
|
||||
|
||||
https://web.archive.org/web/20200504141153/https://github.com/akamhy/waybackpy
|
||||
|
||||
.. _pip: https://en.wikipedia.org/wiki/Pip_(package_manager)
|
||||
|
||||
Receiving the oldest archive for an URL Using oldest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retrieving the oldest archive on Wayback machine.
|
||||
target_url = waybackpy.Url("https://www.google.com/", "My-cool-user-agent")
|
||||
oldest_archive = target_url.oldest()
|
||||
print(oldest_archive)
|
||||
|
||||
This should print the oldest available archive for https://google.com.
|
||||
|
||||
http://web.archive.org/web/19981111184551/http://google.com:80/
|
||||
|
||||
Receiving the newest archive for an URL using newest()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retrieving the newest/latest archive on Wayback machine.
|
||||
target_url = waybackpy.Url(url="https://www.google.com/", user_agnet="My-cool-user-agent")
|
||||
newest_archive = target_url.newest()
|
||||
print(newest_archive)
|
||||
|
||||
This print the newest available archive for
|
||||
https://www.microsoft.com/en-us, something just like this:
|
||||
|
||||
http://web.archive.org/web/20200429033402/https://www.microsoft.com/en-us/
|
||||
|
||||
Receiving archive close to a specified year, month, day, hour, and minute using near()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retriving the the closest archive from a specified year.
|
||||
# supported argumnets are year,month,day,hour and minute
|
||||
target_url = waybackpy.Url(https://www.facebook.com/", "Any-User-Agent")
|
||||
archive_near_year = target_url.near(year=2010)
|
||||
print(archive_near_year)
|
||||
|
||||
returns :
|
||||
http://web.archive.org/web/20100504071154/http://www.facebook.com/
|
||||
|
||||
Please note that if you only specify the year, the current month and
|
||||
day are default arguments for month and day respectively. Just
|
||||
putting the year parameter would not return the archive closer to
|
||||
January but the current month you are using the package. You need to
|
||||
specify the month “1” for January , 2 for february and so on.
|
||||
|
||||
..
|
||||
|
||||
Do not pad (don’t use zeros in the month, year, day, minute, and hour
|
||||
arguments). e.g. For January, set month = 1 and not month = 01.
|
||||
|
||||
Get the content of webpage using get()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
import waybackpy
|
||||
# retriving the webpage from any url including the archived urls. Don't need to import other libraies :)
|
||||
# supported argumnets encoding and user_agent
|
||||
target = waybackpy.Url("google.com", "any-user_agent")
|
||||
oldest_url = target.oldest()
|
||||
webpage = target.get(oldest_url) # We are getting the source of oldest archive of google.com.
|
||||
print(webpage)
|
||||
|
||||
..
|
||||
|
||||
This should print the source code for oldest archive of google.com.
|
||||
If no URL is passed in get() then it should retrive the source code
|
||||
of google.com and not any archive.
|
||||
|
||||
Count total archives for an URL using total_archives()
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. code:: python
|
||||
|
||||
from waybackpy import Url
|
||||
# retriving the content of a webpage from any url including but not limited to the archived urls.
|
||||
count = Url("https://en.wikipedia.org/wiki/Python (programming language)", "User-Agent").total_archives()
|
||||
print(count)
|
||||
|
||||
..
|
||||
|
||||
This should print an integer (int), which is the number of total
|
||||
archives on archive.org
|
||||
|
||||
Tests
|
||||
-----
|
||||
|
||||
- `Here`_
|
||||
|
||||
Dependency
|
||||
----------
|
||||
|
||||
- None, just python standard libraries (re, json, urllib and datetime).
|
||||
Both python 2 and 3 are supported :)
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
`MIT License`_
|
||||
|
||||
.. _Here: https://github.com/akamhy/waybackpy/tree/master/tests
|
||||
.. _MIT License: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
requests>=2.24.0
|
@ -1,3 +1,7 @@
|
||||
[metadata]
|
||||
description-file = README.md
|
||||
license_file = LICENSE
|
||||
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = E203,W503
|
||||
|
77
setup.py
77
setup.py
@ -1,49 +1,54 @@
|
||||
import os.path
|
||||
from setuptools import setup
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), 'README.md')) as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
|
||||
long_description = f.read()
|
||||
|
||||
about = {}
|
||||
with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py')) as f:
|
||||
with open(os.path.join(os.path.dirname(__file__), "waybackpy", "__version__.py")) as f:
|
||||
exec(f.read(), about)
|
||||
|
||||
|
||||
setup(
|
||||
name = about['__title__'],
|
||||
packages = ['waybackpy'],
|
||||
version = about['__version__'],
|
||||
description = about['__description__'],
|
||||
name=about["__title__"],
|
||||
packages=["waybackpy"],
|
||||
version=about["__version__"],
|
||||
description=about["__description__"],
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
license= about['__license__'],
|
||||
author = about['__author__'],
|
||||
author_email = about['__author_email__'],
|
||||
url = about['__url__'],
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/2.0.0.tar.gz',
|
||||
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
|
||||
install_requires=[],
|
||||
python_requires= ">=2.7",
|
||||
long_description_content_type="text/markdown",
|
||||
license=about["__license__"],
|
||||
author=about["__author__"],
|
||||
author_email=about["__author_email__"],
|
||||
url=about["__url__"],
|
||||
download_url="https://github.com/akamhy/waybackpy/archive/2.4.3.tar.gz",
|
||||
keywords=[
|
||||
"Archive It",
|
||||
"Archive Website",
|
||||
"Wayback Machine",
|
||||
"waybackurls",
|
||||
"Internet Archive",
|
||||
],
|
||||
install_requires=["requests"],
|
||||
python_requires=">=3.4",
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Intended Audience :: Developers',
|
||||
'Natural Language :: English',
|
||||
'Topic :: Software Development :: Build Tools',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python',
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.2',
|
||||
'Programming Language :: Python :: 3.3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: Implementation :: CPython',
|
||||
],
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"Natural Language :: English",
|
||||
"Topic :: Software Development :: Build Tools",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.4",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
],
|
||||
entry_points={"console_scripts": ["waybackpy = waybackpy.cli:main"]},
|
||||
project_urls={
|
||||
'Documentation': 'https://waybackpy.readthedocs.io',
|
||||
'Source': 'https://github.com/akamhy/waybackpy',
|
||||
"Documentation": "https://github.com/akamhy/waybackpy/wiki",
|
||||
"Source": "https://github.com/akamhy/waybackpy",
|
||||
"Tracker": "https://github.com/akamhy/waybackpy/issues",
|
||||
},
|
||||
)
|
||||
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
127
tests/test_1.py
127
tests/test_1.py
@ -1,127 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
sys.path.append("..")
|
||||
import waybackpy
|
||||
import pytest
|
||||
import random
|
||||
|
||||
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
|
||||
|
||||
def test_clean_url():
|
||||
test_url = " https://en.wikipedia.org/wiki/Network security "
|
||||
answer = "https://en.wikipedia.org/wiki/Network_security"
|
||||
target = waybackpy.Url(test_url, user_agent)
|
||||
test_result = target.clean_url()
|
||||
assert answer == test_result
|
||||
|
||||
def test_url_check():
|
||||
broken_url = "http://wwwgooglecom/"
|
||||
with pytest.raises(Exception) as e_info:
|
||||
waybackpy.Url(broken_url, user_agent)
|
||||
|
||||
def test_save():
|
||||
# Test for urls that exist and can be archived.
|
||||
|
||||
url_list = [
|
||||
"en.wikipedia.org",
|
||||
"www.wikidata.org",
|
||||
"commons.wikimedia.org",
|
||||
"www.wiktionary.org",
|
||||
"www.w3schools.com",
|
||||
"www.youtube.com"
|
||||
]
|
||||
x = random.randint(0, len(url_list)-1)
|
||||
url1 = url_list[x]
|
||||
target = waybackpy.Url(url1, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36")
|
||||
archived_url1 = target.save()
|
||||
assert url1 in archived_url1
|
||||
|
||||
if sys.version_info > (3, 6):
|
||||
|
||||
# Test for urls that are incorrect.
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url2 = "ha ha ha ha"
|
||||
waybackpy.Url(url2, user_agent)
|
||||
|
||||
# Test for urls not allowed to archive by robot.txt.
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url3 = "http://www.archive.is/faq.html"
|
||||
target = waybackpy.Url(url3, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0")
|
||||
target.save()
|
||||
|
||||
|
||||
# Non existent urls, test
|
||||
with pytest.raises(Exception) as e_info:
|
||||
url4 = "https://githfgdhshajagjstgeths537agajaajgsagudadhuss8762346887adsiugujsdgahub.us"
|
||||
target = waybackpy.Url(url3, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27")
|
||||
target.save()
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
def test_near():
|
||||
url = "google.com"
|
||||
target = waybackpy.Url(url, "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4")
|
||||
archive_near_year = target.near(year=2010)
|
||||
assert "2010" in archive_near_year
|
||||
|
||||
if sys.version_info > (3, 6):
|
||||
archive_near_month_year = target.near( year=2015, month=2)
|
||||
assert ("201502" in archive_near_month_year) or ("201501" in archive_near_month_year) or ("201503" in archive_near_month_year)
|
||||
|
||||
archive_near_day_month_year = target.near(year=2006, month=11, day=15)
|
||||
assert ("20061114" in archive_near_day_month_year) or ("20061115" in archive_near_day_month_year) or ("2006116" in archive_near_day_month_year)
|
||||
|
||||
target = waybackpy.Url("www.python.org", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246")
|
||||
archive_near_hour_day_month_year = target.near(year=2008, month=5, day=9, hour=15)
|
||||
assert ("2008050915" in archive_near_hour_day_month_year) or ("2008050914" in archive_near_hour_day_month_year) or ("2008050913" in archive_near_hour_day_month_year)
|
||||
|
||||
with pytest.raises(Exception) as e_info:
|
||||
NeverArchivedUrl = "https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
|
||||
target = waybackpy.Url(NeverArchivedUrl, user_agent)
|
||||
target.near(year=2010)
|
||||
else:
|
||||
pass
|
||||
|
||||
def test_oldest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "20200504141153" in target.oldest()
|
||||
|
||||
def test_newest():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert url in target.newest()
|
||||
|
||||
def test_get():
|
||||
target = waybackpy.Url("google.com", user_agent)
|
||||
assert "Welcome to Google" in target.get(target.oldest())
|
||||
|
||||
def test_total_archives():
|
||||
if sys.version_info > (3, 6):
|
||||
target = waybackpy.Url(" https://google.com ", user_agent)
|
||||
assert target.total_archives() > 500000
|
||||
else:
|
||||
pass
|
||||
|
||||
target = waybackpy.Url(" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent)
|
||||
assert target.total_archives() == 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_clean_url()
|
||||
print(".") #1
|
||||
test_url_check()
|
||||
print(".") #1
|
||||
test_get()
|
||||
print(".") #3
|
||||
test_near()
|
||||
print(".") #4
|
||||
test_newest()
|
||||
print(".") #5
|
||||
test_save()
|
||||
print(".") #6
|
||||
test_oldest()
|
||||
print(".") #7
|
||||
test_total_archives()
|
||||
print(".") #8
|
||||
print("OK")
|
93
tests/test_cdx.py
Normal file
93
tests/test_cdx.py
Normal file
@ -0,0 +1,93 @@
|
||||
import pytest
|
||||
from waybackpy.cdx import Cdx
|
||||
from waybackpy.exceptions import WaybackError
|
||||
|
||||
|
||||
def test_all_cdx():
|
||||
url = "akamhy.github.io"
|
||||
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, \
|
||||
like Gecko) Chrome/45.0.2454.85 Safari/537.36"
|
||||
cdx = Cdx(
|
||||
url=url,
|
||||
user_agent=user_agent,
|
||||
start_timestamp=2017,
|
||||
end_timestamp=2020,
|
||||
filters=[
|
||||
"statuscode:200",
|
||||
"mimetype:text/html",
|
||||
"timestamp:20201002182319",
|
||||
"original:https://akamhy.github.io/",
|
||||
],
|
||||
gzip=False,
|
||||
collapses=["timestamp:10", "digest"],
|
||||
limit=50,
|
||||
match_type="prefix",
|
||||
)
|
||||
snapshots = cdx.snapshots()
|
||||
for snapshot in snapshots:
|
||||
ans = snapshot.archive_url
|
||||
assert "https://web.archive.org/web/20201002182319/https://akamhy.github.io/" == ans
|
||||
|
||||
url = "akahfjgjkmhy.gihthub.ip"
|
||||
cdx = Cdx(
|
||||
url=url,
|
||||
user_agent=user_agent,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
filters=[],
|
||||
match_type=None,
|
||||
gzip=True,
|
||||
collapses=[],
|
||||
limit=10,
|
||||
)
|
||||
|
||||
snapshots = cdx.snapshots()
|
||||
print(snapshots)
|
||||
i = 0
|
||||
for _ in snapshots:
|
||||
i += 1
|
||||
assert i == 0
|
||||
|
||||
url = "https://github.com/akamhy/waybackpy/*"
|
||||
cdx = Cdx(url=url, user_agent=user_agent, limit=50)
|
||||
snapshots = cdx.snapshots()
|
||||
|
||||
for snapshot in snapshots:
|
||||
print(snapshot.archive_url)
|
||||
|
||||
url = "https://github.com/akamhy/waybackpy"
|
||||
with pytest.raises(WaybackError):
|
||||
cdx = Cdx(url=url, user_agent=user_agent, limit=50, filters=["ghddhfhj"])
|
||||
snapshots = cdx.snapshots()
|
||||
|
||||
with pytest.raises(WaybackError):
|
||||
cdx = Cdx(url=url, user_agent=user_agent, collapses=["timestamp", "ghdd:hfhj"])
|
||||
snapshots = cdx.snapshots()
|
||||
|
||||
url = "https://github.com"
|
||||
cdx = Cdx(url=url, user_agent=user_agent, limit=50)
|
||||
snapshots = cdx.snapshots()
|
||||
c = 0
|
||||
for snapshot in snapshots:
|
||||
c += 1
|
||||
if c > 100:
|
||||
break
|
||||
|
||||
url = "https://github.com/*"
|
||||
cdx = Cdx(url=url, user_agent=user_agent, collapses=["timestamp"])
|
||||
snapshots = cdx.snapshots()
|
||||
c = 0
|
||||
for snapshot in snapshots:
|
||||
c += 1
|
||||
if c > 30529: # deafult limit is 10k
|
||||
break
|
||||
|
||||
url = "https://github.com/*"
|
||||
cdx = Cdx(url=url, user_agent=user_agent)
|
||||
c = 0
|
||||
snapshots = cdx.snapshots()
|
||||
|
||||
for snapshot in snapshots:
|
||||
c += 1
|
||||
if c > 100529:
|
||||
break
|
359
tests/test_cli.py
Normal file
359
tests/test_cli.py
Normal file
@ -0,0 +1,359 @@
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
import random
|
||||
import string
|
||||
import argparse
|
||||
|
||||
import waybackpy.cli as cli
|
||||
from waybackpy.wrapper import Url # noqa: E402
|
||||
from waybackpy.__version__ import __version__
|
||||
|
||||
|
||||
def test_save():
|
||||
|
||||
args = argparse.Namespace(
|
||||
user_agent=None,
|
||||
url="https://hfjfjfjfyu6r6rfjvj.fjhgjhfjgvjm",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=True,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "could happen because either your waybackpy" or "cannot be archived by wayback machine as it is a redirect" in str(reply)
|
||||
|
||||
|
||||
def test_json():
|
||||
args = argparse.Namespace(
|
||||
user_agent=None,
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=True,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "archived_snapshots" in str(reply)
|
||||
|
||||
|
||||
def test_archive_url():
|
||||
args = argparse.Namespace(
|
||||
user_agent=None,
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=True,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "https://web.archive.org/web/" in str(reply)
|
||||
|
||||
|
||||
def test_oldest():
|
||||
args = argparse.Namespace(
|
||||
user_agent=None,
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=True,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in str(reply)
|
||||
|
||||
uid = "".join(
|
||||
random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url = "https://pypi.org/yfvjvycyc667r67ed67r" + uid
|
||||
args = argparse.Namespace(
|
||||
user_agent=None,
|
||||
url=url,
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=True,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "Can not find archive for" in str(reply)
|
||||
|
||||
|
||||
def test_newest():
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=True,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in str(reply)
|
||||
|
||||
uid = "".join(
|
||||
random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url = "https://pypi.org/yfvjvycyc667r67ed67r" + uid
|
||||
args = argparse.Namespace(
|
||||
user_agent=None,
|
||||
url=url,
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=True,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "Can not find archive for" in str(reply)
|
||||
|
||||
|
||||
def test_total_archives():
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=True,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert isinstance(reply, int)
|
||||
|
||||
|
||||
def test_known_urls():
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://www.keybr.com",
|
||||
total=False,
|
||||
version=False,
|
||||
file=True,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=True,
|
||||
get=None,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "keybr" in str(reply)
|
||||
|
||||
|
||||
def test_near():
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=True,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
year=2020,
|
||||
month=7,
|
||||
day=15,
|
||||
hour=1,
|
||||
minute=1,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "202007" in str(reply)
|
||||
|
||||
uid = "".join(
|
||||
random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url = "https://pypi.org/yfvjvycyc667r67ed67r" + uid
|
||||
args = argparse.Namespace(
|
||||
user_agent=None,
|
||||
url=url,
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=True,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get=None,
|
||||
year=2020,
|
||||
month=7,
|
||||
day=15,
|
||||
hour=1,
|
||||
minute=1,
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "Can not find archive for" in str(reply)
|
||||
|
||||
|
||||
def test_get():
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://github.com/akamhy",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="url",
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in str(reply)
|
||||
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://github.com/akamhy/waybackpy",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="oldest",
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in str(reply)
|
||||
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://akamhy.github.io/waybackpy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="newest",
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in str(reply)
|
||||
|
||||
args = argparse.Namespace(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
|
||||
url="https://pypi.org/user/akamhy/",
|
||||
total=False,
|
||||
version=False,
|
||||
file=False,
|
||||
oldest=False,
|
||||
save=False,
|
||||
json=False,
|
||||
archive_url=False,
|
||||
newest=False,
|
||||
near=False,
|
||||
subdomain=False,
|
||||
known_urls=False,
|
||||
get="foobar",
|
||||
)
|
||||
reply = cli.args_handler(args)
|
||||
assert "get the source code of the" in str(reply)
|
||||
|
||||
|
||||
def test_args_handler():
|
||||
args = argparse.Namespace(version=True)
|
||||
reply = cli.args_handler(args)
|
||||
assert ("waybackpy version %s" % (__version__)) == reply
|
||||
|
||||
args = argparse.Namespace(url=None, version=False)
|
||||
reply = cli.args_handler(args)
|
||||
assert ("waybackpy %s" % (__version__)) in str(reply)
|
||||
|
||||
|
||||
def test_main():
|
||||
# This also tests the parse_args method in cli.py
|
||||
cli.main(["temp.py", "--version"])
|
40
tests/test_snapshot.py
Normal file
40
tests/test_snapshot.py
Normal file
@ -0,0 +1,40 @@
|
||||
import pytest
|
||||
|
||||
from waybackpy.snapshot import CdxSnapshot, datetime
|
||||
|
||||
|
||||
def test_CdxSnapshot():
|
||||
sample_input = "org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
|
||||
prop_values = sample_input.split(" ")
|
||||
properties = {}
|
||||
(
|
||||
properties["urlkey"],
|
||||
properties["timestamp"],
|
||||
properties["original"],
|
||||
properties["mimetype"],
|
||||
properties["statuscode"],
|
||||
properties["digest"],
|
||||
properties["length"],
|
||||
) = prop_values
|
||||
|
||||
snapshot = CdxSnapshot(properties)
|
||||
|
||||
assert properties["urlkey"] == snapshot.urlkey
|
||||
assert properties["timestamp"] == snapshot.timestamp
|
||||
assert properties["original"] == snapshot.original
|
||||
assert properties["mimetype"] == snapshot.mimetype
|
||||
assert properties["statuscode"] == snapshot.statuscode
|
||||
assert properties["digest"] == snapshot.digest
|
||||
assert properties["length"] == snapshot.length
|
||||
assert (
|
||||
datetime.strptime(properties["timestamp"], "%Y%m%d%H%M%S")
|
||||
== snapshot.datetime_timestamp
|
||||
)
|
||||
archive_url = (
|
||||
"https://web.archive.org/web/"
|
||||
+ properties["timestamp"]
|
||||
+ "/"
|
||||
+ properties["original"]
|
||||
)
|
||||
assert archive_url == snapshot.archive_url
|
||||
assert sample_input == str(snapshot)
|
186
tests/test_utils.py
Normal file
186
tests/test_utils.py
Normal file
@ -0,0 +1,186 @@
|
||||
import pytest
|
||||
import json
|
||||
|
||||
from waybackpy.utils import (
|
||||
_cleaned_url,
|
||||
_url_check,
|
||||
_full_url,
|
||||
URLError,
|
||||
WaybackError,
|
||||
_get_total_pages,
|
||||
_archive_url_parser,
|
||||
_wayback_timestamp,
|
||||
_get_response,
|
||||
_check_match_type,
|
||||
_check_collapses,
|
||||
_check_filters,
|
||||
_timestamp_manager,
|
||||
)
|
||||
|
||||
|
||||
def test_timestamp_manager():
|
||||
timestamp = True
|
||||
data = {}
|
||||
assert _timestamp_manager(timestamp, data)
|
||||
|
||||
data = """
|
||||
{"archived_snapshots": {"closest": {"timestamp": "20210109155628", "available": true, "status": "200", "url": "http://web.archive.org/web/20210109155628/https://www.google.com/"}}, "url": "https://www.google.com/"}
|
||||
"""
|
||||
data = json.loads(data)
|
||||
assert data["archived_snapshots"]["closest"]["timestamp"] == "20210109155628"
|
||||
|
||||
|
||||
def test_check_filters():
|
||||
filters = []
|
||||
_check_filters(filters)
|
||||
|
||||
filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"]
|
||||
_check_filters(filters)
|
||||
|
||||
with pytest.raises(WaybackError):
|
||||
_check_filters("not-list")
|
||||
|
||||
|
||||
def test_check_collapses():
|
||||
collapses = []
|
||||
_check_collapses(collapses)
|
||||
|
||||
collapses = ["timestamp:10"]
|
||||
_check_collapses(collapses)
|
||||
|
||||
collapses = ["urlkey"]
|
||||
_check_collapses(collapses)
|
||||
|
||||
collapses = "urlkey" # NOT LIST
|
||||
with pytest.raises(WaybackError):
|
||||
_check_collapses(collapses)
|
||||
|
||||
collapses = ["also illegal collapse"]
|
||||
with pytest.raises(WaybackError):
|
||||
_check_collapses(collapses)
|
||||
|
||||
|
||||
def test_check_match_type():
|
||||
assert _check_match_type(None, "url") is None
|
||||
match_type = "exact"
|
||||
url = "test_url"
|
||||
assert _check_match_type(match_type, url) is None
|
||||
|
||||
url = "has * in it"
|
||||
with pytest.raises(WaybackError):
|
||||
_check_match_type("domain", url)
|
||||
|
||||
with pytest.raises(WaybackError):
|
||||
_check_match_type("not a valid type", "url")
|
||||
|
||||
|
||||
def test_cleaned_url():
|
||||
test_url = " https://en.wikipedia.org/wiki/Network security "
|
||||
answer = "https://en.wikipedia.org/wiki/Network%20security"
|
||||
assert answer == _cleaned_url(test_url)
|
||||
|
||||
|
||||
def test_url_check():
|
||||
good_url = "https://akamhy.github.io"
|
||||
assert _url_check(good_url) is None
|
||||
|
||||
bad_url = "https://github-com"
|
||||
with pytest.raises(URLError):
|
||||
_url_check(bad_url)
|
||||
|
||||
|
||||
def test_full_url():
|
||||
params = {}
|
||||
endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||
assert endpoint == _full_url(endpoint, params)
|
||||
|
||||
params = {"a": "1"}
|
||||
assert "https://web.archive.org/cdx/search/cdx?a=1" == _full_url(endpoint, params)
|
||||
assert "https://web.archive.org/cdx/search/cdx?a=1" == _full_url(
|
||||
endpoint + "?", params
|
||||
)
|
||||
|
||||
params["b"] = 2
|
||||
assert "https://web.archive.org/cdx/search/cdx?a=1&b=2" == _full_url(
|
||||
endpoint + "?", params
|
||||
)
|
||||
|
||||
params["c"] = "foo bar"
|
||||
assert "https://web.archive.org/cdx/search/cdx?a=1&b=2&c=foo%20bar" == _full_url(
|
||||
endpoint + "?", params
|
||||
)
|
||||
|
||||
|
||||
def test_get_total_pages():
|
||||
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
|
||||
url = "github.com*"
|
||||
assert 212890 <= _get_total_pages(url, user_agent)
|
||||
|
||||
url = "https://zenodo.org/record/4416138"
|
||||
assert 2 >= _get_total_pages(url, user_agent)
|
||||
|
||||
|
||||
def test_archive_url_parser():
|
||||
perfect_header = """
|
||||
{'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'}
|
||||
"""
|
||||
|
||||
archive = _archive_url_parser(
|
||||
perfect_header, "https://www.scribbr.com/citing-sources/et-al/"
|
||||
)
|
||||
assert "web.archive.org/web/20210102094009" in archive
|
||||
|
||||
header = """
|
||||
vhgvkjv
|
||||
Content-Location: /web/20201126185327/https://www.scribbr.com/citing-sources/et-al
|
||||
ghvjkbjmmcmhj
|
||||
"""
|
||||
archive = _archive_url_parser(
|
||||
header, "https://www.scribbr.com/citing-sources/et-al/"
|
||||
)
|
||||
assert "20201126185327" in archive
|
||||
|
||||
header = """
|
||||
hfjkfjfcjhmghmvjm
|
||||
X-Cache-Key: https://web.archive.org/web/20171128185327/https://www.scribbr.com/citing-sources/et-al/US
|
||||
yfu,u,gikgkikik
|
||||
"""
|
||||
archive = _archive_url_parser(
|
||||
header, "https://www.scribbr.com/citing-sources/et-al/"
|
||||
)
|
||||
assert "20171128185327" in archive
|
||||
|
||||
# The below header should result in Exception
|
||||
no_archive_header = """
|
||||
{'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:42:45 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Cache-Control': 'no-cache', 'X-App-Server': 'wwwb-app52', 'X-ts': '523', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0'}
|
||||
"""
|
||||
|
||||
with pytest.raises(WaybackError):
|
||||
_archive_url_parser(
|
||||
no_archive_header, "https://www.scribbr.com/citing-sources/et-al/"
|
||||
)
|
||||
|
||||
|
||||
def test_wayback_timestamp():
|
||||
ts = _wayback_timestamp(year=2020, month=1, day=2, hour=3, minute=4)
|
||||
assert "202001020304" in str(ts)
|
||||
|
||||
|
||||
def test_get_response():
|
||||
endpoint = "https://www.google.com"
|
||||
user_agent = (
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
|
||||
)
|
||||
headers = {"User-Agent": "%s" % user_agent}
|
||||
response = _get_response(endpoint, params=None, headers=headers)
|
||||
assert response.status_code == 200
|
||||
|
||||
endpoint = "http/wwhfhfvhvjhmom"
|
||||
with pytest.raises(WaybackError):
|
||||
_get_response(endpoint, params=None, headers=headers)
|
||||
|
||||
endpoint = "https://akamhy.github.io"
|
||||
url, response = _get_response(
|
||||
endpoint, params=None, headers=headers, return_full_url=True
|
||||
)
|
||||
assert endpoint == url
|
28
tests/test_wrapper.py
Normal file
28
tests/test_wrapper.py
Normal file
@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
|
||||
from waybackpy.wrapper import Url
|
||||
|
||||
|
||||
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
|
||||
|
||||
|
||||
def test_url_check():
|
||||
"""No API Use"""
|
||||
broken_url = "http://wwwgooglecom/"
|
||||
with pytest.raises(Exception):
|
||||
Url(broken_url, user_agent)
|
||||
|
||||
|
||||
def test_near():
|
||||
with pytest.raises(Exception):
|
||||
NeverArchivedUrl = (
|
||||
"https://ee_3n.wrihkeipef4edia.org/rwti5r_ki/Nertr6w_rork_rse7c_urity"
|
||||
)
|
||||
target = Url(NeverArchivedUrl, user_agent)
|
||||
target.near(year=2010)
|
||||
|
||||
|
||||
def test_json():
|
||||
url = "github.com/akamhy/waybackpy"
|
||||
target = Url(url, user_agent)
|
||||
assert "archived_snapshots" in str(target.JSON)
|
@ -1,5 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ┏┓┏┓┏┓━━━━━━━━━━┏━━┓━━━━━━━━━━┏┓━━┏━━━┓━━━━━
|
||||
# ┃┃┃┃┃┃━━━━━━━━━━┃┏┓┃━━━━━━━━━━┃┃━━┃┏━┓┃━━━━━
|
||||
# ┃┃┃┃┃┃┏━━┓━┏┓━┏┓┃┗┛┗┓┏━━┓━┏━━┓┃┃┏┓┃┗━┛┃┏┓━┏┓
|
||||
@ -10,23 +8,50 @@
|
||||
# ━━━━━━━━━━━┗━━┛━━━━━━━━━━━━━━━━━━━━━━━━┗━━┛━
|
||||
|
||||
"""
|
||||
Waybackpy is a Python library that interfaces with the Internet Archive's Wayback Machine API.
|
||||
Waybackpy is a Python package & command-line program that interfaces with the Internet Archive's Wayback Machine API.
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Archive pages and retrieve archived pages easily.
|
||||
Archive webpage and retrieve archived URLs easily.
|
||||
|
||||
Usage:
|
||||
>>> import waybackpy
|
||||
>>> target_url = waybackpy.Url('https://www.python.org', 'Your-apps-cool-user-agent')
|
||||
>>> new_archive = target_url.save()
|
||||
>>> print(new_archive)
|
||||
https://web.archive.org/web/20200502170312/https://www.python.org/
|
||||
>>> import waybackpy
|
||||
|
||||
Full documentation @ <https://akamhy.github.io/waybackpy/>.
|
||||
:copyright: (c) 2020 by akamhy.
|
||||
>>> url = "https://en.wikipedia.org/wiki/Multivariable_calculus"
|
||||
>>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
|
||||
|
||||
>>> wayback = waybackpy.Url(url, user_agent)
|
||||
|
||||
>>> archive = wayback.save()
|
||||
>>> str(archive)
|
||||
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
|
||||
|
||||
>>> archive.timestamp
|
||||
datetime.datetime(2021, 1, 4, 17, 35, 12, 691741)
|
||||
|
||||
>>> oldest_archive = wayback.oldest()
|
||||
>>> str(oldest_archive)
|
||||
'https://web.archive.org/web/20050422130129/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
|
||||
|
||||
>>> archive_close_to_2010_feb = wayback.near(year=2010, month=2)
|
||||
>>> str(archive_close_to_2010_feb)
|
||||
'https://web.archive.org/web/20100215001541/http://en.wikipedia.org:80/wiki/Multivariable_calculus'
|
||||
|
||||
>>> str(wayback.newest())
|
||||
'https://web.archive.org/web/20210104173410/https://en.wikipedia.org/wiki/Multivariable_calculus'
|
||||
|
||||
Full documentation @ <https://github.com/akamhy/waybackpy/wiki>.
|
||||
:copyright: (c) 2020-2021 AKash Mahanty Et al.
|
||||
:license: MIT
|
||||
"""
|
||||
|
||||
from .wrapper import Url
|
||||
from .__version__ import __title__, __description__, __url__, __version__
|
||||
from .__version__ import __author__, __author_email__, __license__, __copyright__
|
||||
from .wrapper import Url, Cdx
|
||||
from .__version__ import (
|
||||
__title__,
|
||||
__description__,
|
||||
__url__,
|
||||
__version__,
|
||||
__author__,
|
||||
__author_email__,
|
||||
__license__,
|
||||
__copyright__,
|
||||
)
|
||||
|
@ -1,10 +1,11 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__title__ = "waybackpy"
|
||||
__description__ = "A Python library that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
|
||||
__description__ = (
|
||||
"A Python package that interfaces with the Internet Archive's Wayback Machine API. "
|
||||
"Archive pages and retrieve archived pages easily."
|
||||
)
|
||||
__url__ = "https://akamhy.github.io/waybackpy/"
|
||||
__version__ = "2.0.0"
|
||||
__version__ = "2.4.3"
|
||||
__author__ = "akamhy"
|
||||
__author_email__ = "akash3pro@gmail.com"
|
||||
__author_email__ = "akamhy@yahoo.com"
|
||||
__license__ = "MIT"
|
||||
__copyright__ = "Copyright 2020 akamhy"
|
||||
__copyright__ = "Copyright 2020-2021 Akash Mahanty et al."
|
||||
|
229
waybackpy/cdx.py
Normal file
229
waybackpy/cdx.py
Normal file
@ -0,0 +1,229 @@
|
||||
from .snapshot import CdxSnapshot
|
||||
from .exceptions import WaybackError
|
||||
from .utils import (
|
||||
_get_total_pages,
|
||||
_get_response,
|
||||
default_user_agent,
|
||||
_check_filters,
|
||||
_check_collapses,
|
||||
_check_match_type,
|
||||
_add_payload,
|
||||
)
|
||||
|
||||
# TODO : Threading support for pagination API. It's designed for Threading.
|
||||
# TODO : Add get method here if type is Vaild HTML, SVG other but not - or warc. Test it.
|
||||
|
||||
|
||||
class Cdx:
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
user_agent=None,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
filters=[],
|
||||
match_type=None,
|
||||
gzip=None,
|
||||
collapses=[],
|
||||
limit=None,
|
||||
):
|
||||
self.url = str(url).strip()
|
||||
self.user_agent = str(user_agent) if user_agent else default_user_agent
|
||||
self.start_timestamp = str(start_timestamp) if start_timestamp else None
|
||||
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
||||
self.filters = filters
|
||||
_check_filters(self.filters)
|
||||
self.match_type = str(match_type).strip() if match_type else None
|
||||
_check_match_type(self.match_type, self.url)
|
||||
self.gzip = gzip if gzip else True
|
||||
self.collapses = collapses
|
||||
_check_collapses(self.collapses)
|
||||
self.limit = limit if limit else 5000
|
||||
self.last_api_request_url = None
|
||||
self.use_page = False
|
||||
|
||||
def cdx_api_manager(self, payload, headers, use_page=False):
|
||||
"""Act as button, we can choose between the normal API and pagination API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
self : waybackpy.cdx.Cdx
|
||||
The instance itself
|
||||
|
||||
payload : dict
|
||||
Get request parameters name value pairs
|
||||
|
||||
headers : dict
|
||||
The headers for making the GET request.
|
||||
|
||||
use_page : bool
|
||||
If True use pagination API else use normal resume key based API.
|
||||
|
||||
We have two options to get the snapshots, we use this
|
||||
method to make a selection between pagination API and
|
||||
the normal one with Resumption Key, sequential querying
|
||||
of CDX data. For very large querying (for example domain query),
|
||||
it may be useful to perform queries in parallel and also estimate
|
||||
the total size of the query.
|
||||
|
||||
read more about the pagination API at:
|
||||
https://web.archive.org/web/20201228063237/https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api
|
||||
|
||||
if use_page is false if will use the normal sequential query API,
|
||||
else use the pagination API.
|
||||
|
||||
two mutually exclusive cases possible:
|
||||
|
||||
1) pagination API is selected
|
||||
|
||||
a) get the total number of pages to read, using _get_total_pages()
|
||||
|
||||
b) then we use a for loop to get all the pages and yield the response text
|
||||
|
||||
2) normal sequential query API is selected.
|
||||
|
||||
a) get use showResumeKey=true to ask the API to add a query resumption key
|
||||
at the bottom of response
|
||||
|
||||
b) check if the page has more than 3 lines, if not return the text
|
||||
|
||||
c) if it has atleast three lines, we check the second last line for zero length.
|
||||
|
||||
d) if the second last line has length zero than we assume that the last line contains
|
||||
the resumption key, we set the resumeKey and remove the resumeKey from text
|
||||
|
||||
e) if the second line has non zero length we return the text as there will no resumption key
|
||||
|
||||
f) if we find the resumption key we set the "more" variable status to True which is always set
|
||||
to False on each iteration. If more is not True the iteration stops and function returns.
|
||||
"""
|
||||
|
||||
endpoint = "https://web.archive.org/cdx/search/cdx"
|
||||
total_pages = _get_total_pages(self.url, self.user_agent)
|
||||
# If we only have two or less pages of archives then we care for accuracy
|
||||
# pagination API can be lagged sometimes
|
||||
if use_page == True and total_pages >= 2:
|
||||
blank_pages = 0
|
||||
for i in range(total_pages):
|
||||
payload["page"] = str(i)
|
||||
url, res = _get_response(
|
||||
endpoint, params=payload, headers=headers, return_full_url=True
|
||||
)
|
||||
|
||||
self.last_api_request_url = url
|
||||
text = res.text
|
||||
if len(text) == 0:
|
||||
blank_pages += 1
|
||||
|
||||
if blank_pages >= 2:
|
||||
break
|
||||
|
||||
yield text
|
||||
else:
|
||||
|
||||
payload["showResumeKey"] = "true"
|
||||
payload["limit"] = str(self.limit)
|
||||
resumeKey = None
|
||||
|
||||
more = True
|
||||
while more:
|
||||
|
||||
if resumeKey:
|
||||
payload["resumeKey"] = resumeKey
|
||||
|
||||
url, res = _get_response(
|
||||
endpoint, params=payload, headers=headers, return_full_url=True
|
||||
)
|
||||
|
||||
self.last_api_request_url = url
|
||||
|
||||
text = res.text.strip()
|
||||
lines = text.splitlines()
|
||||
|
||||
more = False
|
||||
|
||||
if len(lines) >= 3:
|
||||
|
||||
second_last_line = lines[-2]
|
||||
|
||||
if len(second_last_line) == 0:
|
||||
|
||||
resumeKey = lines[-1].strip()
|
||||
text = text.replace(resumeKey, "", 1).strip()
|
||||
more = True
|
||||
|
||||
yield text
|
||||
|
||||
def snapshots(self):
|
||||
"""
|
||||
This function yeilds snapshots encapsulated
|
||||
in CdxSnapshot for increased usability.
|
||||
|
||||
All the get request values are set if the conditions match
|
||||
|
||||
And we use logic that if someone's only inputs don't have any
|
||||
of [start_timestamp, end_timestamp] and don't use any collapses
|
||||
then we use the pagination API as it returns archives starting
|
||||
from the first archive and the recent most archive will be on
|
||||
the last page.
|
||||
"""
|
||||
payload = {}
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
|
||||
_add_payload(self, payload)
|
||||
|
||||
if not self.start_timestamp or self.end_timestamp:
|
||||
self.use_page = True
|
||||
|
||||
if self.collapses != []:
|
||||
self.use_page = False
|
||||
|
||||
texts = self.cdx_api_manager(payload, headers, use_page=self.use_page)
|
||||
|
||||
for text in texts:
|
||||
|
||||
if text.isspace() or len(text) <= 1 or not text:
|
||||
continue
|
||||
|
||||
snapshot_list = text.split("\n")
|
||||
|
||||
for snapshot in snapshot_list:
|
||||
|
||||
if len(snapshot) < 46: # 14 + 32 (timestamp+digest)
|
||||
continue
|
||||
|
||||
properties = {
|
||||
"urlkey": None,
|
||||
"timestamp": None,
|
||||
"original": None,
|
||||
"mimetype": None,
|
||||
"statuscode": None,
|
||||
"digest": None,
|
||||
"length": None,
|
||||
}
|
||||
|
||||
prop_values = snapshot.split(" ")
|
||||
|
||||
prop_values_len = len(prop_values)
|
||||
properties_len = len(properties)
|
||||
|
||||
if prop_values_len != properties_len:
|
||||
raise WaybackError(
|
||||
"Snapshot returned by Cdx API has {prop_values_len} properties instead of expected {properties_len} properties.\nInvolved Snapshot : {snapshot}".format(
|
||||
prop_values_len=prop_values_len,
|
||||
properties_len=properties_len,
|
||||
snapshot=snapshot,
|
||||
)
|
||||
)
|
||||
|
||||
(
|
||||
properties["urlkey"],
|
||||
properties["timestamp"],
|
||||
properties["original"],
|
||||
properties["mimetype"],
|
||||
properties["statuscode"],
|
||||
properties["digest"],
|
||||
properties["length"],
|
||||
) = prop_values
|
||||
|
||||
yield CdxSnapshot(properties)
|
334
waybackpy/cli.py
Normal file
334
waybackpy/cli.py
Normal file
@ -0,0 +1,334 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import random
|
||||
import string
|
||||
import argparse
|
||||
|
||||
from .wrapper import Url
|
||||
from .exceptions import WaybackError
|
||||
from .__version__ import __version__
|
||||
|
||||
|
||||
def _save(obj):
|
||||
try:
|
||||
return obj.save()
|
||||
except Exception as err:
|
||||
e = str(err)
|
||||
m = re.search(r"Header:\n(.*)", e)
|
||||
if m:
|
||||
header = m.group(1)
|
||||
if "No archive URL found in the API response" in e:
|
||||
return (
|
||||
"\n[waybackpy] Can not save/archive your link.\n[waybackpy] This "
|
||||
"could happen because either your waybackpy ({version}) is likely out of "
|
||||
"date or Wayback Machine is malfunctioning.\n[waybackpy] Visit "
|
||||
"https://github.com/akamhy/waybackpy for the latest version of "
|
||||
"waybackpy.\n[waybackpy] API response Header :\n{header}".format(
|
||||
version=__version__, header=header
|
||||
)
|
||||
)
|
||||
if "URL cannot be archived by wayback machine as it is a redirect" in e:
|
||||
return ("URL cannot be archived by wayback machine as it is a redirect")
|
||||
raise WaybackError(err)
|
||||
|
||||
|
||||
def _archive_url(obj):
|
||||
return obj.archive_url
|
||||
|
||||
|
||||
def _json(obj):
|
||||
return json.dumps(obj.JSON)
|
||||
|
||||
|
||||
def no_archive_handler(e, obj):
|
||||
m = re.search(r"archive\sfor\s\'(.*?)\'\stry", str(e))
|
||||
if m:
|
||||
url = m.group(1)
|
||||
ua = obj.user_agent
|
||||
if "github.com/akamhy/waybackpy" in ua:
|
||||
ua = "YOUR_USER_AGENT_HERE"
|
||||
return (
|
||||
"\n[Waybackpy] Can not find archive for '{url}'.\n[Waybackpy] You can"
|
||||
" save the URL using the following command:\n[Waybackpy] waybackpy --"
|
||||
'user_agent "{user_agent}" --url "{url}" --save'.format(
|
||||
url=url, user_agent=ua
|
||||
)
|
||||
)
|
||||
raise WaybackError(e)
|
||||
|
||||
|
||||
def _oldest(obj):
|
||||
try:
|
||||
return obj.oldest()
|
||||
except Exception as e:
|
||||
return no_archive_handler(e, obj)
|
||||
|
||||
|
||||
def _newest(obj):
|
||||
try:
|
||||
return obj.newest()
|
||||
except Exception as e:
|
||||
return no_archive_handler(e, obj)
|
||||
|
||||
|
||||
def _total_archives(obj):
|
||||
return obj.total_archives()
|
||||
|
||||
|
||||
def _near(obj, args):
|
||||
_near_args = {}
|
||||
args_arr = [args.year, args.month, args.day, args.hour, args.minute]
|
||||
keys = ["year", "month", "day", "hour", "minute"]
|
||||
|
||||
for key, arg in zip(keys, args_arr):
|
||||
if arg:
|
||||
_near_args[key] = arg
|
||||
|
||||
try:
|
||||
return obj.near(**_near_args)
|
||||
except Exception as e:
|
||||
return no_archive_handler(e, obj)
|
||||
|
||||
|
||||
def _save_urls_on_file(url_gen):
|
||||
domain = None
|
||||
sys_random = random.SystemRandom()
|
||||
uid = "".join(
|
||||
sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
|
||||
)
|
||||
url_count = 0
|
||||
|
||||
for url in url_gen:
|
||||
url_count += 1
|
||||
if not domain:
|
||||
m = re.search("https?://([A-Za-z_0-9.-]+).*", url)
|
||||
|
||||
domain = "domain-unknown"
|
||||
|
||||
if m:
|
||||
domain = m.group(1)
|
||||
|
||||
file_name = "{domain}-urls-{uid}.txt".format(domain=domain, uid=uid)
|
||||
file_path = os.path.join(os.getcwd(), file_name)
|
||||
if not os.path.isfile(file_path):
|
||||
open(file_path, "w+").close()
|
||||
|
||||
with open(file_path, "a") as f:
|
||||
f.write("{url}\n".format(url=url))
|
||||
|
||||
print(url)
|
||||
|
||||
if url_count > 0:
|
||||
return "\n\n'{file_name}' saved in current working directory".format(
|
||||
file_name=file_name
|
||||
)
|
||||
else:
|
||||
return "No known URLs found. Please try a diffrent input!"
|
||||
|
||||
|
||||
def _known_urls(obj, args):
|
||||
"""
|
||||
Known urls for a domain.
|
||||
"""
|
||||
|
||||
subdomain = True if args.subdomain else False
|
||||
|
||||
url_gen = obj.known_urls(subdomain=subdomain)
|
||||
|
||||
if args.file:
|
||||
return _save_urls_on_file(url_gen)
|
||||
else:
|
||||
for url in url_gen:
|
||||
print(url)
|
||||
return "\n"
|
||||
|
||||
|
||||
def _get(obj, args):
|
||||
if args.get.lower() == "url":
|
||||
return obj.get()
|
||||
if args.get.lower() == "archive_url":
|
||||
return obj.get(obj.archive_url)
|
||||
if args.get.lower() == "oldest":
|
||||
return obj.get(obj.oldest())
|
||||
if args.get.lower() == "latest" or args.get.lower() == "newest":
|
||||
return obj.get(obj.newest())
|
||||
if args.get.lower() == "save":
|
||||
return obj.get(obj.save())
|
||||
return "Use get as \"--get 'source'\", 'source' can be one of the followings: \
|
||||
\n1) url - get the source code of the url specified using --url/-u.\
|
||||
\n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\
|
||||
\n3) oldest - get the source code of the oldest archive for the supplied url.\
|
||||
\n4) newest - get the source code of the newest archive for the supplied url.\
|
||||
\n5) save - Create a new archive and get the source code of this new archive for the supplied url."
|
||||
|
||||
|
||||
def args_handler(args):
|
||||
if args.version:
|
||||
return "waybackpy version {version}".format(version=__version__)
|
||||
|
||||
if not args.url:
|
||||
return "waybackpy {version} \nSee 'waybackpy --help' for help using this tool.".format(
|
||||
version=__version__
|
||||
)
|
||||
|
||||
obj = Url(args.url)
|
||||
if args.user_agent:
|
||||
obj = Url(args.url, args.user_agent)
|
||||
|
||||
if args.save:
|
||||
output = _save(obj)
|
||||
elif args.archive_url:
|
||||
output = _archive_url(obj)
|
||||
elif args.json:
|
||||
output = _json(obj)
|
||||
elif args.oldest:
|
||||
output = _oldest(obj)
|
||||
elif args.newest:
|
||||
output = _newest(obj)
|
||||
elif args.known_urls:
|
||||
output = _known_urls(obj, args)
|
||||
elif args.total:
|
||||
output = _total_archives(obj)
|
||||
elif args.near:
|
||||
return _near(obj, args)
|
||||
elif args.get:
|
||||
output = _get(obj, args)
|
||||
else:
|
||||
output = (
|
||||
"You only specified the URL. But you also need to specify the operation."
|
||||
"\nSee 'waybackpy --help' for help using this tool."
|
||||
)
|
||||
return output
|
||||
|
||||
|
||||
def add_requiredArgs(requiredArgs):
|
||||
requiredArgs.add_argument(
|
||||
"--url", "-u", help="URL on which Wayback machine operations would occur"
|
||||
)
|
||||
|
||||
|
||||
def add_userAgentArg(userAgentArg):
|
||||
help_text = 'User agent, default user_agent is "waybackpy python package - https://github.com/akamhy/waybackpy"'
|
||||
userAgentArg.add_argument("--user_agent", "-ua", help=help_text)
|
||||
|
||||
|
||||
def add_saveArg(saveArg):
|
||||
saveArg.add_argument(
|
||||
"--save", "-s", action="store_true", help="Save the URL on the Wayback machine"
|
||||
)
|
||||
|
||||
|
||||
def add_auArg(auArg):
|
||||
auArg.add_argument(
|
||||
"--archive_url",
|
||||
"-au",
|
||||
action="store_true",
|
||||
help="Get the latest archive URL, alias for --newest",
|
||||
)
|
||||
|
||||
|
||||
def add_jsonArg(jsonArg):
|
||||
jsonArg.add_argument(
|
||||
"--json",
|
||||
"-j",
|
||||
action="store_true",
|
||||
help="JSON data of the availability API request",
|
||||
)
|
||||
|
||||
|
||||
def add_oldestArg(oldestArg):
|
||||
oldestArg.add_argument(
|
||||
"--oldest",
|
||||
"-o",
|
||||
action="store_true",
|
||||
help="Oldest archive for the specified URL",
|
||||
)
|
||||
|
||||
|
||||
def add_newestArg(newestArg):
|
||||
newestArg.add_argument(
|
||||
"--newest",
|
||||
"-n",
|
||||
action="store_true",
|
||||
help="Newest archive for the specified URL",
|
||||
)
|
||||
|
||||
|
||||
def add_totalArg(totalArg):
|
||||
totalArg.add_argument(
|
||||
"--total",
|
||||
"-t",
|
||||
action="store_true",
|
||||
help="Total number of archives for the specified URL",
|
||||
)
|
||||
|
||||
|
||||
def add_getArg(getArg):
|
||||
getArg.add_argument(
|
||||
"--get",
|
||||
"-g",
|
||||
help="Prints the source code of the supplied url. Use '--get help' for extended usage",
|
||||
)
|
||||
|
||||
|
||||
def add_knownUrlArg(knownUrlArg):
|
||||
knownUrlArg.add_argument(
|
||||
"--known_urls", "-ku", action="store_true", help="URLs known for the domain."
|
||||
)
|
||||
help_text = "Use with '--known_urls' to include known URLs for subdomains."
|
||||
knownUrlArg.add_argument("--subdomain", "-sub", action="store_true", help=help_text)
|
||||
knownUrlArg.add_argument(
|
||||
"--file",
|
||||
"-f",
|
||||
action="store_true",
|
||||
help="Save the URLs in file at current directory.",
|
||||
)
|
||||
|
||||
|
||||
def add_nearArg(nearArg):
|
||||
nearArg.add_argument(
|
||||
"--near", "-N", action="store_true", help="Archive near specified time"
|
||||
)
|
||||
|
||||
|
||||
def add_nearArgs(nearArgs):
|
||||
nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
|
||||
nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
|
||||
nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.")
|
||||
nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
|
||||
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
|
||||
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser()
|
||||
add_requiredArgs(parser.add_argument_group("URL argument (required)"))
|
||||
add_userAgentArg(parser.add_argument_group("User Agent"))
|
||||
add_saveArg(parser.add_argument_group("Create new archive/save URL"))
|
||||
add_auArg(parser.add_argument_group("Get the latest Archive"))
|
||||
add_jsonArg(parser.add_argument_group("Get the JSON data"))
|
||||
add_oldestArg(parser.add_argument_group("Oldest archive"))
|
||||
add_newestArg(parser.add_argument_group("Newest archive"))
|
||||
add_totalArg(parser.add_argument_group("Total number of archives"))
|
||||
add_getArg(parser.add_argument_group("Get source code"))
|
||||
add_knownUrlArg(
|
||||
parser.add_argument_group(
|
||||
"URLs known and archived to Waybcak Machine for the site."
|
||||
)
|
||||
)
|
||||
add_nearArg(parser.add_argument_group("Archive close to time specified"))
|
||||
add_nearArgs(parser.add_argument_group("Arguments that are used only with --near"))
|
||||
parser.add_argument(
|
||||
"--version", "-v", action="store_true", help="Waybackpy version"
|
||||
)
|
||||
return parser.parse_args(argv[1:])
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
argv = sys.argv if argv is None else argv
|
||||
print(args_handler(parse_args(argv)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv))
|
@ -1,6 +1,26 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
waybackpy.exceptions
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
This module contains the set of Waybackpy's exceptions.
|
||||
"""
|
||||
|
||||
|
||||
class WaybackError(Exception):
|
||||
"""
|
||||
Raised when API Service error.
|
||||
Raised when Waybackpy can not return what you asked for.
|
||||
1) Wayback Machine API Service is unreachable/down.
|
||||
2) You passed illegal arguments.
|
||||
"""
|
||||
|
||||
|
||||
class RedirectSaveError(WaybackError):
|
||||
"""
|
||||
Raised when the original URL is redirected and the
|
||||
redirect URL is archived but not the original URL.
|
||||
"""
|
||||
|
||||
|
||||
class URLError(Exception):
|
||||
"""
|
||||
Raised when malformed URLs are passed as arguments.
|
||||
"""
|
||||
|
51
waybackpy/snapshot.py
Normal file
51
waybackpy/snapshot.py
Normal file
@ -0,0 +1,51 @@
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class CdxSnapshot:
|
||||
"""
|
||||
This class encapsulates the snapshots for greater usability.
|
||||
|
||||
Raw Snapshot data looks like:
|
||||
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, properties):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
self : waybackpy.snapshot.CdxSnapshot
|
||||
The instance itself
|
||||
|
||||
properties : dict
|
||||
Properties is a dict containg all of the 7 cdx snapshot properties.
|
||||
|
||||
"""
|
||||
self.urlkey = properties["urlkey"]
|
||||
self.timestamp = properties["timestamp"]
|
||||
self.datetime_timestamp = datetime.strptime(self.timestamp, "%Y%m%d%H%M%S")
|
||||
self.original = properties["original"]
|
||||
self.mimetype = properties["mimetype"]
|
||||
self.statuscode = properties["statuscode"]
|
||||
self.digest = properties["digest"]
|
||||
self.length = properties["length"]
|
||||
self.archive_url = (
|
||||
"https://web.archive.org/web/" + self.timestamp + "/" + self.original
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
"""Returns the Cdx snapshot line.
|
||||
|
||||
Output format:
|
||||
org,archive)/ 20080126045828 http://github.com text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415
|
||||
|
||||
"""
|
||||
return "{urlkey} {timestamp} {original} {mimetype} {statuscode} {digest} {length}".format(
|
||||
urlkey=self.urlkey,
|
||||
timestamp=self.timestamp,
|
||||
original=self.original,
|
||||
mimetype=self.mimetype,
|
||||
statuscode=self.statuscode,
|
||||
digest=self.digest,
|
||||
length=self.length,
|
||||
)
|
564
waybackpy/utils.py
Normal file
564
waybackpy/utils.py
Normal file
@ -0,0 +1,564 @@
|
||||
import re
|
||||
import time
|
||||
import requests
|
||||
from datetime import datetime
|
||||
|
||||
from .exceptions import WaybackError, URLError, RedirectSaveError
|
||||
from .__version__ import __version__
|
||||
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
quote = requests.utils.quote
|
||||
default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||
|
||||
|
||||
def _latest_version(package_name, headers):
|
||||
"""Returns the latest version of package_name.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
package_name : str
|
||||
The name of the python package
|
||||
|
||||
headers : dict
|
||||
Headers that will be used while making get requests
|
||||
|
||||
Return type is str
|
||||
|
||||
Use API <https://pypi.org/pypi/> to get the latest version of
|
||||
waybackpy, but can be used to get latest version of any package
|
||||
on PyPi.
|
||||
"""
|
||||
|
||||
request_url = "https://pypi.org/pypi/" + package_name + "/json"
|
||||
response = _get_response(request_url, headers=headers)
|
||||
data = response.json()
|
||||
return data["info"]["version"]
|
||||
|
||||
|
||||
def _unix_timestamp_to_wayback_timestamp(unix_timestamp):
|
||||
"""Returns unix timestamp converted to datetime.datetime
|
||||
|
||||
Parameters
|
||||
----------
|
||||
unix_timestamp : str, int or float
|
||||
Unix-timestamp that needs to be converted to datetime.datetime
|
||||
|
||||
Converts and returns input unix_timestamp to datetime.datetime object.
|
||||
Does not matter if unix_timestamp is str, float or int.
|
||||
"""
|
||||
|
||||
return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
|
||||
|
||||
|
||||
def _add_payload(instance, payload):
|
||||
"""Adds payload from instance that can be used to make get requests.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
instance : waybackpy.cdx.Cdx
|
||||
instance of the Cdx class
|
||||
|
||||
payload : dict
|
||||
A dict onto which we need to add keys and values based on instance.
|
||||
|
||||
instance is object of Cdx class and it contains the data required to fill
|
||||
the payload dictionary.
|
||||
"""
|
||||
|
||||
if instance.start_timestamp:
|
||||
payload["from"] = instance.start_timestamp
|
||||
|
||||
if instance.end_timestamp:
|
||||
payload["to"] = instance.end_timestamp
|
||||
|
||||
if instance.gzip != True:
|
||||
payload["gzip"] = "false"
|
||||
|
||||
if instance.match_type:
|
||||
payload["matchType"] = instance.match_type
|
||||
|
||||
if instance.filters and len(instance.filters) > 0:
|
||||
for i, f in enumerate(instance.filters):
|
||||
payload["filter" + str(i)] = f
|
||||
|
||||
if instance.collapses and len(instance.collapses) > 0:
|
||||
for i, f in enumerate(instance.collapses):
|
||||
payload["collapse" + str(i)] = f
|
||||
|
||||
# Don't need to return anything as it's dictionary.
|
||||
payload["url"] = instance.url
|
||||
|
||||
|
||||
def _timestamp_manager(timestamp, data):
|
||||
"""Returns the timestamp.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
timestamp : datetime.datetime
|
||||
datetime object
|
||||
|
||||
data : dict
|
||||
A python dictionary, which is loaded JSON os the availability API.
|
||||
|
||||
Return type:
|
||||
datetime.datetime
|
||||
|
||||
If timestamp is not None then sets the value to timestamp itself.
|
||||
If timestamp is None the returns the value from the last fetched API data.
|
||||
If not timestamp and can not read the archived_snapshots form data return datetime.max
|
||||
"""
|
||||
|
||||
if timestamp:
|
||||
return timestamp
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
return datetime.max
|
||||
|
||||
return datetime.strptime(
|
||||
data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||
)
|
||||
|
||||
|
||||
def _check_match_type(match_type, url):
|
||||
"""Checks the validity of match_type parameter of the CDX GET requests.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
match_type : list
|
||||
list that may contain any or all from ["exact", "prefix", "host", "domain"]
|
||||
See https://github.com/akamhy/waybackpy/wiki/Python-package-docs#url-match-scope
|
||||
|
||||
url : str
|
||||
The URL used to create the waybackpy Url object.
|
||||
|
||||
If not vaild match_type raise Exception.
|
||||
|
||||
"""
|
||||
|
||||
if not match_type:
|
||||
return
|
||||
|
||||
if "*" in url:
|
||||
raise WaybackError("Can not use wildcard with match_type argument")
|
||||
|
||||
legal_match_type = ["exact", "prefix", "host", "domain"]
|
||||
|
||||
if match_type not in legal_match_type:
|
||||
exc_message = "{match_type} is not an allowed match type.\nUse one from 'exact', 'prefix', 'host' or 'domain'".format(
|
||||
match_type=match_type
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def _check_collapses(collapses):
|
||||
"""Checks the validity of collapse parameter of the CDX GET request.
|
||||
|
||||
One or more field or field:N to 'collapses=[]' where
|
||||
field is one of (urlkey, timestamp, original, mimetype, statuscode,
|
||||
digest and length) and N is the first N characters of field to test.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
collapses : list
|
||||
|
||||
If not vaild collapses raise Exception.
|
||||
|
||||
"""
|
||||
|
||||
if not isinstance(collapses, list):
|
||||
raise WaybackError("collapses must be a list.")
|
||||
|
||||
if len(collapses) == 0:
|
||||
return
|
||||
|
||||
for collapse in collapses:
|
||||
try:
|
||||
match = re.search(
|
||||
r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)(:?[0-9]{1,99})?",
|
||||
collapse,
|
||||
)
|
||||
field = match.group(1)
|
||||
|
||||
N = None
|
||||
if 2 == len(match.groups()):
|
||||
N = match.group(2)
|
||||
|
||||
if N:
|
||||
if not (field + N == collapse):
|
||||
raise Exception
|
||||
else:
|
||||
if not (field == collapse):
|
||||
raise Exception
|
||||
|
||||
except Exception:
|
||||
exc_message = "collapse argument '{collapse}' is not following the cdx collapse syntax.".format(
|
||||
collapse=collapse
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def _check_filters(filters):
|
||||
"""Checks the validity of filter parameter of the CDX GET request.
|
||||
|
||||
Any number of filter params of the following form may be specified:
|
||||
filters=["[!]field:regex"] may be specified..
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filters : list
|
||||
|
||||
If not vaild filters raise Exception.
|
||||
|
||||
"""
|
||||
|
||||
if not isinstance(filters, list):
|
||||
raise WaybackError("filters must be a list.")
|
||||
|
||||
# [!]field:regex
|
||||
for _filter in filters:
|
||||
try:
|
||||
|
||||
match = re.search(
|
||||
r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):(.*)",
|
||||
_filter,
|
||||
)
|
||||
|
||||
key = match.group(1)
|
||||
val = match.group(2)
|
||||
|
||||
except Exception:
|
||||
|
||||
exc_message = (
|
||||
"Filter '{_filter}' is not following the cdx filter syntax.".format(
|
||||
_filter=_filter
|
||||
)
|
||||
)
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def _cleaned_url(url):
|
||||
"""Sanatize the url
|
||||
Remove and replace illegal whitespace characters from the URL.
|
||||
"""
|
||||
return str(url).strip().replace(" ", "%20")
|
||||
|
||||
|
||||
def _url_check(url):
|
||||
"""
|
||||
Check for common URL problems.
|
||||
What we are checking:
|
||||
1) '.' in self.url, no url that ain't '.' in it.
|
||||
|
||||
If you known any others, please create a PR on the github repo.
|
||||
"""
|
||||
|
||||
if "." not in url:
|
||||
exc_message = "'{url}' is not a vaild URL.".format(url=url)
|
||||
raise URLError(exc_message)
|
||||
|
||||
|
||||
def _full_url(endpoint, params):
|
||||
"""API endpoint + GET parameters = full_url
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endpoint : str
|
||||
The API endpoint
|
||||
|
||||
params : dict
|
||||
Dictionary that has name-value pairs.
|
||||
|
||||
Return type is str
|
||||
|
||||
"""
|
||||
|
||||
if not params:
|
||||
return endpoint
|
||||
|
||||
full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
|
||||
for key, val in params.items():
|
||||
key = "filter" if key.startswith("filter") else key
|
||||
key = "collapse" if key.startswith("collapse") else key
|
||||
amp = "" if full_url.endswith("?") else "&"
|
||||
full_url = full_url + amp + "{key}={val}".format(key=key, val=quote(str(val)))
|
||||
return full_url
|
||||
|
||||
|
||||
def _get_total_pages(url, user_agent):
|
||||
"""
|
||||
If showNumPages is passed in cdx API, it returns
|
||||
'number of archive pages'and each page has many archives.
|
||||
|
||||
This func returns number of pages of archives (type int).
|
||||
"""
|
||||
total_pages_url = (
|
||||
"https://web.archive.org/cdx/search/cdx?url={url}&showNumPages=true".format(
|
||||
url=url
|
||||
)
|
||||
)
|
||||
headers = {"User-Agent": user_agent}
|
||||
return int((_get_response(total_pages_url, headers=headers).text).strip())
|
||||
|
||||
|
||||
def _archive_url_parser(
|
||||
header, url, latest_version=__version__, instance=None, response=None
|
||||
):
|
||||
"""Returns the archive after parsing it from the response header.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
header : str
|
||||
The response header of WayBack Machine's Save API
|
||||
|
||||
url : str
|
||||
The input url, the one used to created the Url object.
|
||||
|
||||
latest_version : str
|
||||
The latest version of waybackpy (default is __version__)
|
||||
|
||||
instance : waybackpy.wrapper.Url
|
||||
Instance of Url class
|
||||
|
||||
|
||||
The wayback machine's save API doesn't
|
||||
return JSON response, we are required
|
||||
to read the header of the API response
|
||||
and find the archive URL.
|
||||
|
||||
This method has some regular expressions
|
||||
that are used to search for the archive url
|
||||
in the response header of Save API.
|
||||
|
||||
Two cases are possible:
|
||||
1) Either we find the archive url in
|
||||
the header.
|
||||
|
||||
2) Or we didn't find the archive url in
|
||||
API header.
|
||||
|
||||
If we found the archive URL we return it.
|
||||
|
||||
Return format:
|
||||
web.archive.org/web/<TIMESTAMP>/<URL>
|
||||
|
||||
And if we couldn't find it, we raise
|
||||
WaybackError with an error message.
|
||||
"""
|
||||
|
||||
if "save redirected" in header and instance:
|
||||
time.sleep(60) # makeup for archive time
|
||||
|
||||
now = datetime.utcnow().timetuple()
|
||||
timestamp = _wayback_timestamp(
|
||||
year=now.tm_year,
|
||||
month=now.tm_mon,
|
||||
day=now.tm_mday,
|
||||
hour=now.tm_hour,
|
||||
minute=now.tm_min,
|
||||
)
|
||||
|
||||
return_str = "web.archive.org/web/{timestamp}/{url}".format(
|
||||
timestamp=timestamp, url=url
|
||||
)
|
||||
url = "https://" + return_str
|
||||
|
||||
headers = {"User-Agent": instance.user_agent}
|
||||
|
||||
res = _get_response(url, headers=headers)
|
||||
|
||||
if res.status_code < 400:
|
||||
return "web.archive.org/web/{timestamp}/{url}".format(
|
||||
timestamp=timestamp, url=url
|
||||
)
|
||||
|
||||
# Regex1
|
||||
m = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
|
||||
if m:
|
||||
return "web.archive.org" + m.group(1)
|
||||
|
||||
# Regex2
|
||||
m = re.search(
|
||||
r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
|
||||
)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
# Regex3
|
||||
m = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
if response:
|
||||
if response.url:
|
||||
if "web.archive.org/web" in response.url:
|
||||
m = re.search(
|
||||
r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$",
|
||||
str(response.url).strip(),
|
||||
)
|
||||
if m:
|
||||
return m.group(0)
|
||||
|
||||
if instance:
|
||||
newest_archive = None
|
||||
try:
|
||||
newest_archive = instance.newest()
|
||||
except WaybackError:
|
||||
pass # We don't care as this is a save request
|
||||
|
||||
if newest_archive:
|
||||
minutes_old = (
|
||||
datetime.utcnow() - newest_archive.timestamp
|
||||
).total_seconds() / 60.0
|
||||
|
||||
if minutes_old <= 30:
|
||||
archive_url = newest_archive.archive_url
|
||||
m = re.search(r"web\.archive\.org/web/[0-9]{14}/.*", archive_url)
|
||||
if m:
|
||||
instance.cached_save = True
|
||||
return m.group(0)
|
||||
|
||||
if __version__ == latest_version:
|
||||
exc_message = (
|
||||
"No archive URL found in the API response. "
|
||||
"If '{url}' can be accessed via your web browser then either "
|
||||
"Wayback Machine is malfunctioning or it refused to archive your URL."
|
||||
"\nHeader:\n{header}".format(url=url, header=header)
|
||||
)
|
||||
|
||||
if "save redirected" == header.strip():
|
||||
raise RedirectSaveError(
|
||||
"URL cannot be archived by wayback machine as it is a redirect.\nHeader:\n{header}".format(
|
||||
header=header
|
||||
)
|
||||
)
|
||||
else:
|
||||
exc_message = (
|
||||
"No archive URL found in the API response. "
|
||||
"If '{url}' can be accessed via your web browser then either "
|
||||
"this version of waybackpy ({version}) is out of date or WayBack "
|
||||
"Machine is malfunctioning. Visit 'https://github.com/akamhy/waybackpy' "
|
||||
"for the latest version of waybackpy.\nHeader:\n{header}".format(
|
||||
url=url, version=__version__, header=header
|
||||
)
|
||||
)
|
||||
|
||||
raise WaybackError(exc_message)
|
||||
|
||||
|
||||
def _wayback_timestamp(**kwargs):
|
||||
"""Returns a valid waybackpy timestamp.
|
||||
|
||||
The standard archive URL format is
|
||||
https://web.archive.org/web/20191214041711/https://www.youtube.com
|
||||
|
||||
If we break it down in three parts:
|
||||
1 ) The start (https://web.archive.org/web/)
|
||||
2 ) timestamp (20191214041711)
|
||||
3 ) https://www.youtube.com, the original URL
|
||||
|
||||
|
||||
The near method of Url class in wrapper.py takes year, month, day, hour
|
||||
and minute as arguments, their type is int.
|
||||
|
||||
This method takes those integers and converts it to
|
||||
wayback machine timestamp and returns it.
|
||||
|
||||
|
||||
zfill(2) adds 1 zero in front of single digit days, months hour etc.
|
||||
|
||||
Return type is string.
|
||||
"""
|
||||
|
||||
return "".join(
|
||||
str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
|
||||
)
|
||||
|
||||
|
||||
def _get_response(
|
||||
endpoint,
|
||||
params=None,
|
||||
headers=None,
|
||||
return_full_url=False,
|
||||
retries=5,
|
||||
backoff_factor=0.5,
|
||||
no_raise_on_redirects=False,
|
||||
):
|
||||
"""Makes get requests.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endpoint : str
|
||||
The API endpoint.
|
||||
|
||||
params : dict
|
||||
The get request parameters. (default is None)
|
||||
|
||||
headers : dict
|
||||
Headers for the get request. (default is None)
|
||||
|
||||
return_full_url : bool
|
||||
Determines whether the call went full url returned along with the
|
||||
response. (default is False)
|
||||
|
||||
retries : int
|
||||
Maximum number of retries for the get request. (default is 5)
|
||||
|
||||
backoff_factor : float
|
||||
The factor by which we determine the next retry time after wait.
|
||||
https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html
|
||||
(default is 0.5)
|
||||
|
||||
no_raise_on_redirects : bool
|
||||
If maximum 30(default for requests) times redirected than instead of
|
||||
exceptions return. (default is False)
|
||||
|
||||
|
||||
To handle WaybackError:
|
||||
from waybackpy.exceptions import WaybackError
|
||||
|
||||
try:
|
||||
...
|
||||
except WaybackError as e:
|
||||
# handle it
|
||||
"""
|
||||
|
||||
# From https://stackoverflow.com/a/35504626
|
||||
# By https://stackoverflow.com/users/401467/datashaman
|
||||
|
||||
s = requests.Session()
|
||||
|
||||
retries = Retry(
|
||||
total=retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
)
|
||||
|
||||
s.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
# The URL with parameters required for the get request
|
||||
url = _full_url(endpoint, params)
|
||||
|
||||
try:
|
||||
|
||||
if not return_full_url:
|
||||
return s.get(url, headers=headers)
|
||||
|
||||
return (url, s.get(url, headers=headers))
|
||||
|
||||
except Exception as e:
|
||||
|
||||
reason = str(e)
|
||||
|
||||
if no_raise_on_redirects:
|
||||
if "Exceeded 30 redirects" in reason:
|
||||
return
|
||||
|
||||
exc_message = "Error while retrieving {url}.\n{reason}".format(
|
||||
url=url, reason=reason
|
||||
)
|
||||
|
||||
exc = WaybackError(exc_message)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
@ -1,153 +1,500 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
from waybackpy.exceptions import WaybackError
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
if sys.version_info >= (3, 0): # If the python ver >= 3
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError, URLError
|
||||
else: # For python2.x
|
||||
from urllib2 import Request, urlopen, HTTPError, URLError
|
||||
|
||||
default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||
|
||||
class Url():
|
||||
"""waybackpy Url object"""
|
||||
from .exceptions import WaybackError
|
||||
from .cdx import Cdx
|
||||
from .utils import (
|
||||
_archive_url_parser,
|
||||
_wayback_timestamp,
|
||||
_get_response,
|
||||
default_user_agent,
|
||||
_url_check,
|
||||
_cleaned_url,
|
||||
_timestamp_manager,
|
||||
_unix_timestamp_to_wayback_timestamp,
|
||||
_latest_version,
|
||||
)
|
||||
|
||||
|
||||
def __init__(self, url, user_agent=default_UA):
|
||||
class Url:
|
||||
"""
|
||||
|
||||
Attributes
|
||||
----------
|
||||
url : str
|
||||
The input URL, wayback machine API operations are performed
|
||||
on this URL after sanatizing it.
|
||||
|
||||
user_agent : str
|
||||
The user_agent used while making the GET requests to the
|
||||
Wayback machine APIs
|
||||
|
||||
_archive_url : str
|
||||
Caches the last fetched archive.
|
||||
|
||||
timestamp : datetime.datetime
|
||||
timestamp of the archive URL as datetime object for
|
||||
greater usability
|
||||
|
||||
_JSON : dict
|
||||
Caches the last fetched availability API data
|
||||
|
||||
latest_version : str
|
||||
The latest version of waybackpy on PyPi
|
||||
|
||||
cached_save : bool
|
||||
Flag to check if WayBack machine returned a cached
|
||||
archive instead of creating a new archive. WayBack
|
||||
machine allows only one 1 archive for an URL in
|
||||
30 minutes. If the archive returned by WayBack machine
|
||||
is older than 3 minutes than this flag is set to True
|
||||
|
||||
Methods turned properties
|
||||
----------
|
||||
JSON : dict
|
||||
JSON response of availability API as dictionary / loaded JSON
|
||||
|
||||
archive_url : str
|
||||
Return the archive url, returns str
|
||||
|
||||
_timestamp : datetime.datetime
|
||||
Sets the value of self.timestamp if still not set
|
||||
|
||||
Methods
|
||||
-------
|
||||
save()
|
||||
Archives the URL on WayBack machine
|
||||
|
||||
get(url="", user_agent="", encoding="")
|
||||
Gets the source of archive url, can also be used to get source
|
||||
of any URL if passed into it.
|
||||
|
||||
near(year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None)
|
||||
Wayback Machine can have many archives for a URL/webpage, sometimes we want
|
||||
archive close to a specific time.
|
||||
This method takes year, month, day, hour, minute and unix_timestamp as input.
|
||||
|
||||
oldest(year=1994)
|
||||
The oldest archive of an URL.
|
||||
|
||||
newest()
|
||||
The newest archive of an URL
|
||||
|
||||
total_archives(start_timestamp=None, end_timestamp=None)
|
||||
total number of archives of an URL, the timeframe can be confined by
|
||||
start_timestamp and end_timestamp
|
||||
|
||||
known_urls(subdomain=False, host=False, start_timestamp=None, end_timestamp=None, match_type="prefix")
|
||||
Known URLs for an URL, subdomain, URL as prefix etc.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, url, user_agent=default_user_agent):
|
||||
self.url = url
|
||||
self.user_agent = user_agent
|
||||
self.url_check() # checks url validity on init.
|
||||
self.user_agent = str(user_agent)
|
||||
_url_check(self.url)
|
||||
self._archive_url = None
|
||||
self.timestamp = None
|
||||
self._JSON = None
|
||||
self.latest_version = None
|
||||
self.cached_save = False
|
||||
|
||||
def __repr__(self):
|
||||
"""Representation of the object."""
|
||||
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
|
||||
return "waybackpy.Url(url={url}, user_agent={user_agent})".format(
|
||||
url=self.url, user_agent=self.user_agent
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
"""String representation of the object."""
|
||||
return "%s" % self.clean_url()
|
||||
if not self._archive_url:
|
||||
self._archive_url = self.archive_url
|
||||
|
||||
return "{archive_url}".format(archive_url=self._archive_url)
|
||||
|
||||
def __len__(self):
|
||||
"""Length of the URL."""
|
||||
return len(self.clean_url())
|
||||
"""Number of days between today and the date of archive based on the timestamp
|
||||
|
||||
def url_check(self):
|
||||
"""Check for common URL problems."""
|
||||
if "." not in self.url:
|
||||
raise URLError("'%s' is not a vaild url." % self.url)
|
||||
return True
|
||||
len() of waybackpy.wrapper.Url should return
|
||||
the number of days between today and the
|
||||
archive timestamp.
|
||||
|
||||
def clean_url(self):
|
||||
"""Fix the URL, if possible."""
|
||||
return str(self.url).strip().replace(" ","_")
|
||||
|
||||
def wayback_timestamp(self, **kwargs):
|
||||
"""Return the formatted the timestamp."""
|
||||
return (
|
||||
str(kwargs["year"])
|
||||
+
|
||||
str(kwargs["month"]).zfill(2)
|
||||
+
|
||||
str(kwargs["day"]).zfill(2)
|
||||
+
|
||||
str(kwargs["hour"]).zfill(2)
|
||||
+
|
||||
str(kwargs["minute"]).zfill(2)
|
||||
)
|
||||
|
||||
def handle_HTTPError(self, e):
|
||||
"""Handle some common HTTPErrors."""
|
||||
if e.code == 404:
|
||||
raise HTTPError(e)
|
||||
if e.code >= 400:
|
||||
raise WaybackError(e)
|
||||
|
||||
def save(self):
|
||||
"""Create a new archives for an URL on the Wayback Machine."""
|
||||
request_url = ("https://web.archive.org/save/" + self.clean_url())
|
||||
hdr = { 'User-Agent' : '%s' % self.user_agent } #nosec
|
||||
req = Request(request_url, headers=hdr) #nosec
|
||||
try:
|
||||
response = urlopen(req, timeout=30) #nosec
|
||||
except Exception:
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except Exception as e:
|
||||
raise WaybackError(e)
|
||||
header = response.headers
|
||||
try:
|
||||
arch = re.search(r"rel=\"memento.*?web\.archive\.org(/web/[0-9]{14}/.*?)>", str(header)).group(1)
|
||||
except KeyError as e:
|
||||
raise WaybackError(e)
|
||||
return "https://web.archive.org" + arch
|
||||
|
||||
def get(self, url=None, user_agent=None, encoding=None):
|
||||
"""Returns the source code of the supplied URL. Auto detects the encoding if not supplied."""
|
||||
if not url:
|
||||
url = self.clean_url()
|
||||
if not user_agent:
|
||||
user_agent = self.user_agent
|
||||
hdr = { 'User-Agent' : '%s' % user_agent }
|
||||
req = Request(url, headers=hdr) #nosec
|
||||
try:
|
||||
resp=urlopen(req) #nosec
|
||||
except URLError:
|
||||
try:
|
||||
resp=urlopen(req) #nosec
|
||||
except URLError as e:
|
||||
raise HTTPError(e)
|
||||
if not encoding:
|
||||
try:
|
||||
encoding= resp.headers['content-type'].split('charset=')[-1]
|
||||
except AttributeError:
|
||||
encoding = "UTF-8"
|
||||
return resp.read().decode(encoding.replace("text/html", "UTF-8", 1))
|
||||
|
||||
def near(self, **kwargs):
|
||||
""" Returns the archived from Wayback Machine for an URL closest to the time supplied.
|
||||
Supported params are year, month, day, hour and minute.
|
||||
The non supplied parameters are default to the runtime time.
|
||||
Can be applied on return values of near and its
|
||||
childs (e.g. oldest) and if applied on waybackpy.Url()
|
||||
whithout using any functions, it just grabs
|
||||
self._timestamp and def _timestamp gets it
|
||||
from def JSON.
|
||||
"""
|
||||
year=kwargs.get("year", datetime.utcnow().strftime('%Y'))
|
||||
month=kwargs.get("month", datetime.utcnow().strftime('%m'))
|
||||
day=kwargs.get("day", datetime.utcnow().strftime('%d'))
|
||||
hour=kwargs.get("hour", datetime.utcnow().strftime('%H'))
|
||||
minute=kwargs.get("minute", datetime.utcnow().strftime('%M'))
|
||||
timestamp = self.wayback_timestamp(year=year,month=month,day=day,hour=hour,minute=minute)
|
||||
request_url = "https://archive.org/wayback/available?url=%s×tamp=%s" % (self.clean_url(), str(timestamp))
|
||||
hdr = { 'User-Agent' : '%s' % self.user_agent }
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except Exception as e:
|
||||
self.handle_HTTPError(e)
|
||||
data = json.loads(response.read().decode("UTF-8"))
|
||||
td_max = timedelta(
|
||||
days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
|
||||
)
|
||||
|
||||
if not self.timestamp:
|
||||
self.timestamp = self._timestamp
|
||||
|
||||
if self.timestamp == datetime.max:
|
||||
return td_max.days
|
||||
|
||||
return (datetime.utcnow() - self.timestamp).days
|
||||
|
||||
@property
|
||||
def JSON(self):
|
||||
"""Returns JSON response of availability API as dictionary / loaded JSON
|
||||
|
||||
return type : dict
|
||||
"""
|
||||
|
||||
# If user used the near method or any method that depends on near, we
|
||||
# are certain that we have a loaded dictionary cached in self._JSON.
|
||||
# Return the loaded JSON data.
|
||||
if self._JSON:
|
||||
return self._JSON
|
||||
|
||||
# If no cached data found, get data and return + cache it.
|
||||
endpoint = "https://archive.org/wayback/available"
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
payload = {"url": "{url}".format(url=_cleaned_url(self.url))}
|
||||
response = _get_response(endpoint, params=payload, headers=headers)
|
||||
self._JSON = response.json()
|
||||
return self._JSON
|
||||
|
||||
@property
|
||||
def archive_url(self):
|
||||
"""Return the archive url.
|
||||
|
||||
return type : str
|
||||
"""
|
||||
|
||||
if self._archive_url:
|
||||
return self._archive_url
|
||||
|
||||
data = self.JSON
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
raise WaybackError("'%s' is not yet archived." % url)
|
||||
archive_url = (data["archived_snapshots"]["closest"]["url"])
|
||||
# wayback machine returns http sometimes, idk why? But they support https
|
||||
archive_url = archive_url.replace("http://web.archive.org/web/","https://web.archive.org/web/",1)
|
||||
archive_url = None
|
||||
else:
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||
)
|
||||
self._archive_url = archive_url
|
||||
return archive_url
|
||||
|
||||
@property
|
||||
def _timestamp(self):
|
||||
"""Sets the value of self.timestamp if still not set.
|
||||
|
||||
Return type : datetime.datetime
|
||||
|
||||
"""
|
||||
return _timestamp_manager(self.timestamp, self.JSON)
|
||||
|
||||
def save(self):
|
||||
"""Saves/Archive the URL.
|
||||
|
||||
To save a webpage on WayBack machine we
|
||||
need to send get request to https://web.archive.org/save/
|
||||
|
||||
And to get the archive URL we are required to read the
|
||||
header of the API response.
|
||||
|
||||
_get_response() takes care of the get requests.
|
||||
|
||||
_archive_url_parser() parses the archive from the header.
|
||||
|
||||
return type : waybackpy.wrapper.Url
|
||||
|
||||
"""
|
||||
request_url = "https://web.archive.org/save/" + _cleaned_url(self.url)
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
|
||||
response = _get_response(
|
||||
request_url,
|
||||
params=None,
|
||||
headers=headers,
|
||||
backoff_factor=2,
|
||||
no_raise_on_redirects=True,
|
||||
)
|
||||
|
||||
if not self.latest_version:
|
||||
self.latest_version = _latest_version("waybackpy", headers=headers)
|
||||
if response:
|
||||
res_headers = response.headers
|
||||
else:
|
||||
res_headers = "save redirected"
|
||||
self._archive_url = "https://" + _archive_url_parser(
|
||||
res_headers,
|
||||
self.url,
|
||||
latest_version=self.latest_version,
|
||||
instance=self,
|
||||
response=response,
|
||||
)
|
||||
|
||||
m = re.search(
|
||||
r"https?://web.archive.org/web/([0-9]{14})/http", self._archive_url
|
||||
)
|
||||
str_ts = m.group(1)
|
||||
ts = datetime.strptime(str_ts, "%Y%m%d%H%M%S")
|
||||
now = datetime.utcnow()
|
||||
total_seconds = int((now - ts).total_seconds())
|
||||
|
||||
if total_seconds > 60 * 3:
|
||||
self.cached_save = True
|
||||
|
||||
self.timestamp = ts
|
||||
|
||||
return self
|
||||
|
||||
def get(self, url="", user_agent="", encoding=""):
|
||||
"""GET the source of archive or any other URL.
|
||||
|
||||
url : str, waybackpy.wrapper.Url
|
||||
The method will return the source code of
|
||||
this URL instead of last fetched archive.
|
||||
|
||||
user_agent : str
|
||||
The user_agent for GET request to API
|
||||
|
||||
encoding : str
|
||||
If user is using any other encoding that
|
||||
can't be detected by response.encoding
|
||||
|
||||
Return the source code of the last fetched
|
||||
archive URL if no URL is passed to this method
|
||||
else it returns the source code of url passed.
|
||||
|
||||
If encoding is not supplied, it is auto-detected
|
||||
from the response itself by requests package.
|
||||
"""
|
||||
|
||||
if not url and self._archive_url:
|
||||
url = self._archive_url
|
||||
|
||||
elif not url and not self._archive_url:
|
||||
url = _cleaned_url(self.url)
|
||||
|
||||
if not user_agent:
|
||||
user_agent = self.user_agent
|
||||
|
||||
headers = {"User-Agent": str(user_agent)}
|
||||
response = _get_response(str(url), params=None, headers=headers)
|
||||
|
||||
if not encoding:
|
||||
try:
|
||||
encoding = response.encoding
|
||||
except AttributeError:
|
||||
encoding = "UTF-8"
|
||||
|
||||
return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
|
||||
|
||||
def near(
|
||||
self,
|
||||
year=None,
|
||||
month=None,
|
||||
day=None,
|
||||
hour=None,
|
||||
minute=None,
|
||||
unix_timestamp=None,
|
||||
):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
|
||||
year : int
|
||||
Archive close to year
|
||||
|
||||
month : int
|
||||
Archive close to month
|
||||
|
||||
day : int
|
||||
Archive close to day
|
||||
|
||||
hour : int
|
||||
Archive close to hour
|
||||
|
||||
minute : int
|
||||
Archive close to minute
|
||||
|
||||
unix_timestamp : str, float or int
|
||||
Archive close to this unix_timestamp
|
||||
|
||||
Wayback Machine can have many archives of a webpage,
|
||||
sometimes we want archive close to a specific time.
|
||||
|
||||
This method takes year, month, day, hour and minute as input.
|
||||
The input type must be integer. Any non-supplied parameters
|
||||
default to the current time.
|
||||
|
||||
We convert the input to a wayback machine timestamp using
|
||||
_wayback_timestamp(), it returns a string.
|
||||
|
||||
We use the wayback machine's availability API
|
||||
(https://archive.org/wayback/available)
|
||||
to get the closest archive from the timestamp.
|
||||
|
||||
We set self._archive_url to the archive found, if any.
|
||||
If archive found, we set self.timestamp to its timestamp.
|
||||
We self._JSON to the response of the availability API.
|
||||
|
||||
And finally return self.
|
||||
"""
|
||||
|
||||
if unix_timestamp:
|
||||
timestamp = _unix_timestamp_to_wayback_timestamp(unix_timestamp)
|
||||
else:
|
||||
now = datetime.utcnow().timetuple()
|
||||
timestamp = _wayback_timestamp(
|
||||
year=year if year else now.tm_year,
|
||||
month=month if month else now.tm_mon,
|
||||
day=day if day else now.tm_mday,
|
||||
hour=hour if hour else now.tm_hour,
|
||||
minute=minute if minute else now.tm_min,
|
||||
)
|
||||
|
||||
endpoint = "https://archive.org/wayback/available"
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
payload = {
|
||||
"url": "{url}".format(url=_cleaned_url(self.url)),
|
||||
"timestamp": timestamp,
|
||||
}
|
||||
response = _get_response(endpoint, params=payload, headers=headers)
|
||||
data = response.json()
|
||||
|
||||
if not data["archived_snapshots"]:
|
||||
raise WaybackError(
|
||||
"Can not find archive for '{url}' try later or use wayback.Url(url, user_agent).save() "
|
||||
"to create a new archive.\nAPI response:\n{text}".format(
|
||||
url=_cleaned_url(self.url), text=response.text
|
||||
)
|
||||
)
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
archive_url = archive_url.replace(
|
||||
"http://web.archive.org/web/", "https://web.archive.org/web/", 1
|
||||
)
|
||||
|
||||
self._archive_url = archive_url
|
||||
self.timestamp = datetime.strptime(
|
||||
data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
|
||||
)
|
||||
self._JSON = data
|
||||
|
||||
return self
|
||||
|
||||
def oldest(self, year=1994):
|
||||
"""Returns the oldest archive from Wayback Machine for an URL."""
|
||||
"""
|
||||
Returns the earliest/oldest Wayback Machine archive for the webpage.
|
||||
|
||||
Wayback machine has started archiving the internet around 1997 and
|
||||
therefore we can't have any archive older than 1997, we use 1994 as the
|
||||
deafult year to look for the oldest archive.
|
||||
|
||||
We simply pass the year in near() and return it.
|
||||
"""
|
||||
|
||||
return self.near(year=year)
|
||||
|
||||
def newest(self):
|
||||
"""Returns the newest archive on Wayback Machine for an URL, sometimes you may not get the newest archive because wayback machine DB lag."""
|
||||
"""Return the newest Wayback Machine archive available.
|
||||
|
||||
We return the return value of self.near() as it deafults to current UTC time.
|
||||
|
||||
Due to Wayback Machine database lag, this may not always be the
|
||||
most recent archive.
|
||||
|
||||
return type : waybackpy.wrapper.Url
|
||||
"""
|
||||
|
||||
return self.near()
|
||||
|
||||
def total_archives(self):
|
||||
"""Returns the total number of archives on Wayback Machine for an URL."""
|
||||
hdr = { 'User-Agent' : '%s' % self.user_agent }
|
||||
request_url = "https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=statuscode" % self.clean_url()
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except Exception as e:
|
||||
self.handle_HTTPError(e)
|
||||
return str(response.read()).count(",") # Most efficient method to count number of archives (yet)
|
||||
def total_archives(self, start_timestamp=None, end_timestamp=None):
|
||||
"""Returns the total number of archives for an URL
|
||||
|
||||
Parameters
|
||||
----------
|
||||
start_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
end_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
|
||||
return type : int
|
||||
|
||||
|
||||
A webpage can have multiple archives on the wayback machine
|
||||
If someone wants to count the total number of archives of a
|
||||
webpage on wayback machine they can use this method.
|
||||
|
||||
Returns the total number of Wayback Machine archives for the URL.
|
||||
|
||||
"""
|
||||
|
||||
cdx = Cdx(
|
||||
_cleaned_url(self.url),
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
)
|
||||
|
||||
# cdx.snapshots() is generator not list.
|
||||
i = 0
|
||||
for _ in cdx.snapshots():
|
||||
i = i + 1
|
||||
return i
|
||||
|
||||
def known_urls(
|
||||
self,
|
||||
subdomain=False,
|
||||
host=False,
|
||||
start_timestamp=None,
|
||||
end_timestamp=None,
|
||||
match_type="prefix",
|
||||
):
|
||||
"""Yields known_urls URLs from the CDX API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
subdomain : bool
|
||||
If True fetch subdomain URLs along with the host URLs.
|
||||
|
||||
host : bool
|
||||
Only fetch host URLs.
|
||||
|
||||
start_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
end_timestamp : str
|
||||
1 to 14 digit string of numbers, you are not required to
|
||||
pass a full 14 digit timestamp.
|
||||
|
||||
match_type : str
|
||||
One of (exact, prefix, host and domain)
|
||||
|
||||
return type : waybackpy.snapshot.CdxSnapshot
|
||||
|
||||
Yields list of URLs known to exist for given input.
|
||||
Defaults to input URL as prefix.
|
||||
|
||||
Based on:
|
||||
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
|
||||
By Mohammed Diaa (https://github.com/mhmdiaa)
|
||||
"""
|
||||
|
||||
if subdomain:
|
||||
match_type = "domain"
|
||||
if host:
|
||||
match_type = "host"
|
||||
|
||||
cdx = Cdx(
|
||||
_cleaned_url(self.url),
|
||||
user_agent=self.user_agent,
|
||||
start_timestamp=start_timestamp,
|
||||
end_timestamp=end_timestamp,
|
||||
match_type=match_type,
|
||||
collapses=["urlkey"],
|
||||
)
|
||||
|
||||
for snapshot in cdx.snapshots():
|
||||
yield (snapshot.original)
|
||||
|
Reference in New Issue
Block a user