Compare commits
20 Commits
Author | SHA1 | Date | |
---|---|---|---|
57a32669b5 | |||
fe017cbcc8 | |||
5edb03d24b | |||
c5de2232ba | |||
ca9186c301 | |||
8a4b631c13 | |||
ec9ce92f48 | |||
e95d35c37f | |||
36d662b961 | |||
2835f8877e | |||
18cbd2fd30 | |||
a2812fb56f | |||
77effcf649 | |||
7272ef45a0 | |||
56116551ac | |||
4dcda94cb0 | |||
09f59b0182 | |||
ed24184b99 | |||
56bef064b1 | |||
44bb2cf5e4 |
20
README.md
20
README.md
@ -1,19 +1,19 @@
|
||||
# waybackpy
|
||||
|
||||

|
||||
[](https://travis-ci.org/akamhy/waybackpy)
|
||||
[](https://pypistats.org/packages/waybackpy)
|
||||
[](https://codecov.io/gh/akamhy/waybackpy)
|
||||
[](https://pepy.tech/project/waybackpy/month)
|
||||
[](https://github.com/akamhy/waybackpy/releases)
|
||||
[](https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade)
|
||||
[](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
[](https://codeclimate.com/github/akamhy/waybackpy/maintainability)
|
||||
[](https://www.codefactor.io/repository/github/akamhy/waybackpy)
|
||||
[](https://www.python.org/)
|
||||

|
||||
[](https://pypi.org/project/waybackpy/)
|
||||

|
||||
[](https://github.com/akamhy/waybackpy/graphs/commit-activity)
|
||||
[](https://codecov.io/gh/akamhy/waybackpy)
|
||||

|
||||

|
||||

|
||||
[](https://github.com/akamhy/waybackpy/blob/master/LICENSE)
|
||||
|
||||
|
||||

|
||||
@ -28,14 +28,14 @@ Table of contents
|
||||
* [Installation](#installation)
|
||||
|
||||
* [Usage](#usage)
|
||||
* [As a python package](#as-a-python-package)
|
||||
* [As a Python package](#as-a-python-package)
|
||||
* [Saving an url using save()](#capturing-aka-saving-an-url-using-save)
|
||||
* [Receiving the oldest archive for an URL Using oldest()](#receiving-the-oldest-archive-for-an-url-using-oldest)
|
||||
* [Receiving the recent most/newest archive for an URL using newest()](#receiving-the-newest-archive-for-an-url-using-newest)
|
||||
* [Receiving archive close to a specified year, month, day, hour, and minute using near()](#receiving-archive-close-to-a-specified-year-month-day-hour-and-minute-using-near)
|
||||
* [Get the content of webpage using get()](#get-the-content-of-webpage-using-get)
|
||||
* [Count total archives for an URL using total_archives()](#count-total-archives-for-an-url-using-total_archives)
|
||||
* [With CLI](#with-the-cli)
|
||||
* [With Command-line interface](#with-the-command-line-interface)
|
||||
* [Save](#save)
|
||||
* [Oldest archive](#oldest-archive)
|
||||
* [Newest archive](#newest-archive)
|
||||
@ -63,7 +63,7 @@ pip install git+https://github.com/akamhy/waybackpy.git
|
||||
|
||||
## Usage
|
||||
|
||||
### As a python package
|
||||
### As a Python package
|
||||
|
||||
#### Capturing aka Saving an url using save()
|
||||
```python
|
||||
@ -230,7 +230,7 @@ print(archive_count) # total_archives() returns an int
|
||||
```
|
||||
<sub>Try this out in your browser @ <https://repl.it/@akamhy/WaybackPyTotalArchivesExample></sub>
|
||||
|
||||
### With the CLI
|
||||
### With the Command-line interface
|
||||
|
||||
#### Save
|
||||
```bash
|
||||
|
37
index.rst
37
index.rst
@ -1,9 +1,9 @@
|
||||
waybackpy
|
||||
=========
|
||||
|
||||
|Build Status| |Downloads| |Release| |Codacy Badge| |License: MIT|
|
||||
|Maintainability| |CodeFactor| |made-with-python| |pypi| |PyPI - Python
|
||||
Version| |Maintenance| |codecov| |image12| |contributions welcome|
|
||||
|contributions welcome| |Build Status| |codecov| |Downloads| |Release|
|
||||
|Codacy Badge| |Maintainability| |CodeFactor| |made-with-python| |pypi|
|
||||
|PyPI - Python Version| |Maintenance| |Repo size| |License: MIT|
|
||||
|
||||
|Internet Archive| |Wayback Machine|
|
||||
|
||||
@ -22,7 +22,7 @@ Table of contents
|
||||
- `Installation <#installation>`__
|
||||
|
||||
- `Usage <#usage>`__
|
||||
- `As a python package <#as-a-python-package>`__
|
||||
- `As a Python package <#as-a-python-package>`__
|
||||
|
||||
- `Saving an url using
|
||||
save() <#capturing-aka-saving-an-url-using-save>`__
|
||||
@ -38,7 +38,7 @@ Table of contents
|
||||
- `Count total archives for an URL using
|
||||
total\_archives() <#count-total-archives-for-an-url-using-total_archives>`__
|
||||
|
||||
- `With CLI <#with-the-cli>`__
|
||||
- `With Command-line interface <#with-the-command-line-interface>`__
|
||||
|
||||
- `Save <#save>`__
|
||||
- `Oldest archive <#oldest-archive>`__
|
||||
@ -75,7 +75,7 @@ or direct from this repository using git.
|
||||
Usage
|
||||
-----
|
||||
|
||||
As a python package
|
||||
As a Python package
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Capturing aka Saving an url using save()
|
||||
@ -269,8 +269,8 @@ Count total archives for an URL using total\_archives()
|
||||
Try this out in your browser @
|
||||
https://repl.it/@akamhy/WaybackPyTotalArchivesExample\
|
||||
|
||||
With the CLI
|
||||
~~~~~~~~~~~~
|
||||
With the Command-line interface
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Save
|
||||
^^^^
|
||||
@ -348,8 +348,8 @@ Tests
|
||||
Dependency
|
||||
----------
|
||||
|
||||
- None, just python standard libraries (re, json, urllib, argparse and datetime).
|
||||
Both python 2 and 3 are supported :)
|
||||
- None, just python standard libraries (re, json, urllib, argparse and
|
||||
datetime). Both python 2 and 3 are supported :)
|
||||
|
||||
License
|
||||
-------
|
||||
@ -357,16 +357,17 @@ License
|
||||
`MIT
|
||||
License <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
|
||||
|
||||
.. |contributions welcome| image:: https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square
|
||||
.. |Build Status| image:: https://img.shields.io/travis/akamhy/waybackpy.svg?label=Travis%20CI&logo=travis&style=flat-square
|
||||
:target: https://travis-ci.org/akamhy/waybackpy
|
||||
.. |Downloads| image:: https://img.shields.io/pypi/dm/waybackpy.svg
|
||||
:target: https://pypistats.org/packages/waybackpy
|
||||
.. |codecov| image:: https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg
|
||||
:target: https://codecov.io/gh/akamhy/waybackpy
|
||||
.. |Downloads| image:: https://pepy.tech/badge/waybackpy/month
|
||||
:target: https://pepy.tech/project/waybackpy/month
|
||||
.. |Release| image:: https://img.shields.io/github/v/release/akamhy/waybackpy.svg
|
||||
:target: https://github.com/akamhy/waybackpy/releases
|
||||
.. |Codacy Badge| image:: https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65
|
||||
:target: https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade
|
||||
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
|
||||
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
||||
.. |Maintainability| image:: https://api.codeclimate.com/v1/badges/942f13d8177a56c1c906/maintainability
|
||||
:target: https://codeclimate.com/github/akamhy/waybackpy/maintainability
|
||||
.. |CodeFactor| image:: https://www.codefactor.io/repository/github/akamhy/waybackpy/badge
|
||||
@ -374,12 +375,12 @@ License <https://github.com/akamhy/waybackpy/blob/master/LICENSE>`__
|
||||
.. |made-with-python| image:: https://img.shields.io/badge/Made%20with-Python-1f425f.svg
|
||||
:target: https://www.python.org/
|
||||
.. |pypi| image:: https://img.shields.io/pypi/v/waybackpy.svg
|
||||
:target: https://pypi.org/project/waybackpy/
|
||||
.. |PyPI - Python Version| image:: https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square
|
||||
.. |Maintenance| image:: https://img.shields.io/badge/Maintained%3F-yes-green.svg
|
||||
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity
|
||||
.. |codecov| image:: https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg
|
||||
:target: https://codecov.io/gh/akamhy/waybackpy
|
||||
.. |image12| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
|
||||
.. |contributions welcome| image:: https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square
|
||||
.. |Repo size| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
|
||||
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
|
||||
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE
|
||||
.. |Internet Archive| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/8/84/Internet_Archive_logo_and_wordmark.svg/84px-Internet_Archive_logo_and_wordmark.svg.png
|
||||
.. |Wayback Machine| image:: https://upload.wikimedia.org/wikipedia/commons/thumb/0/01/Wayback_Machine_logo_2010.svg/284px-Wayback_Machine_logo_2010.svg.png
|
||||
|
2
setup.py
2
setup.py
@ -19,7 +19,7 @@ setup(
|
||||
author = about['__author__'],
|
||||
author_email = about['__author_email__'],
|
||||
url = about['__url__'],
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/2.1.3.tar.gz',
|
||||
download_url = 'https://github.com/akamhy/waybackpy/archive/2.1.7.tar.gz',
|
||||
keywords = ['wayback', 'archive', 'archive website', 'wayback machine', 'Internet Archive'],
|
||||
install_requires=[],
|
||||
python_requires= ">=2.7",
|
||||
|
97
tests/test_cli.py
Normal file
97
tests/test_cli.py
Normal file
@ -0,0 +1,97 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
import argparse
|
||||
|
||||
sys.path.append("..")
|
||||
import waybackpy.cli as cli # noqa: E402
|
||||
from waybackpy.wrapper import Url # noqa: E402
|
||||
from waybackpy.__version__ import __version__
|
||||
|
||||
codecov_python = False
|
||||
if sys.version_info > (3, 7):
|
||||
codecov_python = True
|
||||
|
||||
# Namespace(day=None, get=None, hour=None, minute=None, month=None, near=False,
|
||||
# newest=False, oldest=False, save=False, total=False, url=None, user_agent=None, version=False, year=None)
|
||||
|
||||
if codecov_python:
|
||||
def test_save():
|
||||
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=True, newest=False, near=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
|
||||
def test_oldest():
|
||||
args = argparse.Namespace(user_agent=None, url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=True, save=False, newest=False, near=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
|
||||
def test_newest():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=True, near=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert "pypi.org/user/akamhy" in reply
|
||||
|
||||
def test_total_archives():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=True, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get=None)
|
||||
reply = cli.args_handler(args)
|
||||
assert isinstance(reply, int)
|
||||
|
||||
def test_near():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=True, get=None, year=2020, month=7, day=15, hour=1, minute=1)
|
||||
reply = cli.args_handler(args)
|
||||
assert "202007" in reply
|
||||
|
||||
|
||||
def test_get():
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="url")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="oldest")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="newest")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
|
||||
if codecov_python:
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="save")
|
||||
reply = cli.args_handler(args)
|
||||
assert "waybackpy" in reply
|
||||
|
||||
args = argparse.Namespace(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
|
||||
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", url="https://pypi.org/user/akamhy/", total=False, version=False,
|
||||
oldest=False, save=False, newest=False, near=False, get="BullShit")
|
||||
reply = cli.args_handler(args)
|
||||
assert "get the source code of the" in reply
|
||||
|
||||
def test_args_handler():
|
||||
args = argparse.Namespace(version=True)
|
||||
reply = cli.args_handler(args)
|
||||
assert __version__ == reply
|
||||
|
||||
args = argparse.Namespace(url=None, version=False)
|
||||
reply = cli.args_handler(args)
|
||||
assert "Specify an URL" in reply
|
||||
|
||||
def test_main():
|
||||
# This also tests the parse_args method in cli.py
|
||||
cli.main(['temp.py', '--version'])
|
@ -23,6 +23,21 @@ def test_clean_url():
|
||||
test_result = target._clean_url()
|
||||
assert answer == test_result
|
||||
|
||||
def test_dunders():
|
||||
url = "https://en.wikipedia.org/wiki/Network_security"
|
||||
user_agent = "UA"
|
||||
target = waybackpy.Url(url, user_agent)
|
||||
assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target)
|
||||
assert len(target) == len(url)
|
||||
assert str(target) == url
|
||||
|
||||
def test_archive_url_parser():
|
||||
request_url = "https://amazon.com"
|
||||
hdr = {"User-Agent": user_agent} # nosec
|
||||
req = Request(request_url, headers=hdr) # nosec
|
||||
header = waybackpy._get_response(req).headers
|
||||
with pytest.raises(Exception):
|
||||
waybackpy._archive_url_parser(header)
|
||||
|
||||
def test_url_check():
|
||||
broken_url = "http://wwwgooglecom/"
|
||||
@ -40,7 +55,7 @@ def test_save():
|
||||
"commons.wikimedia.org",
|
||||
"www.wiktionary.org",
|
||||
"www.w3schools.com",
|
||||
"twitter.com",
|
||||
"www.ibm.com",
|
||||
]
|
||||
x = random.randint(0, len(url_list) - 1)
|
||||
url1 = url_list[x]
|
||||
@ -59,17 +74,16 @@ def test_save():
|
||||
url2 = "ha ha ha ha"
|
||||
waybackpy.Url(url2, user_agent)
|
||||
time.sleep(5)
|
||||
# Test for urls not allowed to archive by robot.txt.
|
||||
with pytest.raises(Exception):
|
||||
url3 = "http://www.archive.is/faq.html"
|
||||
target = waybackpy.Url(
|
||||
url3,
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) "
|
||||
"Gecko/20100101 Firefox/25.0",
|
||||
)
|
||||
target.save()
|
||||
|
||||
time.sleep(5)
|
||||
# Test for urls not allowed to archive by robot.txt. Doesn't works anymore. Find alternatives.
|
||||
# with pytest.raises(Exception):
|
||||
# url3 = "http://www.archive.is/faq.html"
|
||||
# target = waybackpy.Url(
|
||||
# url3,
|
||||
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) "
|
||||
# "Gecko/20100101 Firefox/25.0",
|
||||
# )
|
||||
# target.save()
|
||||
# time.sleep(5)
|
||||
# Non existent urls, test
|
||||
with pytest.raises(Exception):
|
||||
url4 = (
|
||||
|
@ -3,7 +3,7 @@
|
||||
__title__ = "waybackpy"
|
||||
__description__ = "A Python library that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily."
|
||||
__url__ = "https://akamhy.github.io/waybackpy/"
|
||||
__version__ = "2.1.3"
|
||||
__version__ = "2.1.7"
|
||||
__author__ = "akamhy"
|
||||
__author_email__ = "akash3pro@gmail.com"
|
||||
__license__ = "MIT"
|
||||
|
@ -1,20 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import argparse
|
||||
from waybackpy.wrapper import Url
|
||||
from waybackpy.__version__ import __version__
|
||||
|
||||
def _save(obj):
|
||||
print(obj.save())
|
||||
return (obj.save())
|
||||
|
||||
def _oldest(obj):
|
||||
print(obj.oldest())
|
||||
return (obj.oldest())
|
||||
|
||||
def _newest(obj):
|
||||
print(obj.newest())
|
||||
return (obj.newest())
|
||||
|
||||
def _total_archives(obj):
|
||||
print(obj.total_archives())
|
||||
return (obj.total_archives())
|
||||
|
||||
def _near(obj, args):
|
||||
_near_args = {}
|
||||
@ -28,29 +29,54 @@ def _near(obj, args):
|
||||
_near_args["hour"] = args.hour
|
||||
if args.minute:
|
||||
_near_args["minute"] = args.minute
|
||||
print(obj.near(**_near_args))
|
||||
return (obj.near(**_near_args))
|
||||
|
||||
def _get(obj, args):
|
||||
if args.get.lower() == "url":
|
||||
print(obj.get())
|
||||
return (obj.get())
|
||||
|
||||
elif args.get.lower() == "oldest":
|
||||
print(obj.get(obj.oldest()))
|
||||
if args.get.lower() == "oldest":
|
||||
return (obj.get(obj.oldest()))
|
||||
|
||||
elif args.get.lower() == "latest" or args.get.lower() == "newest":
|
||||
print(obj.get(obj.newest()))
|
||||
if args.get.lower() == "latest" or args.get.lower() == "newest":
|
||||
return (obj.get(obj.newest()))
|
||||
|
||||
elif args.get.lower() == "save":
|
||||
print(obj.get(obj.save()))
|
||||
if args.get.lower() == "save":
|
||||
return (obj.get(obj.save()))
|
||||
|
||||
else:
|
||||
print("Use get as \"--get 'source'\", 'source' can be one of the followings: \
|
||||
return ("Use get as \"--get 'source'\", 'source' can be one of the followings: \
|
||||
\n1) url - get the source code of the url specified using --url/-u.\
|
||||
\n2) oldest - get the source code of the oldest archive for the supplied url.\
|
||||
\n3) newest - get the source code of the newest archive for the supplied url.\
|
||||
\n4) save - Create a new archive and get the source code of this new archive for the supplied url.")
|
||||
|
||||
def main():
|
||||
def args_handler(args):
|
||||
if args.version:
|
||||
return (__version__)
|
||||
|
||||
if not args.url:
|
||||
return ("Specify an URL. See --help for help using waybackpy.")
|
||||
|
||||
if args.user_agent:
|
||||
obj = Url(args.url, args.user_agent)
|
||||
else:
|
||||
obj = Url(args.url)
|
||||
|
||||
if args.save:
|
||||
return _save(obj)
|
||||
if args.oldest:
|
||||
return _oldest(obj)
|
||||
if args.newest:
|
||||
return _newest(obj)
|
||||
if args.total:
|
||||
return _total_archives(obj)
|
||||
if args.near:
|
||||
return _near(obj, args)
|
||||
if args.get:
|
||||
return _get(obj, args)
|
||||
return ("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help for help using waybackpy.")
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-u", "--url", help="URL on which Wayback machine operations would occur.")
|
||||
parser.add_argument("-ua", "--user_agent", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\".")
|
||||
@ -60,45 +86,20 @@ def main():
|
||||
parser.add_argument("-t", "--total", action='store_true', help="Total number of archives for the specified URL.")
|
||||
parser.add_argument("-g", "--get", help="Prints the source code of the supplied url. Use '--get help' for extended usage.")
|
||||
parser.add_argument("-v", "--version", action='store_true', help="Prints the waybackpy version.")
|
||||
|
||||
parser.add_argument("-N", "--near", action='store_true', help="Latest/Newest archive for the specified URL.")
|
||||
parser.add_argument("-Y", "--year", type=int, help="Year in integer. For use with --near.")
|
||||
parser.add_argument("-M", "--month", type=int, help="Month in integer. For use with --near.")
|
||||
parser.add_argument("-D", "--day", type=int, help="Day in integer. For use with --near.")
|
||||
parser.add_argument("-H", "--hour", type=int, help="Hour in integer. For use with --near.")
|
||||
parser.add_argument("-MIN", "--minute", type=int, help="Minute in integer. For use with --near.")
|
||||
return parser.parse_args(argv[1:])
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.version:
|
||||
print(__version__)
|
||||
return
|
||||
|
||||
if not args.url:
|
||||
print("Specify an URL. See --help")
|
||||
return
|
||||
|
||||
# create the object with or without the user_agent
|
||||
if args.user_agent:
|
||||
obj = Url(args.url, args.user_agent)
|
||||
else:
|
||||
obj = Url(args.url)
|
||||
|
||||
if args.save:
|
||||
_save(obj)
|
||||
elif args.oldest:
|
||||
_oldest(obj)
|
||||
elif args.newest:
|
||||
_newest(obj)
|
||||
elif args.total:
|
||||
_total_archives(obj)
|
||||
elif args.near:
|
||||
_near(obj, args)
|
||||
elif args.get:
|
||||
_get(obj, args)
|
||||
else:
|
||||
print("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help")
|
||||
|
||||
def main(argv=None):
|
||||
if argv is None:
|
||||
argv = sys.argv
|
||||
args = parse_args(argv)
|
||||
output = args_handler(args)
|
||||
print(output)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
sys.exit(main(sys.argv))
|
||||
|
@ -19,12 +19,18 @@ default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
|
||||
def _archive_url_parser(header):
|
||||
"""Parse out the archive from header."""
|
||||
# Regex1
|
||||
arch = re.search(
|
||||
r"Content-Location: (/web/[0-9]{14}/.*)", str(header)
|
||||
)
|
||||
if arch:
|
||||
return "web.archive.org" + arch.group(1)
|
||||
# Regex2
|
||||
arch = re.search(
|
||||
r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
|
||||
)
|
||||
if arch:
|
||||
return arch.group(1)
|
||||
# Regex2
|
||||
# Regex3
|
||||
arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
|
||||
if arch:
|
||||
return arch.group(1)
|
||||
@ -51,10 +57,11 @@ def _get_response(req):
|
||||
try:
|
||||
response = urlopen(req) # nosec
|
||||
except Exception as e:
|
||||
raise WaybackError(e)
|
||||
exc = WaybackError("Error while retrieving %s" % req.full_url)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
||||
return response
|
||||
|
||||
|
||||
class Url:
|
||||
"""waybackpy Url object"""
|
||||
|
||||
@ -108,19 +115,6 @@ class Url:
|
||||
encoding = "UTF-8"
|
||||
return response.read().decode(encoding.replace("text/html", "UTF-8", 1))
|
||||
|
||||
def get_response(self, req):
|
||||
"""Get response for the supplied request."""
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except Exception:
|
||||
try:
|
||||
response = urlopen(req) #nosec
|
||||
except Exception as e:
|
||||
exc = WaybackError("Error while retrieving %s" % req.full_url)
|
||||
exc.__cause__ = e
|
||||
raise exc
|
||||
return response
|
||||
|
||||
def near(self, year=None, month=None, day=None, hour=None, minute=None):
|
||||
""" Return the closest Wayback Machine archive to the time supplied.
|
||||
Supported params are year, month, day, hour and minute.
|
||||
@ -146,7 +140,7 @@ class Url:
|
||||
data = json.loads(response.read().decode("UTF-8"))
|
||||
if not data["archived_snapshots"]:
|
||||
raise WaybackError(
|
||||
"'%s' is not yet archived. Use wayback.Url(url, user_agent).save() "
|
||||
"Can not find archive for '%s' try later or use wayback.Url(url, user_agent).save() "
|
||||
"to create a new archive." % self._clean_url()
|
||||
)
|
||||
archive_url = data["archived_snapshots"]["closest"]["url"]
|
||||
|
Reference in New Issue
Block a user