diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 5d3b61d..c682aef 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -5,3 +5,4 @@ ## ACKNOWLEDGEMENTS - mhmdiaa () for . known_urls is based on this gist. + - datashaman () for . _get_response is based on this amazing answer. diff --git a/tests/test_cli.py b/tests/test_cli.py index b971adb..b7c1700 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -208,7 +208,7 @@ def test_known_urls(): args = argparse.Namespace( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", - url="https://akamhy.github.io", + url="https://www.keybr.com", total=False, version=False, oldest=False, @@ -217,13 +217,13 @@ def test_known_urls(): archive_url=False, newest=False, near=False, - alive=True, - subdomain=True, + alive=False, + subdomain=False, known_urls=True, get=None, ) reply = cli.args_handler(args) - assert "github" in str(reply) + assert "keybr" in str(reply) args = argparse.Namespace( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ @@ -305,7 +305,7 @@ def test_get(): args = argparse.Namespace( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", - url="https://pypi.org/user/akamhy/", + url="https://github.com/akamhy", total=False, version=False, oldest=False, @@ -325,7 +325,7 @@ def test_get(): args = argparse.Namespace( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", - url="https://pypi.org/user/akamhy/", + url="https://github.com/akamhy/waybackpy", total=False, version=False, oldest=False, @@ -345,7 +345,7 @@ def test_get(): args = argparse.Namespace( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \ (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", - url="https://pypi.org/user/akamhy/", + url="https://akamhy.github.io/waybackpy/", total=False, version=False, oldest=False, diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 72b796a..c414568 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -223,7 +223,7 @@ def test_total_archives(): def test_known_urls(): target = waybackpy.Url("akamhy.github.io", user_agent) - assert len(target.known_urls(alive=True, subdomain=True)) > 2 + assert len(target.known_urls(alive=True, subdomain=False)) > 2 target = waybackpy.Url("akamhy.github.io", user_agent) assert len(target.known_urls()) > 3 diff --git a/waybackpy/exceptions.py b/waybackpy/exceptions.py index b08f6d6..d1f6200 100644 --- a/waybackpy/exceptions.py +++ b/waybackpy/exceptions.py @@ -1,3 +1,9 @@ +""" +waybackpy.exceptions +~~~~~~~~~~~~~~~~~~~ +This module contains the set of Waybackpy's exceptions. +""" + class WaybackError(Exception): """ Raised when Wayback Machine API Service is unreachable/down. diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index bb26554..0f9c2b2 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -1,7 +1,9 @@ import re import requests import concurrent.futures +from urllib3.util.retry import Retry from datetime import datetime, timedelta +from requests.adapters import HTTPAdapter from waybackpy.__version__ import __version__ from waybackpy.exceptions import WaybackError, URLError @@ -102,15 +104,15 @@ def _wayback_timestamp(**kwargs): ) -def _get_response(endpoint, params=None, headers=None): +def _get_response(endpoint, params=None, headers=None, retries=5): """ This function is used make get request. We use the requests package to make the requests. - We try twice and if both the times is fails And - raises exceptions we give-up and raise WaybackError. + We try five times and if it fails it raises + WaybackError exception. You can handles WaybackError by importing: from waybackpy.exceptions import WaybackError @@ -121,15 +123,18 @@ def _get_response(endpoint, params=None, headers=None): # handle it """ + # From https://stackoverflow.com/a/35504626 + # By https://stackoverflow.com/users/401467/datashaman + s = requests.Session() + retries = Retry(total=retries, backoff_factor=0.5, status_forcelist=[ 500, 502, 503, 504 ]) + s.mount('https://', HTTPAdapter(max_retries=retries)) + try: - return requests.get(endpoint, params=params, headers=headers) - except Exception: - try: - return requests.get(endpoint, params=params, headers=headers) - except Exception as e: - exc = WaybackError("Error while retrieving %s" % endpoint) - exc.__cause__ = e - raise exc + return s.get(endpoint, params=params, headers=headers) + except Exception as e: + exc = WaybackError("Error while retrieving %s" % endpoint) + exc.__cause__ = e + raise exc class Url: @@ -450,12 +455,13 @@ class Url: ): """ Returns list of URLs known to exist for given domain name - because these URLs were crawled by WayBack Machine bots. - Useful for pen-testers and others. - Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: - https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050 + because these URLs were crawled by WayBack Machine spider. + Useful for pen-testing. """ + # Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: + # https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050 + url_list = [] if subdomain: