retries support for get requests

This commit is contained in:
Akash Mahanty 2021-01-06 01:58:38 +05:30
parent 15ef5816db
commit a2550f17d7
5 changed files with 36 additions and 23 deletions

View File

@ -5,3 +5,4 @@
## ACKNOWLEDGEMENTS
- mhmdiaa (<https://github.com/mhmdiaa>) for <https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050>. known_urls is based on this gist.
- datashaman (<https://stackoverflow.com/users/401467/datashaman>) for <https://stackoverflow.com/a/35504626>. _get_response is based on this amazing answer.

View File

@ -208,7 +208,7 @@ def test_known_urls():
args = argparse.Namespace(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://akamhy.github.io",
url="https://www.keybr.com",
total=False,
version=False,
oldest=False,
@ -217,13 +217,13 @@ def test_known_urls():
archive_url=False,
newest=False,
near=False,
alive=True,
subdomain=True,
alive=False,
subdomain=False,
known_urls=True,
get=None,
)
reply = cli.args_handler(args)
assert "github" in str(reply)
assert "keybr" in str(reply)
args = argparse.Namespace(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
@ -305,7 +305,7 @@ def test_get():
args = argparse.Namespace(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
url="https://github.com/akamhy",
total=False,
version=False,
oldest=False,
@ -325,7 +325,7 @@ def test_get():
args = argparse.Namespace(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
url="https://github.com/akamhy/waybackpy",
total=False,
version=False,
oldest=False,
@ -345,7 +345,7 @@ def test_get():
args = argparse.Namespace(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 \
(KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
url="https://pypi.org/user/akamhy/",
url="https://akamhy.github.io/waybackpy/",
total=False,
version=False,
oldest=False,

View File

@ -223,7 +223,7 @@ def test_total_archives():
def test_known_urls():
target = waybackpy.Url("akamhy.github.io", user_agent)
assert len(target.known_urls(alive=True, subdomain=True)) > 2
assert len(target.known_urls(alive=True, subdomain=False)) > 2
target = waybackpy.Url("akamhy.github.io", user_agent)
assert len(target.known_urls()) > 3

View File

@ -1,3 +1,9 @@
"""
waybackpy.exceptions
~~~~~~~~~~~~~~~~~~~
This module contains the set of Waybackpy's exceptions.
"""
class WaybackError(Exception):
"""
Raised when Wayback Machine API Service is unreachable/down.

View File

@ -1,7 +1,9 @@
import re
import requests
import concurrent.futures
from urllib3.util.retry import Retry
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from waybackpy.__version__ import __version__
from waybackpy.exceptions import WaybackError, URLError
@ -102,15 +104,15 @@ def _wayback_timestamp(**kwargs):
)
def _get_response(endpoint, params=None, headers=None):
def _get_response(endpoint, params=None, headers=None, retries=5):
"""
This function is used make get request.
We use the requests package to make the
requests.
We try twice and if both the times is fails And
raises exceptions we give-up and raise WaybackError.
We try five times and if it fails it raises
WaybackError exception.
You can handles WaybackError by importing:
from waybackpy.exceptions import WaybackError
@ -121,15 +123,18 @@ def _get_response(endpoint, params=None, headers=None):
# handle it
"""
# From https://stackoverflow.com/a/35504626
# By https://stackoverflow.com/users/401467/datashaman
s = requests.Session()
retries = Retry(total=retries, backoff_factor=0.5, status_forcelist=[ 500, 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
try:
return requests.get(endpoint, params=params, headers=headers)
except Exception:
try:
return requests.get(endpoint, params=params, headers=headers)
except Exception as e:
exc = WaybackError("Error while retrieving %s" % endpoint)
exc.__cause__ = e
raise exc
return s.get(endpoint, params=params, headers=headers)
except Exception as e:
exc = WaybackError("Error while retrieving %s" % endpoint)
exc.__cause__ = e
raise exc
class Url:
@ -450,12 +455,13 @@ class Url:
):
"""
Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine bots.
Useful for pen-testers and others.
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
because these URLs were crawled by WayBack Machine spider.
Useful for pen-testing.
"""
# Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
# https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
url_list = []
if subdomain: