don't fetch more pages if >=2 pages are empty

This commit is contained in:
Akash Mahanty 2021-01-11 22:43:14 +05:30
parent 5a7bd73565
commit eabf4dc046
3 changed files with 25 additions and 10 deletions

View File

@ -17,27 +17,27 @@ class Cdx:
def __init__( def __init__(
self, self,
url, url,
user_agent=default_user_agent, user_agent=None,
start_timestamp=None, start_timestamp=None,
end_timestamp=None, end_timestamp=None,
filters=[], filters=[],
match_type=None, match_type=None,
gzip=True, gzip=None,
collapses=[], collapses=[],
limit=10000, limit=None,
): ):
self.url = str(url).strip() self.url = str(url).strip()
self.user_agent = str(user_agent) self.user_agent = str(user_agent) if user_agent else default_user_agent
self.start_timestamp = str(start_timestamp) if start_timestamp else None self.start_timestamp = str(start_timestamp) if start_timestamp else None
self.end_timestamp = str(end_timestamp) if end_timestamp else None self.end_timestamp = str(end_timestamp) if end_timestamp else None
self.filters = filters self.filters = filters
_check_filters(self.filters) _check_filters(self.filters)
self.match_type = str(match_type).strip() if match_type else None self.match_type = str(match_type).strip() if match_type else None
_check_match_type(self.match_type, self.url) _check_match_type(self.match_type, self.url)
self.gzip = gzip self.gzip = gzip if gzip else True
self.collapses = collapses self.collapses = collapses
_check_collapses(self.collapses) _check_collapses(self.collapses)
self.limit = limit self.limit = limit if limit else 5000
self.last_api_request_url = None self.last_api_request_url = None
self.use_page = False self.use_page = False
@ -87,7 +87,7 @@ class Cdx:
if use_page == True: if use_page == True:
total_pages = _get_total_pages(self.url, self.user_agent) total_pages = _get_total_pages(self.url, self.user_agent)
blank_pages = 0
for i in range(total_pages): for i in range(total_pages):
payload["page"] = str(i) payload["page"] = str(i)
url, res = _get_response( url, res = _get_response(
@ -95,8 +95,14 @@ class Cdx:
) )
self.last_api_request_url = url self.last_api_request_url = url
text = res.text
if len(text) == 0:
blank_pages += 1
yield res.text if blank_pages >= 2:
break
yield text
else: else:
payload["showResumeKey"] = "true" payload["showResumeKey"] = "true"

View File

@ -12,7 +12,8 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
def _unix_ts_to_wayback_ts(unix_ts): def _unix_ts_to_wayback_ts(unix_ts):
return datetime.utcfromtimestamp(int(unix_ts)).strftime('%Y%m%d%H%M%S') return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
def _add_payload(self, payload): def _add_payload(self, payload):
if self.start_timestamp: if self.start_timestamp:

View File

@ -166,7 +166,15 @@ class Url:
return response.content.decode(encoding.replace("text/html", "UTF-8", 1)) return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
def near(self, year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None): def near(
self,
year=None,
month=None,
day=None,
hour=None,
minute=None,
unix_timestamp=None,
):
""" """
Wayback Machine can have many archives of a webpage, Wayback Machine can have many archives of a webpage,
sometimes we want archive close to a specific time. sometimes we want archive close to a specific time.