don't fetch more pages if >=2 pages are empty
This commit is contained in:
parent
5a7bd73565
commit
eabf4dc046
@ -17,27 +17,27 @@ class Cdx:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
url,
|
url,
|
||||||
user_agent=default_user_agent,
|
user_agent=None,
|
||||||
start_timestamp=None,
|
start_timestamp=None,
|
||||||
end_timestamp=None,
|
end_timestamp=None,
|
||||||
filters=[],
|
filters=[],
|
||||||
match_type=None,
|
match_type=None,
|
||||||
gzip=True,
|
gzip=None,
|
||||||
collapses=[],
|
collapses=[],
|
||||||
limit=10000,
|
limit=None,
|
||||||
):
|
):
|
||||||
self.url = str(url).strip()
|
self.url = str(url).strip()
|
||||||
self.user_agent = str(user_agent)
|
self.user_agent = str(user_agent) if user_agent else default_user_agent
|
||||||
self.start_timestamp = str(start_timestamp) if start_timestamp else None
|
self.start_timestamp = str(start_timestamp) if start_timestamp else None
|
||||||
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
self.end_timestamp = str(end_timestamp) if end_timestamp else None
|
||||||
self.filters = filters
|
self.filters = filters
|
||||||
_check_filters(self.filters)
|
_check_filters(self.filters)
|
||||||
self.match_type = str(match_type).strip() if match_type else None
|
self.match_type = str(match_type).strip() if match_type else None
|
||||||
_check_match_type(self.match_type, self.url)
|
_check_match_type(self.match_type, self.url)
|
||||||
self.gzip = gzip
|
self.gzip = gzip if gzip else True
|
||||||
self.collapses = collapses
|
self.collapses = collapses
|
||||||
_check_collapses(self.collapses)
|
_check_collapses(self.collapses)
|
||||||
self.limit = limit
|
self.limit = limit if limit else 5000
|
||||||
self.last_api_request_url = None
|
self.last_api_request_url = None
|
||||||
self.use_page = False
|
self.use_page = False
|
||||||
|
|
||||||
@ -87,7 +87,7 @@ class Cdx:
|
|||||||
if use_page == True:
|
if use_page == True:
|
||||||
|
|
||||||
total_pages = _get_total_pages(self.url, self.user_agent)
|
total_pages = _get_total_pages(self.url, self.user_agent)
|
||||||
|
blank_pages = 0
|
||||||
for i in range(total_pages):
|
for i in range(total_pages):
|
||||||
payload["page"] = str(i)
|
payload["page"] = str(i)
|
||||||
url, res = _get_response(
|
url, res = _get_response(
|
||||||
@ -95,8 +95,14 @@ class Cdx:
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.last_api_request_url = url
|
self.last_api_request_url = url
|
||||||
|
text = res.text
|
||||||
|
if len(text) == 0:
|
||||||
|
blank_pages += 1
|
||||||
|
|
||||||
yield res.text
|
if blank_pages >= 2:
|
||||||
|
break
|
||||||
|
|
||||||
|
yield text
|
||||||
else:
|
else:
|
||||||
|
|
||||||
payload["showResumeKey"] = "true"
|
payload["showResumeKey"] = "true"
|
||||||
|
@ -12,7 +12,8 @@ default_user_agent = "waybackpy python package - https://github.com/akamhy/wayba
|
|||||||
|
|
||||||
|
|
||||||
def _unix_ts_to_wayback_ts(unix_ts):
|
def _unix_ts_to_wayback_ts(unix_ts):
|
||||||
return datetime.utcfromtimestamp(int(unix_ts)).strftime('%Y%m%d%H%M%S')
|
return datetime.utcfromtimestamp(int(unix_ts)).strftime("%Y%m%d%H%M%S")
|
||||||
|
|
||||||
|
|
||||||
def _add_payload(self, payload):
|
def _add_payload(self, payload):
|
||||||
if self.start_timestamp:
|
if self.start_timestamp:
|
||||||
|
@ -166,7 +166,15 @@ class Url:
|
|||||||
|
|
||||||
return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
|
return response.content.decode(encoding.replace("text/html", "UTF-8", 1))
|
||||||
|
|
||||||
def near(self, year=None, month=None, day=None, hour=None, minute=None, unix_timestamp=None):
|
def near(
|
||||||
|
self,
|
||||||
|
year=None,
|
||||||
|
month=None,
|
||||||
|
day=None,
|
||||||
|
hour=None,
|
||||||
|
minute=None,
|
||||||
|
unix_timestamp=None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Wayback Machine can have many archives of a webpage,
|
Wayback Machine can have many archives of a webpage,
|
||||||
sometimes we want archive close to a specific time.
|
sometimes we want archive close to a specific time.
|
||||||
|
Loading…
Reference in New Issue
Block a user