Update wrapper.py

This commit is contained in:
Akash Mahanty
2021-01-02 02:54:42 +05:30
committed by GitHub
parent 287c3cac10
commit 67e34e3997

View File

@@ -29,7 +29,7 @@ def _archive_url_parser(header):
return arch.group(1) return arch.group(1)
raise WaybackError( raise WaybackError(
"No archive URL found in the API response. " "No archive URL found in the API response. "
"This version of waybackpy (%s) is likely out of date. Visit " "This version of waybackpy (%s) is likely out of date or WayBack Machine is malfunctioning. Visit "
"https://github.com/akamhy/waybackpy for the latest version " "https://github.com/akamhy/waybackpy for the latest version "
"of waybackpy.\nHeader:\n%s" % (__version__, str(header)) "of waybackpy.\nHeader:\n%s" % (__version__, str(header))
) )
@@ -97,6 +97,9 @@ class Url:
@property @property
def JSON(self): def JSON(self):
"""
Returns JSON data from 'https://archive.org/wayback/available?url=YOUR-URL'.
"""
endpoint = "https://archive.org/wayback/available" endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": "%s" % self.user_agent} headers = {"User-Agent": "%s" % self.user_agent}
payload = {"url": "%s" % self._clean_url()} payload = {"url": "%s" % self._clean_url()}
@@ -105,7 +108,13 @@ class Url:
@property @property
def archive_url(self): def archive_url(self):
"""Get URL of archive.""" """
Returns any random archive for the instance.
But if near, oldest, newest were used before
then it returns the same archive again.
We cache archive in self._archive_url
"""
if self._archive_url: if self._archive_url:
return self._archive_url return self._archive_url
@@ -124,7 +133,15 @@ class Url:
@property @property
def _timestamp(self): def _timestamp(self):
"""Get timestamp of last archive.""" """
Get timestamp of last fetched archive.
If used before fetching any archive, This
randomly picks archive.
"""
if self.timestamp:
return self.timestamp
data = self.JSON data = self.JSON
if not data["archived_snapshots"]: if not data["archived_snapshots"]:
@@ -138,7 +155,10 @@ class Url:
return ts return ts
def _clean_url(self): def _clean_url(self):
"""Fix the URL, if possible.""" """
Remove newlines
replace " " with "_"
"""
return str(self.url).strip().replace(" ", "_") return str(self.url).strip().replace(" ", "_")
def save(self): def save(self):
@@ -236,7 +256,7 @@ class Url:
# Most efficient method to count number of archives (yet) # Most efficient method to count number of archives (yet)
return response.text.count(",") return response.text.count(",")
def pick_live_urls(self, url): def live_urls_picker(self, url):
try: try:
response_code = requests.get(url).status_code response_code = requests.get(url).status_code
@@ -278,7 +298,7 @@ class Url:
# Remove all deadURLs from url_list if alive=True # Remove all deadURLs from url_list if alive=True
if alive: if alive:
with concurrent.futures.ThreadPoolExecutor() as executor: with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(self.pick_live_urls, url_list) executor.map(self.live_urls_picker, url_list)
url_list = self._alive_url_list url_list = self._alive_url_list
return url_list return url_list