diff --git a/README.md b/README.md index c589dc9..59629c3 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ ## Python package & CLI tool that interfaces with the Wayback Machine API. [![pypi](https://img.shields.io/pypi/v/waybackpy.svg)](https://pypi.org/project/waybackpy/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/akamhy/waybackpy/blob/master/LICENSE) -[![Build Status](https://github.com/akamhy/waybackpy/workflows/CI/badge.svg)](https://github.com/akamhy/waybackpy/actions) +[![Build Status](https://github.com/akamhy/waybackpy/workflows/CI/badge.svg)](https://github.com/akamhy/waybackpy/actions?query=workflow%3ACI) [![codecov](https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg)](https://codecov.io/gh/akamhy/waybackpy) [![contributions welcome](https://img.shields.io/static/v1.svg?label=Contributions&message=Welcome&color=0059b3&style=flat-square)](https://github.com/akamhy/waybackpy/blob/master/CONTRIBUTING.md) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/255459cede9341e39436ec8866d3fb65)](https://www.codacy.com/manual/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade) diff --git a/setup.py b/setup.py index ce47b12..5bb28d6 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( author=about["__author__"], author_email=about["__author_email__"], url=about["__url__"], - download_url="https://github.com/akamhy/waybackpy/archive/2.3.1.tar.gz", + download_url="https://github.com/akamhy/waybackpy/archive/2.3.2.tar.gz", keywords=[ "Archive It", "Archive Website", diff --git a/waybackpy/__version__.py b/waybackpy/__version__.py index b83035e..fc1bb3d 100644 --- a/waybackpy/__version__.py +++ b/waybackpy/__version__.py @@ -6,8 +6,8 @@ __description__ = ( "Archive pages and retrieve archived pages easily." ) __url__ = "https://akamhy.github.io/waybackpy/" -__version__ = "2.3.1" +__version__ = "2.3.2" __author__ = "akamhy" -__author_email__ = "akash3pro@gmail.com" +__author_email__ = "akamhy@yahoo.com" __license__ = "MIT" -__copyright__ = "Copyright 2020 akamhy" +__copyright__ = "Copyright 2020-2021 Akash Mahanty et al." diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 044ca4f..9174e39 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -7,10 +7,28 @@ import string import random from waybackpy.wrapper import Url from waybackpy.__version__ import __version__ +from waybackpy.exceptions import WaybackError def _save(obj): - return obj.save() + try: + return obj.save() + except Exception as err: + e = str(err) + url = obj.url + m = re.search(r"Header:\n(.*)", e) + if m: + header = m.group(1) + if "No archive URL found in the API response" in e: + return ( + "\n[waybackpy] Can not save/archive your link.\n[waybackpy] This\ + could happen because either your waybackpy (%s) is likely out of\ + date or Wayback Machine is malfunctioning.\n[waybackpy] Visit\ + https://github.com/akamhy/waybackpy for the latest version of \ + waybackpy.\n[waybackpy] API response Header :\n%s" + % (__version__, header) + ) + return WaybackError(err) def _archive_url(obj): @@ -21,12 +39,33 @@ def _json(obj): return obj.JSON +def handle_not_archived_error(e, obj): + m = re.search(r"archive\sfor\s\'(.*?)\'\stry", str(e)) + if m: + url = m.group(1) + ua = obj.user_agent + if "github.com/akamhy/waybackpy" in ua: + ua = "YOUR_USER_AGENT_HERE" + return ( + "\n[Waybackpy] Can not find archive for '%s'.\n[Waybackpy] You can" + " save the URL using the following command:\n[Waybackpy] waybackpy --" + 'user_agent "%s" --url "%s" --save' % (url, ua, url) + ) + return WaybackError(e) + + def _oldest(obj): - return obj.oldest() + try: + return obj.oldest() + except Exception as e: + return handle_not_archived_error(e, obj) def _newest(obj): - return obj.newest() + try: + return obj.newest() + except Exception as e: + return handle_not_archived_error(e, obj) def _total_archives(obj): @@ -45,7 +84,11 @@ def _near(obj, args): _near_args["hour"] = args.hour if args.minute: _near_args["minute"] = args.minute - return obj.near(**_near_args) + + try: + return obj.near(**_near_args) + except Exception as e: + return handle_not_archived_error(e, obj) def _save_urls_on_file(input_list, live_url_count): diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 1b3597b..16bfddf 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -29,7 +29,7 @@ def _archive_url_parser(header): return arch.group(1) raise WaybackError( "No archive URL found in the API response. " - "This version of waybackpy (%s) is likely out of date. Visit " + "This version of waybackpy (%s) is likely out of date or WayBack Machine is malfunctioning. Visit " "https://github.com/akamhy/waybackpy for the latest version " "of waybackpy.\nHeader:\n%s" % (__version__, str(header)) ) @@ -64,20 +64,26 @@ class Url: self.url = url self.user_agent = user_agent self._url_check() # checks url validity on init. - self.archive_url = self._archive_url() # URL of archive - self.timestamp = self._archive_timestamp() # timestamp for last archive + self._archive_url = None # URL of archive + self.timestamp = None # timestamp for last archive self._alive_url_list = [] def __repr__(self): return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) def __str__(self): - return "%s" % self.archive_url + if not self._archive_url: + self._archive_url = self.archive_url + return "%s" % self._archive_url def __len__(self): td_max = timedelta( days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999 ) + + if not self.timestamp: + self.timestamp = self._timestamp + if self.timestamp == datetime.max: return td_max.days @@ -91,14 +97,28 @@ class Url: @property def JSON(self): + """ + Returns JSON data from 'https://archive.org/wayback/available?url=YOUR-URL'. + """ endpoint = "https://archive.org/wayback/available" headers = {"User-Agent": "%s" % self.user_agent} payload = {"url": "%s" % self._clean_url()} response = _get_response(endpoint, params=payload, headers=headers) return response.json() - def _archive_url(self): - """Get URL of archive.""" + @property + def archive_url(self): + """ + Returns any random archive for the instance. + But if near, oldest, newest were used before + then it returns the same archive again. + + We cache archive in self._archive_url + """ + + if self._archive_url: + return self._archive_url + data = self.JSON if not data["archived_snapshots"]: @@ -108,25 +128,37 @@ class Url: archive_url = archive_url.replace( "http://web.archive.org/web/", "https://web.archive.org/web/", 1 ) - + self._archive_url = archive_url return archive_url - def _archive_timestamp(self): - """Get timestamp of last archive.""" + @property + def _timestamp(self): + """ + Get timestamp of last fetched archive. + If used before fetching any archive, This + randomly picks archive. + """ + + if self.timestamp: + return self.timestamp + data = self.JSON if not data["archived_snapshots"]: - time = datetime.max + ts = datetime.max else: - time = datetime.strptime( + ts = datetime.strptime( data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" ) - - return time + self.timestamp = ts + return ts def _clean_url(self): - """Fix the URL, if possible.""" + """ + Remove newlines + replace " " with "_" + """ return str(self.url).strip().replace(" ", "_") def save(self): @@ -134,7 +166,7 @@ class Url: request_url = "https://web.archive.org/save/" + self._clean_url() headers = {"User-Agent": "%s" % self.user_agent} response = _get_response(request_url, params=None, headers=headers) - self.archive_url = "https://" + _archive_url_parser(response.headers) + self._archive_url = "https://" + _archive_url_parser(response.headers) self.timestamp = datetime.utcnow() return self @@ -190,7 +222,7 @@ class Url: "http://web.archive.org/web/", "https://web.archive.org/web/", 1 ) - self.archive_url = archive_url + self._archive_url = archive_url self.timestamp = datetime.strptime( data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" ) @@ -224,7 +256,7 @@ class Url: # Most efficient method to count number of archives (yet) return response.text.count(",") - def pick_live_urls(self, url): + def live_urls_picker(self, url): try: response_code = requests.get(url).status_code @@ -266,7 +298,7 @@ class Url: # Remove all deadURLs from url_list if alive=True if alive: with concurrent.futures.ThreadPoolExecutor() as executor: - executor.map(self.pick_live_urls, url_list) + executor.map(self.live_urls_picker, url_list) url_list = self._alive_url_list return url_list