From ca51c14332bd727e04c7471e7a20044b6b00d1bf Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Thu, 26 Nov 2020 13:06:50 +0530 Subject: [PATCH] deleted .travis.yml, link with flake (#41) close #38 --- .travis.yml | 17 ----------------- setup.py | 22 +++++++++++----------- tests/__init__.py | 0 waybackpy/__version__.py | 5 ++++- waybackpy/cli.py | 36 ++++++++++++++++++++++++++++-------- waybackpy/wrapper.py | 40 +++++++++++++++++++--------------------- 6 files changed, 62 insertions(+), 58 deletions(-) delete mode 100644 .travis.yml create mode 100644 tests/__init__.py diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c70b36c..0000000 --- a/.travis.yml +++ /dev/null @@ -1,17 +0,0 @@ -language: python -os: linux -dist: xenial -cache: pip -python: - - 3.8 -before_install: - - python --version - - pip install -U pip - - pip install -U pytest - - pip install codecov - - pip install pytest pytest-cov -script: - - cd tests - - pytest --cov=../waybackpy -after_success: - - python -m codecov diff --git a/setup.py b/setup.py index 96777f2..3e4491c 100644 --- a/setup.py +++ b/setup.py @@ -9,20 +9,20 @@ with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py') exec(f.read(), about) setup( - name = about['__title__'], - packages = ['waybackpy'], - version = about['__version__'], - description = about['__description__'], + name=about['__title__'], + packages=['waybackpy'], + version=about['__version__'], + description=about['__description__'], long_description=long_description, long_description_content_type='text/markdown', - license= about['__license__'], - author = about['__author__'], - author_email = about['__author_email__'], - url = about['__url__'], - download_url = 'https://github.com/akamhy/waybackpy/archive/2.2.0.tar.gz', - keywords = ['waybackpy', 'archive', 'archive website', 'wayback machine', 'Internet Archive'], + license=about['__license__'], + author=about['__author__'], + author_email=about['__author_email__'], + url=about['__url__'], + download_url='https://github.com/akamhy/waybackpy/archive/2.2.0.tar.gz', + keywords=['waybackpy', 'archive', 'archive website', 'wayback machine', 'Internet Archive'], install_requires=[], - python_requires= ">=3.4", + python_requires=">=3.4", classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/waybackpy/__version__.py b/waybackpy/__version__.py index 66321c3..bf85a63 100644 --- a/waybackpy/__version__.py +++ b/waybackpy/__version__.py @@ -1,7 +1,10 @@ # -*- coding: utf-8 -*- __title__ = "waybackpy" -__description__ = "A Python package that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily." +__description__ = ( + "A Python package that interfaces with the Internet Archive's Wayback Machine API. " + "Archive pages and retrieve archived pages easily." +) __url__ = "https://akamhy.github.io/waybackpy/" __version__ = "2.2.0" __author__ = "akamhy" diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 7f20651..4bff3bd 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -8,24 +8,31 @@ import random from waybackpy.wrapper import Url from waybackpy.__version__ import __version__ + def _save(obj): return (obj.save()) + def _archive_url(obj): return (obj.archive_url) + def _json(obj): return (obj.JSON) + def _oldest(obj): return (obj.oldest()) + def _newest(obj): return (obj.newest()) + def _total_archives(obj): return (obj.total_archives()) + def _near(obj, args): _near_args = {} if args.year: @@ -40,8 +47,9 @@ def _near(obj, args): _near_args["minute"] = args.minute return (obj.near(**_near_args)) + def _save_urls_on_file(input_list, live_url_count): - m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) # O(1) + m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) if m: domain = m.group(1) else: @@ -50,12 +58,13 @@ def _save_urls_on_file(input_list, live_url_count): uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid) - file_content = "\n".join(input_list) #join with \n + file_content = "\n".join(input_list) file_path = os.path.join(os.getcwd(), file_name) - with open(file_name, "w+") as f: + with open(file_path, "w+") as f: f.write(file_content) return "%s\n\n'%s' saved in current working directory" % (file_content, file_name) + def _known_urls(obj, args): """Abbreviations: sd = subdomain @@ -77,6 +86,7 @@ def _known_urls(obj, args): return text + def _get(obj, args): if args.get.lower() == "url": return (obj.get()) @@ -100,6 +110,7 @@ def _get(obj, args): \n4) newest - get the source code of the newest archive for the supplied url.\ \n5) save - Create a new archive and get the source code of this new archive for the supplied url.") + def args_handler(args): if args.version: return ("waybackpy version %s" % __version__) @@ -130,7 +141,12 @@ def args_handler(args): return _near(obj, args) if args.get: return _get(obj, args) - return ("You only specified the URL. But you also need to specify the operation.\nSee 'waybackpy --help' for help using this tool.") + message = ( + "You only specified the URL. But you also need to specify the operation." + "\nSee 'waybackpy --help' for help using this tool." + ) + return message + def parse_args(argv): parser = argparse.ArgumentParser() @@ -139,7 +155,8 @@ def parse_args(argv): requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur") userAgentArg = parser.add_argument_group('User Agent') - userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"") + help_text = "User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"" + userAgentArg.add_argument("--user_agent", "-ua", help=help_text) saveArg = parser.add_argument_group("Create new archive/save URL") saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine") @@ -164,9 +181,10 @@ def parse_args(argv): knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.") knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.") - knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help="Use with '--known_urls' to include known URLs for subdomains.") - knownUrlArg.add_argument("--alive", "-a", action='store_true', help="Only include live URLs. Will not inlclude dead links.") - + help_text = "Use with '--known_urls' to include known URLs for subdomains." + knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help=help_text) + help_text = "Only include live URLs. Will not inlclude dead links." + knownUrlArg.add_argument("--alive", "-a", action='store_true', help=help_text) nearArg = parser.add_argument_group('Archive close to time specified') nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time") @@ -182,6 +200,7 @@ def parse_args(argv): return parser.parse_args(argv[1:]) + def main(argv=None): if argv is None: argv = sys.argv @@ -189,5 +208,6 @@ def main(argv=None): output = args_handler(args) print(output) + if __name__ == "__main__": sys.exit(main(sys.argv)) diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 1f47f4a..3df6648 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -60,6 +60,7 @@ def _get_response(req): raise exc return response + class Url: """waybackpy Url object""" @@ -67,9 +68,9 @@ class Url: self.url = url self.user_agent = user_agent self._url_check() # checks url validity on init. - self.JSON = self._JSON() # JSON of most recent archive - self.archive_url = self._archive_url() # URL of archive - self.timestamp = self._archive_timestamp() # timestamp for last archive + self.JSON = self._JSON() # JSON of most recent archive + self.archive_url = self._archive_url() # URL of archive + self.timestamp = self._archive_timestamp() # timestamp for last archive self._alive_url_list = [] def __repr__(self): @@ -79,11 +80,13 @@ class Url: return "%s" % self.archive_url def __len__(self): - td_max = timedelta(days=999999999, - hours=23, - minutes=59, - seconds=59, - microseconds=999999) + td_max = timedelta( + days=999999999, + hours=23, + minutes=59, + seconds=59, + microseconds=999999 + ) if self.timestamp == datetime.max: return td_max.days else: @@ -208,14 +211,10 @@ class Url: ) self.archive_url = archive_url - self.timestamp = datetime.strptime(data["archived_snapshots"] - ["closest"] - ["timestamp"], - '%Y%m%d%H%M%S') + self.timestamp = datetime.strptime(data["archived_snapshots"]["closest"]["timestamp"], '%Y%m%d%H%M%S') return self - def oldest(self, year=1994): """Return the oldest Wayback Machine archive for this URL.""" return self.near(year=year) @@ -244,10 +243,11 @@ class Url: try: response_code = requests.get(url).status_code - except Exception as e: - return #we don't care if urls are not opening + except Exception: + return # we don't care if urls are not opening - if response_code >= 400: #200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300 + # 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300 + if response_code >= 400: return self._alive_url_list.append(url) @@ -266,14 +266,12 @@ class Url: if subdomain: request_url = ( - "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" - % self._clean_url() + "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" % self._clean_url() ) else: request_url = ( - "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" - % self._clean_url() + "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" % self._clean_url() ) hdr = {"User-Agent": "%s" % self.user_agent} @@ -283,7 +281,7 @@ class Url: data = json.loads(response.read().decode("UTF-8")) url_list = [y[0] for y in data if y[0] != "original"] - #Remove all deadURLs from url_list if alive=True + # Remove all deadURLs from url_list if alive=True if alive: with concurrent.futures.ThreadPoolExecutor() as executor: executor.map(self.pick_live_urls, url_list)