17
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								.travis.yml
									
									
									
									
									
								
							| @@ -1,17 +0,0 @@ | |||||||
| language: python |  | ||||||
| os: linux |  | ||||||
| dist: xenial |  | ||||||
| cache: pip |  | ||||||
| python: |  | ||||||
|   - 3.8 |  | ||||||
| before_install: |  | ||||||
|   - python --version |  | ||||||
|   - pip install -U pip |  | ||||||
|   - pip install -U pytest |  | ||||||
|   - pip install codecov |  | ||||||
|   - pip install pytest pytest-cov |  | ||||||
| script: |  | ||||||
|   - cd tests |  | ||||||
|   - pytest --cov=../waybackpy |  | ||||||
| after_success: |  | ||||||
|   - python -m codecov |  | ||||||
							
								
								
									
										22
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								setup.py
									
									
									
									
									
								
							| @@ -9,20 +9,20 @@ with open(os.path.join(os.path.dirname(__file__), 'waybackpy', '__version__.py') | |||||||
|     exec(f.read(), about) |     exec(f.read(), about) | ||||||
|  |  | ||||||
| setup( | setup( | ||||||
|     name = about['__title__'], |     name=about['__title__'], | ||||||
|     packages = ['waybackpy'], |     packages=['waybackpy'], | ||||||
|     version = about['__version__'], |     version=about['__version__'], | ||||||
|     description = about['__description__'], |     description=about['__description__'], | ||||||
|     long_description=long_description, |     long_description=long_description, | ||||||
|     long_description_content_type='text/markdown', |     long_description_content_type='text/markdown', | ||||||
|     license= about['__license__'], |     license=about['__license__'], | ||||||
|     author = about['__author__'], |     author=about['__author__'], | ||||||
|     author_email = about['__author_email__'], |     author_email=about['__author_email__'], | ||||||
|     url = about['__url__'], |     url=about['__url__'], | ||||||
|     download_url = 'https://github.com/akamhy/waybackpy/archive/2.2.0.tar.gz', |     download_url='https://github.com/akamhy/waybackpy/archive/2.2.0.tar.gz', | ||||||
|     keywords = ['waybackpy', 'archive', 'archive website', 'wayback machine', 'Internet Archive'], |     keywords=['waybackpy', 'archive', 'archive website', 'wayback machine', 'Internet Archive'], | ||||||
|     install_requires=[], |     install_requires=[], | ||||||
|     python_requires= ">=3.4", |     python_requires=">=3.4", | ||||||
|     classifiers=[ |     classifiers=[ | ||||||
|         'Development Status :: 5 - Production/Stable', |         'Development Status :: 5 - Production/Stable', | ||||||
|         'Intended Audience :: Developers', |         'Intended Audience :: Developers', | ||||||
|   | |||||||
							
								
								
									
										0
									
								
								tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -1,7 +1,10 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| __title__ = "waybackpy" | __title__ = "waybackpy" | ||||||
| __description__ = "A Python package that interfaces with the Internet Archive's Wayback Machine API. Archive pages and retrieve archived pages easily." | __description__ = ( | ||||||
|  |     "A Python package that interfaces with the Internet Archive's Wayback Machine API. " | ||||||
|  |     "Archive pages and retrieve archived pages easily." | ||||||
|  | ) | ||||||
| __url__ = "https://akamhy.github.io/waybackpy/" | __url__ = "https://akamhy.github.io/waybackpy/" | ||||||
| __version__ = "2.2.0" | __version__ = "2.2.0" | ||||||
| __author__ = "akamhy" | __author__ = "akamhy" | ||||||
|   | |||||||
| @@ -8,24 +8,31 @@ import random | |||||||
| from waybackpy.wrapper import Url | from waybackpy.wrapper import Url | ||||||
| from waybackpy.__version__ import __version__ | from waybackpy.__version__ import __version__ | ||||||
|  |  | ||||||
|  |  | ||||||
| def _save(obj): | def _save(obj): | ||||||
|     return (obj.save()) |     return (obj.save()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _archive_url(obj): | def _archive_url(obj): | ||||||
|     return (obj.archive_url) |     return (obj.archive_url) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _json(obj): | def _json(obj): | ||||||
|     return (obj.JSON) |     return (obj.JSON) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _oldest(obj): | def _oldest(obj): | ||||||
|     return (obj.oldest()) |     return (obj.oldest()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _newest(obj): | def _newest(obj): | ||||||
|     return (obj.newest()) |     return (obj.newest()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _total_archives(obj): | def _total_archives(obj): | ||||||
|     return (obj.total_archives()) |     return (obj.total_archives()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _near(obj, args): | def _near(obj, args): | ||||||
|     _near_args = {} |     _near_args = {} | ||||||
|     if args.year: |     if args.year: | ||||||
| @@ -40,8 +47,9 @@ def _near(obj, args): | |||||||
|         _near_args["minute"] = args.minute |         _near_args["minute"] = args.minute | ||||||
|     return (obj.near(**_near_args)) |     return (obj.near(**_near_args)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _save_urls_on_file(input_list, live_url_count): | def _save_urls_on_file(input_list, live_url_count): | ||||||
|     m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) # O(1) |     m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) | ||||||
|     if m: |     if m: | ||||||
|         domain = m.group(1) |         domain = m.group(1) | ||||||
|     else: |     else: | ||||||
| @@ -50,12 +58,13 @@ def _save_urls_on_file(input_list, live_url_count): | |||||||
|     uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) |     uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) | ||||||
|  |  | ||||||
|     file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid) |     file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid) | ||||||
|     file_content = "\n".join(input_list) #join with \n |     file_content = "\n".join(input_list) | ||||||
|     file_path = os.path.join(os.getcwd(), file_name) |     file_path = os.path.join(os.getcwd(), file_name) | ||||||
|     with open(file_name, "w+") as f: |     with open(file_path, "w+") as f: | ||||||
|         f.write(file_content) |         f.write(file_content) | ||||||
|     return "%s\n\n'%s' saved in current working directory" % (file_content, file_name) |     return "%s\n\n'%s' saved in current working directory" % (file_content, file_name) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _known_urls(obj, args): | def _known_urls(obj, args): | ||||||
|     """Abbreviations: |     """Abbreviations: | ||||||
|     sd = subdomain |     sd = subdomain | ||||||
| @@ -77,6 +86,7 @@ def _known_urls(obj, args): | |||||||
|  |  | ||||||
|     return text |     return text | ||||||
|  |  | ||||||
|  |  | ||||||
| def _get(obj, args): | def _get(obj, args): | ||||||
|     if args.get.lower() == "url": |     if args.get.lower() == "url": | ||||||
|         return (obj.get()) |         return (obj.get()) | ||||||
| @@ -100,6 +110,7 @@ def _get(obj, args): | |||||||
|         \n4) newest - get the source code of the newest archive for the supplied url.\ |         \n4) newest - get the source code of the newest archive for the supplied url.\ | ||||||
|         \n5) save - Create a new archive and get the source code of this new archive for the supplied url.") |         \n5) save - Create a new archive and get the source code of this new archive for the supplied url.") | ||||||
|  |  | ||||||
|  |  | ||||||
| def args_handler(args): | def args_handler(args): | ||||||
|     if args.version: |     if args.version: | ||||||
|         return ("waybackpy version %s" % __version__) |         return ("waybackpy version %s" % __version__) | ||||||
| @@ -130,7 +141,12 @@ def args_handler(args): | |||||||
|         return _near(obj, args) |         return _near(obj, args) | ||||||
|     if args.get: |     if args.get: | ||||||
|         return _get(obj, args) |         return _get(obj, args) | ||||||
|     return ("You only specified the URL. But you also need to specify the operation.\nSee 'waybackpy --help' for help using this tool.") |     message = ( | ||||||
|  |         "You only specified the URL. But you also need to specify the operation." | ||||||
|  |         "\nSee 'waybackpy --help' for help using this tool." | ||||||
|  |     ) | ||||||
|  |     return message | ||||||
|  |  | ||||||
|  |  | ||||||
| def parse_args(argv): | def parse_args(argv): | ||||||
|     parser = argparse.ArgumentParser() |     parser = argparse.ArgumentParser() | ||||||
| @@ -139,7 +155,8 @@ def parse_args(argv): | |||||||
|     requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur") |     requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur") | ||||||
|  |  | ||||||
|     userAgentArg = parser.add_argument_group('User Agent') |     userAgentArg = parser.add_argument_group('User Agent') | ||||||
|     userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"") |     help_text = "User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"" | ||||||
|  |     userAgentArg.add_argument("--user_agent", "-ua", help=help_text) | ||||||
|  |  | ||||||
|     saveArg = parser.add_argument_group("Create new archive/save URL") |     saveArg = parser.add_argument_group("Create new archive/save URL") | ||||||
|     saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine") |     saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine") | ||||||
| @@ -164,9 +181,10 @@ def parse_args(argv): | |||||||
|  |  | ||||||
|     knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.") |     knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.") | ||||||
|     knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.") |     knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.") | ||||||
|     knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help="Use with '--known_urls' to include known URLs for subdomains.") |     help_text = "Use with '--known_urls' to include known URLs for subdomains." | ||||||
|     knownUrlArg.add_argument("--alive", "-a", action='store_true', help="Only include live URLs. Will not inlclude dead links.") |     knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help=help_text) | ||||||
|  |     help_text = "Only include live URLs. Will not inlclude dead links." | ||||||
|  |     knownUrlArg.add_argument("--alive", "-a", action='store_true', help=help_text) | ||||||
|  |  | ||||||
|     nearArg = parser.add_argument_group('Archive close to time specified') |     nearArg = parser.add_argument_group('Archive close to time specified') | ||||||
|     nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time") |     nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time") | ||||||
| @@ -182,6 +200,7 @@ def parse_args(argv): | |||||||
|  |  | ||||||
|     return parser.parse_args(argv[1:]) |     return parser.parse_args(argv[1:]) | ||||||
|  |  | ||||||
|  |  | ||||||
| def main(argv=None): | def main(argv=None): | ||||||
|     if argv is None: |     if argv is None: | ||||||
|         argv = sys.argv |         argv = sys.argv | ||||||
| @@ -189,5 +208,6 @@ def main(argv=None): | |||||||
|     output = args_handler(args) |     output = args_handler(args) | ||||||
|     print(output) |     print(output) | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     sys.exit(main(sys.argv)) |     sys.exit(main(sys.argv)) | ||||||
|   | |||||||
| @@ -60,6 +60,7 @@ def _get_response(req): | |||||||
|             raise exc |             raise exc | ||||||
|     return response |     return response | ||||||
|  |  | ||||||
|  |  | ||||||
| class Url: | class Url: | ||||||
|     """waybackpy Url object""" |     """waybackpy Url object""" | ||||||
|  |  | ||||||
| @@ -67,9 +68,9 @@ class Url: | |||||||
|         self.url = url |         self.url = url | ||||||
|         self.user_agent = user_agent |         self.user_agent = user_agent | ||||||
|         self._url_check()  # checks url validity on init. |         self._url_check()  # checks url validity on init. | ||||||
|         self.JSON = self._JSON() # JSON of most recent archive |         self.JSON = self._JSON()  # JSON of most recent archive | ||||||
|         self.archive_url = self._archive_url() # URL of archive |         self.archive_url = self._archive_url()  # URL of archive | ||||||
|         self.timestamp = self._archive_timestamp() # timestamp for last archive |         self.timestamp = self._archive_timestamp()  # timestamp for last archive | ||||||
|         self._alive_url_list = [] |         self._alive_url_list = [] | ||||||
|  |  | ||||||
|     def __repr__(self): |     def __repr__(self): | ||||||
| @@ -79,11 +80,13 @@ class Url: | |||||||
|         return "%s" % self.archive_url |         return "%s" % self.archive_url | ||||||
|  |  | ||||||
|     def __len__(self): |     def __len__(self): | ||||||
|         td_max = timedelta(days=999999999, |         td_max = timedelta( | ||||||
|                              hours=23, |             days=999999999, | ||||||
|                              minutes=59, |             hours=23, | ||||||
|                              seconds=59, |             minutes=59, | ||||||
|                              microseconds=999999) |             seconds=59, | ||||||
|  |             microseconds=999999 | ||||||
|  |         ) | ||||||
|         if self.timestamp == datetime.max: |         if self.timestamp == datetime.max: | ||||||
|             return td_max.days |             return td_max.days | ||||||
|         else: |         else: | ||||||
| @@ -208,14 +211,10 @@ class Url: | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         self.archive_url = archive_url |         self.archive_url = archive_url | ||||||
|         self.timestamp = datetime.strptime(data["archived_snapshots"] |         self.timestamp = datetime.strptime(data["archived_snapshots"]["closest"]["timestamp"], '%Y%m%d%H%M%S') | ||||||
|                                  ["closest"] |  | ||||||
|                                  ["timestamp"], |  | ||||||
|                                  '%Y%m%d%H%M%S') |  | ||||||
|  |  | ||||||
|         return self |         return self | ||||||
|  |  | ||||||
|  |  | ||||||
|     def oldest(self, year=1994): |     def oldest(self, year=1994): | ||||||
|         """Return the oldest Wayback Machine archive for this URL.""" |         """Return the oldest Wayback Machine archive for this URL.""" | ||||||
|         return self.near(year=year) |         return self.near(year=year) | ||||||
| @@ -244,10 +243,11 @@ class Url: | |||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             response_code = requests.get(url).status_code |             response_code = requests.get(url).status_code | ||||||
|         except Exception as e: |         except Exception: | ||||||
|             return #we don't care if urls are not opening |             return  # we don't care if urls are not opening | ||||||
|  |  | ||||||
|         if response_code >= 400: #200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300 |         # 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300 | ||||||
|  |         if response_code >= 400: | ||||||
|             return |             return | ||||||
|  |  | ||||||
|         self._alive_url_list.append(url) |         self._alive_url_list.append(url) | ||||||
| @@ -266,14 +266,12 @@ class Url: | |||||||
|  |  | ||||||
|         if subdomain: |         if subdomain: | ||||||
|             request_url = ( |             request_url = ( | ||||||
|             "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" |                 "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" % self._clean_url() | ||||||
|             % self._clean_url() |  | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|         else: |         else: | ||||||
|             request_url = ( |             request_url = ( | ||||||
|             "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" |                 "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" % self._clean_url() | ||||||
|             % self._clean_url() |  | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|         hdr = {"User-Agent": "%s" % self.user_agent} |         hdr = {"User-Agent": "%s" % self.user_agent} | ||||||
| @@ -283,7 +281,7 @@ class Url: | |||||||
|         data = json.loads(response.read().decode("UTF-8")) |         data = json.loads(response.read().decode("UTF-8")) | ||||||
|         url_list = [y[0] for y in data if y[0] != "original"] |         url_list = [y[0] for y in data if y[0] != "original"] | ||||||
|  |  | ||||||
|         #Remove all deadURLs from url_list if alive=True |         # Remove all deadURLs from url_list if alive=True | ||||||
|         if alive: |         if alive: | ||||||
|             with concurrent.futures.ThreadPoolExecutor() as executor: |             with concurrent.futures.ThreadPoolExecutor() as executor: | ||||||
|                 executor.map(self.pick_live_urls, url_list) |                 executor.map(self.pick_live_urls, url_list) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user