From 58cd9c28e7c5709368142e7b202c6562248e0dab Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Thu, 26 Nov 2020 06:15:42 +0530 Subject: [PATCH] Threading enabled checking for URLs --- README.md | 4 ---- index.rst | 44 +++++++++++++++++++------------------------- requirements.txt | 1 + waybackpy/cli.py | 34 +++++++++++++++++++--------------- waybackpy/wrapper.py | 29 ++++++++++++++++++----------- 5 files changed, 57 insertions(+), 55 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index 09c8cb5..21095a6 100644 --- a/README.md +++ b/README.md @@ -428,10 +428,6 @@ pytest --cov=../waybackpy python -m codecov #For reporting coverage on Codecov ``` -## Dependency - -None, just pre-installed [python standard libraries](https://docs.python.org/3/library/). - ## Packaging 1. Increment version. diff --git a/index.rst b/index.rst index 597b4b9..f6322bd 100644 --- a/index.rst +++ b/index.rst @@ -112,7 +112,7 @@ Capturing aka Saving an url using save() https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus Try this out in your browser @ -https://repl.it/@akamhy/WaybackPySaveExample\ +https://repl.it/@akamhy/WaybackPySaveExample\ Retrieving the archive for an URL using archive\_url ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -133,7 +133,7 @@ Retrieving the archive for an URL using archive\_url https://web.archive.org/web/20201016153320/https://www.google.com/ Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyArchiveUrl\ +https://repl.it/@akamhy/WaybackPyArchiveUrl\ Retrieving the oldest archive for an URL using oldest() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -154,7 +154,7 @@ Retrieving the oldest archive for an URL using oldest() http://web.archive.org/web/19981111184551/http://google.com:80/ Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyOldestExample\ +https://repl.it/@akamhy/WaybackPyOldestExample\ Retrieving the newest archive for an URL using newest() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -175,7 +175,7 @@ Retrieving the newest archive for an URL using newest() https://web.archive.org/web/20201016150543/https://www.facebook.com/ Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyNewestExample\ +https://repl.it/@akamhy/WaybackPyNewestExample\ Retrieving the JSON reponse for the avaliblity API request ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -195,7 +195,7 @@ Retrieving the JSON reponse for the avaliblity API request {'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'available': True, 'url': 'http://web.archive.org/web/20201016150543/https://www.facebook.com/', 'timestamp': '20201016150543', 'status': '200'}}} -Try this out in your browser @ https://repl.it/@akamhy/WaybackPyJSON\ +Try this out in your browser @ https://repl.it/@akamhy/WaybackPyJSON\ Retrieving archive close to a specified year, month, day, hour, and minute using near() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -251,7 +251,7 @@ The package doesn't support second argument yet. You are encourged to create a PR ;) Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyNearExample\ +https://repl.it/@akamhy/WaybackPyNearExample\ Get the content of webpage using get() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -283,7 +283,7 @@ Get the content of webpage using get() print(google_oldest_archive_source) Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyGetExample#main.py\ +https://repl.it/@akamhy/WaybackPyGetExample#main.py\ Count total archives for an URL using total\_archives() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -306,7 +306,7 @@ Count total archives for an URL using total\_archives() 2516 Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyTotalArchivesExample\ +https://repl.it/@akamhy/WaybackPyTotalArchivesExample\ List of URLs that Wayback Machine knows and has archived for a domain name ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -335,7 +335,7 @@ List of URLs that Wayback Machine knows and has archived for a domain name 'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c'] Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py\ +https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py\ With the Command-line interface ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -349,7 +349,7 @@ Save https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyBashSave\ +https://repl.it/@akamhy/WaybackPyBashSave\ Get archive URL ^^^^^^^^^^^^^^^ @@ -360,7 +360,7 @@ Get archive URL https://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyBashArchiveUrl\ +https://repl.it/@akamhy/WaybackPyBashArchiveUrl\ Oldest archive ^^^^^^^^^^^^^^ @@ -371,7 +371,7 @@ Oldest archive https://web.archive.org/web/20040803000845/http://en.wikipedia.org:80/wiki/SpaceX Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyBashOldest\ +https://repl.it/@akamhy/WaybackPyBashOldest\ Newest archive ^^^^^^^^^^^^^^ @@ -382,7 +382,7 @@ Newest archive https://web.archive.org/web/20200606044708/https://en.wikipedia.org/wiki/YouTube Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyBashNewest\ +https://repl.it/@akamhy/WaybackPyBashNewest\ Get JSON data of avaialblity API ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -396,7 +396,7 @@ Get JSON data of avaialblity API {'archived_snapshots': {'closest': {'timestamp': '20201007132458', 'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX'}}, 'url': 'https://en.wikipedia.org/wiki/SpaceX'} Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyBashJSON\ +https://repl.it/@akamhy/WaybackPyBashJSON\ Total number of archives ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -407,7 +407,7 @@ Total number of archives 853 Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyBashTotal\ +https://repl.it/@akamhy/WaybackPyBashTotal\ Archive near time ^^^^^^^^^^^^^^^^^ @@ -418,7 +418,7 @@ Archive near time https://web.archive.org/web/20120512142515/https://www.facebook.com/ Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyBashNear\ +https://repl.it/@akamhy/WaybackPyBashNear\ Get the source code ^^^^^^^^^^^^^^^^^^^ @@ -431,7 +431,7 @@ Get the source code waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive. Try this out in your browser @ -https://repl.it/@akamhy/WaybackPyBashGet\ +https://repl.it/@akamhy/WaybackPyBashGet\ Fetch all the URLs that the Wayback Machine knows for a domain ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -464,7 +464,7 @@ Fetch all the URLs that the Wayback Machine knows for a domain # Prints all known URLs under akamhy.github.io including subdomain which are not dead links and still alive. Try this out in your browser @ -https://repl.it/@akamhy/WaybackpyKnownUrlsFromWaybackMachine#main.sh\ +https://repl.it/@akamhy/WaybackpyKnownUrlsFromWaybackMachine#main.sh\ Tests ----- @@ -482,12 +482,6 @@ To run tests locally: pytest --cov=../waybackpy python -m codecov #For reporting coverage on Codecov -Dependency ----------- - -None, just pre-installed `python standard -libraries `__. - Packaging --------- @@ -528,4 +522,4 @@ for details. :target: https://github.com/akamhy/waybackpy/graphs/commit-activity .. |Repo size| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square .. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg - :target: https://github.com/akamhy/waybackpy/blob/master/LICENSE \ No newline at end of file + :target: https://github.com/akamhy/waybackpy/blob/master/LICENSE diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..989b995 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests==2.24.0 diff --git a/waybackpy/cli.py b/waybackpy/cli.py index bf0f0b7..7f20651 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -3,6 +3,8 @@ import sys import os import re import argparse +import string +import random from waybackpy.wrapper import Url from waybackpy.__version__ import __version__ @@ -38,6 +40,22 @@ def _near(obj, args): _near_args["minute"] = args.minute return (obj.near(**_near_args)) +def _save_urls_on_file(input_list, live_url_count): + m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) # O(1) + if m: + domain = m.group(1) + else: + domain = "domain-unknown" + + uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) + + file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid) + file_content = "\n".join(input_list) #join with \n + file_path = os.path.join(os.getcwd(), file_name) + with open(file_name, "w+") as f: + f.write(file_content) + return "%s\n\n'%s' saved in current working directory" % (file_content, file_name) + def _known_urls(obj, args): """Abbreviations: sd = subdomain @@ -53,21 +71,7 @@ def _known_urls(obj, args): total_urls = len(url_list) if total_urls > 0: - m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0]) - if m: - domain = m.group(1) - else: - domain = "domain-unknown" - - dir_path = os.path.abspath(os.getcwd()) - file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls) - text = "\n".join(url_list) + "\n" - with open(file_name, "a+") as f: - f.write(text) - text = text + "%d URLs found and saved in ./%s-%d-urls.txt" % ( - total_urls, domain, total_urls - ) - + text = _save_urls_on_file(url_list, total_urls) else: text = "No known URLs found. Please try a diffrent domain!" diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 53da342..1f47f4a 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -6,6 +6,8 @@ from datetime import datetime, timedelta from waybackpy.exceptions import WaybackError from waybackpy.__version__ import __version__ from urllib.request import Request, urlopen +import requests +import concurrent.futures from urllib.error import URLError @@ -68,6 +70,7 @@ class Url: self.JSON = self._JSON() # JSON of most recent archive self.archive_url = self._archive_url() # URL of archive self.timestamp = self._archive_timestamp() # timestamp for last archive + self._alive_url_list = [] def __repr__(self): return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) @@ -237,6 +240,18 @@ class Url: # Most efficient method to count number of archives (yet) return str(response.read()).count(",") + def pick_live_urls(self, url): + + try: + response_code = requests.get(url).status_code + except Exception as e: + return #we don't care if urls are not opening + + if response_code >= 400: #200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300 + return + + self._alive_url_list.append(url) + def known_urls(self, alive=False, subdomain=False): """Returns list of URLs known to exist for given domain name because these URLs were crawled by WayBack Machine bots. @@ -270,16 +285,8 @@ class Url: #Remove all deadURLs from url_list if alive=True if alive: - tmp_url_list = [] - for url in url_list: - - try: - urlopen(url) # nosec - except: - continue - - tmp_url_list.append(url) - - url_list = tmp_url_list + with concurrent.futures.ThreadPoolExecutor() as executor: + executor.map(self.pick_live_urls, url_list) + url_list = self._alive_url_list return url_list