Threading enabled checking for URLs
This commit is contained in:
		| @@ -428,10 +428,6 @@ pytest --cov=../waybackpy | ||||
| python -m codecov #For reporting coverage on Codecov | ||||
| ``` | ||||
|  | ||||
| ## Dependency | ||||
|  | ||||
| None, just pre-installed [python standard libraries](https://docs.python.org/3/library/). | ||||
|  | ||||
| ## Packaging | ||||
|  | ||||
| 1. Increment version. | ||||
|   | ||||
| @@ -482,12 +482,6 @@ To run tests locally: | ||||
|     pytest --cov=../waybackpy | ||||
|     python -m codecov #For reporting coverage on Codecov | ||||
|  | ||||
| Dependency | ||||
| ---------- | ||||
|  | ||||
| None, just pre-installed `python standard | ||||
| libraries <https://docs.python.org/3/library/>`__. | ||||
|  | ||||
| Packaging | ||||
| --------- | ||||
|  | ||||
|   | ||||
							
								
								
									
										1
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| requests==2.24.0 | ||||
| @@ -3,6 +3,8 @@ import sys | ||||
| import os | ||||
| import re | ||||
| import argparse | ||||
| import string | ||||
| import random | ||||
| from waybackpy.wrapper import Url | ||||
| from waybackpy.__version__ import __version__ | ||||
|  | ||||
| @@ -38,6 +40,22 @@ def _near(obj, args): | ||||
|         _near_args["minute"] = args.minute | ||||
|     return (obj.near(**_near_args)) | ||||
|  | ||||
| def _save_urls_on_file(input_list, live_url_count): | ||||
|     m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) # O(1) | ||||
|     if m: | ||||
|         domain = m.group(1) | ||||
|     else: | ||||
|         domain = "domain-unknown" | ||||
|  | ||||
|     uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) | ||||
|  | ||||
|     file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid) | ||||
|     file_content = "\n".join(input_list) #join with \n | ||||
|     file_path = os.path.join(os.getcwd(), file_name) | ||||
|     with open(file_name, "w+") as f: | ||||
|         f.write(file_content) | ||||
|     return "%s\n\n'%s' saved in current working directory" % (file_content, file_name) | ||||
|  | ||||
| def _known_urls(obj, args): | ||||
|     """Abbreviations: | ||||
|     sd = subdomain | ||||
| @@ -53,21 +71,7 @@ def _known_urls(obj, args): | ||||
|     total_urls = len(url_list) | ||||
|  | ||||
|     if total_urls > 0: | ||||
|         m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0]) | ||||
|         if m: | ||||
|             domain = m.group(1) | ||||
|         else: | ||||
|             domain = "domain-unknown" | ||||
|  | ||||
|         dir_path = os.path.abspath(os.getcwd()) | ||||
|         file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls) | ||||
|         text = "\n".join(url_list) + "\n" | ||||
|         with open(file_name, "a+") as f: | ||||
|             f.write(text) | ||||
|         text =  text + "%d URLs found and saved in ./%s-%d-urls.txt" % ( | ||||
|             total_urls, domain, total_urls | ||||
|             ) | ||||
|  | ||||
|         text = _save_urls_on_file(url_list, total_urls) | ||||
|     else: | ||||
|         text = "No known URLs found. Please try a diffrent domain!" | ||||
|  | ||||
|   | ||||
| @@ -6,6 +6,8 @@ from datetime import datetime, timedelta | ||||
| from waybackpy.exceptions import WaybackError | ||||
| from waybackpy.__version__ import __version__ | ||||
| from urllib.request import Request, urlopen | ||||
| import requests | ||||
| import concurrent.futures | ||||
| from urllib.error import URLError | ||||
|  | ||||
|  | ||||
| @@ -68,6 +70,7 @@ class Url: | ||||
|         self.JSON = self._JSON() # JSON of most recent archive | ||||
|         self.archive_url = self._archive_url() # URL of archive | ||||
|         self.timestamp = self._archive_timestamp() # timestamp for last archive | ||||
|         self._alive_url_list = [] | ||||
|  | ||||
|     def __repr__(self): | ||||
|         return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) | ||||
| @@ -237,6 +240,18 @@ class Url: | ||||
|         # Most efficient method to count number of archives (yet) | ||||
|         return str(response.read()).count(",") | ||||
|  | ||||
|     def pick_live_urls(self, url): | ||||
|  | ||||
|         try: | ||||
|             response_code = requests.get(url).status_code | ||||
|         except Exception as e: | ||||
|             return #we don't care if urls are not opening | ||||
|  | ||||
|         if response_code >= 400: #200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300 | ||||
|             return | ||||
|  | ||||
|         self._alive_url_list.append(url) | ||||
|  | ||||
|     def known_urls(self, alive=False, subdomain=False): | ||||
|         """Returns list of URLs known to exist for given domain name | ||||
|         because these URLs were crawled by WayBack Machine bots. | ||||
| @@ -270,16 +285,8 @@ class Url: | ||||
|  | ||||
|         #Remove all deadURLs from url_list if alive=True | ||||
|         if alive: | ||||
|             tmp_url_list = [] | ||||
|             for url in url_list: | ||||
|  | ||||
|                 try: | ||||
|                     urlopen(url) # nosec | ||||
|                 except: | ||||
|                     continue | ||||
|  | ||||
|                 tmp_url_list.append(url) | ||||
|  | ||||
|             url_list = tmp_url_list | ||||
|             with concurrent.futures.ThreadPoolExecutor() as executor: | ||||
|                 executor.map(self.pick_live_urls, url_list) | ||||
|             url_list = self._alive_url_list | ||||
|  | ||||
|         return url_list | ||||
|   | ||||
		Reference in New Issue
	
	Block a user