Threading enabled checking for URLs

This commit is contained in:
Akash Mahanty 2020-11-26 06:15:42 +05:30
parent 5088305a58
commit 58cd9c28e7
5 changed files with 57 additions and 55 deletions

View File

@ -428,10 +428,6 @@ pytest --cov=../waybackpy
python -m codecov #For reporting coverage on Codecov python -m codecov #For reporting coverage on Codecov
``` ```
## Dependency
None, just pre-installed [python standard libraries](https://docs.python.org/3/library/).
## Packaging ## Packaging
1. Increment version. 1. Increment version.

View File

@ -482,12 +482,6 @@ To run tests locally:
pytest --cov=../waybackpy pytest --cov=../waybackpy
python -m codecov #For reporting coverage on Codecov python -m codecov #For reporting coverage on Codecov
Dependency
----------
None, just pre-installed `python standard
libraries <https://docs.python.org/3/library/>`__.
Packaging Packaging
--------- ---------

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
requests==2.24.0

View File

@ -3,6 +3,8 @@ import sys
import os import os
import re import re
import argparse import argparse
import string
import random
from waybackpy.wrapper import Url from waybackpy.wrapper import Url
from waybackpy.__version__ import __version__ from waybackpy.__version__ import __version__
@ -38,6 +40,22 @@ def _near(obj, args):
_near_args["minute"] = args.minute _near_args["minute"] = args.minute
return (obj.near(**_near_args)) return (obj.near(**_near_args))
def _save_urls_on_file(input_list, live_url_count):
m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) # O(1)
if m:
domain = m.group(1)
else:
domain = "domain-unknown"
uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6))
file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid)
file_content = "\n".join(input_list) #join with \n
file_path = os.path.join(os.getcwd(), file_name)
with open(file_name, "w+") as f:
f.write(file_content)
return "%s\n\n'%s' saved in current working directory" % (file_content, file_name)
def _known_urls(obj, args): def _known_urls(obj, args):
"""Abbreviations: """Abbreviations:
sd = subdomain sd = subdomain
@ -53,21 +71,7 @@ def _known_urls(obj, args):
total_urls = len(url_list) total_urls = len(url_list)
if total_urls > 0: if total_urls > 0:
m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0]) text = _save_urls_on_file(url_list, total_urls)
if m:
domain = m.group(1)
else:
domain = "domain-unknown"
dir_path = os.path.abspath(os.getcwd())
file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
text = "\n".join(url_list) + "\n"
with open(file_name, "a+") as f:
f.write(text)
text = text + "%d URLs found and saved in ./%s-%d-urls.txt" % (
total_urls, domain, total_urls
)
else: else:
text = "No known URLs found. Please try a diffrent domain!" text = "No known URLs found. Please try a diffrent domain!"

View File

@ -6,6 +6,8 @@ from datetime import datetime, timedelta
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
from waybackpy.__version__ import __version__ from waybackpy.__version__ import __version__
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
import requests
import concurrent.futures
from urllib.error import URLError from urllib.error import URLError
@ -68,6 +70,7 @@ class Url:
self.JSON = self._JSON() # JSON of most recent archive self.JSON = self._JSON() # JSON of most recent archive
self.archive_url = self._archive_url() # URL of archive self.archive_url = self._archive_url() # URL of archive
self.timestamp = self._archive_timestamp() # timestamp for last archive self.timestamp = self._archive_timestamp() # timestamp for last archive
self._alive_url_list = []
def __repr__(self): def __repr__(self):
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
@ -237,6 +240,18 @@ class Url:
# Most efficient method to count number of archives (yet) # Most efficient method to count number of archives (yet)
return str(response.read()).count(",") return str(response.read()).count(",")
def pick_live_urls(self, url):
try:
response_code = requests.get(url).status_code
except Exception as e:
return #we don't care if urls are not opening
if response_code >= 400: #200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
return
self._alive_url_list.append(url)
def known_urls(self, alive=False, subdomain=False): def known_urls(self, alive=False, subdomain=False):
"""Returns list of URLs known to exist for given domain name """Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine bots. because these URLs were crawled by WayBack Machine bots.
@ -270,16 +285,8 @@ class Url:
#Remove all deadURLs from url_list if alive=True #Remove all deadURLs from url_list if alive=True
if alive: if alive:
tmp_url_list = [] with concurrent.futures.ThreadPoolExecutor() as executor:
for url in url_list: executor.map(self.pick_live_urls, url_list)
url_list = self._alive_url_list
try:
urlopen(url) # nosec
except:
continue
tmp_url_list.append(url)
url_list = tmp_url_list
return url_list return url_list