Threading enabled checking for URLs

This commit is contained in:
Akash Mahanty 2020-11-26 06:15:42 +05:30
parent 5088305a58
commit 58cd9c28e7
5 changed files with 57 additions and 55 deletions

View File

@ -428,10 +428,6 @@ pytest --cov=../waybackpy
python -m codecov #For reporting coverage on Codecov python -m codecov #For reporting coverage on Codecov
``` ```
## Dependency
None, just pre-installed [python standard libraries](https://docs.python.org/3/library/).
## Packaging ## Packaging
1. Increment version. 1. Increment version.

View File

@ -112,7 +112,7 @@ Capturing aka Saving an url using save()
https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus https://web.archive.org/web/20201016171808/https://en.wikipedia.org/wiki/Multivariable_calculus
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPySaveExample\ https://repl.it/@akamhy/WaybackPySaveExample\
Retrieving the archive for an URL using archive\_url Retrieving the archive for an URL using archive\_url
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -133,7 +133,7 @@ Retrieving the archive for an URL using archive\_url
https://web.archive.org/web/20201016153320/https://www.google.com/ https://web.archive.org/web/20201016153320/https://www.google.com/
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyArchiveUrl\ https://repl.it/@akamhy/WaybackPyArchiveUrl\
Retrieving the oldest archive for an URL using oldest() Retrieving the oldest archive for an URL using oldest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -154,7 +154,7 @@ Retrieving the oldest archive for an URL using oldest()
http://web.archive.org/web/19981111184551/http://google.com:80/ http://web.archive.org/web/19981111184551/http://google.com:80/
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyOldestExample\ https://repl.it/@akamhy/WaybackPyOldestExample\
Retrieving the newest archive for an URL using newest() Retrieving the newest archive for an URL using newest()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -175,7 +175,7 @@ Retrieving the newest archive for an URL using newest()
https://web.archive.org/web/20201016150543/https://www.facebook.com/ https://web.archive.org/web/20201016150543/https://www.facebook.com/
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyNewestExample\ https://repl.it/@akamhy/WaybackPyNewestExample\
Retrieving the JSON reponse for the avaliblity API request Retrieving the JSON reponse for the avaliblity API request
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -195,7 +195,7 @@ Retrieving the JSON reponse for the avaliblity API request
{'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'available': True, 'url': 'http://web.archive.org/web/20201016150543/https://www.facebook.com/', 'timestamp': '20201016150543', 'status': '200'}}} {'url': 'https://www.facebook.com/', 'archived_snapshots': {'closest': {'available': True, 'url': 'http://web.archive.org/web/20201016150543/https://www.facebook.com/', 'timestamp': '20201016150543', 'status': '200'}}}
Try this out in your browser @ https://repl.it/@akamhy/WaybackPyJSON\ Try this out in your browser @ https://repl.it/@akamhy/WaybackPyJSON\
Retrieving archive close to a specified year, month, day, hour, and minute using near() Retrieving archive close to a specified year, month, day, hour, and minute using near()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -251,7 +251,7 @@ The package doesn't support second argument yet. You are encourged to
create a PR ;) create a PR ;)
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyNearExample\ https://repl.it/@akamhy/WaybackPyNearExample\
Get the content of webpage using get() Get the content of webpage using get()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -283,7 +283,7 @@ Get the content of webpage using get()
print(google_oldest_archive_source) print(google_oldest_archive_source)
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyGetExample#main.py\ https://repl.it/@akamhy/WaybackPyGetExample#main.py\
Count total archives for an URL using total\_archives() Count total archives for an URL using total\_archives()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -306,7 +306,7 @@ Count total archives for an URL using total\_archives()
2516 2516
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyTotalArchivesExample\ https://repl.it/@akamhy/WaybackPyTotalArchivesExample\
List of URLs that Wayback Machine knows and has archived for a domain name List of URLs that Wayback Machine knows and has archived for a domain name
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -335,7 +335,7 @@ List of URLs that Wayback Machine knows and has archived for a domain name
'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c'] 'https://akamhy.github.io/waybackpy/assets/css/style.css?v=f881705d00bf47b5bf0c58808efe29eecba2226c']
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py\ https://repl.it/@akamhy/WaybackPyKnownURLsToWayBackMachineExample#main.py\
With the Command-line interface With the Command-line interface
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -349,7 +349,7 @@ Save
https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media https://web.archive.org/web/20200719062108/https://en.wikipedia.org/wiki/Social_media
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashSave\ https://repl.it/@akamhy/WaybackPyBashSave\
Get archive URL Get archive URL
^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^
@ -360,7 +360,7 @@ Get archive URL
https://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX https://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashArchiveUrl\ https://repl.it/@akamhy/WaybackPyBashArchiveUrl\
Oldest archive Oldest archive
^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^
@ -371,7 +371,7 @@ Oldest archive
https://web.archive.org/web/20040803000845/http://en.wikipedia.org:80/wiki/SpaceX https://web.archive.org/web/20040803000845/http://en.wikipedia.org:80/wiki/SpaceX
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashOldest\ https://repl.it/@akamhy/WaybackPyBashOldest\
Newest archive Newest archive
^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^
@ -382,7 +382,7 @@ Newest archive
https://web.archive.org/web/20200606044708/https://en.wikipedia.org/wiki/YouTube https://web.archive.org/web/20200606044708/https://en.wikipedia.org/wiki/YouTube
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashNewest\ https://repl.it/@akamhy/WaybackPyBashNewest\
Get JSON data of avaialblity API Get JSON data of avaialblity API
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -396,7 +396,7 @@ Get JSON data of avaialblity API
{'archived_snapshots': {'closest': {'timestamp': '20201007132458', 'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX'}}, 'url': 'https://en.wikipedia.org/wiki/SpaceX'} {'archived_snapshots': {'closest': {'timestamp': '20201007132458', 'status': '200', 'available': True, 'url': 'http://web.archive.org/web/20201007132458/https://en.wikipedia.org/wiki/SpaceX'}}, 'url': 'https://en.wikipedia.org/wiki/SpaceX'}
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashJSON\ https://repl.it/@akamhy/WaybackPyBashJSON\
Total number of archives Total number of archives
^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^
@ -407,7 +407,7 @@ Total number of archives
853 853
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashTotal\ https://repl.it/@akamhy/WaybackPyBashTotal\
Archive near time Archive near time
^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^
@ -418,7 +418,7 @@ Archive near time
https://web.archive.org/web/20120512142515/https://www.facebook.com/ https://web.archive.org/web/20120512142515/https://www.facebook.com/
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashNear\ https://repl.it/@akamhy/WaybackPyBashNear\
Get the source code Get the source code
^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^
@ -431,7 +431,7 @@ Get the source code
waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive. waybackpy --url google.com --user_agent "my-unique-user-agent" --get save # Save a new archive on wayback machine then print the source code of this archive.
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackPyBashGet\ https://repl.it/@akamhy/WaybackPyBashGet\
Fetch all the URLs that the Wayback Machine knows for a domain Fetch all the URLs that the Wayback Machine knows for a domain
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -464,7 +464,7 @@ Fetch all the URLs that the Wayback Machine knows for a domain
# Prints all known URLs under akamhy.github.io including subdomain which are not dead links and still alive. # Prints all known URLs under akamhy.github.io including subdomain which are not dead links and still alive.
Try this out in your browser @ Try this out in your browser @
https://repl.it/@akamhy/WaybackpyKnownUrlsFromWaybackMachine#main.sh\ https://repl.it/@akamhy/WaybackpyKnownUrlsFromWaybackMachine#main.sh\
Tests Tests
----- -----
@ -482,12 +482,6 @@ To run tests locally:
pytest --cov=../waybackpy pytest --cov=../waybackpy
python -m codecov #For reporting coverage on Codecov python -m codecov #For reporting coverage on Codecov
Dependency
----------
None, just pre-installed `python standard
libraries <https://docs.python.org/3/library/>`__.
Packaging Packaging
--------- ---------
@ -528,4 +522,4 @@ for details.
:target: https://github.com/akamhy/waybackpy/graphs/commit-activity :target: https://github.com/akamhy/waybackpy/graphs/commit-activity
.. |Repo size| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square .. |Repo size| image:: https://img.shields.io/github/repo-size/akamhy/waybackpy.svg?label=Repo%20size&style=flat-square
.. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg .. |License: MIT| image:: https://img.shields.io/badge/License-MIT-yellow.svg
:target: https://github.com/akamhy/waybackpy/blob/master/LICENSE :target: https://github.com/akamhy/waybackpy/blob/master/LICENSE

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
requests==2.24.0

View File

@ -3,6 +3,8 @@ import sys
import os import os
import re import re
import argparse import argparse
import string
import random
from waybackpy.wrapper import Url from waybackpy.wrapper import Url
from waybackpy.__version__ import __version__ from waybackpy.__version__ import __version__
@ -38,6 +40,22 @@ def _near(obj, args):
_near_args["minute"] = args.minute _near_args["minute"] = args.minute
return (obj.near(**_near_args)) return (obj.near(**_near_args))
def _save_urls_on_file(input_list, live_url_count):
m = re.search('https?://([A-Za-z_0-9.-]+).*', input_list[0]) # O(1)
if m:
domain = m.group(1)
else:
domain = "domain-unknown"
uid = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6))
file_name = "%s-%d-urls-%s.txt" % (domain, live_url_count, uid)
file_content = "\n".join(input_list) #join with \n
file_path = os.path.join(os.getcwd(), file_name)
with open(file_name, "w+") as f:
f.write(file_content)
return "%s\n\n'%s' saved in current working directory" % (file_content, file_name)
def _known_urls(obj, args): def _known_urls(obj, args):
"""Abbreviations: """Abbreviations:
sd = subdomain sd = subdomain
@ -53,21 +71,7 @@ def _known_urls(obj, args):
total_urls = len(url_list) total_urls = len(url_list)
if total_urls > 0: if total_urls > 0:
m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0]) text = _save_urls_on_file(url_list, total_urls)
if m:
domain = m.group(1)
else:
domain = "domain-unknown"
dir_path = os.path.abspath(os.getcwd())
file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
text = "\n".join(url_list) + "\n"
with open(file_name, "a+") as f:
f.write(text)
text = text + "%d URLs found and saved in ./%s-%d-urls.txt" % (
total_urls, domain, total_urls
)
else: else:
text = "No known URLs found. Please try a diffrent domain!" text = "No known URLs found. Please try a diffrent domain!"

View File

@ -6,6 +6,8 @@ from datetime import datetime, timedelta
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
from waybackpy.__version__ import __version__ from waybackpy.__version__ import __version__
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
import requests
import concurrent.futures
from urllib.error import URLError from urllib.error import URLError
@ -68,6 +70,7 @@ class Url:
self.JSON = self._JSON() # JSON of most recent archive self.JSON = self._JSON() # JSON of most recent archive
self.archive_url = self._archive_url() # URL of archive self.archive_url = self._archive_url() # URL of archive
self.timestamp = self._archive_timestamp() # timestamp for last archive self.timestamp = self._archive_timestamp() # timestamp for last archive
self._alive_url_list = []
def __repr__(self): def __repr__(self):
return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent) return "waybackpy.Url(url=%s, user_agent=%s)" % (self.url, self.user_agent)
@ -237,6 +240,18 @@ class Url:
# Most efficient method to count number of archives (yet) # Most efficient method to count number of archives (yet)
return str(response.read()).count(",") return str(response.read()).count(",")
def pick_live_urls(self, url):
try:
response_code = requests.get(url).status_code
except Exception as e:
return #we don't care if urls are not opening
if response_code >= 400: #200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
return
self._alive_url_list.append(url)
def known_urls(self, alive=False, subdomain=False): def known_urls(self, alive=False, subdomain=False):
"""Returns list of URLs known to exist for given domain name """Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine bots. because these URLs were crawled by WayBack Machine bots.
@ -270,16 +285,8 @@ class Url:
#Remove all deadURLs from url_list if alive=True #Remove all deadURLs from url_list if alive=True
if alive: if alive:
tmp_url_list = [] with concurrent.futures.ThreadPoolExecutor() as executor:
for url in url_list: executor.map(self.pick_live_urls, url_list)
url_list = self._alive_url_list
try:
urlopen(url) # nosec
except:
continue
tmp_url_list.append(url)
url_list = tmp_url_list
return url_list return url_list