refactoring, try to code complexity

This commit is contained in:
Akash Mahanty 2021-01-04 00:14:38 +05:30
parent 62e5217b9e
commit 5dec4927cd
4 changed files with 158 additions and 155 deletions

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
import sys import sys
import os import os
import pytest import pytest
@ -286,7 +285,7 @@ def test_get():
alive=False, alive=False,
subdomain=False, subdomain=False,
known_urls=False, known_urls=False,
get="BullShit", get="foobar",
) )
reply = cli.args_handler(args) reply = cli.args_handler(args)
assert "get the source code of the" in str(reply) assert "get the source code of the" in str(reply)

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
import sys import sys
import pytest import pytest
import random import random
@ -12,14 +11,15 @@ import waybackpy.wrapper as waybackpy # noqa: E402
user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0" user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
def test_clean_url(): def test_cleaned_url():
"""No API use""" """No API use"""
test_url = " https://en.wikipedia.org/wiki/Network security " test_url = " https://en.wikipedia.org/wiki/Network security "
answer = "https://en.wikipedia.org/wiki/Network_security" answer = "https://en.wikipedia.org/wiki/Network_security"
target = waybackpy.Url(test_url, user_agent) target = waybackpy.Url(test_url, user_agent)
test_result = target._clean_url() test_result = target._cleaned_url()
assert answer == test_result assert answer == test_result
def test_dunders(): def test_dunders():
"""No API use""" """No API use"""
url = "https://en.wikipedia.org/wiki/Network_security" url = "https://en.wikipedia.org/wiki/Network_security"
@ -28,19 +28,23 @@ def test_dunders():
assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target) assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target)
assert "en.wikipedia.org" in str(target) assert "en.wikipedia.org" in str(target)
def test_url_check(): def test_url_check():
"""No API Use""" """No API Use"""
broken_url = "http://wwwgooglecom/" broken_url = "http://wwwgooglecom/"
with pytest.raises(Exception): with pytest.raises(Exception):
waybackpy.Url(broken_url, user_agent) waybackpy.Url(broken_url, user_agent)
def test_archive_url_parser(): def test_archive_url_parser():
"""No API Use""" """No API Use"""
perfect_header = """ perfect_header = """
{'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'} {'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'}
""" """
archive = waybackpy._archive_url_parser(perfect_header) archive = waybackpy._archive_url_parser(
perfect_header, "https://www.scribbr.com/citing-sources/et-al/"
)
assert "web.archive.org/web/20210102094009" in archive assert "web.archive.org/web/20210102094009" in archive
# The below header should result in Exception # The below header should result in Exception
@ -49,7 +53,9 @@ def test_archive_url_parser():
""" """
with pytest.raises(Exception): with pytest.raises(Exception):
waybackpy._archive_url_parser(no_archive_header) waybackpy._archive_url_parser(
no_archive_header, "https://www.scribbr.com/citing-sources/et-al/"
)
def test_save(): def test_save():
@ -173,9 +179,11 @@ def test_get_response():
def test_total_archives(): def test_total_archives():
user_agent = (
target = waybackpy.Url(" https://google.com ", user_agent) "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
assert target.total_archives() > 500000 )
target = waybackpy.Url(" https://outlook.com ", user_agent)
assert target.total_archives() > 80000
target = waybackpy.Url( target = waybackpy.Url(
" https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent " https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent

View File

@ -1,13 +1,12 @@
# -*- coding: utf-8 -*-
import sys
import os import os
import re import re
import argparse import sys
import string
import random import random
import string
import argparse
from waybackpy.wrapper import Url from waybackpy.wrapper import Url
from waybackpy.__version__ import __version__
from waybackpy.exceptions import WaybackError from waybackpy.exceptions import WaybackError
from waybackpy.__version__ import __version__
def _save(obj): def _save(obj):
@ -38,7 +37,7 @@ def _json(obj):
return obj.JSON return obj.JSON
def handle_not_archived_error(e, obj): def no_archive_handler(e, obj):
m = re.search(r"archive\sfor\s\'(.*?)\'\stry", str(e)) m = re.search(r"archive\sfor\s\'(.*?)\'\stry", str(e))
if m: if m:
url = m.group(1) url = m.group(1)
@ -57,14 +56,14 @@ def _oldest(obj):
try: try:
return obj.oldest() return obj.oldest()
except Exception as e: except Exception as e:
return handle_not_archived_error(e, obj) return no_archive_handler(e, obj)
def _newest(obj): def _newest(obj):
try: try:
return obj.newest() return obj.newest()
except Exception as e: except Exception as e:
return handle_not_archived_error(e, obj) return no_archive_handler(e, obj)
def _total_archives(obj): def _total_archives(obj):
@ -83,15 +82,15 @@ def _near(obj, args):
try: try:
return obj.near(**_near_args) return obj.near(**_near_args)
except Exception as e: except Exception as e:
return handle_not_archived_error(e, obj) return no_archive_handler(e, obj)
def _save_urls_on_file(input_list, live_url_count): def _save_urls_on_file(input_list, live_url_count):
m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0]) m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
domain = "domain-unknown"
if m: if m:
domain = m.group(1) domain = m.group(1)
else:
domain = "domain-unknown"
uid = "".join( uid = "".join(
random.choice(string.ascii_lowercase + string.digits) for _ in range(6) random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
@ -106,44 +105,39 @@ def _save_urls_on_file(input_list, live_url_count):
def _known_urls(obj, args): def _known_urls(obj, args):
"""Abbreviations:
sd = subdomain
al = alive
""" """
Known urls for a domain.
"""
# sd = subdomain
sd = False sd = False
al = False
if args.subdomain: if args.subdomain:
sd = True sd = True
# al = alive
al = False
if args.alive: if args.alive:
al = True al = True
url_list = obj.known_urls(alive=al, subdomain=sd) url_list = obj.known_urls(alive=al, subdomain=sd)
total_urls = len(url_list) total_urls = len(url_list)
if total_urls > 0: if total_urls > 0:
text = _save_urls_on_file(url_list, total_urls) return _save_urls_on_file(url_list, total_urls)
else:
text = "No known URLs found. Please try a diffrent domain!"
return text return "No known URLs found. Please try a diffrent domain!"
def _get(obj, args): def _get(obj, args):
if args.get.lower() == "url": if args.get.lower() == "url":
return obj.get() return obj.get()
if args.get.lower() == "archive_url": if args.get.lower() == "archive_url":
return obj.get(obj.archive_url) return obj.get(obj.archive_url)
if args.get.lower() == "oldest": if args.get.lower() == "oldest":
return obj.get(obj.oldest()) return obj.get(obj.oldest())
if args.get.lower() == "latest" or args.get.lower() == "newest": if args.get.lower() == "latest" or args.get.lower() == "newest":
return obj.get(obj.newest()) return obj.get(obj.newest())
if args.get.lower() == "save": if args.get.lower() == "save":
return obj.get(obj.save()) return obj.get(obj.save())
return "Use get as \"--get 'source'\", 'source' can be one of the followings: \ return "Use get as \"--get 'source'\", 'source' can be one of the followings: \
\n1) url - get the source code of the url specified using --url/-u.\ \n1) url - get the source code of the url specified using --url/-u.\
\n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\ \n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\
@ -167,39 +161,48 @@ def args_handler(args):
obj = Url(args.url, args.user_agent) obj = Url(args.url, args.user_agent)
if args.save: if args.save:
return _save(obj) output = _save(obj)
if args.archive_url: elif args.archive_url:
return _archive_url(obj) output = _archive_url(obj)
if args.json: elif args.json:
return _json(obj) output = _json(obj)
if args.oldest: elif args.oldest:
return _oldest(obj) output = _oldest(obj)
if args.newest: elif args.newest:
return _newest(obj) output = _newest(obj)
if args.known_urls: elif args.known_urls:
return _known_urls(obj, args) output = _known_urls(obj, args)
if args.total: elif args.total:
return _total_archives(obj) output = _total_archives(obj)
if args.near: elif args.near:
return _near(obj, args) return _near(obj, args)
if args.get: elif args.get:
return _get(obj, args) output = _get(obj, args)
else:
return ( output = (
"You only specified the URL. But you also need to specify the operation." "You only specified the URL. But you also need to specify the operation."
"\nSee 'waybackpy --help' for help using this tool." "\nSee 'waybackpy --help' for help using this tool."
) )
return output
def add_requiredArgs(requiredArgs):
requiredArgs.add_argument(
"--url", "-u", help="URL on which Wayback machine operations would occur"
)
def add_userAgentArg(userAgentArg): def add_userAgentArg(userAgentArg):
help_text = 'User agent, default user_agent is "waybackpy python package - https://github.com/akamhy/waybackpy"' help_text = 'User agent, default user_agent is "waybackpy python package - https://github.com/akamhy/waybackpy"'
userAgentArg.add_argument("--user_agent", "-ua", help=help_text) userAgentArg.add_argument("--user_agent", "-ua", help=help_text)
def add_saveArg(saveArg): def add_saveArg(saveArg):
saveArg.add_argument( saveArg.add_argument(
"--save", "-s", action="store_true", help="Save the URL on the Wayback machine" "--save", "-s", action="store_true", help="Save the URL on the Wayback machine"
) )
def add_auArg(auArg): def add_auArg(auArg):
auArg.add_argument( auArg.add_argument(
"--archive_url", "--archive_url",
@ -208,6 +211,7 @@ def add_auArg(auArg):
help="Get the latest archive URL, alias for --newest", help="Get the latest archive URL, alias for --newest",
) )
def add_jsonArg(jsonArg): def add_jsonArg(jsonArg):
jsonArg.add_argument( jsonArg.add_argument(
"--json", "--json",
@ -216,6 +220,7 @@ def add_jsonArg(jsonArg):
help="JSON data of the availability API request", help="JSON data of the availability API request",
) )
def add_oldestArg(oldestArg): def add_oldestArg(oldestArg):
oldestArg.add_argument( oldestArg.add_argument(
"--oldest", "--oldest",
@ -224,6 +229,7 @@ def add_oldestArg(oldestArg):
help="Oldest archive for the specified URL", help="Oldest archive for the specified URL",
) )
def add_newestArg(newestArg): def add_newestArg(newestArg):
newestArg.add_argument( newestArg.add_argument(
"--newest", "--newest",
@ -232,6 +238,7 @@ def add_newestArg(newestArg):
help="Newest archive for the specified URL", help="Newest archive for the specified URL",
) )
def add_totalArg(totalArg): def add_totalArg(totalArg):
totalArg.add_argument( totalArg.add_argument(
"--total", "--total",
@ -240,6 +247,7 @@ def add_totalArg(totalArg):
help="Total number of archives for the specified URL", help="Total number of archives for the specified URL",
) )
def add_getArg(getArg): def add_getArg(getArg):
getArg.add_argument( getArg.add_argument(
"--get", "--get",
@ -247,6 +255,7 @@ def add_getArg(getArg):
help="Prints the source code of the supplied url. Use '--get help' for extended usage", help="Prints the source code of the supplied url. Use '--get help' for extended usage",
) )
def add_knownUrlArg(knownUrlArg): def add_knownUrlArg(knownUrlArg):
knownUrlArg.add_argument( knownUrlArg.add_argument(
"--known_urls", "-ku", action="store_true", help="URLs known for the domain." "--known_urls", "-ku", action="store_true", help="URLs known for the domain."
@ -257,6 +266,12 @@ def add_knownUrlArg(knownUrlArg):
knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text) knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text)
def add_nearArg(nearArg):
nearArg.add_argument(
"--near", "-N", action="store_true", help="Archive near specified time"
)
def add_nearArgs(nearArgs): def add_nearArgs(nearArgs):
nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer") nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
nearArgs.add_argument("--month", "-M", type=int, help="Month in integer") nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
@ -264,64 +279,35 @@ def add_nearArgs(nearArgs):
nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege") nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer") nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
def parse_args(argv): def parse_args(argv):
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
add_requiredArgs(parser.add_argument_group("URL argument (required)"))
requiredArgs = parser.add_argument_group("URL argument (required)") add_userAgentArg(parser.add_argument_group("User Agent"))
requiredArgs.add_argument( add_saveArg(parser.add_argument_group("Create new archive/save URL"))
"--url", "-u", help="URL on which Wayback machine operations would occur" add_auArg(parser.add_argument_group("Get the latest Archive"))
add_jsonArg(parser.add_argument_group("Get the JSON data"))
add_oldestArg(parser.add_argument_group("Oldest archive"))
add_newestArg(parser.add_argument_group("Newest archive"))
add_totalArg(parser.add_argument_group("Total number of archives"))
add_getArg(parser.add_argument_group("Get source code"))
add_knownUrlArg(
parser.add_argument_group(
"URLs known and archived to Waybcak Machine for the site."
)
) )
add_nearArg(parser.add_argument_group("Archive close to time specified"))
userAgentArg = parser.add_argument_group("User Agent") add_nearArgs(parser.add_argument_group("Arguments that are used only with --near"))
add_userAgentArg(userAgentArg)
saveArg = parser.add_argument_group("Create new archive/save URL")
add_saveArg(saveArg)
auArg = parser.add_argument_group("Get the latest Archive")
add_auArg(auArg)
jsonArg = parser.add_argument_group("Get the JSON data")
add_jsonArg(jsonArg)
oldestArg = parser.add_argument_group("Oldest archive")
add_oldestArg(oldestArg)
newestArg = parser.add_argument_group("Newest archive")
add_newestArg(newestArg)
totalArg = parser.add_argument_group("Total number of archives")
add_totalArg(totalArg)
getArg = parser.add_argument_group("Get source code")
add_getArg(getArg)
knownUrlArg = parser.add_argument_group(
"URLs known and archived to Waybcak Machine for the site."
)
add_knownUrlArg(knownUrlArg)
nearArg = parser.add_argument_group("Archive close to time specified")
nearArg.add_argument(
"--near", "-N", action="store_true", help="Archive near specified time"
)
#The following is adding supplementary args used with near.
nearArgs = parser.add_argument_group("Arguments that are used only with --near")
add_nearArgs(nearArgs)
parser.add_argument( parser.add_argument(
"--version", "-v", action="store_true", help="Waybackpy version" "--version", "-v", action="store_true", help="Waybackpy version"
) )
return parser.parse_args(argv[1:]) return parser.parse_args(argv[1:])
def main(argv=None): def main(argv=None):
if argv is None: if argv is None:
argv = sys.argv argv = sys.argv
args = parse_args(argv) print(args_handler(parse_args(argv)))
output = args_handler(args)
print(output)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,59 +1,62 @@
# -*- coding: utf-8 -*-
import re import re
from datetime import datetime, timedelta
from waybackpy.exceptions import WaybackError, URLError
from waybackpy.__version__ import __version__
import requests import requests
import concurrent.futures import concurrent.futures
from datetime import datetime, timedelta
from waybackpy.__version__ import __version__
from waybackpy.exceptions import WaybackError, URLError
default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy" default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
def _archive_url_parser(header): def _archive_url_parser(header, url):
""" """
The wayback machine's save API doesn't
return JSON response, we are required
to read the header of the API response
and look for the archive URL.
This method has some regexen (or regexes) This method has some regexen (or regexes)
that search for archive url in header. that search for archive url in header.
This method is used when you try to This method is used when you try to
save a webpage on wayback machine. save a webpage on wayback machine.
The wayback machine's save API doesn't
return JSON response, we are required
to read the header of the API response
and look for the archive URL.
Two cases are possible: Two cases are possible:
1) Either we find the archive url in 1) Either we find the archive url in
the header. the header.
2) We didn't find the archive url in 2) Or we didn't find the archive url in
API header. API header.
If we found the archive we return it. If we found the archive URL we return it.
And if we couldn't find it we raise And if we couldn't find it, we raise
WaybackError with a standard Error message. WaybackError with an error message.
""" """
# Regex1 # Regex1
arch = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header)) m = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
if arch: if m:
return "web.archive.org" + arch.group(1) return "web.archive.org" + m.group(1)
# Regex2 # Regex2
arch = re.search( m = re.search(
r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header) r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
) )
if arch: if m:
return arch.group(1) return m.group(1)
# Regex3 # Regex3
arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header)) m = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
if arch: if m:
return arch.group(1) return m.group(1)
raise WaybackError( raise WaybackError(
"No archive URL found in the API response. " "No archive URL found in the API response. "
"This version of waybackpy (%s) is likely out of date or WayBack Machine is malfunctioning. Visit " "If '%s' can be accessed via your web browser then either "
"https://github.com/akamhy/waybackpy for the latest version " "this version of waybackpy (%s) is out of date or WayBack Machine is malfunctioning. Visit "
"'https://github.com/akamhy/waybackpy' for the latest version "
"of waybackpy.\nHeader:\n%s" % (__version__, str(header)) "of waybackpy.\nHeader:\n%s" % (__version__, str(header))
) )
@ -79,6 +82,7 @@ def _wayback_timestamp(**kwargs):
Return format is string. Return format is string.
""" """
return "".join( return "".join(
str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"] str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
) )
@ -104,26 +108,25 @@ def _get_response(endpoint, params=None, headers=None):
""" """
try: try:
response = requests.get(endpoint, params=params, headers=headers) return requests.get(endpoint, params=params, headers=headers)
except Exception: except Exception:
try: try:
response = requests.get(endpoint, params=params, headers=headers) # nosec return requests.get(endpoint, params=params, headers=headers)
except Exception as e: except Exception as e:
exc = WaybackError("Error while retrieving %s" % endpoint) exc = WaybackError("Error while retrieving %s" % endpoint)
exc.__cause__ = e exc.__cause__ = e
raise exc raise exc
return response
class Url: class Url:
""" """
waybackpy Url object <class 'waybackpy.wrapper.Url'> waybackpy Url object, Type : <class 'waybackpy.wrapper.Url'>
""" """
def __init__(self, url, user_agent=default_UA): def __init__(self, url, user_agent=default_user_agent):
self.url = url self.url = url
self.user_agent = user_agent self.user_agent = user_agent
self._url_check() # checks url validity on init. self._url_check()
self._archive_url = None self._archive_url = None
self.timestamp = None self.timestamp = None
self._JSON = None self._JSON = None
@ -144,6 +147,7 @@ class Url:
sets self._archive_url, we now set self._archive_url to self.archive_url sets self._archive_url, we now set self._archive_url to self.archive_url
and return it. and return it.
""" """
if not self._archive_url: if not self._archive_url:
self._archive_url = self.archive_url self._archive_url = self.archive_url
return "%s" % self._archive_url return "%s" % self._archive_url
@ -159,8 +163,7 @@ class Url:
if self.timestamp == datetime.max: if self.timestamp == datetime.max:
return td_max.days return td_max.days
diff = datetime.utcnow() - self.timestamp return (datetime.utcnow() - self.timestamp).days
return diff.days
def _url_check(self): def _url_check(self):
""" """
@ -170,6 +173,7 @@ class Url:
If you known any others, please create a PR on the github repo. If you known any others, please create a PR on the github repo.
""" """
if "." not in self.url: if "." not in self.url:
raise URLError("'%s' is not a vaild URL." % self.url) raise URLError("'%s' is not a vaild URL." % self.url)
@ -184,7 +188,7 @@ class Url:
endpoint = "https://archive.org/wayback/available" endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": "%s" % self.user_agent} headers = {"User-Agent": "%s" % self.user_agent}
payload = {"url": "%s" % self._clean_url()} payload = {"url": "%s" % self._cleaned_url()}
response = _get_response(endpoint, params=payload, headers=headers) response = _get_response(endpoint, params=payload, headers=headers)
return response.json() return response.json()
@ -236,7 +240,7 @@ class Url:
self.timestamp = ts self.timestamp = ts
return ts return ts
def _clean_url(self): def _cleaned_url(self):
""" """
Remove newlines Remove newlines
replace " " with "_" replace " " with "_"
@ -245,10 +249,10 @@ class Url:
def save(self): def save(self):
"""Create a new Wayback Machine archive for this URL.""" """Create a new Wayback Machine archive for this URL."""
request_url = "https://web.archive.org/save/" + self._clean_url() request_url = "https://web.archive.org/save/" + self._cleaned_url()
headers = {"User-Agent": "%s" % self.user_agent} headers = {"User-Agent": "%s" % self.user_agent}
response = _get_response(request_url, params=None, headers=headers) response = _get_response(request_url, params=None, headers=headers)
self._archive_url = "https://" + _archive_url_parser(response.headers) self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
self.timestamp = datetime.utcnow() self.timestamp = datetime.utcnow()
return self return self
@ -258,7 +262,7 @@ class Url:
""" """
if not url: if not url:
url = self._clean_url() url = self._cleaned_url()
if not user_agent: if not user_agent:
user_agent = self.user_agent user_agent = self.user_agent
@ -307,14 +311,14 @@ class Url:
endpoint = "https://archive.org/wayback/available" endpoint = "https://archive.org/wayback/available"
headers = {"User-Agent": "%s" % self.user_agent} headers = {"User-Agent": "%s" % self.user_agent}
payload = {"url": "%s" % self._clean_url(), "timestamp": timestamp} payload = {"url": "%s" % self._cleaned_url(), "timestamp": timestamp}
response = _get_response(endpoint, params=payload, headers=headers) response = _get_response(endpoint, params=payload, headers=headers)
data = response.json() data = response.json()
if not data["archived_snapshots"]: if not data["archived_snapshots"]:
raise WaybackError( raise WaybackError(
"Can not find archive for '%s' try later or use wayback.Url(url, user_agent).save() " "Can not find archive for '%s' try later or use wayback.Url(url, user_agent).save() "
"to create a new archive." % self._clean_url() "to create a new archive." % self._cleaned_url()
) )
archive_url = data["archived_snapshots"]["closest"]["url"] archive_url = data["archived_snapshots"]["closest"]["url"]
archive_url = archive_url.replace( archive_url = archive_url.replace(
@ -362,18 +366,24 @@ class Url:
Return type in integer. Return type in integer.
""" """
total_pages_url = (
"https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true"
% self._cleaned_url()
)
headers = {"User-Agent": "%s" % self.user_agent}
total_pages = int(
(_get_response(total_pages_url, headers=headers).text).strip()
)
endpoint = "https://web.archive.org/cdx/search/cdx" archive_count = 0
headers = { for i in range(total_pages):
"User-Agent": "%s" % self.user_agent, page_url = "https://web.archive.org/cdx/search/cdx?url=%s&page=%s" % (
"output": "json", self._cleaned_url(),
"fl": "statuscode", str(i),
} )
payload = {"url": "%s" % self._clean_url()} count = str(_get_response(page_url, headers=headers).text).count("\n")
response = _get_response(endpoint, params=payload, headers=headers) archive_count = archive_count + count
return archive_count
# Most efficient method to count number of archives (yet)
return response.text.count(",")
def live_urls_picker(self, url): def live_urls_picker(self, url):
""" """
@ -384,7 +394,7 @@ class Url:
try: try:
response_code = requests.get(url).status_code response_code = requests.get(url).status_code
except Exception: except Exception:
return # we don't care if Exception return # we don't care if Exception
# 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300 # 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
if response_code >= 400: if response_code >= 400:
@ -406,12 +416,12 @@ class Url:
if subdomain: if subdomain:
request_url = ( request_url = (
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url() % self._cleaned_url()
) )
else: else:
request_url = ( request_url = (
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url() % self._cleaned_url()
) )
headers = {"User-Agent": "%s" % self.user_agent} headers = {"User-Agent": "%s" % self.user_agent}