refactoring, try to code complexity

2021-01-04 00:14:38 +05:30
parent 62e5217b9e
commit 5dec4927cd
4 changed files with 158 additions and 155 deletions
@@ -1,4 +1,3 @@
 # -*- coding: utf-8 -*-
 import sys
 import os
 import pytest
@@ -286,7 +285,7 @@ def test_get():
        alive=False,
        subdomain=False,
        known_urls=False,
-        get="BullShit",
+        get="foobar",
    )
    reply = cli.args_handler(args)
    assert "get the source code of the" in str(reply)
@@ -1,4 +1,3 @@
 # -*- coding: utf-8 -*-
 import sys
 import pytest
 import random
@@ -12,14 +11,15 @@ import waybackpy.wrapper as waybackpy  # noqa: E402
 user_agent = "Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/20.0"
-def test_clean_url():
+def test_cleaned_url():
    """No API use"""
    test_url = " https://en.wikipedia.org/wiki/Network security "
    answer = "https://en.wikipedia.org/wiki/Network_security"
    target = waybackpy.Url(test_url, user_agent)
-    test_result = target._clean_url()
+    test_result = target._cleaned_url()
    assert answer == test_result
 def test_dunders():
    """No API use"""
    url = "https://en.wikipedia.org/wiki/Network_security"
@@ -28,19 +28,23 @@ def test_dunders():
    assert "waybackpy.Url(url=%s, user_agent=%s)" % (url, user_agent) == repr(target)
    assert "en.wikipedia.org" in str(target)
 def test_url_check():
    """No API Use"""
    broken_url = "http://wwwgooglecom/"
    with pytest.raises(Exception):
        waybackpy.Url(broken_url, user_agent)
 def test_archive_url_parser():
    """No API Use"""
    perfect_header = """
    {'Server': 'nginx/1.15.8', 'Date': 'Sat, 02 Jan 2021 09:40:25 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Archive-Orig-Server': 'nginx', 'X-Archive-Orig-Date': 'Sat, 02 Jan 2021 09:40:09 GMT', 'X-Archive-Orig-Transfer-Encoding': 'chunked', 'X-Archive-Orig-Connection': 'keep-alive', 'X-Archive-Orig-Vary': 'Accept-Encoding', 'X-Archive-Orig-Last-Modified': 'Fri, 01 Jan 2021 12:19:00 GMT', 'X-Archive-Orig-Strict-Transport-Security': 'max-age=31536000, max-age=0;', 'X-Archive-Guessed-Content-Type': 'text/html', 'X-Archive-Guessed-Charset': 'utf-8', 'Memento-Datetime': 'Sat, 02 Jan 2021 09:40:09 GMT', 'Link': '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", <https://web.archive.org/web/timemap/link/https://www.scribbr.com/citing-sources/et-al/>; rel="timemap"; type="application/link-format", <https://web.archive.org/web/https://www.scribbr.com/citing-sources/et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/20201126185327/https://www.scribbr.com/citing-sources/et-al/>; rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT", <https://web.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/>; rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"', 'Content-Security-Policy': "default-src 'self' 'unsafe-eval' 'unsafe-inline' data: blob: archive.org web.archive.org analytics.archive.org pragma.archivelab.org", 'X-Archive-Src': 'spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz', 'Server-Timing': 'captures_list;dur=112.646325, exclusion.robots;dur=0.172010, exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, esindex;dur=0.014647, LoadShardBlock;dur=82.205012, PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, load_resource;dur=26.520179', 'X-App-Server': 'wwwb-app200', 'X-ts': '200', 'X-location': 'All', 'X-Cache-Key': 'httpsweb.archive.org/web/20210102094009/https://www.scribbr.com/citing-sources/et-al/IN', 'X-RL': '0', 'X-Page-Cache': 'MISS', 'X-Archive-Screenname': '0', 'Content-Encoding': 'gzip'}
    """
-    archive = waybackpy._archive_url_parser(perfect_header)
+    archive = waybackpy._archive_url_parser(
        perfect_header, "https://www.scribbr.com/citing-sources/et-al/"
    )
    assert "web.archive.org/web/20210102094009" in archive
    # The below header should result in Exception
@@ -49,7 +53,9 @@ def test_archive_url_parser():
    """
    with pytest.raises(Exception):
-        waybackpy._archive_url_parser(no_archive_header)
+        waybackpy._archive_url_parser(
            no_archive_header, "https://www.scribbr.com/citing-sources/et-al/"
        )
 def test_save():
@@ -173,9 +179,11 @@ def test_get_response():
 def test_total_archives():
-
+    user_agent = (
-    target = waybackpy.Url(" https://google.com ", user_agent)
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
-    assert target.total_archives() > 500000
+    )
    target = waybackpy.Url(" https://outlook.com ", user_agent)
    assert target.total_archives() > 80000
    target = waybackpy.Url(
        " https://gaha.e4i3n.m5iai3kip6ied.cima/gahh2718gs/ahkst63t7gad8 ", user_agent
@@ -1,13 +1,12 @@
 # -*- coding: utf-8 -*-
 import sys
 import os
 import re
-import argparse
+import sys
 import string
 import random
 import string
 import argparse
 from waybackpy.wrapper import Url
 from waybackpy.__version__ import __version__
 from waybackpy.exceptions import WaybackError
 from waybackpy.__version__ import __version__
 def _save(obj):
@@ -38,7 +37,7 @@ def _json(obj):
    return obj.JSON
-def handle_not_archived_error(e, obj):
+def no_archive_handler(e, obj):
    m = re.search(r"archive\sfor\s\'(.*?)\'\stry", str(e))
    if m:
        url = m.group(1)
@@ -57,14 +56,14 @@ def _oldest(obj):
    try:
        return obj.oldest()
    except Exception as e:
-        return handle_not_archived_error(e, obj)
+        return no_archive_handler(e, obj)
 def _newest(obj):
    try:
        return obj.newest()
    except Exception as e:
-        return handle_not_archived_error(e, obj)
+        return no_archive_handler(e, obj)
 def _total_archives(obj):
@@ -83,15 +82,15 @@ def _near(obj, args):
    try:
        return obj.near(**_near_args)
    except Exception as e:
-        return handle_not_archived_error(e, obj)
+        return no_archive_handler(e, obj)
 def _save_urls_on_file(input_list, live_url_count):
    m = re.search("https?://([A-Za-z_0-9.-]+).*", input_list[0])
    domain = "domain-unknown"
    if m:
        domain = m.group(1)
    else:
        domain = "domain-unknown"
    uid = "".join(
        random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
@@ -106,44 +105,39 @@ def _save_urls_on_file(input_list, live_url_count):
 def _known_urls(obj, args):
    """Abbreviations:
    sd = subdomain
    al = alive
    """
    Known urls for a domain.
    """
    # sd = subdomain
    sd = False
    al = False
    if args.subdomain:
        sd = True
    # al = alive
    al = False
    if args.alive:
        al = True
    url_list = obj.known_urls(alive=al, subdomain=sd)
    total_urls = len(url_list)
    if total_urls > 0:
-        text = _save_urls_on_file(url_list, total_urls)
+        return _save_urls_on_file(url_list, total_urls)
    else:
        text = "No known URLs found. Please try a diffrent domain!"
-    return text
+    return "No known URLs found. Please try a diffrent domain!"
 def _get(obj, args):
    if args.get.lower() == "url":
        return obj.get()
    if args.get.lower() == "archive_url":
        return obj.get(obj.archive_url)
    if args.get.lower() == "oldest":
        return obj.get(obj.oldest())
    if args.get.lower() == "latest" or args.get.lower() == "newest":
        return obj.get(obj.newest())
    if args.get.lower() == "save":
        return obj.get(obj.save())
    return "Use get as \"--get 'source'\", 'source' can be one of the followings: \
        \n1) url - get the source code of the url specified using --url/-u.\
        \n2) archive_url - get the source code of the newest archive for the supplied url, alias of newest.\
@@ -167,39 +161,48 @@ def args_handler(args):
        obj = Url(args.url, args.user_agent)
    if args.save:
-        return _save(obj)
+        output = _save(obj)
-    if args.archive_url:
+    elif args.archive_url:
-        return _archive_url(obj)
+        output = _archive_url(obj)
-    if args.json:
+    elif args.json:
-        return _json(obj)
+        output = _json(obj)
-    if args.oldest:
+    elif args.oldest:
-        return _oldest(obj)
+        output = _oldest(obj)
-    if args.newest:
+    elif args.newest:
-        return _newest(obj)
+        output = _newest(obj)
-    if args.known_urls:
+    elif args.known_urls:
-        return _known_urls(obj, args)
+        output = _known_urls(obj, args)
-    if args.total:
+    elif args.total:
-        return _total_archives(obj)
+        output = _total_archives(obj)
-    if args.near:
+    elif args.near:
        return _near(obj, args)
-    if args.get:
+    elif args.get:
-        return _get(obj, args)
+        output = _get(obj, args)
-
+    else:
-    return (
+        output = (
            "You only specified the URL. But you also need to specify the operation."
            "\nSee 'waybackpy --help' for help using this tool."
        )
    return output
 def add_requiredArgs(requiredArgs):
    requiredArgs.add_argument(
        "--url", "-u", help="URL on which Wayback machine operations would occur"
    )
 def add_userAgentArg(userAgentArg):
    help_text = 'User agent, default user_agent is "waybackpy python package - https://github.com/akamhy/waybackpy"'
    userAgentArg.add_argument("--user_agent", "-ua", help=help_text)
 def add_saveArg(saveArg):
    saveArg.add_argument(
        "--save", "-s", action="store_true", help="Save the URL on the Wayback machine"
    )
 def add_auArg(auArg):
    auArg.add_argument(
        "--archive_url",
@@ -208,6 +211,7 @@ def add_auArg(auArg):
        help="Get the latest archive URL, alias for --newest",
    )
 def add_jsonArg(jsonArg):
    jsonArg.add_argument(
        "--json",
@@ -216,6 +220,7 @@ def add_jsonArg(jsonArg):
        help="JSON data of the availability API request",
    )
 def add_oldestArg(oldestArg):
    oldestArg.add_argument(
        "--oldest",
@@ -224,6 +229,7 @@ def add_oldestArg(oldestArg):
        help="Oldest archive for the specified URL",
    )
 def add_newestArg(newestArg):
    newestArg.add_argument(
        "--newest",
@@ -232,6 +238,7 @@ def add_newestArg(newestArg):
        help="Newest archive for the specified URL",
    )
 def add_totalArg(totalArg):
    totalArg.add_argument(
        "--total",
@@ -240,6 +247,7 @@ def add_totalArg(totalArg):
        help="Total number of archives for the specified URL",
    )
 def add_getArg(getArg):
    getArg.add_argument(
        "--get",
@@ -247,6 +255,7 @@ def add_getArg(getArg):
        help="Prints the source code of the supplied url. Use '--get help' for extended usage",
    )
 def add_knownUrlArg(knownUrlArg):
    knownUrlArg.add_argument(
        "--known_urls", "-ku", action="store_true", help="URLs known for the domain."
@@ -257,6 +266,12 @@ def add_knownUrlArg(knownUrlArg):
    knownUrlArg.add_argument("--alive", "-a", action="store_true", help=help_text)
 def add_nearArg(nearArg):
    nearArg.add_argument(
        "--near", "-N", action="store_true", help="Archive near specified time"
    )
 def add_nearArgs(nearArgs):
    nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
    nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
@@ -264,64 +279,35 @@ def add_nearArgs(nearArgs):
    nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
    nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
 def parse_args(argv):
    parser = argparse.ArgumentParser()
-
+    add_requiredArgs(parser.add_argument_group("URL argument (required)"))
-    requiredArgs = parser.add_argument_group("URL argument (required)")
+    add_userAgentArg(parser.add_argument_group("User Agent"))
-    requiredArgs.add_argument(
+    add_saveArg(parser.add_argument_group("Create new archive/save URL"))
-        "--url", "-u", help="URL on which Wayback machine operations would occur"
+    add_auArg(parser.add_argument_group("Get the latest Archive"))
    add_jsonArg(parser.add_argument_group("Get the JSON data"))
    add_oldestArg(parser.add_argument_group("Oldest archive"))
    add_newestArg(parser.add_argument_group("Newest archive"))
    add_totalArg(parser.add_argument_group("Total number of archives"))
    add_getArg(parser.add_argument_group("Get source code"))
    add_knownUrlArg(
        parser.add_argument_group(
            "URLs known and archived to Waybcak Machine for the site."
        )
    )
-
+    add_nearArg(parser.add_argument_group("Archive close to time specified"))
-    userAgentArg = parser.add_argument_group("User Agent")
+    add_nearArgs(parser.add_argument_group("Arguments that are used only with --near"))
    add_userAgentArg(userAgentArg)
    saveArg = parser.add_argument_group("Create new archive/save URL")
    add_saveArg(saveArg)
    auArg = parser.add_argument_group("Get the latest Archive")
    add_auArg(auArg)
    jsonArg = parser.add_argument_group("Get the JSON data")
    add_jsonArg(jsonArg)
    oldestArg = parser.add_argument_group("Oldest archive")
    add_oldestArg(oldestArg)
    newestArg = parser.add_argument_group("Newest archive")
    add_newestArg(newestArg)
    totalArg = parser.add_argument_group("Total number of archives")
    add_totalArg(totalArg)
    getArg = parser.add_argument_group("Get source code")
    add_getArg(getArg)
    knownUrlArg = parser.add_argument_group(
        "URLs known and archived to Waybcak Machine for the site."
    )
    add_knownUrlArg(knownUrlArg)
    nearArg = parser.add_argument_group("Archive close to time specified")
    nearArg.add_argument(
        "--near", "-N", action="store_true", help="Archive near specified time"
    )
    #The following is adding supplementary args used with near.
    nearArgs = parser.add_argument_group("Arguments that are used only with --near")
    add_nearArgs(nearArgs)
    parser.add_argument(
        "--version", "-v", action="store_true", help="Waybackpy version"
    )
    return parser.parse_args(argv[1:])
 def main(argv=None):
    if argv is None:
        argv = sys.argv
-    args = parse_args(argv)
+    print(args_handler(parse_args(argv)))
    output = args_handler(args)
    print(output)
 if __name__ == "__main__":
@@ -1,59 +1,62 @@
 # -*- coding: utf-8 -*-
 import re
 from datetime import datetime, timedelta
 from waybackpy.exceptions import WaybackError, URLError
 from waybackpy.__version__ import __version__
 import requests
 import concurrent.futures
 from datetime import datetime, timedelta
 from waybackpy.__version__ import __version__
 from waybackpy.exceptions import WaybackError, URLError
-default_UA = "waybackpy python package - https://github.com/akamhy/waybackpy"
+default_user_agent = "waybackpy python package - https://github.com/akamhy/waybackpy"
-def _archive_url_parser(header):
+def _archive_url_parser(header, url):
    """
    The wayback machine's save API doesn't
    return JSON response, we are required
    to read the header of the API response
    and look for the archive URL.
    This method has some regexen (or regexes)
    that search for archive url in header.
    This method is used when you try to
    save a webpage on wayback machine.
    The wayback machine's save API doesn't
    return JSON response, we are required
    to read the header of the API response
    and look for the archive URL.
    Two cases are possible:
    1) Either we find the archive url in
       the header.
-    2) We didn't find the archive url in
+    2) Or we didn't find the archive url in
       API header.
-    If we found the archive we return it.
+    If we found the archive URL we return it.
-    And if we couldn't find it we raise
+    And if we couldn't find it, we raise
-    WaybackError with a standard Error message.
+    WaybackError with an error message.
    """
    # Regex1
-    arch = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
+    m = re.search(r"Content-Location: (/web/[0-9]{14}/.*)", str(header))
-    if arch:
+    if m:
-        return "web.archive.org" + arch.group(1)
+        return "web.archive.org" + m.group(1)
    # Regex2
-    arch = re.search(
+    m = re.search(
        r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>", str(header)
    )
-    if arch:
+    if m:
-        return arch.group(1)
+        return m.group(1)
    # Regex3
-    arch = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
+    m = re.search(r"X-Cache-Key:\shttps(.*)[A-Z]{2}", str(header))
-    if arch:
+    if m:
-        return arch.group(1)
+        return m.group(1)
    raise WaybackError(
        "No archive URL found in the API response. "
-        "This version of waybackpy (%s) is likely out of date or WayBack Machine is malfunctioning. Visit "
+        "If '%s' can be accessed via your web browser then either "
-        "https://github.com/akamhy/waybackpy for the latest version "
+        "this version of waybackpy (%s) is out of date or WayBack Machine is malfunctioning. Visit "
        "'https://github.com/akamhy/waybackpy' for the latest version "
        "of waybackpy.\nHeader:\n%s" % (__version__, str(header))
    )
@@ -79,6 +82,7 @@ def _wayback_timestamp(**kwargs):
    Return format is string.
    """
    return "".join(
        str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
    )
@@ -104,26 +108,25 @@ def _get_response(endpoint, params=None, headers=None):
    """
    try:
-        response = requests.get(endpoint, params=params, headers=headers)
+        return requests.get(endpoint, params=params, headers=headers)
    except Exception:
        try:
-            response = requests.get(endpoint, params=params, headers=headers)  # nosec
+            return requests.get(endpoint, params=params, headers=headers)
        except Exception as e:
            exc = WaybackError("Error while retrieving %s" % endpoint)
            exc.__cause__ = e
            raise exc
    return response
 class Url:
    """
-    waybackpy Url object <class 'waybackpy.wrapper.Url'>
+    waybackpy Url object, Type : <class 'waybackpy.wrapper.Url'>
    """
-    def __init__(self, url, user_agent=default_UA):
+    def __init__(self, url, user_agent=default_user_agent):
        self.url = url
        self.user_agent = user_agent
-        self._url_check()  # checks url validity on init.
+        self._url_check()
        self._archive_url = None
        self.timestamp = None
        self._JSON = None
@@ -144,6 +147,7 @@ class Url:
        sets self._archive_url, we now set self._archive_url to self.archive_url
        and return it.
        """
        if not self._archive_url:
            self._archive_url = self.archive_url
        return "%s" % self._archive_url
@@ -159,8 +163,7 @@ class Url:
        if self.timestamp == datetime.max:
            return td_max.days
-        diff = datetime.utcnow() - self.timestamp
+        return (datetime.utcnow() - self.timestamp).days
        return diff.days
    def _url_check(self):
        """
@@ -170,6 +173,7 @@ class Url:
        If you known any others, please create a PR on the github repo.
        """
        if "." not in self.url:
            raise URLError("'%s' is not a vaild URL." % self.url)
@@ -184,7 +188,7 @@ class Url:
        endpoint = "https://archive.org/wayback/available"
        headers = {"User-Agent": "%s" % self.user_agent}
-        payload = {"url": "%s" % self._clean_url()}
+        payload = {"url": "%s" % self._cleaned_url()}
        response = _get_response(endpoint, params=payload, headers=headers)
        return response.json()
@@ -236,7 +240,7 @@ class Url:
        self.timestamp = ts
        return ts
-    def _clean_url(self):
+    def _cleaned_url(self):
        """
        Remove newlines
        replace " " with "_"
@@ -245,10 +249,10 @@ class Url:
    def save(self):
        """Create a new Wayback Machine archive for this URL."""
-        request_url = "https://web.archive.org/save/" + self._clean_url()
+        request_url = "https://web.archive.org/save/" + self._cleaned_url()
        headers = {"User-Agent": "%s" % self.user_agent}
        response = _get_response(request_url, params=None, headers=headers)
-        self._archive_url = "https://" + _archive_url_parser(response.headers)
+        self._archive_url = "https://" + _archive_url_parser(response.headers, self.url)
        self.timestamp = datetime.utcnow()
        return self
@@ -258,7 +262,7 @@ class Url:
        """
        if not url:
-            url = self._clean_url()
+            url = self._cleaned_url()
        if not user_agent:
            user_agent = self.user_agent
@@ -307,14 +311,14 @@ class Url:
        endpoint = "https://archive.org/wayback/available"
        headers = {"User-Agent": "%s" % self.user_agent}
-        payload = {"url": "%s" % self._clean_url(), "timestamp": timestamp}
+        payload = {"url": "%s" % self._cleaned_url(), "timestamp": timestamp}
        response = _get_response(endpoint, params=payload, headers=headers)
        data = response.json()
        if not data["archived_snapshots"]:
            raise WaybackError(
                "Can not find archive for '%s' try later or use wayback.Url(url, user_agent).save() "
-                "to create a new archive." % self._clean_url()
+                "to create a new archive." % self._cleaned_url()
            )
        archive_url = data["archived_snapshots"]["closest"]["url"]
        archive_url = archive_url.replace(
@@ -362,18 +366,24 @@ class Url:
        Return type in integer.
        """
        total_pages_url = (
            "https://web.archive.org/cdx/search/cdx?url=%s&showNumPages=true"
            % self._cleaned_url()
        )
        headers = {"User-Agent": "%s" % self.user_agent}
        total_pages = int(
            (_get_response(total_pages_url, headers=headers).text).strip()
        )
-        endpoint = "https://web.archive.org/cdx/search/cdx"
+        archive_count = 0
-        headers = {
+        for i in range(total_pages):
-            "User-Agent": "%s" % self.user_agent,
+            page_url = "https://web.archive.org/cdx/search/cdx?url=%s&page=%s" % (
-            "output": "json",
+                self._cleaned_url(),
-            "fl": "statuscode",
+                str(i),
-        }
+            )
-        payload = {"url": "%s" % self._clean_url()}
+            count = str(_get_response(page_url, headers=headers).text).count("\n")
-        response = _get_response(endpoint, params=payload, headers=headers)
+            archive_count = archive_count + count
-
+        return archive_count
        # Most efficient method to count number of archives (yet)
        return response.text.count(",")
    def live_urls_picker(self, url):
        """
@@ -384,7 +394,7 @@ class Url:
        try:
            response_code = requests.get(url).status_code
        except Exception:
-            return # we don't care if Exception
+            return  # we don't care if Exception
        # 200s are OK and 300s are usually redirects, if you don't want redirects replace 400 with 300
        if response_code >= 400:
@@ -406,12 +416,12 @@ class Url:
        if subdomain:
            request_url = (
                "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
-                % self._clean_url()
+                % self._cleaned_url()
            )
        else:
            request_url = (
                "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
-                % self._clean_url()
+                % self._cleaned_url()
            )
        headers = {"User-Agent": "%s" % self.user_agent}