YouTube_captions_search_engine/website/search.py

#!/usr/bin/python3

import sys, time, fcntl, os, zipfile, webvtt
from io import StringIO

path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'

clientId = sys.argv[1]
message = sys.argv[2]

searchOnlyCaptions = message.startswith('search-only-captions ')
message = message[message.find(' ') + 1:]

clientFilePath = f'users/{clientId}.txt'

def write(s):
    f = open(clientFilePath, 'r+')
    try:
        fcntl.flock(f, fcntl.LOCK_EX)
        # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
        read = f.read()
        # We are appening content, as we moved in-file cursor.
        if read != '':
            f.write("\n")
        f.write(s)
        f.flush()
        fcntl.flock(f, fcntl.LOCK_UN)
        f.close()
    except Exception as e:
        sys.exit(e)

# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
files = [file for file in os.listdir(path) if file.endswith('.zip')]
for fileIndex, file in enumerate(files):
    write(f'progress:{fileIndex + 1} / {len(files)}')
    zip = zipfile.ZipFile(path + file)
    for fileInZip in zip.namelist():
        endsWithVtt = fileInZip.endswith('.vtt')
        if searchOnlyCaptions and not endsWithVtt:
            continue
        with zip.open(fileInZip) as f:
            toWrite = f'{file}/{fileInZip}'
            if endsWithVtt:
                content = StringIO(f.read().decode('utf-8'))
                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)])
                if message in wholeCaption:
                    write(toWrite)
            else:
                for line in f.readlines():
                    if message in str(line):
                        write(toWrite)
                        break

f = open(clientFilePath)
while True:
    try:
        fcntl.flock(f, fcntl.LOCK_EX)
        if f.read() == '':
            os.remove(clientFilePath)
            break
        else:
            fcntl.flock(f, fcntl.LOCK_UN)
            time.sleep(1)
    except Exception as e:
        sys.exit(e)

f.close()