YouTube_captions_search_engine/website/search.py

#!/usr/bin/python3

import sys, time, fcntl, os, zipfile, webvtt, re
from io import StringIO

path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'

clientId = sys.argv[1]
pathSearchMessageParts = sys.argv[2].split(' ')
pathSearch = pathSearchMessageParts[1]
message = ' '.join(pathSearchMessageParts[2:])

pathSearchRegex = re.compile(pathSearch)
messageRegex = re.compile(message)

isPathSearchAChannelId = re.fullmatch(r'[a-zA-Z0-9-_]{24}', pathSearch)

searchOnlyCaptions = pathSearchMessageParts[0] == 'search-only-captions'

clientFilePath = f'users/{clientId}.txt'

def write(s):
    with open(clientFilePath, 'r+') as f:
        try:
            fcntl.flock(f, fcntl.LOCK_EX)
            # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
            read = f.read()
            # We are appening content, as we moved in-file cursor.
            if read != '':
                f.write('\n')
            f.write(s)
            f.flush()
            fcntl.flock(f, fcntl.LOCK_UN)
        except Exception as e:
            sys.exit(e)

def cleanCaption(caption):
    return caption.replace('\n', ' ')

# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
if isPathSearchAChannelId:
    file = pathSearch + '.zip'
    if os.path.isfile(path + file):
        files = [file]
    else:
        write(f'progress:0 / 0')
else:
    files = [file for file in os.listdir(path) if file.endswith('.zip')]

for fileIndex, file in enumerate(files):
    write(f'progress:{fileIndex} / {len(files)}')
    zip = zipfile.ZipFile(path + file)
    for fileInZip in zip.namelist():
        endsWithVtt = fileInZip.endswith('.vtt')
        if searchOnlyCaptions and not endsWithVtt:
            continue
        toWrite = f'{file}/{fileInZip}'
        if not bool(pathSearchRegex.search(toWrite)):
            continue
        with zip.open(fileInZip) as f:
            if endsWithVtt:
                content = f.read().decode('utf-8')
                stringIOf = StringIO(content)
                wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)])
                messagePositions = [m.start() for m in messageRegex.finditer(wholeCaption)]
                if messagePositions != []:
                    timestamps = []
                    for messagePosition in messagePositions:
                        stringIOf = StringIO(content)
                        for caption in webvtt.read_buffer(stringIOf):
                            text = cleanCaption(caption.text)
                            if messagePosition <= len(text):
                                timestamp = str(int(caption.start_in_seconds))
                                timestamps += [timestamp]
                                break
                            messagePosition -= len(text) + 1
                    write(f'{toWrite}|{"|".join(timestamps)}')
            else:
                for line in f.readlines():
                    if message in str(line):
                        write(toWrite)
                        break
    write(f'progress:{fileIndex + 1} / {len(files)}')

with open(clientFilePath) as f:
    while True:
        try:
            fcntl.flock(f, fcntl.LOCK_EX)
            if f.read() == '':
                os.remove(clientFilePath)
                break
            else:
                fcntl.flock(f, fcntl.LOCK_UN)
                time.sleep(1)
        except Exception as e:
            sys.exit(e)