#!/usr/bin/python3 import sys, time, fcntl, os, zipfile, webvtt from io import StringIO path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' clientId = sys.argv[1] message = sys.argv[2] searchOnlyCaptions = message.startswith('search-only-captions ') message = message[message.find(' ') + 1:] clientFilePath = f'users/{clientId}.txt' def write(s): f = open(clientFilePath, 'r+') try: fcntl.flock(f, fcntl.LOCK_EX) # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output. read = f.read() # We are appening content, as we moved in-file cursor. if read != '': f.write("\n") f.write(s) f.flush() fcntl.flock(f, fcntl.LOCK_UN) f.close() except Exception as e: sys.exit(e) # As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity. files = [file for file in os.listdir(path) if file.endswith('.zip')] for fileIndex, file in enumerate(files): write(f'progress:{fileIndex + 1} / {len(files)}') zip = zipfile.ZipFile(path + file) for fileInZip in zip.namelist(): endsWithVtt = fileInZip.endswith('.vtt') if searchOnlyCaptions and not endsWithVtt: continue with zip.open(fileInZip) as f: toWrite = f'{file}/{fileInZip}' if endsWithVtt: content = f.read().decode('utf-8') stringIOf = StringIO(content) wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) messagePosition = wholeCaption.find(message) if messagePosition != -1: stringIOf = StringIO(content) for caption in webvtt.read_buffer(stringIOf): text = caption.text if messagePosition <= len(text): write(f'{toWrite}|{int(caption.start_in_seconds)}') break messagePosition -= len(text) + 1 else: for line in f.readlines(): if message in str(line): write(toWrite) break f = open(clientFilePath) while True: try: fcntl.flock(f, fcntl.LOCK_EX) if f.read() == '': os.remove(clientFilePath) break else: fcntl.flock(f, fcntl.LOCK_UN) time.sleep(1) except Exception as e: sys.exit(e) f.close()