#!/usr/bin/python3 import sys, time, fcntl, os, zipfile, webvtt from io import StringIO path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' clientId = sys.argv[1] message = sys.argv[2] searchOnlyCaptions = message.startswith('search-only-captions ') message = message[message.find(' ') + 1:] clientFilePath = f'users/{clientId}.txt' def write(s): f = open(clientFilePath, 'r+') try: fcntl.flock(f, fcntl.LOCK_EX) # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output. read = f.read() # We are appening content, as we moved in-file cursor. if read != '': f.write("\n") f.write(s) f.flush() fcntl.flock(f, fcntl.LOCK_UN) f.close() except Exception as e: sys.exit(e) # As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity. files = [file for file in os.listdir(path) if file.endswith('.zip')] for fileIndex, file in enumerate(files): write(f'progress:{fileIndex + 1} / {len(files)}') zip = zipfile.ZipFile(path + file) for fileInZip in zip.namelist(): endsWithVtt = fileInZip.endswith('.vtt') if searchOnlyCaptions and not endsWithVtt: continue with zip.open(fileInZip) as f: toWrite = f'{file}/{fileInZip}' if endsWithVtt: content = StringIO(f.read().decode('utf-8')) wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)]) if message in wholeCaption: write(toWrite) else: for line in f.readlines(): if message in str(line): write(toWrite) break f = open(clientFilePath) while True: try: fcntl.flock(f, fcntl.LOCK_EX) if f.read() == '': os.remove(clientFilePath) break else: fcntl.flock(f, fcntl.LOCK_UN) time.sleep(1) except Exception as e: sys.exit(e) f.close()