YouTube_captions_search_engine/website/search.py
2024-10-16 18:11:21 +02:00

97 lines
3.5 KiB
Python
Executable File

#!/usr/bin/python3
import sys, time, fcntl, os, zipfile, webvtt, re
from io import StringIO
path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
clientId = sys.argv[1]
pathSearchMessageParts = sys.argv[2].split(' ')
pathSearch = pathSearchMessageParts[1]
message = ' '.join(pathSearchMessageParts[2:])
pathSearchRegex = re.compile(pathSearch)
messageRegex = re.compile(message)
isPathSearchAChannelId = re.fullmatch(r'[a-zA-Z0-9-_]{24}', pathSearch)
searchOnlyCaptions = pathSearchMessageParts[0] == 'search-only-captions'
clientFilePath = f'users/{clientId}.txt'
def write(s):
with open(clientFilePath, 'r+') as f:
try:
fcntl.flock(f, fcntl.LOCK_EX)
# If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
read = f.read()
# We are appening content, as we moved in-file cursor.
if read != '':
f.write('\n')
f.write(s)
f.flush()
fcntl.flock(f, fcntl.LOCK_UN)
except Exception as e:
sys.exit(e)
def cleanCaption(caption):
return caption.replace('\n', ' ')
# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
if isPathSearchAChannelId:
file = pathSearch + '.zip'
if os.path.isfile(path + file):
files = [file]
else:
write(f'progress:0 / 0')
else:
files = [file for file in os.listdir(path) if file.endswith('.zip')]
for fileIndex, file in enumerate(files):
write(f'progress:{fileIndex} / {len(files)}')
zip = zipfile.ZipFile(path + file)
for fileInZip in zip.namelist():
endsWithVtt = fileInZip.endswith('.vtt')
if searchOnlyCaptions and not endsWithVtt:
continue
toWrite = f'{file}/{fileInZip}'
if not bool(pathSearchRegex.search(toWrite)):
continue
with zip.open(fileInZip) as f:
if endsWithVtt:
content = f.read().decode('utf-8')
stringIOf = StringIO(content)
wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)])
messagePositions = [m.start() for m in messageRegex.finditer(wholeCaption)]
if messagePositions != []:
timestamps = []
for messagePosition in messagePositions:
stringIOf = StringIO(content)
for caption in webvtt.read_buffer(stringIOf):
text = cleanCaption(caption.text)
if messagePosition <= len(text):
timestamp = str(int(caption.start_in_seconds))
timestamps += [timestamp]
break
messagePosition -= len(text) + 1
write(f'{toWrite}|{"|".join(timestamps)}')
else:
for line in f.readlines():
if message in str(line):
write(toWrite)
break
write(f'progress:{fileIndex + 1} / {len(files)}')
with open(clientFilePath) as f:
while True:
try:
fcntl.flock(f, fcntl.LOCK_EX)
if f.read() == '':
os.remove(clientFilePath)
break
else:
fcntl.flock(f, fcntl.LOCK_UN)
time.sleep(1)
except Exception as e:
sys.exit(e)