83 lines
3.2 KiB
Python
Executable File
83 lines
3.2 KiB
Python
Executable File
#!/usr/bin/python3
|
|
|
|
import sys, time, fcntl, os, zipfile, webvtt, re
|
|
from io import StringIO
|
|
|
|
path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
|
|
|
|
clientId = sys.argv[1]
|
|
pathSearchMessageParts = sys.argv[2].split(' ')
|
|
pathSearch = pathSearchMessageParts[1]
|
|
message = ' '.join(pathSearchMessageParts[2:])
|
|
|
|
searchOnlyCaptions = pathSearchMessageParts[0] == 'search-only-captions'
|
|
|
|
clientFilePath = f'users/{clientId}.txt'
|
|
|
|
def write(s):
|
|
with open(clientFilePath, 'r+') as f:
|
|
try:
|
|
fcntl.flock(f, fcntl.LOCK_EX)
|
|
# If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
|
|
read = f.read()
|
|
# We are appening content, as we moved in-file cursor.
|
|
if read != '':
|
|
f.write("\n")
|
|
f.write(s)
|
|
f.flush()
|
|
fcntl.flock(f, fcntl.LOCK_UN)
|
|
except Exception as e:
|
|
sys.exit(e)
|
|
|
|
def cleanCaption(caption):
|
|
return caption.replace('\n', ' ')
|
|
|
|
# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
|
|
files = [file for file in os.listdir(path) if file.endswith('.zip')]
|
|
for fileIndex, file in enumerate(files):
|
|
write(f'progress:{fileIndex + 1} / {len(files)}')
|
|
zip = zipfile.ZipFile(path + file)
|
|
for fileInZip in zip.namelist():
|
|
endsWithVtt = fileInZip.endswith('.vtt')
|
|
if searchOnlyCaptions and not endsWithVtt:
|
|
continue
|
|
toWrite = f'{file}/{fileInZip}'
|
|
if not bool(re.search(pathSearch, toWrite)):
|
|
continue
|
|
with zip.open(fileInZip) as f:
|
|
if endsWithVtt:
|
|
content = f.read().decode('utf-8')
|
|
stringIOf = StringIO(content)
|
|
wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)])
|
|
messagePositions = [m.start() for m in re.finditer(message, wholeCaption)]
|
|
if messagePositions != []:
|
|
timestamps = []
|
|
for messagePosition in messagePositions:
|
|
stringIOf = StringIO(content)
|
|
for caption in webvtt.read_buffer(stringIOf):
|
|
text = cleanCaption(caption.text)
|
|
if messagePosition <= len(text):
|
|
timestamp = str(int(caption.start_in_seconds))
|
|
timestamps += [timestamp]
|
|
break
|
|
messagePosition -= len(text) + 1
|
|
write(f'{toWrite}|{"|".join(timestamps)}')
|
|
else:
|
|
for line in f.readlines():
|
|
if message in str(line):
|
|
write(toWrite)
|
|
break
|
|
|
|
with open(clientFilePath) as f:
|
|
while True:
|
|
try:
|
|
fcntl.flock(f, fcntl.LOCK_EX)
|
|
if f.read() == '':
|
|
os.remove(clientFilePath)
|
|
break
|
|
else:
|
|
fcntl.flock(f, fcntl.LOCK_UN)
|
|
time.sleep(1)
|
|
except Exception as e:
|
|
sys.exit(e)
|