2023-02-07 17:25:17 +01:00
|
|
|
#!/usr/bin/python3
|
|
|
|
|
2023-02-14 02:56:11 +01:00
|
|
|
import sys, time, fcntl, os, zipfile, webvtt, re
|
2023-02-14 01:32:36 +01:00
|
|
|
from io import StringIO
|
2023-02-07 20:15:36 +01:00
|
|
|
|
|
|
|
path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
|
2023-02-07 17:25:17 +01:00
|
|
|
|
|
|
|
clientId = sys.argv[1]
|
|
|
|
message = sys.argv[2]
|
|
|
|
|
2023-02-14 00:59:37 +01:00
|
|
|
searchOnlyCaptions = message.startswith('search-only-captions ')
|
|
|
|
message = message[message.find(' ') + 1:]
|
|
|
|
|
2023-02-07 17:25:17 +01:00
|
|
|
clientFilePath = f'users/{clientId}.txt'
|
|
|
|
|
|
|
|
def write(s):
|
2023-02-07 20:15:36 +01:00
|
|
|
f = open(clientFilePath, 'r+')
|
2023-02-07 17:25:17 +01:00
|
|
|
try:
|
|
|
|
fcntl.flock(f, fcntl.LOCK_EX)
|
2023-02-07 18:14:49 +01:00
|
|
|
# If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
|
|
|
|
read = f.read()
|
2023-02-07 20:15:36 +01:00
|
|
|
# We are appening content, as we moved in-file cursor.
|
|
|
|
if read != '':
|
|
|
|
f.write("\n")
|
|
|
|
f.write(s)
|
|
|
|
f.flush()
|
|
|
|
fcntl.flock(f, fcntl.LOCK_UN)
|
|
|
|
f.close()
|
2023-02-07 17:25:17 +01:00
|
|
|
except Exception as e:
|
2023-02-07 18:14:49 +01:00
|
|
|
sys.exit(e)
|
2023-02-07 17:25:17 +01:00
|
|
|
|
2023-02-07 20:15:36 +01:00
|
|
|
# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
|
2023-02-14 01:08:05 +01:00
|
|
|
files = [file for file in os.listdir(path) if file.endswith('.zip')]
|
|
|
|
for fileIndex, file in enumerate(files):
|
|
|
|
write(f'progress:{fileIndex + 1} / {len(files)}')
|
|
|
|
zip = zipfile.ZipFile(path + file)
|
|
|
|
for fileInZip in zip.namelist():
|
2023-02-14 01:32:36 +01:00
|
|
|
endsWithVtt = fileInZip.endswith('.vtt')
|
|
|
|
if searchOnlyCaptions and not endsWithVtt:
|
2023-02-14 01:08:05 +01:00
|
|
|
continue
|
2023-02-14 01:32:36 +01:00
|
|
|
with zip.open(fileInZip) as f:
|
|
|
|
toWrite = f'{file}/{fileInZip}'
|
|
|
|
if endsWithVtt:
|
2023-02-14 02:00:23 +01:00
|
|
|
content = f.read().decode('utf-8')
|
|
|
|
stringIOf = StringIO(content)
|
|
|
|
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
|
2023-02-14 02:56:11 +01:00
|
|
|
messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
|
|
|
|
if messagePositions != []:
|
|
|
|
timestamps = []
|
|
|
|
for messagePosition in messagePositions:
|
|
|
|
stringIOf = StringIO(content)
|
|
|
|
for caption in webvtt.read_buffer(stringIOf):
|
|
|
|
text = caption.text
|
|
|
|
if messagePosition <= len(text):
|
|
|
|
timestamp = str(int(caption.start_in_seconds))
|
|
|
|
timestamps += [timestamp]
|
|
|
|
break
|
|
|
|
messagePosition -= len(text) + 1
|
|
|
|
write(f'{toWrite}|{"|".join(timestamps)}')
|
2023-02-14 01:32:36 +01:00
|
|
|
else:
|
|
|
|
for line in f.readlines():
|
|
|
|
if message in str(line):
|
|
|
|
write(toWrite)
|
|
|
|
break
|
2023-02-07 17:25:17 +01:00
|
|
|
|
2023-02-07 20:15:36 +01:00
|
|
|
f = open(clientFilePath)
|
2023-02-07 18:14:49 +01:00
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
fcntl.flock(f, fcntl.LOCK_EX)
|
|
|
|
if f.read() == '':
|
|
|
|
os.remove(clientFilePath)
|
|
|
|
break
|
|
|
|
else:
|
2023-02-07 20:15:36 +01:00
|
|
|
fcntl.flock(f, fcntl.LOCK_UN)
|
2023-02-07 18:14:49 +01:00
|
|
|
time.sleep(1)
|
|
|
|
except Exception as e:
|
|
|
|
sys.exit(e)
|
|
|
|
|
|
|
|
f.close()
|