Fix #31: List all occurrences of search within video captions

This commit is contained in:
Benjamin Loison 2023-02-14 02:56:11 +01:00
parent 8d34cf33ae
commit e166fdb4e5
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 21 additions and 16 deletions

View File

@ -42,10 +42,10 @@ Progress: <span id="progress"></span> channels
document.getElementById('progress').innerHTML = line.replace('progress:', ''); document.getElementById('progress').innerHTML = line.replace('progress:', '');
} else { } else {
var channelsDom = document.getElementById('channels'); var channelsDom = document.getElementById('channels');
var timestamp = null; var timestamp = [];
if (line.includes('|')) {
const lineParts = line.split('|'); const lineParts = line.split('|');
timestamp = parseInt(lineParts[1]); if (lineParts.length > 0) {
timestamps = lineParts.slice(1).map(linePart => parseInt(linePart));
line = lineParts[0]; line = lineParts[0];
} }
const channelFileParts = line.split('/'); const channelFileParts = line.split('/');
@ -66,10 +66,11 @@ Progress: <span id="progress"></span> channels
var channelFileDom = document.createElement('li'); var channelFileDom = document.createElement('li');
var a = createA(channelFile, `${channelHref}/${channelFile}`); var a = createA(channelFile, `${channelHref}/${channelFile}`);
channelFileDom.appendChild(a); channelFileDom.appendChild(a);
if (timestamp != null) { const id = channelFileParts[2];
for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) {
const space = document.createTextNode('\u00A0'); const space = document.createTextNode('\u00A0');
channelFileDom.appendChild(space); channelFileDom.appendChild(space);
const id = channelFileParts[2]; const timestamp = timestamps[timestampsIndex];
var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`); var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
channelFileDom.appendChild(a); channelFileDom.appendChild(a);
} }

View File

@ -1,6 +1,6 @@
#!/usr/bin/python3 #!/usr/bin/python3
import sys, time, fcntl, os, zipfile, webvtt import sys, time, fcntl, os, zipfile, webvtt, re
from io import StringIO from io import StringIO
path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
@ -44,15 +44,19 @@ for fileIndex, file in enumerate(files):
content = f.read().decode('utf-8') content = f.read().decode('utf-8')
stringIOf = StringIO(content) stringIOf = StringIO(content)
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
messagePosition = wholeCaption.find(message) messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
if messagePosition != -1: if messagePositions != []:
timestamps = []
for messagePosition in messagePositions:
stringIOf = StringIO(content) stringIOf = StringIO(content)
for caption in webvtt.read_buffer(stringIOf): for caption in webvtt.read_buffer(stringIOf):
text = caption.text text = caption.text
if messagePosition <= len(text): if messagePosition <= len(text):
write(f'{toWrite}|{int(caption.start_in_seconds)}') timestamp = str(int(caption.start_in_seconds))
timestamps += [timestamp]
break break
messagePosition -= len(text) + 1 messagePosition -= len(text) + 1
write(f'{toWrite}|{"|".join(timestamps)}')
else: else:
for line in f.readlines(): for line in f.readlines():
if message in str(line): if message in str(line):