Fix #31: List all occurrences of search within video captions

This commit is contained in:
Benjamin Loison 2023-02-14 02:56:11 +01:00
parent 8d34cf33ae
commit e166fdb4e5
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 21 additions and 16 deletions

View File

@ -42,10 +42,10 @@ Progress: <span id="progress"></span> channels
document.getElementById('progress').innerHTML = line.replace('progress:', '');
} else {
var channelsDom = document.getElementById('channels');
var timestamp = null;
if (line.includes('|')) {
var timestamp = [];
const lineParts = line.split('|');
timestamp = parseInt(lineParts[1]);
if (lineParts.length > 0) {
timestamps = lineParts.slice(1).map(linePart => parseInt(linePart));
line = lineParts[0];
}
const channelFileParts = line.split('/');
@ -66,10 +66,11 @@ Progress: <span id="progress"></span> channels
var channelFileDom = document.createElement('li');
var a = createA(channelFile, `${channelHref}/${channelFile}`);
channelFileDom.appendChild(a);
if (timestamp != null) {
const id = channelFileParts[2];
for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) {
const space = document.createTextNode('\u00A0');
channelFileDom.appendChild(space);
const id = channelFileParts[2];
const timestamp = timestamps[timestampsIndex];
var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
channelFileDom.appendChild(a);
}

View File

@ -1,6 +1,6 @@
#!/usr/bin/python3
import sys, time, fcntl, os, zipfile, webvtt
import sys, time, fcntl, os, zipfile, webvtt, re
from io import StringIO
path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
@ -44,15 +44,19 @@ for fileIndex, file in enumerate(files):
content = f.read().decode('utf-8')
stringIOf = StringIO(content)
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
messagePosition = wholeCaption.find(message)
if messagePosition != -1:
messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
if messagePositions != []:
timestamps = []
for messagePosition in messagePositions:
stringIOf = StringIO(content)
for caption in webvtt.read_buffer(stringIOf):
text = caption.text
if messagePosition <= len(text):
write(f'{toWrite}|{int(caption.start_in_seconds)}')
timestamp = str(int(caption.start_in_seconds))
timestamps += [timestamp]
break
messagePosition -= len(text) + 1
write(f'{toWrite}|{"|".join(timestamps)}')
else:
for line in f.readlines():
if message in str(line):