Fix #31: List all occurrences of search within video captions
This commit is contained in:
parent
8d34cf33ae
commit
e166fdb4e5
@ -42,10 +42,10 @@ Progress: <span id="progress"></span> channels
|
||||
document.getElementById('progress').innerHTML = line.replace('progress:', '');
|
||||
} else {
|
||||
var channelsDom = document.getElementById('channels');
|
||||
var timestamp = null;
|
||||
if (line.includes('|')) {
|
||||
const lineParts = line.split('|');
|
||||
timestamp = parseInt(lineParts[1]);
|
||||
var timestamp = [];
|
||||
const lineParts = line.split('|');
|
||||
if (lineParts.length > 0) {
|
||||
timestamps = lineParts.slice(1).map(linePart => parseInt(linePart));
|
||||
line = lineParts[0];
|
||||
}
|
||||
const channelFileParts = line.split('/');
|
||||
@ -66,10 +66,11 @@ Progress: <span id="progress"></span> channels
|
||||
var channelFileDom = document.createElement('li');
|
||||
var a = createA(channelFile, `${channelHref}/${channelFile}`);
|
||||
channelFileDom.appendChild(a);
|
||||
if (timestamp != null) {
|
||||
const id = channelFileParts[2];
|
||||
for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) {
|
||||
const space = document.createTextNode('\u00A0');
|
||||
channelFileDom.appendChild(space);
|
||||
const id = channelFileParts[2];
|
||||
const timestamp = timestamps[timestampsIndex];
|
||||
var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
|
||||
channelFileDom.appendChild(a);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import sys, time, fcntl, os, zipfile, webvtt
|
||||
import sys, time, fcntl, os, zipfile, webvtt, re
|
||||
from io import StringIO
|
||||
|
||||
path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
|
||||
@ -44,15 +44,19 @@ for fileIndex, file in enumerate(files):
|
||||
content = f.read().decode('utf-8')
|
||||
stringIOf = StringIO(content)
|
||||
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
|
||||
messagePosition = wholeCaption.find(message)
|
||||
if messagePosition != -1:
|
||||
stringIOf = StringIO(content)
|
||||
for caption in webvtt.read_buffer(stringIOf):
|
||||
text = caption.text
|
||||
if messagePosition <= len(text):
|
||||
write(f'{toWrite}|{int(caption.start_in_seconds)}')
|
||||
break
|
||||
messagePosition -= len(text) + 1
|
||||
messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
|
||||
if messagePositions != []:
|
||||
timestamps = []
|
||||
for messagePosition in messagePositions:
|
||||
stringIOf = StringIO(content)
|
||||
for caption in webvtt.read_buffer(stringIOf):
|
||||
text = caption.text
|
||||
if messagePosition <= len(text):
|
||||
timestamp = str(int(caption.start_in_seconds))
|
||||
timestamps += [timestamp]
|
||||
break
|
||||
messagePosition -= len(text) + 1
|
||||
write(f'{toWrite}|{"|".join(timestamps)}')
|
||||
else:
|
||||
for line in f.readlines():
|
||||
if message in str(line):
|
||||
|
Loading…
Reference in New Issue
Block a user