Fix #31: Make a website with a search engine notably based on the captions extracted

This commit is contained in:
Benjamin Loison 2023-02-14 02:00:23 +01:00
parent 09f7675bf7
commit 8d34cf33ae
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 26 additions and 5 deletions

View File

@ -17,7 +17,7 @@ Access raw data with: <?php echoUrl('channels/'); ?>.
<input type="submit" id="search-only-captions" value="Search only captions">
</form>
Progress: <span id="progress"></span>
Progress: <span id="progress"></span> channels
<ul id="channels">
</ul>
@ -42,6 +42,12 @@ Progress: <span id="progress"></span>
document.getElementById('progress').innerHTML = line.replace('progress:', '');
} else {
var channelsDom = document.getElementById('channels');
var timestamp = null;
if (line.includes('|')) {
const lineParts = line.split('|');
timestamp = parseInt(lineParts[1]);
line = lineParts[0];
}
const channelFileParts = line.split('/');
const channel = channelFileParts[0];
const channelFile = channelFileParts.slice(1).join('/');
@ -60,6 +66,13 @@ Progress: <span id="progress"></span>
var channelFileDom = document.createElement('li');
var a = createA(channelFile, `${channelHref}/${channelFile}`);
channelFileDom.appendChild(a);
if (timestamp != null) {
const space = document.createTextNode('\u00A0');
channelFileDom.appendChild(space);
const id = channelFileParts[2];
var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
channelFileDom.appendChild(a);
}
channelFilesDom.appendChild(channelFileDom);
}
}

View File

@ -41,10 +41,18 @@ for fileIndex, file in enumerate(files):
with zip.open(fileInZip) as f:
toWrite = f'{file}/{fileInZip}'
if endsWithVtt:
content = StringIO(f.read().decode('utf-8'))
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)])
if message in wholeCaption:
write(toWrite)
content = f.read().decode('utf-8')
stringIOf = StringIO(content)
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
messagePosition = wholeCaption.find(message)
if messagePosition != -1:
stringIOf = StringIO(content)
for caption in webvtt.read_buffer(stringIOf):
text = caption.text
if messagePosition <= len(text):
write(f'{toWrite}|{int(caption.start_in_seconds)}')
break
messagePosition -= len(text) + 1
else:
for line in f.readlines():
if message in str(line):