Fix #31: Make a website with a search engine notably based on the captions extracted
This commit is contained in:
parent
09f7675bf7
commit
8d34cf33ae
@ -17,7 +17,7 @@ Access raw data with: <?php echoUrl('channels/'); ?>.
|
||||
<input type="submit" id="search-only-captions" value="Search only captions">
|
||||
</form>
|
||||
|
||||
Progress: <span id="progress"></span>
|
||||
Progress: <span id="progress"></span> channels
|
||||
|
||||
<ul id="channels">
|
||||
</ul>
|
||||
@ -42,6 +42,12 @@ Progress: <span id="progress"></span>
|
||||
document.getElementById('progress').innerHTML = line.replace('progress:', '');
|
||||
} else {
|
||||
var channelsDom = document.getElementById('channels');
|
||||
var timestamp = null;
|
||||
if (line.includes('|')) {
|
||||
const lineParts = line.split('|');
|
||||
timestamp = parseInt(lineParts[1]);
|
||||
line = lineParts[0];
|
||||
}
|
||||
const channelFileParts = line.split('/');
|
||||
const channel = channelFileParts[0];
|
||||
const channelFile = channelFileParts.slice(1).join('/');
|
||||
@ -60,6 +66,13 @@ Progress: <span id="progress"></span>
|
||||
var channelFileDom = document.createElement('li');
|
||||
var a = createA(channelFile, `${channelHref}/${channelFile}`);
|
||||
channelFileDom.appendChild(a);
|
||||
if (timestamp != null) {
|
||||
const space = document.createTextNode('\u00A0');
|
||||
channelFileDom.appendChild(space);
|
||||
const id = channelFileParts[2];
|
||||
var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
|
||||
channelFileDom.appendChild(a);
|
||||
}
|
||||
channelFilesDom.appendChild(channelFileDom);
|
||||
}
|
||||
}
|
||||
|
@ -41,10 +41,18 @@ for fileIndex, file in enumerate(files):
|
||||
with zip.open(fileInZip) as f:
|
||||
toWrite = f'{file}/{fileInZip}'
|
||||
if endsWithVtt:
|
||||
content = StringIO(f.read().decode('utf-8'))
|
||||
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)])
|
||||
if message in wholeCaption:
|
||||
write(toWrite)
|
||||
content = f.read().decode('utf-8')
|
||||
stringIOf = StringIO(content)
|
||||
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
|
||||
messagePosition = wholeCaption.find(message)
|
||||
if messagePosition != -1:
|
||||
stringIOf = StringIO(content)
|
||||
for caption in webvtt.read_buffer(stringIOf):
|
||||
text = caption.text
|
||||
if messagePosition <= len(text):
|
||||
write(f'{toWrite}|{int(caption.start_in_seconds)}')
|
||||
break
|
||||
messagePosition -= len(text) + 1
|
||||
else:
|
||||
for line in f.readlines():
|
||||
if message in str(line):
|
||||
|
Loading…
x
Reference in New Issue
Block a user