Fix #31: Make a website with a search engine notably based on the captions extracted
This commit is contained in:
parent
09f7675bf7
commit
8d34cf33ae
@ -17,7 +17,7 @@ Access raw data with: <?php echoUrl('channels/'); ?>.
|
|||||||
<input type="submit" id="search-only-captions" value="Search only captions">
|
<input type="submit" id="search-only-captions" value="Search only captions">
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
Progress: <span id="progress"></span>
|
Progress: <span id="progress"></span> channels
|
||||||
|
|
||||||
<ul id="channels">
|
<ul id="channels">
|
||||||
</ul>
|
</ul>
|
||||||
@ -42,6 +42,12 @@ Progress: <span id="progress"></span>
|
|||||||
document.getElementById('progress').innerHTML = line.replace('progress:', '');
|
document.getElementById('progress').innerHTML = line.replace('progress:', '');
|
||||||
} else {
|
} else {
|
||||||
var channelsDom = document.getElementById('channels');
|
var channelsDom = document.getElementById('channels');
|
||||||
|
var timestamp = null;
|
||||||
|
if (line.includes('|')) {
|
||||||
|
const lineParts = line.split('|');
|
||||||
|
timestamp = parseInt(lineParts[1]);
|
||||||
|
line = lineParts[0];
|
||||||
|
}
|
||||||
const channelFileParts = line.split('/');
|
const channelFileParts = line.split('/');
|
||||||
const channel = channelFileParts[0];
|
const channel = channelFileParts[0];
|
||||||
const channelFile = channelFileParts.slice(1).join('/');
|
const channelFile = channelFileParts.slice(1).join('/');
|
||||||
@ -60,6 +66,13 @@ Progress: <span id="progress"></span>
|
|||||||
var channelFileDom = document.createElement('li');
|
var channelFileDom = document.createElement('li');
|
||||||
var a = createA(channelFile, `${channelHref}/${channelFile}`);
|
var a = createA(channelFile, `${channelHref}/${channelFile}`);
|
||||||
channelFileDom.appendChild(a);
|
channelFileDom.appendChild(a);
|
||||||
|
if (timestamp != null) {
|
||||||
|
const space = document.createTextNode('\u00A0');
|
||||||
|
channelFileDom.appendChild(space);
|
||||||
|
const id = channelFileParts[2];
|
||||||
|
var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
|
||||||
|
channelFileDom.appendChild(a);
|
||||||
|
}
|
||||||
channelFilesDom.appendChild(channelFileDom);
|
channelFilesDom.appendChild(channelFileDom);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,10 +41,18 @@ for fileIndex, file in enumerate(files):
|
|||||||
with zip.open(fileInZip) as f:
|
with zip.open(fileInZip) as f:
|
||||||
toWrite = f'{file}/{fileInZip}'
|
toWrite = f'{file}/{fileInZip}'
|
||||||
if endsWithVtt:
|
if endsWithVtt:
|
||||||
content = StringIO(f.read().decode('utf-8'))
|
content = f.read().decode('utf-8')
|
||||||
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)])
|
stringIOf = StringIO(content)
|
||||||
if message in wholeCaption:
|
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
|
||||||
write(toWrite)
|
messagePosition = wholeCaption.find(message)
|
||||||
|
if messagePosition != -1:
|
||||||
|
stringIOf = StringIO(content)
|
||||||
|
for caption in webvtt.read_buffer(stringIOf):
|
||||||
|
text = caption.text
|
||||||
|
if messagePosition <= len(text):
|
||||||
|
write(f'{toWrite}|{int(caption.start_in_seconds)}')
|
||||||
|
break
|
||||||
|
messagePosition -= len(text) + 1
|
||||||
else:
|
else:
|
||||||
for line in f.readlines():
|
for line in f.readlines():
|
||||||
if message in str(line):
|
if message in str(line):
|
||||||
|
Loading…
Reference in New Issue
Block a user