Fix #31: Make a website with a search engine notably based on the captions extracted

2023-02-14 02:00:23 +01:00
parent 09f7675bf7
commit 8d34cf33ae
2 changed files with 26 additions and 5 deletions
@@ -17,7 +17,7 @@ Access raw data with: <?php echoUrl('channels/'); ?>.
    <input type="submit" id="search-only-captions" value="Search only captions">
 </form>
-Progress: <span id="progress"></span>
+Progress: <span id="progress"></span> channels
 <ul id="channels">
 </ul>
@@ -42,6 +42,12 @@ Progress: <span id="progress"></span>
            document.getElementById('progress').innerHTML = line.replace('progress:', '');
        } else {
            var channelsDom = document.getElementById('channels');
            var timestamp = null;
            if (line.includes('|')) {
                const lineParts = line.split('|');
                timestamp = parseInt(lineParts[1]);
                line = lineParts[0];
            }
            const channelFileParts = line.split('/');
            const channel = channelFileParts[0];
            const channelFile = channelFileParts.slice(1).join('/');
@@ -60,6 +66,13 @@ Progress: <span id="progress"></span>
            var channelFileDom = document.createElement('li');
            var a = createA(channelFile, `${channelHref}/${channelFile}`);
            channelFileDom.appendChild(a);
            if (timestamp != null) {
                const space = document.createTextNode('\u00A0');
                channelFileDom.appendChild(space);
                const id = channelFileParts[2];
                var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
                channelFileDom.appendChild(a);
            }
            channelFilesDom.appendChild(channelFileDom);
        }
    }
@@ -41,10 +41,18 @@ for fileIndex, file in enumerate(files):
        with zip.open(fileInZip) as f:
            toWrite = f'{file}/{fileInZip}'
            if endsWithVtt:
-                content = StringIO(f.read().decode('utf-8'))
+                content = f.read().decode('utf-8')
-                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)])
+                stringIOf = StringIO(content)
-                if message in wholeCaption:
+                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
-                    write(toWrite)
+                messagePosition = wholeCaption.find(message)
                if messagePosition != -1:
                    stringIOf = StringIO(content)
                    for caption in webvtt.read_buffer(stringIOf):
                        text = caption.text
                        if messagePosition <= len(text):
                            write(f'{toWrite}|{int(caption.start_in_seconds)}')
                            break
                        messagePosition -= len(text) + 1
            else:
                for line in f.readlines():
                    if message in str(line):