Fix #31: Make a website with a search engine notably based on the captions extracted

2023-02-14 02:00:23 +01:00
parent 09f7675bf7
commit 8d34cf33ae
2 changed files with 26 additions and 5 deletions
--- a/website/index.php
+++ b/website/index.php
@@ -17,7 +17,7 @@ Access raw data with: <?php echoUrl('channels/'); ?>.
    <input type="submit" id="search-only-captions" value="Search only captions">
 </form>

-Progress: <span id="progress"></span>
+Progress: <span id="progress"></span> channels

 <ul id="channels">
 </ul>
@@ -42,6 +42,12 @@ Progress: <span id="progress"></span>
            document.getElementById('progress').innerHTML = line.replace('progress:', '');
        } else {
            var channelsDom = document.getElementById('channels');
+            var timestamp = null;
+            if (line.includes('|')) {
+                const lineParts = line.split('|');
+                timestamp = parseInt(lineParts[1]);
+                line = lineParts[0];
+            }
            const channelFileParts = line.split('/');
            const channel = channelFileParts[0];
            const channelFile = channelFileParts.slice(1).join('/');
@@ -60,6 +66,13 @@ Progress: <span id="progress"></span>
            var channelFileDom = document.createElement('li');
            var a = createA(channelFile, `${channelHref}/${channelFile}`);
            channelFileDom.appendChild(a);
+            if (timestamp != null) {
+                const space = document.createTextNode('\u00A0');
+                channelFileDom.appendChild(space);
+                const id = channelFileParts[2];
+                var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
+                channelFileDom.appendChild(a);
+            }
            channelFilesDom.appendChild(channelFileDom);
        }
    }
--- a/website/search.py
+++ b/website/search.py
@@ -41,10 +41,18 @@ for fileIndex, file in enumerate(files):
        with zip.open(fileInZip) as f:
            toWrite = f'{file}/{fileInZip}'
            if endsWithVtt:
-                content = StringIO(f.read().decode('utf-8'))
-                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)])
-                if message in wholeCaption:
-                    write(toWrite)
+                content = f.read().decode('utf-8')
+                stringIOf = StringIO(content)
+                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
+                messagePosition = wholeCaption.find(message)
+                if messagePosition != -1:
+                    stringIOf = StringIO(content)
+                    for caption in webvtt.read_buffer(stringIOf):
+                        text = caption.text
+                        if messagePosition <= len(text):
+                            write(f'{toWrite}|{int(caption.start_in_seconds)}')
+                            break
+                        messagePosition -= len(text) + 1
            else:
                for line in f.readlines():
                    if message in str(line):