From 21ad878be89e1d2950e66e791b0dd0185012ed0c Mon Sep 17 00:00:00 2001
From: Benjamin Loison <Benjamin_Loison@users.noreply.gitea.lemnoslife.com>
Date: Tue, 14 Feb 2023 02:00:23 +0100
Subject: [PATCH] Fix #31: Make a website with a search engine notably based on
 the captions extracted

---
 website/index.php | 15 ++++++++++++++-
 website/search.py | 16 ++++++++++++----
 2 files changed, 26 insertions(+), 5 deletions(-)
diff --git a/website/index.php b/website/index.php
index 141f6b0..6c79161 100644
--- a/website/index.php
+++ b/website/index.php
@@ -17,7 +17,7 @@ Access raw data with: <?php echoUrl('channels/'); ?>.
     <input type="submit" id="search-only-captions" value="Search only captions">
 </form>
 
-Progress: <span id="progress"></span>
+Progress: <span id="progress"></span> channels
 
 <ul id="channels">
 </ul>
@@ -42,6 +42,12 @@ Progress: <span id="progress"></span>
             document.getElementById('progress').innerHTML = line.replace('progress:', '');
         } else {
             var channelsDom = document.getElementById('channels');
+            var timestamp = null;
+            if (line.includes('|')) {
+                const lineParts = line.split('|');
+                timestamp = parseInt(lineParts[1]);
+                line = lineParts[0];
+            }
             const channelFileParts = line.split('/');
             const channel = channelFileParts[0];
             const channelFile = channelFileParts.slice(1).join('/');
@@ -60,6 +66,13 @@ Progress: <span id="progress"></span>
             var channelFileDom = document.createElement('li');
             var a = createA(channelFile, `${channelHref}/${channelFile}`);
             channelFileDom.appendChild(a);
+            if (timestamp != null) {
+                const space = document.createTextNode('\u00A0');
+                channelFileDom.appendChild(space);
+                const id = channelFileParts[2];
+                var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
+                channelFileDom.appendChild(a);
+            }
             channelFilesDom.appendChild(channelFileDom);
         }
     }
diff --git a/website/search.py b/website/search.py
index c0ca211..d65cd0c 100755
--- a/website/search.py
+++ b/website/search.py
@@ -41,10 +41,18 @@ for fileIndex, file in enumerate(files):
         with zip.open(fileInZip) as f:
             toWrite = f'{file}/{fileInZip}'
             if endsWithVtt:
-                content = StringIO(f.read().decode('utf-8'))
-                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)])
-                if message in wholeCaption:
-                    write(toWrite)
+                content = f.read().decode('utf-8')
+                stringIOf = StringIO(content)
+                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
+                messagePosition = wholeCaption.find(message)
+                if messagePosition != -1:
+                    stringIOf = StringIO(content)
+                    for caption in webvtt.read_buffer(stringIOf):
+                        text = caption.text
+                        if messagePosition <= len(text):
+                            write(f'{toWrite}|{int(caption.start_in_seconds)}')
+                            break
+                        messagePosition -= len(text) + 1
             else:
                 for line in f.readlines():
                     if message in str(line):