From e166fdb4e59c061995014733cc1a91251f7cf676 Mon Sep 17 00:00:00 2001
From: Benjamin Loison <Benjamin_Loison@users.noreply.gitea.lemnoslife.com>
Date: Tue, 14 Feb 2023 02:56:11 +0100
Subject: [PATCH] Fix #31: List all occurrences of search within video captions

---
 website/index.php | 13 +++++++------
 website/search.py | 24 ++++++++++++++----------
 2 files changed, 21 insertions(+), 16 deletions(-)
diff --git a/website/index.php b/website/index.php
index 6c79161..21eb0b6 100644
--- a/website/index.php
+++ b/website/index.php
@@ -42,10 +42,10 @@ Progress: <span id="progress"></span> channels
             document.getElementById('progress').innerHTML = line.replace('progress:', '');
         } else {
             var channelsDom = document.getElementById('channels');
-            var timestamp = null;
-            if (line.includes('|')) {
-                const lineParts = line.split('|');
-                timestamp = parseInt(lineParts[1]);
+            var timestamp = [];
+            const lineParts = line.split('|');
+            if (lineParts.length > 0) {
+                timestamps = lineParts.slice(1).map(linePart => parseInt(linePart));
                 line = lineParts[0];
             }
             const channelFileParts = line.split('/');
@@ -66,10 +66,11 @@ Progress: <span id="progress"></span> channels
             var channelFileDom = document.createElement('li');
             var a = createA(channelFile, `${channelHref}/${channelFile}`);
             channelFileDom.appendChild(a);
-            if (timestamp != null) {
+            const id = channelFileParts[2];
+            for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) {
                 const space = document.createTextNode('\u00A0');
                 channelFileDom.appendChild(space);
-                const id = channelFileParts[2];
+                const timestamp = timestamps[timestampsIndex];
                 var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
                 channelFileDom.appendChild(a);
             }
diff --git a/website/search.py b/website/search.py
index d65cd0c..710720c 100755
--- a/website/search.py
+++ b/website/search.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-import sys, time, fcntl, os, zipfile, webvtt
+import sys, time, fcntl, os, zipfile, webvtt, re
 from io import StringIO
 
 path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
@@ -44,15 +44,19 @@ for fileIndex, file in enumerate(files):
                 content = f.read().decode('utf-8')
                 stringIOf = StringIO(content)
                 wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
-                messagePosition = wholeCaption.find(message)
-                if messagePosition != -1:
-                    stringIOf = StringIO(content)
-                    for caption in webvtt.read_buffer(stringIOf):
-                        text = caption.text
-                        if messagePosition <= len(text):
-                            write(f'{toWrite}|{int(caption.start_in_seconds)}')
-                            break
-                        messagePosition -= len(text) + 1
+                messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
+                if messagePositions != []:
+                    timestamps = []
+                    for messagePosition in messagePositions:
+                        stringIOf = StringIO(content)
+                        for caption in webvtt.read_buffer(stringIOf):
+                            text = caption.text
+                            if messagePosition <= len(text):
+                                timestamp = str(int(caption.start_in_seconds))
+                                timestamps += [timestamp]
+                                break
+                            messagePosition -= len(text) + 1
+                    write(f'{toWrite}|{"|".join(timestamps)}')
             else:
                 for line in f.readlines():
                     if message in str(line):