Fix #31: List all occurrences of search within video captions

2023-02-14 02:56:11 +01:00
parent 8d34cf33ae
commit e166fdb4e5
2 changed files with 21 additions and 16 deletions
@@ -42,10 +42,10 @@ Progress: <span id="progress"></span> channels
            document.getElementById('progress').innerHTML = line.replace('progress:', '');
        } else {
            var channelsDom = document.getElementById('channels');
-            var timestamp = null;
-            if (line.includes('|')) {
-                const lineParts = line.split('|');
-                timestamp = parseInt(lineParts[1]);
+            var timestamp = [];
+            const lineParts = line.split('|');
+            if (lineParts.length > 0) {
+                timestamps = lineParts.slice(1).map(linePart => parseInt(linePart));
                line = lineParts[0];
            }
            const channelFileParts = line.split('/');
@@ -66,10 +66,11 @@ Progress: <span id="progress"></span> channels
            var channelFileDom = document.createElement('li');
            var a = createA(channelFile, `${channelHref}/${channelFile}`);
            channelFileDom.appendChild(a);
-            if (timestamp != null) {
+            const id = channelFileParts[2];
+            for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) {
                const space = document.createTextNode('\u00A0');
                channelFileDom.appendChild(space);
-                const id = channelFileParts[2];
+                const timestamp = timestamps[timestampsIndex];
                var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
                channelFileDom.appendChild(a);
            }
@@ -1,6 +1,6 @@
 #!/usr/bin/python3

-import sys, time, fcntl, os, zipfile, webvtt
+import sys, time, fcntl, os, zipfile, webvtt, re
 from io import StringIO

 path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
@@ -44,15 +44,19 @@ for fileIndex, file in enumerate(files):
                content = f.read().decode('utf-8')
                stringIOf = StringIO(content)
                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
-                messagePosition = wholeCaption.find(message)
-                if messagePosition != -1:
-                    stringIOf = StringIO(content)
-                    for caption in webvtt.read_buffer(stringIOf):
-                        text = caption.text
-                        if messagePosition <= len(text):
-                            write(f'{toWrite}|{int(caption.start_in_seconds)}')
-                            break
-                        messagePosition -= len(text) + 1
+                messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
+                if messagePositions != []:
+                    timestamps = []
+                    for messagePosition in messagePositions:
+                        stringIOf = StringIO(content)
+                        for caption in webvtt.read_buffer(stringIOf):
+                            text = caption.text
+                            if messagePosition <= len(text):
+                                timestamp = str(int(caption.start_in_seconds))
+                                timestamps += [timestamp]
+                                break
+                            messagePosition -= len(text) + 1
+                    write(f'{toWrite}|{"|".join(timestamps)}')
            else:
                for line in f.readlines():
                    if message in str(line):