From 3bba97e90cd15289f5a5f76bfe608444d13e176b Mon Sep 17 00:00:00 2001
From: Benjamin Loison <Benjamin_Loison@users.noreply.gitea.lemnoslife.com>
Date: Fri, 24 Feb 2023 14:46:00 +0100
Subject: [PATCH] Make `search.py` search across displayed captions.

Otherwise `Linux, is in millions of computers` doesn't match the not automatically generated caption of [`o8NPllzkFhE`](https://www.youtube.com/watch?v=o8NPllzkFhE). Note to be confused with the search across captions that already used to work with for instance `is in millions of computers, it`.
---
 website/search.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/website/search.py b/website/search.py
index 710720c..6d81556 100755
--- a/website/search.py
+++ b/website/search.py
@@ -29,6 +29,9 @@ def write(s):
     except Exception as e:
         sys.exit(e)
 
+def cleanCaption(caption):
+    return caption.replace('\n', ' ')
+
 # As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
 files = [file for file in os.listdir(path) if file.endswith('.zip')]
 for fileIndex, file in enumerate(files):
@@ -43,14 +46,14 @@ for fileIndex, file in enumerate(files):
             if endsWithVtt:
                 content = f.read().decode('utf-8')
                 stringIOf = StringIO(content)
-                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
+                wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)])
                 messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
                 if messagePositions != []:
                     timestamps = []
                     for messagePosition in messagePositions:
                         stringIOf = StringIO(content)
                         for caption in webvtt.read_buffer(stringIOf):
-                            text = caption.text
+                            text = cleanCaption(caption.text)
                             if messagePosition <= len(text):
                                 timestamp = str(int(caption.start_in_seconds))
                                 timestamps += [timestamp]