From cba2535d970b30052e2d517ea822ccb6d04f2084 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Fri, 24 Feb 2023 14:46:00 +0100 Subject: [PATCH] Make `search.py` search across displayed captions. Otherwise `Linux, is in millions of computers` doesn't match the not automatically generated caption of [`o8NPllzkFhE`](https://www.youtube.com/watch?v=o8NPllzkFhE). Note to be confused with the search across captions that already used to work with for instance `is in millions of computers, it`. --- website/search.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/website/search.py b/website/search.py index 710720c..6d81556 100755 --- a/website/search.py +++ b/website/search.py @@ -29,6 +29,9 @@ def write(s): except Exception as e: sys.exit(e) +def cleanCaption(caption): + return caption.replace('\n', ' ') + # As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity. files = [file for file in os.listdir(path) if file.endswith('.zip')] for fileIndex, file in enumerate(files): @@ -43,14 +46,14 @@ for fileIndex, file in enumerate(files): if endsWithVtt: content = f.read().decode('utf-8') stringIOf = StringIO(content) - wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) + wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)]) messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)] if messagePositions != []: timestamps = [] for messagePosition in messagePositions: stringIOf = StringIO(content) for caption in webvtt.read_buffer(stringIOf): - text = caption.text + text = cleanCaption(caption.text) if messagePosition <= len(text): timestamp = str(int(caption.start_in_seconds)) timestamps += [timestamp]