Make search.py search across displayed captions.

Otherwise `Linux, is in millions of computers` doesn't match the not automatically generated caption of [`o8NPllzkFhE`](https://www.youtube.com/watch?v=o8NPllzkFhE). Note to be confused with the search across captions that already used to work with for instance `is in millions of computers, it`.
This commit is contained in:
Benjamin Loison 2023-02-24 14:46:00 +01:00
parent 0278b77667
commit cba2535d97

View File

@ -29,6 +29,9 @@ def write(s):
except Exception as e: except Exception as e:
sys.exit(e) sys.exit(e)
def cleanCaption(caption):
return caption.replace('\n', ' ')
# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity. # As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
files = [file for file in os.listdir(path) if file.endswith('.zip')] files = [file for file in os.listdir(path) if file.endswith('.zip')]
for fileIndex, file in enumerate(files): for fileIndex, file in enumerate(files):
@ -43,14 +46,14 @@ for fileIndex, file in enumerate(files):
if endsWithVtt: if endsWithVtt:
content = f.read().decode('utf-8') content = f.read().decode('utf-8')
stringIOf = StringIO(content) stringIOf = StringIO(content)
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)])
messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)] messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
if messagePositions != []: if messagePositions != []:
timestamps = [] timestamps = []
for messagePosition in messagePositions: for messagePosition in messagePositions:
stringIOf = StringIO(content) stringIOf = StringIO(content)
for caption in webvtt.read_buffer(stringIOf): for caption in webvtt.read_buffer(stringIOf):
text = caption.text text = cleanCaption(caption.text)
if messagePosition <= len(text): if messagePosition <= len(text):
timestamp = str(int(caption.start_in_seconds)) timestamp = str(int(caption.start_in_seconds))
timestamps += [timestamp] timestamps += [timestamp]