Make search.py
search across displayed captions.
Otherwise `Linux, is in millions of computers` doesn't match the not automatically generated caption of [`o8NPllzkFhE`](https://www.youtube.com/watch?v=o8NPllzkFhE). Note to be confused with the search across captions that already used to work with for instance `is in millions of computers, it`.
This commit is contained in:
parent
9d433ba2f3
commit
3bba97e90c
@ -29,6 +29,9 @@ def write(s):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
sys.exit(e)
|
sys.exit(e)
|
||||||
|
|
||||||
|
def cleanCaption(caption):
|
||||||
|
return caption.replace('\n', ' ')
|
||||||
|
|
||||||
# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
|
# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
|
||||||
files = [file for file in os.listdir(path) if file.endswith('.zip')]
|
files = [file for file in os.listdir(path) if file.endswith('.zip')]
|
||||||
for fileIndex, file in enumerate(files):
|
for fileIndex, file in enumerate(files):
|
||||||
@ -43,14 +46,14 @@ for fileIndex, file in enumerate(files):
|
|||||||
if endsWithVtt:
|
if endsWithVtt:
|
||||||
content = f.read().decode('utf-8')
|
content = f.read().decode('utf-8')
|
||||||
stringIOf = StringIO(content)
|
stringIOf = StringIO(content)
|
||||||
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
|
wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)])
|
||||||
messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
|
messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
|
||||||
if messagePositions != []:
|
if messagePositions != []:
|
||||||
timestamps = []
|
timestamps = []
|
||||||
for messagePosition in messagePositions:
|
for messagePosition in messagePositions:
|
||||||
stringIOf = StringIO(content)
|
stringIOf = StringIO(content)
|
||||||
for caption in webvtt.read_buffer(stringIOf):
|
for caption in webvtt.read_buffer(stringIOf):
|
||||||
text = caption.text
|
text = cleanCaption(caption.text)
|
||||||
if messagePosition <= len(text):
|
if messagePosition <= len(text):
|
||||||
timestamp = str(int(caption.start_in_seconds))
|
timestamp = str(int(caption.start_in_seconds))
|
||||||
timestamps += [timestamp]
|
timestamps += [timestamp]
|
||||||
|
Loading…
Reference in New Issue
Block a user