Make search.py search across displayed captions.
				
					
				
			Otherwise `Linux, is in millions of computers` doesn't match the not automatically generated caption of [`o8NPllzkFhE`](https://www.youtube.com/watch?v=o8NPllzkFhE). Note to be confused with the search across captions that already used to work with for instance `is in millions of computers, it`.
This commit is contained in:
		| @@ -29,6 +29,9 @@ def write(s): | ||||
|     except Exception as e: | ||||
|         sys.exit(e) | ||||
|  | ||||
| def cleanCaption(caption): | ||||
|     return caption.replace('\n', ' ') | ||||
|  | ||||
| # As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity. | ||||
| files = [file for file in os.listdir(path) if file.endswith('.zip')] | ||||
| for fileIndex, file in enumerate(files): | ||||
| @@ -43,14 +46,14 @@ for fileIndex, file in enumerate(files): | ||||
|             if endsWithVtt: | ||||
|                 content = f.read().decode('utf-8') | ||||
|                 stringIOf = StringIO(content) | ||||
|                 wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) | ||||
|                 wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)]) | ||||
|                 messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)] | ||||
|                 if messagePositions != []: | ||||
|                     timestamps = [] | ||||
|                     for messagePosition in messagePositions: | ||||
|                         stringIOf = StringIO(content) | ||||
|                         for caption in webvtt.read_buffer(stringIOf): | ||||
|                             text = caption.text | ||||
|                             text = cleanCaption(caption.text) | ||||
|                             if messagePosition <= len(text): | ||||
|                                 timestamp = str(int(caption.start_in_seconds)) | ||||
|                                 timestamps += [timestamp] | ||||
|   | ||||
		Reference in New Issue
	
	Block a user