Fix #31: List all occurrences of search within video captions
This commit is contained in:
		| @@ -42,10 +42,10 @@ Progress: <span id="progress"></span> channels | ||||
|             document.getElementById('progress').innerHTML = line.replace('progress:', ''); | ||||
|         } else { | ||||
|             var channelsDom = document.getElementById('channels'); | ||||
|             var timestamp = null; | ||||
|             if (line.includes('|')) { | ||||
|                 const lineParts = line.split('|'); | ||||
|                 timestamp = parseInt(lineParts[1]); | ||||
|             var timestamp = []; | ||||
|             const lineParts = line.split('|'); | ||||
|             if (lineParts.length > 0) { | ||||
|                 timestamps = lineParts.slice(1).map(linePart => parseInt(linePart)); | ||||
|                 line = lineParts[0]; | ||||
|             } | ||||
|             const channelFileParts = line.split('/'); | ||||
| @@ -66,10 +66,11 @@ Progress: <span id="progress"></span> channels | ||||
|             var channelFileDom = document.createElement('li'); | ||||
|             var a = createA(channelFile, `${channelHref}/${channelFile}`); | ||||
|             channelFileDom.appendChild(a); | ||||
|             if (timestamp != null) { | ||||
|             const id = channelFileParts[2]; | ||||
|             for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) { | ||||
|                 const space = document.createTextNode('\u00A0'); | ||||
|                 channelFileDom.appendChild(space); | ||||
|                 const id = channelFileParts[2]; | ||||
|                 const timestamp = timestamps[timestampsIndex]; | ||||
|                 var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`); | ||||
|                 channelFileDom.appendChild(a); | ||||
|             } | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| #!/usr/bin/python3 | ||||
|  | ||||
| import sys, time, fcntl, os, zipfile, webvtt | ||||
| import sys, time, fcntl, os, zipfile, webvtt, re | ||||
| from io import StringIO | ||||
|  | ||||
| path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' | ||||
| @@ -44,15 +44,19 @@ for fileIndex, file in enumerate(files): | ||||
|                 content = f.read().decode('utf-8') | ||||
|                 stringIOf = StringIO(content) | ||||
|                 wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) | ||||
|                 messagePosition = wholeCaption.find(message) | ||||
|                 if messagePosition != -1: | ||||
|                     stringIOf = StringIO(content) | ||||
|                     for caption in webvtt.read_buffer(stringIOf): | ||||
|                         text = caption.text | ||||
|                         if messagePosition <= len(text): | ||||
|                             write(f'{toWrite}|{int(caption.start_in_seconds)}') | ||||
|                             break | ||||
|                         messagePosition -= len(text) + 1 | ||||
|                 messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)] | ||||
|                 if messagePositions != []: | ||||
|                     timestamps = [] | ||||
|                     for messagePosition in messagePositions: | ||||
|                         stringIOf = StringIO(content) | ||||
|                         for caption in webvtt.read_buffer(stringIOf): | ||||
|                             text = caption.text | ||||
|                             if messagePosition <= len(text): | ||||
|                                 timestamp = str(int(caption.start_in_seconds)) | ||||
|                                 timestamps += [timestamp] | ||||
|                                 break | ||||
|                             messagePosition -= len(text) + 1 | ||||
|                     write(f'{toWrite}|{"|".join(timestamps)}') | ||||
|             else: | ||||
|                 for line in f.readlines(): | ||||
|                     if message in str(line): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user