Fix #31: List all occurrences of search within video captions
This commit is contained in:
		| @@ -42,10 +42,10 @@ Progress: <span id="progress"></span> channels | |||||||
|             document.getElementById('progress').innerHTML = line.replace('progress:', ''); |             document.getElementById('progress').innerHTML = line.replace('progress:', ''); | ||||||
|         } else { |         } else { | ||||||
|             var channelsDom = document.getElementById('channels'); |             var channelsDom = document.getElementById('channels'); | ||||||
|             var timestamp = null; |             var timestamp = []; | ||||||
|             if (line.includes('|')) { |             const lineParts = line.split('|'); | ||||||
|                 const lineParts = line.split('|'); |             if (lineParts.length > 0) { | ||||||
|                 timestamp = parseInt(lineParts[1]); |                 timestamps = lineParts.slice(1).map(linePart => parseInt(linePart)); | ||||||
|                 line = lineParts[0]; |                 line = lineParts[0]; | ||||||
|             } |             } | ||||||
|             const channelFileParts = line.split('/'); |             const channelFileParts = line.split('/'); | ||||||
| @@ -66,10 +66,11 @@ Progress: <span id="progress"></span> channels | |||||||
|             var channelFileDom = document.createElement('li'); |             var channelFileDom = document.createElement('li'); | ||||||
|             var a = createA(channelFile, `${channelHref}/${channelFile}`); |             var a = createA(channelFile, `${channelHref}/${channelFile}`); | ||||||
|             channelFileDom.appendChild(a); |             channelFileDom.appendChild(a); | ||||||
|             if (timestamp != null) { |             const id = channelFileParts[2]; | ||||||
|  |             for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) { | ||||||
|                 const space = document.createTextNode('\u00A0'); |                 const space = document.createTextNode('\u00A0'); | ||||||
|                 channelFileDom.appendChild(space); |                 channelFileDom.appendChild(space); | ||||||
|                 const id = channelFileParts[2]; |                 const timestamp = timestamps[timestampsIndex]; | ||||||
|                 var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`); |                 var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`); | ||||||
|                 channelFileDom.appendChild(a); |                 channelFileDom.appendChild(a); | ||||||
|             } |             } | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| #!/usr/bin/python3 | #!/usr/bin/python3 | ||||||
|  |  | ||||||
| import sys, time, fcntl, os, zipfile, webvtt | import sys, time, fcntl, os, zipfile, webvtt, re | ||||||
| from io import StringIO | from io import StringIO | ||||||
|  |  | ||||||
| path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' | path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' | ||||||
| @@ -44,15 +44,19 @@ for fileIndex, file in enumerate(files): | |||||||
|                 content = f.read().decode('utf-8') |                 content = f.read().decode('utf-8') | ||||||
|                 stringIOf = StringIO(content) |                 stringIOf = StringIO(content) | ||||||
|                 wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) |                 wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) | ||||||
|                 messagePosition = wholeCaption.find(message) |                 messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)] | ||||||
|                 if messagePosition != -1: |                 if messagePositions != []: | ||||||
|                     stringIOf = StringIO(content) |                     timestamps = [] | ||||||
|                     for caption in webvtt.read_buffer(stringIOf): |                     for messagePosition in messagePositions: | ||||||
|                         text = caption.text |                         stringIOf = StringIO(content) | ||||||
|                         if messagePosition <= len(text): |                         for caption in webvtt.read_buffer(stringIOf): | ||||||
|                             write(f'{toWrite}|{int(caption.start_in_seconds)}') |                             text = caption.text | ||||||
|                             break |                             if messagePosition <= len(text): | ||||||
|                         messagePosition -= len(text) + 1 |                                 timestamp = str(int(caption.start_in_seconds)) | ||||||
|  |                                 timestamps += [timestamp] | ||||||
|  |                                 break | ||||||
|  |                             messagePosition -= len(text) + 1 | ||||||
|  |                     write(f'{toWrite}|{"|".join(timestamps)}') | ||||||
|             else: |             else: | ||||||
|                 for line in f.readlines(): |                 for line in f.readlines(): | ||||||
|                     if message in str(line): |                     if message in str(line): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user