From e166fdb4e59c061995014733cc1a91251f7cf676 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Tue, 14 Feb 2023 02:56:11 +0100 Subject: [PATCH] Fix #31: List all occurrences of search within video captions --- website/index.php | 13 +++++++------ website/search.py | 24 ++++++++++++++---------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/website/index.php b/website/index.php index 6c79161..21eb0b6 100644 --- a/website/index.php +++ b/website/index.php @@ -42,10 +42,10 @@ Progress: channels document.getElementById('progress').innerHTML = line.replace('progress:', ''); } else { var channelsDom = document.getElementById('channels'); - var timestamp = null; - if (line.includes('|')) { - const lineParts = line.split('|'); - timestamp = parseInt(lineParts[1]); + var timestamp = []; + const lineParts = line.split('|'); + if (lineParts.length > 0) { + timestamps = lineParts.slice(1).map(linePart => parseInt(linePart)); line = lineParts[0]; } const channelFileParts = line.split('/'); @@ -66,10 +66,11 @@ Progress: channels var channelFileDom = document.createElement('li'); var a = createA(channelFile, `${channelHref}/${channelFile}`); channelFileDom.appendChild(a); - if (timestamp != null) { + const id = channelFileParts[2]; + for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) { const space = document.createTextNode('\u00A0'); channelFileDom.appendChild(space); - const id = channelFileParts[2]; + const timestamp = timestamps[timestampsIndex]; var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`); channelFileDom.appendChild(a); } diff --git a/website/search.py b/website/search.py index d65cd0c..710720c 100755 --- a/website/search.py +++ b/website/search.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import sys, time, fcntl, os, zipfile, webvtt +import sys, time, fcntl, os, zipfile, webvtt, re from io import StringIO path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' @@ -44,15 +44,19 @@ for fileIndex, file in enumerate(files): content = f.read().decode('utf-8') stringIOf = StringIO(content) wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) - messagePosition = wholeCaption.find(message) - if messagePosition != -1: - stringIOf = StringIO(content) - for caption in webvtt.read_buffer(stringIOf): - text = caption.text - if messagePosition <= len(text): - write(f'{toWrite}|{int(caption.start_in_seconds)}') - break - messagePosition -= len(text) + 1 + messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)] + if messagePositions != []: + timestamps = [] + for messagePosition in messagePositions: + stringIOf = StringIO(content) + for caption in webvtt.read_buffer(stringIOf): + text = caption.text + if messagePosition <= len(text): + timestamp = str(int(caption.start_in_seconds)) + timestamps += [timestamp] + break + messagePosition -= len(text) + 1 + write(f'{toWrite}|{"|".join(timestamps)}') else: for line in f.readlines(): if message in str(line):