From 21ad878be89e1d2950e66e791b0dd0185012ed0c Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Tue, 14 Feb 2023 02:00:23 +0100 Subject: [PATCH] Fix #31: Make a website with a search engine notably based on the captions extracted --- website/index.php | 15 ++++++++++++++- website/search.py | 16 ++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/website/index.php b/website/index.php index 141f6b0..6c79161 100644 --- a/website/index.php +++ b/website/index.php @@ -17,7 +17,7 @@ Access raw data with: . -Progress: +Progress: channels @@ -42,6 +42,12 @@ Progress: document.getElementById('progress').innerHTML = line.replace('progress:', ''); } else { var channelsDom = document.getElementById('channels'); + var timestamp = null; + if (line.includes('|')) { + const lineParts = line.split('|'); + timestamp = parseInt(lineParts[1]); + line = lineParts[0]; + } const channelFileParts = line.split('/'); const channel = channelFileParts[0]; const channelFile = channelFileParts.slice(1).join('/'); @@ -60,6 +66,13 @@ Progress: var channelFileDom = document.createElement('li'); var a = createA(channelFile, `${channelHref}/${channelFile}`); channelFileDom.appendChild(a); + if (timestamp != null) { + const space = document.createTextNode('\u00A0'); + channelFileDom.appendChild(space); + const id = channelFileParts[2]; + var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`); + channelFileDom.appendChild(a); + } channelFilesDom.appendChild(channelFileDom); } } diff --git a/website/search.py b/website/search.py index c0ca211..d65cd0c 100755 --- a/website/search.py +++ b/website/search.py @@ -41,10 +41,18 @@ for fileIndex, file in enumerate(files): with zip.open(fileInZip) as f: toWrite = f'{file}/{fileInZip}' if endsWithVtt: - content = StringIO(f.read().decode('utf-8')) - wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)]) - if message in wholeCaption: - write(toWrite) + content = f.read().decode('utf-8') + stringIOf = StringIO(content) + wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)]) + messagePosition = wholeCaption.find(message) + if messagePosition != -1: + stringIOf = StringIO(content) + for caption in webvtt.read_buffer(stringIOf): + text = caption.text + if messagePosition <= len(text): + write(f'{toWrite}|{int(caption.start_in_seconds)}') + break + messagePosition -= len(text) + 1 else: for line in f.readlines(): if message in str(line):