From 09f7675bf73e1291ed84d72d4a5ff1b50a001151 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Tue, 14 Feb 2023 01:32:36 +0100 Subject: [PATCH] #31: Make search within captions not limited by line wrapping --- README.md | 6 ++++++ website/search.py | 24 ++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 2abdc09..5052d0f 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,12 @@ make ./youtubeCaptionsSearchEngine -h ``` +If you plan to use the front-end website, also run: + +```sh +pip install webvtt-py +``` + Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api). Except if you provide the argument `--no-keys`, you have to provide at least one [YouTube Data API v3 key](https://developers.google.com/youtube/v3/getting-started) in `keys.txt`. diff --git a/website/search.py b/website/search.py index f506884..c0ca211 100755 --- a/website/search.py +++ b/website/search.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 -import sys, time, fcntl, os, zipfile +import sys, time, fcntl, os, zipfile, webvtt +from io import StringIO path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' @@ -34,14 +35,21 @@ for fileIndex, file in enumerate(files): write(f'progress:{fileIndex + 1} / {len(files)}') zip = zipfile.ZipFile(path + file) for fileInZip in zip.namelist(): - if searchOnlyCaptions and not fileInZip.endswith('.vtt'): + endsWithVtt = fileInZip.endswith('.vtt') + if searchOnlyCaptions and not endsWithVtt: continue - f = zip.open(fileInZip) - for line in f.readlines(): - if message in str(line): - write(f'{file}/{fileInZip}') - break - f.close() + with zip.open(fileInZip) as f: + toWrite = f'{file}/{fileInZip}' + if endsWithVtt: + content = StringIO(f.read().decode('utf-8')) + wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(content)]) + if message in wholeCaption: + write(toWrite) + else: + for line in f.readlines(): + if message in str(line): + write(toWrite) + break f = open(clientFilePath) while True: