From 3c4664a4b18dd23c85c7b134fef97802f0d1e5ed Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Fri, 10 Feb 2023 20:03:08 +0100 Subject: [PATCH] Fix #13: Add captions extraction I was about to commit in addition: ```c++ // Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`. // My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`. /*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat), items = data["items"]; for(const auto& item : items) { json snippet = item["snippet"]; if(snippet["trackKind"] == "asr") { string language = snippet["language"]; cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); // As there should be a single automatic speech recognized track, there is no need to go through all tracks. break; } }*/ ``` Instead of: ```c++ cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); ``` But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was wrong: > `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again. > > ``` > 'subtitleslangs': ['en.*','.*orig'], > 'writeautomaticsub': True, > ``` > > Work as expected too. Thank you > > Very sorry for the video sample. I even not watched it. Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet). --- README.md | 3 ++- main.cpp | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 99ddc3b..2abdc09 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,9 @@ Have to proceed with a breadth-first search approach as treating all *child* cha Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm. ```sh -sudo apt install nlohmann-json3-dev +sudo apt install nlohmann-json3-dev yt-dlp make +./youtubeCaptionsSearchEngine -h ``` Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api). diff --git a/main.cpp b/main.cpp index d51318f..a2db4c9 100644 --- a/main.cpp +++ b/main.cpp @@ -25,7 +25,7 @@ void createDirectory(string path), deleteDirectory(string path), addChannelToTreat(unsigned short threadId, string channelId); string getHttps(string url), - exec(string cmd), + exec(unsigned short threadId, string cmd), join(vector parts, string delimiter); size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp); bool doesFileExist(string filePath), @@ -57,7 +57,9 @@ string CHANNELS_DIRECTORY = "channels/", KEYS_FILE_PATH = "keys.txt", UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt", apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line. - YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API"; // Can be "https://yt.lemnoslife.com" for instance. + YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance. + CAPTIONS_DIRECTORY = "captions/", + DEBUG_DIRECTORY = "debug/"; bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false; int main(int argc, char *argv[]) @@ -177,13 +179,15 @@ void treatChannels(unsigned short threadId) string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/"; createDirectory(channelToTreatDirectory); + createDirectory(DEBUG_DIRECTORY); + createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY); treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat); // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds. PRINT("Starting compression...") // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli. - exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@"); + exec(threadId, "cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@"); PRINT("Compression finished, started deleting initial directory...") deleteDirectory(channelToTreatDirectory); @@ -559,6 +563,46 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str } } } + // Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel. + string playlistToTreat = "UU" + channelToTreat.substr(2); + pageToken = ""; + while(true) + { + json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound); + if(data.contains("error")) + { + EXIT_WITH_ERROR("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!") + } + json items = data["items"]; + for(const auto& item : items) + { + string videoId = item["contentDetails"]["videoId"]; + // Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way. + //treatChannelOrVideo(threadId, false, videoId, channelToTreat); + + string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/"; + createDirectory(channelCaptionsToTreatDirectory); + + // Firstly download all not automatically generated captions. + // The underscore in `-o` argument is used to not end up with hidden files. + string cmdCommonPrefix = "yt-dlp --skip-download ", + cmdCommonPostfix = " '" + videoId + "' -o '" + channelCaptionsToTreatDirectory + "_'"; + string cmd = cmdCommonPrefix + "--all-subs" + cmdCommonPostfix; + exec(threadId, cmd); + + // Secondly download the automatically generated captions. + cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; + exec(threadId, cmd); + } + if(data.contains("nextPageToken")) + { + pageToken = data["nextPageToken"]; + } + else + { + break; + } + } } // This function verifies that the given hasn't already been treated. @@ -609,8 +653,13 @@ string join(vector parts, string delimiter) return result; } -string exec(string cmd) +string exec(unsigned short threadId, string cmd) { + ostringstream toString; + toString << threadId; + string threadIdStr = toString.str(), debugCommonFilePath = DEBUG_DIRECTORY + threadIdStr; + cmd += " >> " + debugCommonFilePath + ".out"; + cmd += " 2>> " + debugCommonFilePath + ".err"; array buffer; string result; unique_ptr pipe(popen(cmd.c_str(), "r"), pclose);