Fix #13: Add captions extraction

I was about to commit in addition: ```c++ // Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`. // My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`. /*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat), items = data["items"]; for(const auto& item : items) { json snippet = item["snippet"]; if(snippet["trackKind"] == "asr") { string language = snippet["language"]; cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); // As there should be a single automatic speech recognized track, there is no need to go through all tracks. break; } }*/ ``` Instead of: ```c++ cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); ``` But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was wrong: > `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again. > > ``` > 'subtitleslangs': ['en.*','.*orig'], > 'writeautomaticsub': True, > ``` > > Work as expected too. Thank you > > Very sorry for the video sample. I even not watched it. Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
2023-02-10 20:03:08 +01:00
parent 7fcc8b09fa
commit 3c4664a4b1
2 changed files with 55 additions and 5 deletions
--- a/README.md
+++ b/README.md
@@ -15,8 +15,9 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
 Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
 ```sh
-sudo apt install nlohmann-json3-dev
+sudo apt install nlohmann-json3-dev yt-dlp
 make
 ./youtubeCaptionsSearchEngine -h
 ```
 Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
--- a/main.cpp
+++ b/main.cpp
@@ -25,7 +25,7 @@ void createDirectory(string path),
     deleteDirectory(string path),
     addChannelToTreat(unsigned short threadId, string channelId);
 string getHttps(string url),
-       exec(string cmd),
+       exec(unsigned short threadId, string cmd),
       join(vector<string> parts, string delimiter);
 size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
 bool doesFileExist(string filePath),
@@ -57,7 +57,9 @@ string CHANNELS_DIRECTORY = "channels/",
       KEYS_FILE_PATH = "keys.txt",
       UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
       apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
-       YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API"; // Can be "https://yt.lemnoslife.com" for instance.
+       YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance.
       CAPTIONS_DIRECTORY = "captions/",
       DEBUG_DIRECTORY = "debug/";
 bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;
 int main(int argc, char *argv[])
@@ -177,13 +179,15 @@ void treatChannels(unsigned short threadId)
        string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
        createDirectory(channelToTreatDirectory);
        createDirectory(DEBUG_DIRECTORY);
        createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY);
        treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
        // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
        PRINT("Starting compression...")
        // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
-        exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
+        exec(threadId, "cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
        PRINT("Compression finished, started deleting initial directory...")
        deleteDirectory(channelToTreatDirectory);
@@ -559,6 +563,46 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
            }
        }
    }
    // Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
    string playlistToTreat = "UU" + channelToTreat.substr(2);
    pageToken = "";
    while(true)
    {
        json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
        if(data.contains("error"))
        {
            EXIT_WITH_ERROR("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!")
        }
        json items = data["items"];
        for(const auto& item : items)
        {
            string videoId = item["contentDetails"]["videoId"];
            // Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
            //treatChannelOrVideo(threadId, false, videoId, channelToTreat);
            string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/";
            createDirectory(channelCaptionsToTreatDirectory);
            // Firstly download all not automatically generated captions.
            // The underscore in `-o` argument is used to not end up with hidden files.
            string cmdCommonPrefix = "yt-dlp --skip-download ",
                   cmdCommonPostfix = " '" + videoId + "' -o '" + channelCaptionsToTreatDirectory + "_'";
            string cmd = cmdCommonPrefix + "--all-subs" + cmdCommonPostfix;
            exec(threadId, cmd);
            // Secondly download the automatically generated captions.
            cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
            exec(threadId, cmd);
        }
        if(data.contains("nextPageToken"))
        {
            pageToken = data["nextPageToken"];
        }
        else
        {
            break;
        }
    }
 }
 // This function verifies that the given hasn't already been treated.
@@ -609,8 +653,13 @@ string join(vector<string> parts, string delimiter)
    return result;
 }
-string exec(string cmd)
+string exec(unsigned short threadId, string cmd)
 {
    ostringstream toString;
    toString << threadId;
    string threadIdStr = toString.str(), debugCommonFilePath = DEBUG_DIRECTORY + threadIdStr;
    cmd += " >> " + debugCommonFilePath + ".out";
    cmd += " 2>> " + debugCommonFilePath + ".err";
    array<char, 128> buffer;
    string result;
    unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);