From 3c4664a4b18dd23c85c7b134fef97802f0d1e5ed Mon Sep 17 00:00:00 2001
From: Benjamin Loison <Benjamin_Loison@users.noreply.gitea.lemnoslife.com>
Date: Fri, 10 Feb 2023 20:03:08 +0100
Subject: [PATCH] Fix #13: Add captions extraction

I was about to commit in addition:

```c++
// Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`.
// My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`.
/*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat),
     items = data["items"];
for(const auto& item : items)
{
    json snippet = item["snippet"];
    if(snippet["trackKind"] == "asr")
    {
        string language = snippet["language"];
        cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
        exec(threadId, cmd);
        // As there should be a single automatic speech recognized track, there is no need to go through all tracks.
        break;
    }
}*/
```

Instead of:

```c++
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
```

But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was
wrong:

> `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again.
>
> ```
> 'subtitleslangs': ['en.*','.*orig'],
> 'writeautomaticsub': True,
> ```
>
> Work as expected too. Thank you
>
> Very sorry for the video sample. I even not watched it.

Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
---
 README.md |  3 ++-
 main.cpp  | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 55 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 99ddc3b..2abdc09 100644
--- a/README.md
+++ b/README.md
@@ -15,8 +15,9 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
 Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
 
 ```sh
-sudo apt install nlohmann-json3-dev
+sudo apt install nlohmann-json3-dev yt-dlp
 make
+./youtubeCaptionsSearchEngine -h
 ```
 
 Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
diff --git a/main.cpp b/main.cpp
index d51318f..a2db4c9 100644
--- a/main.cpp
+++ b/main.cpp
@@ -25,7 +25,7 @@ void createDirectory(string path),
      deleteDirectory(string path),
      addChannelToTreat(unsigned short threadId, string channelId);
 string getHttps(string url),
-       exec(string cmd),
+       exec(unsigned short threadId, string cmd),
        join(vector<string> parts, string delimiter);
 size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
 bool doesFileExist(string filePath),
@@ -57,7 +57,9 @@ string CHANNELS_DIRECTORY = "channels/",
        KEYS_FILE_PATH = "keys.txt",
        UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
        apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
-       YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API"; // Can be "https://yt.lemnoslife.com" for instance.
+       YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance.
+       CAPTIONS_DIRECTORY = "captions/",
+       DEBUG_DIRECTORY = "debug/";
 bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;
 
 int main(int argc, char *argv[])
@@ -177,13 +179,15 @@ void treatChannels(unsigned short threadId)
 
         string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
         createDirectory(channelToTreatDirectory);
+        createDirectory(DEBUG_DIRECTORY);
+        createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY);
 
         treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
 
         // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
         PRINT("Starting compression...")
         // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
-        exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
+        exec(threadId, "cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
 
         PRINT("Compression finished, started deleting initial directory...")
         deleteDirectory(channelToTreatDirectory);
@@ -559,6 +563,46 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
             }
         }
     }
+    // Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
+    string playlistToTreat = "UU" + channelToTreat.substr(2);
+    pageToken = "";
+    while(true)
+    {
+        json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
+        if(data.contains("error"))
+        {
+            EXIT_WITH_ERROR("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!")
+        }
+        json items = data["items"];
+        for(const auto& item : items)
+        {
+            string videoId = item["contentDetails"]["videoId"];
+            // Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
+            //treatChannelOrVideo(threadId, false, videoId, channelToTreat);
+
+            string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/";
+            createDirectory(channelCaptionsToTreatDirectory);
+
+            // Firstly download all not automatically generated captions.
+            // The underscore in `-o` argument is used to not end up with hidden files.
+            string cmdCommonPrefix = "yt-dlp --skip-download ",
+                   cmdCommonPostfix = " '" + videoId + "' -o '" + channelCaptionsToTreatDirectory + "_'";
+            string cmd = cmdCommonPrefix + "--all-subs" + cmdCommonPostfix;
+            exec(threadId, cmd);
+
+            // Secondly download the automatically generated captions.
+            cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
+            exec(threadId, cmd);
+        }
+        if(data.contains("nextPageToken"))
+        {
+            pageToken = data["nextPageToken"];
+        }
+        else
+        {
+            break;
+        }
+    }
 }
 
 // This function verifies that the given hasn't already been treated.
@@ -609,8 +653,13 @@ string join(vector<string> parts, string delimiter)
     return result;
 }
 
-string exec(string cmd)
+string exec(unsigned short threadId, string cmd)
 {
+    ostringstream toString;
+    toString << threadId;
+    string threadIdStr = toString.str(), debugCommonFilePath = DEBUG_DIRECTORY + threadIdStr;
+    cmd += " >> " + debugCommonFilePath + ".out";
+    cmd += " 2>> " + debugCommonFilePath + ".err";
     array<char, 128> buffer;
     string result;
     unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);