Fix #13: Add captions extraction

I was about to commit in addition:

```c++
// Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`.
// My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`.
/*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat),
     items = data["items"];
for(const auto& item : items)
{
    json snippet = item["snippet"];
    if(snippet["trackKind"] == "asr")
    {
        string language = snippet["language"];
        cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
        exec(threadId, cmd);
        // As there should be a single automatic speech recognized track, there is no need to go through all tracks.
        break;
    }
}*/
```

Instead of:

```c++
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
```

But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was
wrong:

> `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again.
>
> ```
> 'subtitleslangs': ['en.*','.*orig'],
> 'writeautomaticsub': True,
> ```
>
> Work as expected too. Thank you
>
> Very sorry for the video sample. I even not watched it.

Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
This commit is contained in:
Benjamin Loison 2023-02-10 20:03:08 +01:00
parent 9b792015fa
commit 04c59eb025
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 55 additions and 5 deletions

View File

@ -15,8 +15,9 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm. Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
```sh ```sh
sudo apt install nlohmann-json3-dev sudo apt install nlohmann-json3-dev yt-dlp
make make
./youtubeCaptionsSearchEngine -h
``` ```
Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api). Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).

View File

@ -25,7 +25,7 @@ void createDirectory(string path),
deleteDirectory(string path), deleteDirectory(string path),
addChannelToTreat(unsigned short threadId, string channelId); addChannelToTreat(unsigned short threadId, string channelId);
string getHttps(string url), string getHttps(string url),
exec(string cmd), exec(unsigned short threadId, string cmd),
join(vector<string> parts, string delimiter); join(vector<string> parts, string delimiter);
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp); size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
bool doesFileExist(string filePath), bool doesFileExist(string filePath),
@ -57,7 +57,9 @@ string CHANNELS_DIRECTORY = "channels/",
KEYS_FILE_PATH = "keys.txt", KEYS_FILE_PATH = "keys.txt",
UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt", UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line. apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API"; // Can be "https://yt.lemnoslife.com" for instance. YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance.
CAPTIONS_DIRECTORY = "captions/",
DEBUG_DIRECTORY = "debug/";
bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false; bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;
int main(int argc, char *argv[]) int main(int argc, char *argv[])
@ -177,13 +179,15 @@ void treatChannels(unsigned short threadId)
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/"; string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
createDirectory(channelToTreatDirectory); createDirectory(channelToTreatDirectory);
createDirectory(DEBUG_DIRECTORY);
createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY);
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat); treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds. // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
PRINT("Starting compression...") PRINT("Starting compression...")
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli. // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@"); exec(threadId, "cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
PRINT("Compression finished, started deleting initial directory...") PRINT("Compression finished, started deleting initial directory...")
deleteDirectory(channelToTreatDirectory); deleteDirectory(channelToTreatDirectory);
@ -559,6 +563,46 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
} }
} }
} }
// Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
string playlistToTreat = "UU" + channelToTreat.substr(2);
pageToken = "";
while(true)
{
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
if(data.contains("error"))
{
EXIT_WITH_ERROR("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!")
}
json items = data["items"];
for(const auto& item : items)
{
string videoId = item["contentDetails"]["videoId"];
// Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
//treatChannelOrVideo(threadId, false, videoId, channelToTreat);
string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/";
createDirectory(channelCaptionsToTreatDirectory);
// Firstly download all not automatically generated captions.
// The underscore in `-o` argument is used to not end up with hidden files.
string cmdCommonPrefix = "yt-dlp --skip-download ",
cmdCommonPostfix = " '" + videoId + "' -o '" + channelCaptionsToTreatDirectory + "_'";
string cmd = cmdCommonPrefix + "--all-subs" + cmdCommonPostfix;
exec(threadId, cmd);
// Secondly download the automatically generated captions.
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
}
if(data.contains("nextPageToken"))
{
pageToken = data["nextPageToken"];
}
else
{
break;
}
}
} }
// This function verifies that the given hasn't already been treated. // This function verifies that the given hasn't already been treated.
@ -609,8 +653,13 @@ string join(vector<string> parts, string delimiter)
return result; return result;
} }
string exec(string cmd) string exec(unsigned short threadId, string cmd)
{ {
ostringstream toString;
toString << threadId;
string threadIdStr = toString.str(), debugCommonFilePath = DEBUG_DIRECTORY + threadIdStr;
cmd += " >> " + debugCommonFilePath + ".out";
cmd += " 2>> " + debugCommonFilePath + ".err";
array<char, 128> buffer; array<char, 128> buffer;
string result; string result;
unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose); unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);