Fix #13: Add captions extraction
I was about to commit in addition: ```c++ // Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`. // My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`. /*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat), items = data["items"]; for(const auto& item : items) { json snippet = item["snippet"]; if(snippet["trackKind"] == "asr") { string language = snippet["language"]; cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); // As there should be a single automatic speech recognized track, there is no need to go through all tracks. break; } }*/ ``` Instead of: ```c++ cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); ``` But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was wrong: > `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again. > > ``` > 'subtitleslangs': ['en.*','.*orig'], > 'writeautomaticsub': True, > ``` > > Work as expected too. Thank you > > Very sorry for the video sample. I even not watched it. Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
This commit is contained in:
parent
9b792015fa
commit
04c59eb025
@ -15,8 +15,9 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
|
|||||||
Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
|
Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
sudo apt install nlohmann-json3-dev
|
sudo apt install nlohmann-json3-dev yt-dlp
|
||||||
make
|
make
|
||||||
|
./youtubeCaptionsSearchEngine -h
|
||||||
```
|
```
|
||||||
|
|
||||||
Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
|
Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
|
||||||
|
57
main.cpp
57
main.cpp
@ -25,7 +25,7 @@ void createDirectory(string path),
|
|||||||
deleteDirectory(string path),
|
deleteDirectory(string path),
|
||||||
addChannelToTreat(unsigned short threadId, string channelId);
|
addChannelToTreat(unsigned short threadId, string channelId);
|
||||||
string getHttps(string url),
|
string getHttps(string url),
|
||||||
exec(string cmd),
|
exec(unsigned short threadId, string cmd),
|
||||||
join(vector<string> parts, string delimiter);
|
join(vector<string> parts, string delimiter);
|
||||||
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
|
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
|
||||||
bool doesFileExist(string filePath),
|
bool doesFileExist(string filePath),
|
||||||
@ -57,7 +57,9 @@ string CHANNELS_DIRECTORY = "channels/",
|
|||||||
KEYS_FILE_PATH = "keys.txt",
|
KEYS_FILE_PATH = "keys.txt",
|
||||||
UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
|
UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
|
||||||
apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
|
apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
|
||||||
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API"; // Can be "https://yt.lemnoslife.com" for instance.
|
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance.
|
||||||
|
CAPTIONS_DIRECTORY = "captions/",
|
||||||
|
DEBUG_DIRECTORY = "debug/";
|
||||||
bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;
|
bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;
|
||||||
|
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
@ -177,13 +179,15 @@ void treatChannels(unsigned short threadId)
|
|||||||
|
|
||||||
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
|
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
|
||||||
createDirectory(channelToTreatDirectory);
|
createDirectory(channelToTreatDirectory);
|
||||||
|
createDirectory(DEBUG_DIRECTORY);
|
||||||
|
createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY);
|
||||||
|
|
||||||
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
|
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
|
||||||
|
|
||||||
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
|
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
|
||||||
PRINT("Starting compression...")
|
PRINT("Starting compression...")
|
||||||
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
|
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
|
||||||
exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
|
exec(threadId, "cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
|
||||||
|
|
||||||
PRINT("Compression finished, started deleting initial directory...")
|
PRINT("Compression finished, started deleting initial directory...")
|
||||||
deleteDirectory(channelToTreatDirectory);
|
deleteDirectory(channelToTreatDirectory);
|
||||||
@ -559,6 +563,46 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
|
||||||
|
string playlistToTreat = "UU" + channelToTreat.substr(2);
|
||||||
|
pageToken = "";
|
||||||
|
while(true)
|
||||||
|
{
|
||||||
|
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
|
||||||
|
if(data.contains("error"))
|
||||||
|
{
|
||||||
|
EXIT_WITH_ERROR("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!")
|
||||||
|
}
|
||||||
|
json items = data["items"];
|
||||||
|
for(const auto& item : items)
|
||||||
|
{
|
||||||
|
string videoId = item["contentDetails"]["videoId"];
|
||||||
|
// Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
|
||||||
|
//treatChannelOrVideo(threadId, false, videoId, channelToTreat);
|
||||||
|
|
||||||
|
string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/";
|
||||||
|
createDirectory(channelCaptionsToTreatDirectory);
|
||||||
|
|
||||||
|
// Firstly download all not automatically generated captions.
|
||||||
|
// The underscore in `-o` argument is used to not end up with hidden files.
|
||||||
|
string cmdCommonPrefix = "yt-dlp --skip-download ",
|
||||||
|
cmdCommonPostfix = " '" + videoId + "' -o '" + channelCaptionsToTreatDirectory + "_'";
|
||||||
|
string cmd = cmdCommonPrefix + "--all-subs" + cmdCommonPostfix;
|
||||||
|
exec(threadId, cmd);
|
||||||
|
|
||||||
|
// Secondly download the automatically generated captions.
|
||||||
|
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
|
||||||
|
exec(threadId, cmd);
|
||||||
|
}
|
||||||
|
if(data.contains("nextPageToken"))
|
||||||
|
{
|
||||||
|
pageToken = data["nextPageToken"];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function verifies that the given hasn't already been treated.
|
// This function verifies that the given hasn't already been treated.
|
||||||
@ -609,8 +653,13 @@ string join(vector<string> parts, string delimiter)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
string exec(string cmd)
|
string exec(unsigned short threadId, string cmd)
|
||||||
{
|
{
|
||||||
|
ostringstream toString;
|
||||||
|
toString << threadId;
|
||||||
|
string threadIdStr = toString.str(), debugCommonFilePath = DEBUG_DIRECTORY + threadIdStr;
|
||||||
|
cmd += " >> " + debugCommonFilePath + ".out";
|
||||||
|
cmd += " 2>> " + debugCommonFilePath + ".err";
|
||||||
array<char, 128> buffer;
|
array<char, 128> buffer;
|
||||||
string result;
|
string result;
|
||||||
unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
|
unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
|
||||||
|
Loading…
Reference in New Issue
Block a user