Fix #13: Add captions extraction
I was about to commit in addition: ```c++ // Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`. // My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`. /*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat), items = data["items"]; for(const auto& item : items) { json snippet = item["snippet"]; if(snippet["trackKind"] == "asr") { string language = snippet["language"]; cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); // As there should be a single automatic speech recognized track, there is no need to go through all tracks. break; } }*/ ``` Instead of: ```c++ cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); ``` But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was wrong: > `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again. > > ``` > 'subtitleslangs': ['en.*','.*orig'], > 'writeautomaticsub': True, > ``` > > Work as expected too. Thank you > > Very sorry for the video sample. I even not watched it. Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
This commit is contained in:
parent
9b792015fa
commit
04c59eb025
@ -15,8 +15,9 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
|
||||
Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
|
||||
|
||||
```sh
|
||||
sudo apt install nlohmann-json3-dev
|
||||
sudo apt install nlohmann-json3-dev yt-dlp
|
||||
make
|
||||
./youtubeCaptionsSearchEngine -h
|
||||
```
|
||||
|
||||
Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
|
||||
|
57
main.cpp
57
main.cpp
@ -25,7 +25,7 @@ void createDirectory(string path),
|
||||
deleteDirectory(string path),
|
||||
addChannelToTreat(unsigned short threadId, string channelId);
|
||||
string getHttps(string url),
|
||||
exec(string cmd),
|
||||
exec(unsigned short threadId, string cmd),
|
||||
join(vector<string> parts, string delimiter);
|
||||
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
|
||||
bool doesFileExist(string filePath),
|
||||
@ -57,7 +57,9 @@ string CHANNELS_DIRECTORY = "channels/",
|
||||
KEYS_FILE_PATH = "keys.txt",
|
||||
UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
|
||||
apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
|
||||
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API"; // Can be "https://yt.lemnoslife.com" for instance.
|
||||
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance.
|
||||
CAPTIONS_DIRECTORY = "captions/",
|
||||
DEBUG_DIRECTORY = "debug/";
|
||||
bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
@ -177,13 +179,15 @@ void treatChannels(unsigned short threadId)
|
||||
|
||||
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
|
||||
createDirectory(channelToTreatDirectory);
|
||||
createDirectory(DEBUG_DIRECTORY);
|
||||
createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY);
|
||||
|
||||
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
|
||||
|
||||
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
|
||||
PRINT("Starting compression...")
|
||||
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
|
||||
exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
|
||||
exec(threadId, "cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
|
||||
|
||||
PRINT("Compression finished, started deleting initial directory...")
|
||||
deleteDirectory(channelToTreatDirectory);
|
||||
@ -559,6 +563,46 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
}
|
||||
}
|
||||
}
|
||||
// Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
|
||||
string playlistToTreat = "UU" + channelToTreat.substr(2);
|
||||
pageToken = "";
|
||||
while(true)
|
||||
{
|
||||
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
|
||||
if(data.contains("error"))
|
||||
{
|
||||
EXIT_WITH_ERROR("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!")
|
||||
}
|
||||
json items = data["items"];
|
||||
for(const auto& item : items)
|
||||
{
|
||||
string videoId = item["contentDetails"]["videoId"];
|
||||
// Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
|
||||
//treatChannelOrVideo(threadId, false, videoId, channelToTreat);
|
||||
|
||||
string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/";
|
||||
createDirectory(channelCaptionsToTreatDirectory);
|
||||
|
||||
// Firstly download all not automatically generated captions.
|
||||
// The underscore in `-o` argument is used to not end up with hidden files.
|
||||
string cmdCommonPrefix = "yt-dlp --skip-download ",
|
||||
cmdCommonPostfix = " '" + videoId + "' -o '" + channelCaptionsToTreatDirectory + "_'";
|
||||
string cmd = cmdCommonPrefix + "--all-subs" + cmdCommonPostfix;
|
||||
exec(threadId, cmd);
|
||||
|
||||
// Secondly download the automatically generated captions.
|
||||
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
|
||||
exec(threadId, cmd);
|
||||
}
|
||||
if(data.contains("nextPageToken"))
|
||||
{
|
||||
pageToken = data["nextPageToken"];
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function verifies that the given hasn't already been treated.
|
||||
@ -609,8 +653,13 @@ string join(vector<string> parts, string delimiter)
|
||||
return result;
|
||||
}
|
||||
|
||||
string exec(string cmd)
|
||||
string exec(unsigned short threadId, string cmd)
|
||||
{
|
||||
ostringstream toString;
|
||||
toString << threadId;
|
||||
string threadIdStr = toString.str(), debugCommonFilePath = DEBUG_DIRECTORY + threadIdStr;
|
||||
cmd += " >> " + debugCommonFilePath + ".out";
|
||||
cmd += " 2>> " + debugCommonFilePath + ".err";
|
||||
array<char, 128> buffer;
|
||||
string result;
|
||||
unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
|
||||
|
Loading…
Reference in New Issue
Block a user