From 7456685f2bbed7b3fdcbf675a4c933ba79a939da Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Sun, 15 Jan 2023 02:19:31 +0100 Subject: [PATCH] #11: Add a first iteration for the `CHANNELS` retrieval --- Makefile | 2 +- README.md | 7 ++++- main.cpp | 87 ++++++++++++++++++++++++++++++++++++++----------------- 3 files changed, 68 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index 07cac50..bf7a88b 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ .PHONY: main main: - g++ main.cpp -g -std=c++17 -lcurl -lpthread -o main + g++ main.cpp -g -std=c++17 -lcurl -lpthread -o youtubeCaptionsSearchEngine diff --git a/README.md b/README.md index 0392478..133f552 100644 --- a/README.md +++ b/README.md @@ -15,5 +15,10 @@ Have to proceed with a breadth-first search approach as treating all *child* cha ```sh sudo apt install nlohmann-json3-dev make -./main +``` + +Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api). + +```sh +./youtubeCaptionsSearchEngine ``` diff --git a/main.cpp b/main.cpp index b82ec3e..3e2a655 100644 --- a/main.cpp +++ b/main.cpp @@ -16,13 +16,14 @@ enum getJsonBehavior { normal, retryOnCommentsDisabled, returnErrorIfPlaylistNot set setFromVector(vector vec); vector getFileContent(string filePath); -json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior = normal); +json getJson(unsigned short threadId, string url, bool usingYouTubeDataApiV3, string directoryPath, getJsonBehavior behavior = normal); void createDirectory(string path), print(ostringstream* toPrint), treatComment(unsigned short threadId, json comment, string channelId), treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat), treatChannels(unsigned short threadId), - deleteDirectory(string path); + deleteDirectory(string path), + addChannelToTreat(unsigned short threadId, string channelId); string getHttps(string url), exec(string cmd); size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp); @@ -196,7 +197,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str ostringstream toString; toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken; string url = toString.str(); - json data = getJson(threadId, url, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled); + json data = getJson(threadId, url, true, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled); bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled"; if(doesRelyingOnCommentThreadsIsEnough) { @@ -213,7 +214,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str string pageToken = ""; while(true) { - json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat), + json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, true, channelToTreat), items = data["items"]; for(const auto& item : items) { @@ -251,7 +252,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str else { PRINT(threadId, "Comments disabled channel, treating differently...") - json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, channelToTreat); + json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, true, channelToTreat); // YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer... unsigned int videoCount = atoi(string(data["items"][0]["statistics"]["videoCount"]).c_str()); PRINT(threadId, "The channel has about " << videoCount << " videos.") @@ -263,7 +264,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str while(true) { // `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them. - json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, channelToTreat, returnErrorIfPlaylistNotFound); + json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound); if(data.contains("error")) { PRINT(threadId, "Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!") @@ -300,6 +301,51 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str } } } + if(isChannel) + { + string pageToken = ""; + while(true) + { + json data = getJson(threadId, "channels?part=channels&id=" + id + (pageToken == "" ? "" : "&pageToken=" + pageToken), false, id), + items = data["items"]; + for(const auto& item : items) + { + for(const auto& channel : item["channels"]["channels"]) + { + PRINT(threadId, channel) + addChannelToTreat(threadId, channel["channelId"]); + } + } + if(!data["nextPageToken"].is_null()) + { + pageToken = data["nextPageToken"]; + } + else + { + break; + } + } + } +} + +// This function verifies that the given hasn't already been treated. +void addChannelToTreat(unsigned short threadId, string channelId) +{ + channelsAlreadyTreatedAndToTreatMutex.lock(); + if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreatRev.find(channelId) == channelsToTreatRev.end()) + { + unsigned int channelsToTreatIndex = channelsToTreat.end()->first + 1; + channelsToTreat[channelsToTreatIndex] = channelId; + channelsToTreatRev[channelId] = channelsToTreatIndex; + + channelsAlreadyTreatedAndToTreatMutex.unlock(); + + writeFile(threadId, CHANNELS_FILE_PATH, "a", "\n" + channelId); + } + else + { + channelsAlreadyTreatedAndToTreatMutex.unlock(); + } } void treatComment(unsigned short threadId, json comment, string channelId) @@ -309,21 +355,7 @@ void treatComment(unsigned short threadId, json comment, string channelId) if(snippet.contains("authorChannelId")) { string channelId = snippet["authorChannelId"]["value"]; - channelsAlreadyTreatedAndToTreatMutex.lock(); - if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreatRev.find(channelId) == channelsToTreatRev.end()) - { - unsigned int channelsToTreatIndex = channelsToTreat.end()->first + 1; - channelsToTreat[channelsToTreatIndex] = channelId; - channelsToTreatRev[channelId] = channelsToTreatIndex; - - channelsAlreadyTreatedAndToTreatMutex.unlock(); - - writeFile(threadId, CHANNELS_FILE_PATH, "a", "\n" + channelId); - } - else - { - channelsAlreadyTreatedAndToTreatMutex.unlock(); - } + addChannelToTreat(threadId, channelId); } commentsCount++; commentsPerSecondCount++; @@ -405,10 +437,13 @@ vector getFileContent(string filePath) return lines; } -json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior) +json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, string directoryPath, getJsonBehavior behavior) { - string finalUrl = USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ? "https://yt.lemnoslife.com/noKey/" + url : - "https://www.googleapis.com/youtube/v3/" + url + "&key=" + apiKey, + string finalUrl = usingYoutubeDataApiv3 ? + (USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ? + "https://yt.lemnoslife.com/noKey/" + url : + "https://www.googleapis.com/youtube/v3/" + url + "&key=" + apiKey) : + YOUTUBE_OPERATIONAL_API_INSTANCE_URL + "/" + url, content = getHttps(finalUrl); json data; try @@ -433,12 +468,12 @@ json getJson(unsigned short threadId, string url, string directoryPath, getJsonB PRINT(threadId, "No more quota on " << apiKey << " switching to " << keys[0] << ".") apiKey = keys[0]; quotaMutex.unlock(); - return getJson(threadId, url, directoryPath); + return getJson(threadId, url, true, directoryPath); } PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !") if(reason != "commentsDisabled" || behavior == retryOnCommentsDisabled) { - return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, directoryPath); + return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, true, directoryPath); } }