From 923c14a77b887d375d01671e17883a8012d17d71 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Mon, 2 Jan 2023 19:46:32 +0100 Subject: [PATCH] #2: Add data logging --- channelsToTreat.txt => channels.txt | 0 main.cpp | 76 ++++++++++++++++++----------- 2 files changed, 48 insertions(+), 28 deletions(-) rename channelsToTreat.txt => channels.txt (100%) diff --git a/channelsToTreat.txt b/channels.txt similarity index 100% rename from channelsToTreat.txt rename to channels.txt diff --git a/main.cpp b/main.cpp index ef12b7f..602fc2f 100644 --- a/main.cpp +++ b/main.cpp @@ -9,10 +9,10 @@ using namespace std; using json = nlohmann::json; vector getFileContent(string filePath); -json getJson(string url); +json getJson(string url, string directoryPath); void createDirectory(string path), print(ostringstream* toPrint), - treatComment(json comment); + treatComment(json comment, string channelId); string getHttps(string url); size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp); bool doesFileExist(string filePath), @@ -27,20 +27,26 @@ ostringstream toPrint; set channelsAlreadyTreated, channelsToTreat; -unsigned int commentsCount = 0; +unsigned int commentsCount = 0, + requestsPerChannel = 0; +string CHANNELS_DIRECTORY = "channels/", + CHANNELS_FILE_PATH = "channels.txt"; int main() { - string channelsToTreatFilePath = "channelsToTreat.txt"; - vector channelsToTreatVec = getFileContent(channelsToTreatFilePath); - channelsToTreat = set(channelsToTreatVec.begin(), channelsToTreatVec.end()); + // The starting set should be written to `CHANNELS_FILE_PATH`. + // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folder in `CHANNELS_DIRECTORY` being treated. + // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set. + vector channelsVec = getFileContent(CHANNELS_FILE_PATH); + channelsToTreat = set(channelsVec.begin(), channelsVec.end()); - string channelsDirectory = "channels/"; - createDirectory(channelsDirectory); + createDirectory(CHANNELS_DIRECTORY); - for(const auto& entry : filesystem::directory_iterator(channelsDirectory)) + for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY)) { - channelsAlreadyTreated.insert(entry.path().filename()); + string channelId = entry.path().filename(); + channelsToTreat.erase(channelId); + channelsAlreadyTreated.insert(channelId); } PRINT(channelsToTreat.size() << " channel(s) to treat") @@ -49,12 +55,18 @@ int main() while(!channelsToTreat.empty()) { string channelToTreat = *channelsToTreat.begin(); + PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")") + channelsAlreadyTreated.insert(channelToTreat); + + string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/"; + createDirectory(channelToTreatDirectory); + string pageToken = ""; while(true) { - json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken); + json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken, channelToTreat); bool doesRelyingOnCommentThreadsIsEnough = data["error"]["errors"][0]["reason"] != "commentsDisabled"; if(doesRelyingOnCommentThreadsIsEnough) { @@ -63,7 +75,7 @@ int main() { json comment = item["snippet"]["topLevelComment"]; string commentId = comment["id"]; - treatComment(comment); + treatComment(comment, channelToTreat); if(item.contains("replies")) { json replies = item["replies"]["comments"]; @@ -72,11 +84,11 @@ int main() string pageToken = ""; while(true) { - json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken); + json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat); json items = data["items"]; for(const auto& item : items) { - treatComment(item); + treatComment(item, channelToTreat); } if(data.contains("nextPageToken")) { @@ -92,7 +104,7 @@ int main() { for(const auto& reply : replies) { - treatComment(reply); + treatComment(reply, channelToTreat); } } } @@ -115,29 +127,27 @@ int main() PRINT(commentsCount) commentsCount = 0; + requestsPerChannel = 0; channelsToTreat.erase(channelToTreat); - channelsAlreadyTreated.insert(channelToTreat); - - string channelToTreatDirectory = channelsDirectory + channelToTreat + "/"; - createDirectory(channelToTreatDirectory); - - string toWrite = (doesFileExist(channelsToTreatFilePath) ? "\n" : "") + channelToTreat; - writeFile(channelsToTreatFilePath, "a", toWrite); } return 0; } -void treatComment(json comment) +void treatComment(json comment, string channelId) { json snippet = comment["snippet"]; // The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`). if(snippet.contains("authorChannelId")) { string channelId = snippet["authorChannelId"]["value"]; - if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end()) + if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end() && find(channelsToTreat.begin(), channelsToTreat.end(), channelId) == channelsToTreat.end()) + { channelsToTreat.insert(channelId); + + writeFile(CHANNELS_FILE_PATH, "a", "\n" + channelId); + } } commentsCount++; } @@ -151,6 +161,10 @@ bool writeFile(string filePath, string option, string toWrite) fclose(file); return true; } + else + { + PRINT("writeFile error: " << strerror(errno)) + } return false; } @@ -184,15 +198,21 @@ vector getFileContent(string filePath) return lines; } -json getJson(string url) +json getJson(string url, string directoryPath) { #ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE - url = "https://yt.lemnoslife.com/noKey/" + url; + string finalUrl = "https://yt.lemnoslife.com/noKey/" + url; #else - url = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY; + string finalUrl = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY; #endif - string content = getHttps(url); + string content = getHttps(finalUrl); json data = json::parse(content); + + ostringstream toString; + toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json"; + requestsPerChannel++; + writeFile(toString.str(), "w", url + "\n" + content); + return data; }