From 95a9421ad0469a09335afeddb2983e31dc00bc36 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Thu, 22 Dec 2022 05:20:32 +0100 Subject: [PATCH] Add `main.cpp`, `Makefile` and `channelsToTreat.txt` Note that running this algorithm end up with channel [`UC-99odscxh1xxTyxHyXuRrg`](https://www.youtube.com/channel/UC-99odscxh1xxTyxHyXuRrg) and more precisely the video [`Tq5aPNzfYcg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg) and more precisely the comment [`Ugx-TlSq6SNCbOX04mx4AaABAg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg&lc=Ugx-TlSq6SNCbOX04mx4AaABAg) [which doesn't have any author](https://yt.lemnoslife.com/noKey/comments?part=snippet&id=Ugx-TlSq6SNCbOX04mx4AaABAg)... --- Makefile | 4 ++ README.md | 12 +++- channelsToTreat.txt | 1 + main.cpp | 169 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 Makefile create mode 100644 channelsToTreat.txt create mode 100644 main.cpp diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d35fd21 --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +.PHONY: main + +main: + g++ main.cpp -g -lcurl -o main diff --git a/README.md b/README.md index 148f4a6..0392478 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,15 @@ For a given channel, there are two ways to list comments users published on it: 2. A simpler approach consists in using YouTube Data API v3 CommentThreads: list endpoint with `allThreadsRelatedToChannelId`. The main upside of this method, in addition to be simpler, is that for channels with many videos we spare much time by working 100 comments at a time instead of a video at a time with possibly not a single comment. Note that this approach doesn't list all videos etc so we don't retrieve some information. Note that this approach doesn't work for some channels that have comments enabled on some videos but not the whole channels. So when possible we will proceed with 2. and use 1. as a fallback approach. -We can multi-thread this process by channel or we can multi-thread per videos of a given channel. -As would like to proceed channel per channel, the question is **how much time does it take to retrieve all comments from the biggest YouTube channel? If the answer is a long period of time, then multi-threading per videos of a given channel may make sense.** +We can multi-thread this process by channel or we can multi-thread per videos of a given channel (loosing optimization of CommentThreads: list with `allThreadsRelatedToChannelId`). In any case we shouldn't do something hybrid in terms of multi-threading, as it would be too complex. +As would like to proceed channel per channel, the question is **how much time does it take to retrieve all comments from the biggest YouTube channel? If the answer is a long period of time, then multi-threading per videos of a given channel may make sense.** There are two possibilities following our methods: +1. Here the complexity is linear in the number of channel's comments, more precisely this number divided by 100 - we could guess that the channel with the most subscribers ([T-Series](https://www.youtube.com/@tseries)) has the most comments +2. Here the complexity is linear in the number of videos - as far as I know [RoelVandePaar](https://www.youtube.com/@RoelVandePaar) has the most videos, [2,026,566 according to SocialBlade](https://socialblade.com/youtube/c/roelvandepaar). However due to the 20,000 limit of YouTube Data API v3 PlaylistItems: list the actual limit is 20,000 [as far as I know](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/wiki#user-content-concerning-20-000-videos-limit-for-youtube-data-api-v3-playlistitems-list-endpoint). Have to proceed with a breadth-first search approach as treating all *child* channels might take a time equivalent to treating the whole original tree. + +```sh +sudo apt install nlohmann-json3-dev +make +./main +``` diff --git a/channelsToTreat.txt b/channelsToTreat.txt new file mode 100644 index 0000000..2c63c27 --- /dev/null +++ b/channelsToTreat.txt @@ -0,0 +1 @@ +UCt5USYpzzMCYhkirVQGHwKQ diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..fed8441 --- /dev/null +++ b/main.cpp @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +using namespace std; +using json = nlohmann::json; + +vector getFileContent(string filePath); +json getJson(string url); +void print(ostringstream* toPrint), + treatComment(json comment); +string getHttps(string url); +size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp); + +#define API_KEY "AIzaSy..." + +// Note that this printing approach is only safe in a mono-thread context. +#define PRINT(x) toPrint << x; print(&toPrint); +ostringstream toPrint; + +// TODO: change to `set`s +set channelsToTreat, + channelsAlreadyTreated; +unsigned int commentsCount = 0; + +int main() +{ + vector channelsToTreatVec = getFileContent("channelsToTreat.txt"); + channelsToTreat = set(channelsToTreatVec.begin(), channelsToTreatVec.end()); + + PRINT(channelsToTreat.size() << " channel(s) to treat") + + while(!channelsToTreat.empty()) + { + string channelToTreat = *channelsToTreat.begin(); + PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")") + + string pageToken = ""; + while(true) + { + json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken); + bool doesRelyingOnCommentThreadsIsEnough = data["error"]["errors"][0]["reason"] != "commentsDisabled"; + if(doesRelyingOnCommentThreadsIsEnough) + { + json items = data["items"]; + for(const auto& item : items) + { + json comment = item["snippet"]["topLevelComment"]; + string commentId = comment["id"]; + treatComment(comment); + if(item.contains("replies")) + { + json replies = item["replies"]["comments"]; + if(replies.size() >= 5) + { + string pageToken = ""; + while(true) + { + json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken); + json items = data["items"]; + for(const auto& item : items) + { + treatComment(item); + } + if(data.contains("nextPageToken")) + { + pageToken = data["nextPageToken"]; + } + else + { + break; + } + } + } + else + { + for(const auto& reply : replies) + { + treatComment(reply); + } + } + } + } + if(data.contains("nextPageToken")) + { + pageToken = data["nextPageToken"]; + } + else + { + break; + } + } + else + { + PRINT("Comments disabled channel!") + exit(1); + } + } + + PRINT(commentsCount) + commentsCount = 0; + channelsToTreat.erase(channelToTreat); + channelsAlreadyTreated.insert(channelToTreat); + } + + return 0; +} + +void treatComment(json comment) +{ + PRINT("id: " << comment["id"]) + json snippet = comment["snippet"]; + PRINT("snippet: " << snippet) + if(snippet.contains("videoId")) + PRINT("videoId: " << snippet["videoId"]) + json authorChannelId = snippet["authorChannelId"]; + PRINT("authorChannelId: " << authorChannelId) + string channelId = comment["snippet"]["authorChannelId"]["value"]; + PRINT("channelId: " << channelId) + commentsCount++; + //PRINT(channelId) + if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end()) + channelsToTreat.insert(channelId); +} + +vector getFileContent(string filePath) +{ + vector lines; + ifstream infile(filePath.c_str()); + string line; + while(getline(infile, line)) + lines.push_back(line); + return lines; +} + +json getJson(string url) +{ + url = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY; + string content = getHttps(url); + json data = json::parse(content); + return data; +} + +void print(ostringstream* toPrint) +{ + cout << toPrint->str() << endl; + toPrint->str(""); +} + +string getHttps(string url) +{ + CURL* curl = curl_easy_init(); + string got; + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 1); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &got); + curl_easy_perform(curl); + curl_easy_cleanup(curl); + return got; +} + +size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp) +{ + ((string*)userp)->append((char*)contents, size * nmemb); + return size * nmemb; +}