#include #include #include #include #include #include #include using namespace std; using json = nlohmann::json; vector getFileContent(string filePath); json getJson(string url, string directoryPath); void createDirectory(string path), print(ostringstream* toPrint), treatComment(json comment, string channelId); string getHttps(string url); size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp); bool doesFileExist(string filePath), writeFile(string filePath, string option, string toWrite); #define USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE #define API_KEY "AIzaSy..." // Note that this printing approach is only safe in a mono-thread context. #define PRINT(x) toPrint << x; print(&toPrint); ostringstream toPrint; set channelsAlreadyTreated, channelsToTreat; unsigned int commentsCount = 0, requestsPerChannel = 0; string CHANNELS_DIRECTORY = "channels/", CHANNELS_FILE_PATH = "channels.txt"; int main() { // The starting set should be written to `CHANNELS_FILE_PATH`. // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folder in `CHANNELS_DIRECTORY` being treated. // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set. vector channelsVec = getFileContent(CHANNELS_FILE_PATH); channelsToTreat = set(channelsVec.begin(), channelsVec.end()); createDirectory(CHANNELS_DIRECTORY); for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY)) { string channelId = entry.path().filename(); channelsToTreat.erase(channelId); channelsAlreadyTreated.insert(channelId); } PRINT(channelsToTreat.size() << " channel(s) to treat") PRINT(channelsAlreadyTreated.size() << " channel(s) already treated") while(!channelsToTreat.empty()) { string channelToTreat = *channelsToTreat.begin(); PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")") channelsAlreadyTreated.insert(channelToTreat); string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/"; createDirectory(channelToTreatDirectory); string pageToken = ""; while(true) { json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken, channelToTreat); bool doesRelyingOnCommentThreadsIsEnough = data["error"]["errors"][0]["reason"] != "commentsDisabled"; if(doesRelyingOnCommentThreadsIsEnough) { json items = data["items"]; for(const auto& item : items) { json comment = item["snippet"]["topLevelComment"]; string commentId = comment["id"]; treatComment(comment, channelToTreat); if(item.contains("replies")) { json replies = item["replies"]["comments"]; if(replies.size() >= 5) { string pageToken = ""; while(true) { json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat); json items = data["items"]; for(const auto& item : items) { treatComment(item, channelToTreat); } if(data.contains("nextPageToken")) { pageToken = data["nextPageToken"]; } else { break; } } } else { for(const auto& reply : replies) { treatComment(reply, channelToTreat); } } } } if(data.contains("nextPageToken")) { pageToken = data["nextPageToken"]; } else { break; } } else { PRINT("Comments disabled channel!") exit(1); } } PRINT(commentsCount) commentsCount = 0; requestsPerChannel = 0; channelsToTreat.erase(channelToTreat); } return 0; } void treatComment(json comment, string channelId) { json snippet = comment["snippet"]; // The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`). if(snippet.contains("authorChannelId")) { string channelId = snippet["authorChannelId"]["value"]; if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end() && find(channelsToTreat.begin(), channelsToTreat.end(), channelId) == channelsToTreat.end()) { channelsToTreat.insert(channelId); writeFile(CHANNELS_FILE_PATH, "a", "\n" + channelId); } } commentsCount++; } bool writeFile(string filePath, string option, string toWrite) { FILE* file = fopen(filePath.c_str(), option.c_str()); if(file != NULL) { fputs(toWrite.c_str(), file); fclose(file); return true; } else { PRINT("writeFile error: " << strerror(errno)) } return false; } bool doesFileExist(string filePath) { struct stat buffer; return stat(filePath.c_str(), &buffer) == 0; } void createDirectory(string path) { mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); } string getDate() { auto t = time(nullptr); auto tm = *localtime(&t); ostringstream toString; toString << put_time(&tm, "%d-%m-%Y %H-%M-%S"); return toString.str(); } vector getFileContent(string filePath) { vector lines; ifstream infile(filePath.c_str()); string line; while(getline(infile, line)) lines.push_back(line); return lines; } json getJson(string url, string directoryPath) { #ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE string finalUrl = "https://yt.lemnoslife.com/noKey/" + url; #else string finalUrl = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY; #endif string content = getHttps(finalUrl); json data = json::parse(content); ostringstream toString; toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json"; requestsPerChannel++; writeFile(toString.str(), "w", url + "\n" + content); return data; } void print(ostringstream* toPrint) { cout << getDate() << ": " << toPrint->str() << endl; toPrint->str(""); } string getHttps(string url) { CURL* curl = curl_easy_init(); string got; curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1); curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 1); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &got); curl_easy_perform(curl); curl_easy_cleanup(curl); return got; } size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp) { ((string*)userp)->append((char*)contents, size * nmemb); return size * nmemb; }