From b3779fe49a592b421706e920a374ed094e399913 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Sun, 8 Jan 2023 15:43:27 +0100 Subject: [PATCH] Fix #20: YouTube Data API v3 returns rarely suddenly `commentsDisabled` error which involves an unwanted method switch Also modified compression command, as I got `sh: 1: zip: Argument list too long` when compressing the 248,868 json files of the French most subscribers channel. --- main.cpp | 22 ++++++++++++++++------ removeChannelsBeingTreated.py | 10 +++++++++- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/main.cpp b/main.cpp index ba10767..3a8b84b 100644 --- a/main.cpp +++ b/main.cpp @@ -13,7 +13,7 @@ using namespace chrono; using json = nlohmann::json; vector getFileContent(string filePath); -json getJson(unsigned short threadId, string url, string directoryPath); +json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled = false); void createDirectory(string path), print(ostringstream* toPrint), treatComment(unsigned short threadId, json comment, string channelId), @@ -49,13 +49,15 @@ int main() // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated. // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set. vector channelsVec = getFileContent(CHANNELS_FILE_PATH); + // Note that using `set`s makes the search faster but we lose the `channels.txt` lines order. channelsToTreat = set(channelsVec.begin(), channelsVec.end()); createDirectory(CHANNELS_DIRECTORY); for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY)) { - string channelId = entry.path().filename(); + string fileName = entry.path().filename(), + channelId = fileName.substr(0, fileName.length() - 4); channelsToTreat.erase(channelId); channelsAlreadyTreated.insert(channelId); } @@ -112,9 +114,14 @@ void treatChannels(unsigned short threadId) treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat); + // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds. + PRINT(threadId, "Starting compression...") // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli. - exec("cd " + channelToTreatDirectory + " && zip -r ../" + channelToTreat + ".zip *"); + exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@"); + + PRINT(threadId, "Compression finished, started deleting initial directory...") deleteDirectory(channelToTreatDirectory); + PRINT(threadId, "Deleting directory finished.") PRINT(threadId, commentsCount << " comments were found for this channel.") commentsCount = 0; @@ -132,7 +139,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str ostringstream toString; toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken; string url = toString.str(); - json data = getJson(threadId, url, channelToTreat); + json data = getJson(threadId, url, channelToTreat, pageToken != ""); bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled"; if(doesRelyingOnCommentThreadsIsEnough) { @@ -323,7 +330,7 @@ vector getFileContent(string filePath) return lines; } -json getJson(unsigned short threadId, string url, string directoryPath) +json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled) { #ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE string finalUrl = "https://yt.lemnoslife.com/noKey/" + url; @@ -345,7 +352,10 @@ json getJson(unsigned short threadId, string url, string directoryPath) if(data.contains("error")) { PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !") - return getJson(threadId, url, directoryPath); + if(data["error"]["errors"][0]["reason"] != "commentsDisabled" || retryOnCommentsDisabled) + { + return getJson(threadId, url, directoryPath); + } } ostringstream toString; diff --git a/removeChannelsBeingTreated.py b/removeChannelsBeingTreated.py index 9f1df53..c80a196 100755 --- a/removeChannelsBeingTreated.py +++ b/removeChannelsBeingTreated.py @@ -1,6 +1,6 @@ #!/usr/bin/python3 -import shutil +import shutil, os infix = ': Treating channel ' path = 'channels/' @@ -18,8 +18,16 @@ with open('nohup.out') as f: for threadId in threads: channelId = threads[threadId] print(threadId, channelId) + # There are three cases: + # - `channelId`/ exists + # - `channelId`/ and `channelId`.zip exist + # - `channelId`.zip exists + # To manage every case, we need to use two `try`/`except`. try: shutil.rmtree(path + channelId) + except: + pass + try: os.remove(path + channelId + ".zip") except: pass