From b3779fe49a592b421706e920a374ed094e399913 Mon Sep 17 00:00:00 2001
From: Benjamin Loison <Benjamin_Loison@users.noreply.gitea.lemnoslife.com>
Date: Sun, 8 Jan 2023 15:43:27 +0100
Subject: [PATCH] Fix #20: YouTube Data API v3 returns rarely suddenly
 `commentsDisabled` error which involves an unwanted method switch

Also modified compression command, as I got `sh: 1: zip: Argument list too long` when compressing the 248,868 json files of the French most subscribers channel.
---
 main.cpp                      | 22 ++++++++++++++++------
 removeChannelsBeingTreated.py | 10 +++++++++-
 2 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/main.cpp b/main.cpp
index ba10767..3a8b84b 100644
--- a/main.cpp
+++ b/main.cpp
@@ -13,7 +13,7 @@ using namespace chrono;
 using json = nlohmann::json;
 
 vector<string> getFileContent(string filePath);
-json getJson(unsigned short threadId, string url, string directoryPath);
+json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled = false);
 void createDirectory(string path),
      print(ostringstream* toPrint),
      treatComment(unsigned short threadId, json comment, string channelId),
@@ -49,13 +49,15 @@ int main()
     // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
     // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
     vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
+    // Note that using `set`s makes the search faster but we lose the `channels.txt` lines order.
     channelsToTreat = set(channelsVec.begin(), channelsVec.end());
 
     createDirectory(CHANNELS_DIRECTORY);
 
     for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
     {
-        string channelId = entry.path().filename();
+        string fileName = entry.path().filename(),
+               channelId = fileName.substr(0, fileName.length() - 4);
         channelsToTreat.erase(channelId);
         channelsAlreadyTreated.insert(channelId);
     }
@@ -112,9 +114,14 @@ void treatChannels(unsigned short threadId)
 
         treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
 
+        // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
+        PRINT(threadId, "Starting compression...")
         // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
-        exec("cd " + channelToTreatDirectory + " && zip -r ../" + channelToTreat + ".zip *");
+        exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
+
+        PRINT(threadId, "Compression finished, started deleting initial directory...")
         deleteDirectory(channelToTreatDirectory);
+        PRINT(threadId, "Deleting directory finished.")
 
         PRINT(threadId, commentsCount << " comments were found for this channel.")
         commentsCount = 0;
@@ -132,7 +139,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
         ostringstream toString;
         toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
         string url = toString.str();
-        json data = getJson(threadId, url, channelToTreat);
+        json data = getJson(threadId, url, channelToTreat, pageToken != "");
         bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
         if(doesRelyingOnCommentThreadsIsEnough)
         {
@@ -323,7 +330,7 @@ vector<string> getFileContent(string filePath)
     return lines;
 }
 
-json getJson(unsigned short threadId, string url, string directoryPath)
+json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled)
 {
 #ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
     string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
@@ -345,7 +352,10 @@ json getJson(unsigned short threadId, string url, string directoryPath)
     if(data.contains("error"))
     {
         PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
-        return getJson(threadId, url, directoryPath);
+        if(data["error"]["errors"][0]["reason"] != "commentsDisabled" || retryOnCommentsDisabled)
+        {
+            return getJson(threadId, url, directoryPath);
+        }
     }
 
     ostringstream toString;
diff --git a/removeChannelsBeingTreated.py b/removeChannelsBeingTreated.py
index 9f1df53..c80a196 100755
--- a/removeChannelsBeingTreated.py
+++ b/removeChannelsBeingTreated.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-import shutil
+import shutil, os
 
 infix = ': Treating channel '
 path = 'channels/'
@@ -18,8 +18,16 @@ with open('nohup.out') as f:
     for threadId in threads:
         channelId = threads[threadId]
         print(threadId, channelId)
+        # There are three cases:
+        # - `channelId`/ exists
+        # - `channelId`/ and `channelId`.zip exist
+        # - `channelId`.zip exists
+        # To manage every case, we need to use two `try`/`except`.
         try:
             shutil.rmtree(path + channelId)
+        except:
+            pass
+        try:
             os.remove(path + channelId + ".zip")
         except:
             pass