Fix #20: YouTube Data API v3 returns rarely suddenly commentsDisabled error which involves an unwanted method switch

Also modified compression command, as I got `sh: 1: zip: Argument list too long` when compressing the 248,868 json files of the French most subscribers channel.
2023-01-08 15:43:27 +01:00
parent ba37d6a111
commit 7e35a6473a
2 changed files with 25 additions and 7 deletions
--- a/main.cpp
+++ b/main.cpp
@@ -13,7 +13,7 @@ using namespace chrono;
 using json = nlohmann::json;

 vector<string> getFileContent(string filePath);
-json getJson(unsigned short threadId, string url, string directoryPath);
+json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled = false);
 void createDirectory(string path),
     print(ostringstream* toPrint),
     treatComment(unsigned short threadId, json comment, string channelId),
@@ -49,13 +49,15 @@ int main()
    // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
    // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
    vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
+    // Note that using `set`s makes the search faster but we lose the `channels.txt` lines order.
    channelsToTreat = set(channelsVec.begin(), channelsVec.end());

    createDirectory(CHANNELS_DIRECTORY);

    for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
    {
-        string channelId = entry.path().filename();
+        string fileName = entry.path().filename(),
+               channelId = fileName.substr(0, fileName.length() - 4);
        channelsToTreat.erase(channelId);
        channelsAlreadyTreated.insert(channelId);
    }
@@ -112,9 +114,14 @@ void treatChannels(unsigned short threadId)

        treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);

+        // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
+        PRINT(threadId, "Starting compression...")
        // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
-        exec("cd " + channelToTreatDirectory + " && zip -r ../" + channelToTreat + ".zip *");
+        exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
+
+        PRINT(threadId, "Compression finished, started deleting initial directory...")
        deleteDirectory(channelToTreatDirectory);
+        PRINT(threadId, "Deleting directory finished.")

        PRINT(threadId, commentsCount << " comments were found for this channel.")
        commentsCount = 0;
@@ -132,7 +139,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
        ostringstream toString;
        toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
        string url = toString.str();
-        json data = getJson(threadId, url, channelToTreat);
+        json data = getJson(threadId, url, channelToTreat, pageToken != "");
        bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
        if(doesRelyingOnCommentThreadsIsEnough)
        {
@@ -323,7 +330,7 @@ vector<string> getFileContent(string filePath)
    return lines;
 }

-json getJson(unsigned short threadId, string url, string directoryPath)
+json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled)
 {
 #ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
    string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
@@ -345,7 +352,10 @@ json getJson(unsigned short threadId, string url, string directoryPath)
    if(data.contains("error"))
    {
        PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
-        return getJson(threadId, url, directoryPath);
+        if(data["error"]["errors"][0]["reason"] != "commentsDisabled" || retryOnCommentsDisabled)
+        {
+            return getJson(threadId, url, directoryPath);
+        }
    }

    ostringstream toString;
--- a/removeChannelsBeingTreated.py
+++ b/removeChannelsBeingTreated.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3

-import shutil
+import shutil, os

 infix = ': Treating channel '
 path = 'channels/'
@@ -18,8 +18,16 @@ with open('nohup.out') as f:
    for threadId in threads:
        channelId = threads[threadId]
        print(threadId, channelId)
+        # There are three cases:
+        # - `channelId`/ exists
+        # - `channelId`/ and `channelId`.zip exist
+        # - `channelId`.zip exists
+        # To manage every case, we need to use two `try`/`except`.
        try:
            shutil.rmtree(path + channelId)
+        except:
+            pass
+        try:
            os.remove(path + channelId + ".zip")
        except:
            pass