From 923c14a77b887d375d01671e17883a8012d17d71 Mon Sep 17 00:00:00 2001
From: Benjamin Loison <Benjamin_Loison@users.noreply.gitea.lemnoslife.com>
Date: Mon, 2 Jan 2023 19:46:32 +0100
Subject: [PATCH] #2: Add data logging

---
 channelsToTreat.txt => channels.txt |  0
 main.cpp                            | 76 ++++++++++++++++++-----------
 2 files changed, 48 insertions(+), 28 deletions(-)
 rename channelsToTreat.txt => channels.txt (100%)
diff --git a/channelsToTreat.txt b/channels.txt
similarity index 100%
rename from channelsToTreat.txt
rename to channels.txt
diff --git a/main.cpp b/main.cpp
index ef12b7f..602fc2f 100644
--- a/main.cpp
+++ b/main.cpp
@@ -9,10 +9,10 @@ using namespace std;
 using json = nlohmann::json;
 
 vector<string> getFileContent(string filePath);
-json getJson(string url);
+json getJson(string url, string directoryPath);
 void createDirectory(string path),
      print(ostringstream* toPrint),
-     treatComment(json comment);
+     treatComment(json comment, string channelId);
 string getHttps(string url);
 size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
 bool doesFileExist(string filePath),
@@ -27,20 +27,26 @@ ostringstream toPrint;
 
 set<string> channelsAlreadyTreated,
     channelsToTreat;
-unsigned int commentsCount = 0;
+unsigned int commentsCount = 0,
+             requestsPerChannel = 0;
+string CHANNELS_DIRECTORY = "channels/",
+       CHANNELS_FILE_PATH = "channels.txt";
 
 int main()
 {
-    string channelsToTreatFilePath = "channelsToTreat.txt";
-    vector<string> channelsToTreatVec = getFileContent(channelsToTreatFilePath);
-    channelsToTreat = set(channelsToTreatVec.begin(), channelsToTreatVec.end());
+    // The starting set should be written to `CHANNELS_FILE_PATH`.
+    // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folder in `CHANNELS_DIRECTORY` being treated.
+    // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
+    vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
+    channelsToTreat = set(channelsVec.begin(), channelsVec.end());
 
-    string channelsDirectory = "channels/";
-    createDirectory(channelsDirectory);
+    createDirectory(CHANNELS_DIRECTORY);
 
-    for(const auto& entry : filesystem::directory_iterator(channelsDirectory))
+    for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
     {
-        channelsAlreadyTreated.insert(entry.path().filename());
+        string channelId = entry.path().filename();
+        channelsToTreat.erase(channelId);
+        channelsAlreadyTreated.insert(channelId);
     }
 
     PRINT(channelsToTreat.size() << " channel(s) to treat")
@@ -49,12 +55,18 @@ int main()
     while(!channelsToTreat.empty())
     {
         string channelToTreat = *channelsToTreat.begin();
+
         PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")
 
+        channelsAlreadyTreated.insert(channelToTreat);
+
+        string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
+        createDirectory(channelToTreatDirectory);
+
         string pageToken = "";
         while(true)
         {
-            json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken);
+            json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken, channelToTreat);
             bool doesRelyingOnCommentThreadsIsEnough = data["error"]["errors"][0]["reason"] != "commentsDisabled";
             if(doesRelyingOnCommentThreadsIsEnough)
             {
@@ -63,7 +75,7 @@ int main()
                 {
                     json comment = item["snippet"]["topLevelComment"];
                     string commentId = comment["id"];
-                    treatComment(comment);
+                    treatComment(comment, channelToTreat);
                     if(item.contains("replies"))
                     {
                         json replies = item["replies"]["comments"];
@@ -72,11 +84,11 @@ int main()
                             string pageToken = "";
                             while(true)
                             {
-                                json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken);
+                                json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat);
                                 json items = data["items"];
                                 for(const auto& item : items)
                                 {
-                                    treatComment(item);
+                                    treatComment(item, channelToTreat);
                                 }
                                 if(data.contains("nextPageToken"))
                                 {
@@ -92,7 +104,7 @@ int main()
                         {
                             for(const auto& reply : replies)
                             {
-                                treatComment(reply);
+                                treatComment(reply, channelToTreat);
                             }
                         }
                     }
@@ -115,29 +127,27 @@ int main()
 
         PRINT(commentsCount)
         commentsCount = 0;
+        requestsPerChannel = 0;
 
         channelsToTreat.erase(channelToTreat);
-        channelsAlreadyTreated.insert(channelToTreat);
-
-        string channelToTreatDirectory = channelsDirectory + channelToTreat + "/";
-        createDirectory(channelToTreatDirectory);
-
-        string toWrite = (doesFileExist(channelsToTreatFilePath) ? "\n" : "") + channelToTreat;
-        writeFile(channelsToTreatFilePath, "a", toWrite);
     }
 
     return 0;
 }
 
-void treatComment(json comment)
+void treatComment(json comment, string channelId)
 {
     json snippet = comment["snippet"];
     // The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
     if(snippet.contains("authorChannelId"))
     {
         string channelId = snippet["authorChannelId"]["value"];
-        if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end())
+        if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end() && find(channelsToTreat.begin(), channelsToTreat.end(), channelId) == channelsToTreat.end())
+        {
             channelsToTreat.insert(channelId);
+
+            writeFile(CHANNELS_FILE_PATH, "a", "\n" + channelId);
+        }
     }
     commentsCount++;
 }
@@ -151,6 +161,10 @@ bool writeFile(string filePath, string option, string toWrite)
         fclose(file);
         return true;
     }
+    else
+    {
+        PRINT("writeFile error: " << strerror(errno))
+    }
     return false;
 }
 
@@ -184,15 +198,21 @@ vector<string> getFileContent(string filePath)
     return lines;
 }
 
-json getJson(string url)
+json getJson(string url, string directoryPath)
 {
 #ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
-    url = "https://yt.lemnoslife.com/noKey/" + url;
+    string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
 #else
-    url = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY;
+    string finalUrl = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY;
 #endif
-    string content = getHttps(url);
+    string content = getHttps(finalUrl);
     json data = json::parse(content);
+
+    ostringstream toString;
+    toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json";
+    requestsPerChannel++;
+    writeFile(toString.str(), "w", url + "\n" + content);
+
     return data;
 }