#2: Add data logging

This commit is contained in:
Benjamin Loison 2023-01-02 19:46:32 +01:00
parent 73a9dea023
commit 923c14a77b
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 48 additions and 28 deletions

View File

@ -9,10 +9,10 @@ using namespace std;
using json = nlohmann::json; using json = nlohmann::json;
vector<string> getFileContent(string filePath); vector<string> getFileContent(string filePath);
json getJson(string url); json getJson(string url, string directoryPath);
void createDirectory(string path), void createDirectory(string path),
print(ostringstream* toPrint), print(ostringstream* toPrint),
treatComment(json comment); treatComment(json comment, string channelId);
string getHttps(string url); string getHttps(string url);
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp); size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
bool doesFileExist(string filePath), bool doesFileExist(string filePath),
@ -27,20 +27,26 @@ ostringstream toPrint;
set<string> channelsAlreadyTreated, set<string> channelsAlreadyTreated,
channelsToTreat; channelsToTreat;
unsigned int commentsCount = 0; unsigned int commentsCount = 0,
requestsPerChannel = 0;
string CHANNELS_DIRECTORY = "channels/",
CHANNELS_FILE_PATH = "channels.txt";
int main() int main()
{ {
string channelsToTreatFilePath = "channelsToTreat.txt"; // The starting set should be written to `CHANNELS_FILE_PATH`.
vector<string> channelsToTreatVec = getFileContent(channelsToTreatFilePath); // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folder in `CHANNELS_DIRECTORY` being treated.
channelsToTreat = set(channelsToTreatVec.begin(), channelsToTreatVec.end()); // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
channelsToTreat = set(channelsVec.begin(), channelsVec.end());
string channelsDirectory = "channels/"; createDirectory(CHANNELS_DIRECTORY);
createDirectory(channelsDirectory);
for(const auto& entry : filesystem::directory_iterator(channelsDirectory)) for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
{ {
channelsAlreadyTreated.insert(entry.path().filename()); string channelId = entry.path().filename();
channelsToTreat.erase(channelId);
channelsAlreadyTreated.insert(channelId);
} }
PRINT(channelsToTreat.size() << " channel(s) to treat") PRINT(channelsToTreat.size() << " channel(s) to treat")
@ -49,12 +55,18 @@ int main()
while(!channelsToTreat.empty()) while(!channelsToTreat.empty())
{ {
string channelToTreat = *channelsToTreat.begin(); string channelToTreat = *channelsToTreat.begin();
PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")") PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")
channelsAlreadyTreated.insert(channelToTreat);
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
createDirectory(channelToTreatDirectory);
string pageToken = ""; string pageToken = "";
while(true) while(true)
{ {
json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken); json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken, channelToTreat);
bool doesRelyingOnCommentThreadsIsEnough = data["error"]["errors"][0]["reason"] != "commentsDisabled"; bool doesRelyingOnCommentThreadsIsEnough = data["error"]["errors"][0]["reason"] != "commentsDisabled";
if(doesRelyingOnCommentThreadsIsEnough) if(doesRelyingOnCommentThreadsIsEnough)
{ {
@ -63,7 +75,7 @@ int main()
{ {
json comment = item["snippet"]["topLevelComment"]; json comment = item["snippet"]["topLevelComment"];
string commentId = comment["id"]; string commentId = comment["id"];
treatComment(comment); treatComment(comment, channelToTreat);
if(item.contains("replies")) if(item.contains("replies"))
{ {
json replies = item["replies"]["comments"]; json replies = item["replies"]["comments"];
@ -72,11 +84,11 @@ int main()
string pageToken = ""; string pageToken = "";
while(true) while(true)
{ {
json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken); json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat);
json items = data["items"]; json items = data["items"];
for(const auto& item : items) for(const auto& item : items)
{ {
treatComment(item); treatComment(item, channelToTreat);
} }
if(data.contains("nextPageToken")) if(data.contains("nextPageToken"))
{ {
@ -92,7 +104,7 @@ int main()
{ {
for(const auto& reply : replies) for(const auto& reply : replies)
{ {
treatComment(reply); treatComment(reply, channelToTreat);
} }
} }
} }
@ -115,29 +127,27 @@ int main()
PRINT(commentsCount) PRINT(commentsCount)
commentsCount = 0; commentsCount = 0;
requestsPerChannel = 0;
channelsToTreat.erase(channelToTreat); channelsToTreat.erase(channelToTreat);
channelsAlreadyTreated.insert(channelToTreat);
string channelToTreatDirectory = channelsDirectory + channelToTreat + "/";
createDirectory(channelToTreatDirectory);
string toWrite = (doesFileExist(channelsToTreatFilePath) ? "\n" : "") + channelToTreat;
writeFile(channelsToTreatFilePath, "a", toWrite);
} }
return 0; return 0;
} }
void treatComment(json comment) void treatComment(json comment, string channelId)
{ {
json snippet = comment["snippet"]; json snippet = comment["snippet"];
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`). // The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if(snippet.contains("authorChannelId")) if(snippet.contains("authorChannelId"))
{ {
string channelId = snippet["authorChannelId"]["value"]; string channelId = snippet["authorChannelId"]["value"];
if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end()) if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end() && find(channelsToTreat.begin(), channelsToTreat.end(), channelId) == channelsToTreat.end())
{
channelsToTreat.insert(channelId); channelsToTreat.insert(channelId);
writeFile(CHANNELS_FILE_PATH, "a", "\n" + channelId);
}
} }
commentsCount++; commentsCount++;
} }
@ -151,6 +161,10 @@ bool writeFile(string filePath, string option, string toWrite)
fclose(file); fclose(file);
return true; return true;
} }
else
{
PRINT("writeFile error: " << strerror(errno))
}
return false; return false;
} }
@ -184,15 +198,21 @@ vector<string> getFileContent(string filePath)
return lines; return lines;
} }
json getJson(string url) json getJson(string url, string directoryPath)
{ {
#ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE #ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
url = "https://yt.lemnoslife.com/noKey/" + url; string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
#else #else
url = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY; string finalUrl = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY;
#endif #endif
string content = getHttps(url); string content = getHttps(finalUrl);
json data = json::parse(content); json data = json::parse(content);
ostringstream toString;
toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json";
requestsPerChannel++;
writeFile(toString.str(), "w", url + "\n" + content);
return data; return data;
} }