YouTube_captions_search_engine/main.cpp

#include <iostream>
#include <fstream>
#include <set>
#include <curl/curl.h>
#include <nlohmann/json.hpp>
using namespace std;
using json = nlohmann::json;

vector<string> getFileContent(string filePath);
json getJson(string url);
void print(ostringstream* toPrint),
     treatComment(json comment);
string getHttps(string url);
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);

#define API_KEY "AIzaSy..."

// Note that this printing approach is only safe in a mono-thread context.
#define PRINT(x) toPrint << x; print(&toPrint);
ostringstream toPrint;

set<string> channelsToTreat,
            channelsAlreadyTreated;
unsigned int commentsCount = 0;

int main()
{
	vector<string> channelsToTreatVec = getFileContent("channelsToTreat.txt");
	channelsToTreat = set(channelsToTreatVec.begin(), channelsToTreatVec.end());
	
	PRINT(channelsToTreat.size() << " channel(s) to treat")
	
	while(!channelsToTreat.empty())
	{
		string channelToTreat = *channelsToTreat.begin();
		PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")
		
		string pageToken = "";
		while(true)
		{
			json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken);
			bool doesRelyingOnCommentThreadsIsEnough = data["error"]["errors"][0]["reason"] != "commentsDisabled";
			if(doesRelyingOnCommentThreadsIsEnough)
			{
				json items = data["items"];
				for(const auto& item : items)
				{
					json comment = item["snippet"]["topLevelComment"];
					string commentId = comment["id"];
					treatComment(comment);
					if(item.contains("replies"))
					{
						json replies = item["replies"]["comments"];
						if(replies.size() >= 5)
						{
							string pageToken = "";
							while(true)
							{
								json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken);
								json items = data["items"];
								for(const auto& item : items)
								{
									treatComment(item);
								}
								if(data.contains("nextPageToken"))
								{
									pageToken = data["nextPageToken"];
								}
								else
								{
									break;
								}
							}
						}
						else
						{
							for(const auto& reply : replies)
							{
								treatComment(reply);
							}
						}
					}
				}
				if(data.contains("nextPageToken"))
				{
					pageToken = data["nextPageToken"];
				}
				else
				{
					break;
				}
			}
			else
			{
				PRINT("Comments disabled channel!")
				exit(1);
			}
		}
		
		PRINT(commentsCount)
		commentsCount = 0;
		channelsToTreat.erase(channelToTreat);
		channelsAlreadyTreated.insert(channelToTreat);
	}

	return 0;
}

void treatComment(json comment)
{
	json snippet = comment["snippet"];
	// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
	if(snippet.contains("authorChannelId"))
	{
		string channelId = snippet["authorChannelId"]["value"];
		if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end())
			channelsToTreat.insert(channelId);
	}
	commentsCount++;
}

string getDate()
{
    auto t = time(nullptr);
    auto tm = *localtime(&t);
    ostringstream toString;
    toString << put_time(&tm, "%d-%m-%Y %H-%M-%S");
    return toString.str();
}

vector<string> getFileContent(string filePath)
{
	vector<string> lines;
	ifstream infile(filePath.c_str());
    string line;
    while(getline(infile, line))
		lines.push_back(line);
    return lines;
}

json getJson(string url)
{
	url = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY;
	string content = getHttps(url);
	json data = json::parse(content);
	return data;
}

void print(ostringstream* toPrint)
{
	cout << getDate() << ": " << toPrint->str() << endl;
	toPrint->str("");
}

string getHttps(string url)
{
    CURL* curl = curl_easy_init();
    string got;
    curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
    curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1);
    curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 1);
    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &got);
    curl_easy_perform(curl);
    curl_easy_cleanup(curl);
    return got;
}

size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp)
{
    ((string*)userp)->append((char*)contents, size * nmemb);
    return size * nmemb;
}
Add `main.cpp`, `Makefile` and `channelsToTreat.txt` Note that running this algorithm end up with channel [`UC-99odscxh1xxTyxHyXuRrg`](https://www.youtube.com/channel/UC-99odscxh1xxTyxHyXuRrg) and more precisely the video [`Tq5aPNzfYcg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg) and more precisely the comment [`Ugx-TlSq6SNCbOX04mx4AaABAg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg&lc=Ugx-TlSq6SNCbOX04mx4AaABAg) [which doesn't have any author](https://yt.lemnoslife.com/noKey/comments?part=snippet&id=Ugx-TlSq6SNCbOX04mx4AaABAg)... 2022-12-22 05:20:32 +01:00			`#include <iostream>`
			`#include <fstream>`
			`#include <set>`
			`#include <curl/curl.h>`
			`#include <nlohmann/json.hpp>`
			`using namespace std;`
			`using json = nlohmann::json;`

			`vector<string> getFileContent(string filePath);`
			`json getJson(string url);`
			`void print(ostringstream* toPrint),`
			`treatComment(json comment);`
			`string getHttps(string url);`
			`size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);`

			`#define API_KEY "AIzaSy..."`

			`// Note that this printing approach is only safe in a mono-thread context.`
			`#define PRINT(x) toPrint << x; print(&toPrint);`
			`ostringstream toPrint;`

			`set<string> channelsToTreat,`
			`channelsAlreadyTreated;`
			`unsigned int commentsCount = 0;`

			`int main()`
			`{`
			`vector<string> channelsToTreatVec = getFileContent("channelsToTreat.txt");`
			`channelsToTreat = set(channelsToTreatVec.begin(), channelsToTreatVec.end());`

			`PRINT(channelsToTreat.size() << " channel(s) to treat")`

			`while(!channelsToTreat.empty())`
			`{`
			`string channelToTreat = *channelsToTreat.begin();`
			`PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")`

			`string pageToken = "";`
			`while(true)`
			`{`
			`json data = getJson("commentThreads?part=snippet,replies&allThreadsRelatedToChannelId=" + channelToTreat + "&maxResults=100&pageToken=" + pageToken);`
			`bool doesRelyingOnCommentThreadsIsEnough = data["error"]["errors"][0]["reason"] != "commentsDisabled";`
			`if(doesRelyingOnCommentThreadsIsEnough)`
			`{`
			`json items = data["items"];`
			`for(const auto& item : items)`
			`{`
			`json comment = item["snippet"]["topLevelComment"];`
			`string commentId = comment["id"];`
			`treatComment(comment);`
			`if(item.contains("replies"))`
			`{`
			`json replies = item["replies"]["comments"];`
			`if(replies.size() >= 5)`
			`{`
			`string pageToken = "";`
			`while(true)`
			`{`
			`json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken);`
			`json items = data["items"];`
			`for(const auto& item : items)`
			`{`
			`treatComment(item);`
			`}`
			`if(data.contains("nextPageToken"))`
			`{`
			`pageToken = data["nextPageToken"];`
			`}`
			`else`
			`{`
			`break;`
			`}`
			`}`
			`}`
			`else`
			`{`
			`for(const auto& reply : replies)`
			`{`
			`treatComment(reply);`
			`}`
			`}`
			`}`
			`}`
			`if(data.contains("nextPageToken"))`
			`{`
			`pageToken = data["nextPageToken"];`
			`}`
			`else`
			`{`
			`break;`
			`}`
			`}`
			`else`
			`{`
			`PRINT("Comments disabled channel!")`
			`exit(1);`
			`}`
			`}`

			`PRINT(commentsCount)`
			`commentsCount = 0;`
			`channelsToTreat.erase(channelToTreat);`
			`channelsAlreadyTreated.insert(channelToTreat);`
			`}`

			`return 0;`
			`}`

			`void treatComment(json comment)`
			`{`
			`json snippet = comment["snippet"];`
Add resilience to missing `authorChannelId` in `main.cpp` 2022-12-22 05:41:38 +01:00			// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
			`if(snippet.contains("authorChannelId"))`
			`{`
			`string channelId = snippet["authorChannelId"]["value"];`
			`if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end())`
			`channelsToTreat.insert(channelId);`
			`}`
Add `main.cpp`, `Makefile` and `channelsToTreat.txt` Note that running this algorithm end up with channel [`UC-99odscxh1xxTyxHyXuRrg`](https://www.youtube.com/channel/UC-99odscxh1xxTyxHyXuRrg) and more precisely the video [`Tq5aPNzfYcg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg) and more precisely the comment [`Ugx-TlSq6SNCbOX04mx4AaABAg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg&lc=Ugx-TlSq6SNCbOX04mx4AaABAg) [which doesn't have any author](https://yt.lemnoslife.com/noKey/comments?part=snippet&id=Ugx-TlSq6SNCbOX04mx4AaABAg)... 2022-12-22 05:20:32 +01:00			`commentsCount++;`
			`}`

Add time to logging 2022-12-22 05:47:16 +01:00			`string getDate()`
			`{`
			`auto t = time(nullptr);`
			`auto tm = *localtime(&t);`
			`ostringstream toString;`
			`toString << put_time(&tm, "%d-%m-%Y %H-%M-%S");`
			`return toString.str();`
			`}`

Add `main.cpp`, `Makefile` and `channelsToTreat.txt` Note that running this algorithm end up with channel [`UC-99odscxh1xxTyxHyXuRrg`](https://www.youtube.com/channel/UC-99odscxh1xxTyxHyXuRrg) and more precisely the video [`Tq5aPNzfYcg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg) and more precisely the comment [`Ugx-TlSq6SNCbOX04mx4AaABAg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg&lc=Ugx-TlSq6SNCbOX04mx4AaABAg) [which doesn't have any author](https://yt.lemnoslife.com/noKey/comments?part=snippet&id=Ugx-TlSq6SNCbOX04mx4AaABAg)... 2022-12-22 05:20:32 +01:00			`vector<string> getFileContent(string filePath)`
			`{`
			`vector<string> lines;`
			`ifstream infile(filePath.c_str());`
			`string line;`
			`while(getline(infile, line))`
			`lines.push_back(line);`
			`return lines;`
			`}`

			`json getJson(string url)`
			`{`
			`url = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY;`
			`string content = getHttps(url);`
			`json data = json::parse(content);`
			`return data;`
			`}`

			`void print(ostringstream* toPrint)`
			`{`
Add time to logging 2022-12-22 05:47:16 +01:00			`cout << getDate() << ": " << toPrint->str() << endl;`
Add `main.cpp`, `Makefile` and `channelsToTreat.txt` Note that running this algorithm end up with channel [`UC-99odscxh1xxTyxHyXuRrg`](https://www.youtube.com/channel/UC-99odscxh1xxTyxHyXuRrg) and more precisely the video [`Tq5aPNzfYcg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg) and more precisely the comment [`Ugx-TlSq6SNCbOX04mx4AaABAg`](https://www.youtube.com/watch?v=Tq5aPNzfYcg&lc=Ugx-TlSq6SNCbOX04mx4AaABAg) [which doesn't have any author](https://yt.lemnoslife.com/noKey/comments?part=snippet&id=Ugx-TlSq6SNCbOX04mx4AaABAg)... 2022-12-22 05:20:32 +01:00			`toPrint->str("");`
			`}`

			`string getHttps(string url)`
			`{`
			`CURL* curl = curl_easy_init();`
			`string got;`
			`curl_easy_setopt(curl, CURLOPT_URL, url.c_str());`
			`curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1);`
			`curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 1);`
			`curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);`
			`curl_easy_setopt(curl, CURLOPT_WRITEDATA, &got);`
			`curl_easy_perform(curl);`
			`curl_easy_cleanup(curl);`
			`return got;`
			`}`

			`size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp)`
			`{`
			`((string)userp)->append((char)contents, size * nmemb);`
			`return size * nmemb;`
			`}`