YouTube_captions_search_engine/main.cpp
Benjamin Loison ad3e90fe92 Fix #8: Support comments disabled channels
Tested with `UCWIdqSQekeGmUWlSFeCiEnA` which treated correctly the 36 comments of the only comments enabled video `3F8dFt8LsXY`.

Note that this commit doesn't support comments disabled channels with more than 20,000 videos.
2023-01-03 02:56:07 +01:00

289 lines
10 KiB
C++

#include <iostream>
#include <fstream>
#include <sstream>
#include <set>
#include <sys/stat.h>
#include <curl/curl.h>
#include <nlohmann/json.hpp>
using namespace std;
using json = nlohmann::json;
vector<string> getFileContent(string filePath);
json getJson(string url, string directoryPath);
void createDirectory(string path),
print(ostringstream* toPrint),
treatComment(json comment, string channelId),
treatChannelOrVideo(bool isChannel, string id, string channelToTreat);
string getHttps(string url);
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
bool doesFileExist(string filePath),
writeFile(string filePath, string option, string toWrite);
#define USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
#define API_KEY "AIzaSy..."
// Note that this printing approach is only safe in a mono-thread context.
#define PRINT(x) toPrint << x; print(&toPrint);
ostringstream toPrint;
set<string> channelsAlreadyTreated,
channelsToTreat;
unsigned int commentsCount = 0,
requestsPerChannel = 0;
string CHANNELS_DIRECTORY = "channels/",
CHANNELS_FILE_PATH = "channels.txt";
int main()
{
// The starting set should be written to `CHANNELS_FILE_PATH`.
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folder in `CHANNELS_DIRECTORY` being treated.
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
channelsToTreat = set(channelsVec.begin(), channelsVec.end());
createDirectory(CHANNELS_DIRECTORY);
for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
{
string channelId = entry.path().filename();
channelsToTreat.erase(channelId);
channelsAlreadyTreated.insert(channelId);
}
PRINT(channelsToTreat.size() << " channel(s) to treat")
PRINT(channelsAlreadyTreated.size() << " channel(s) already treated")
while(!channelsToTreat.empty())
{
string channelToTreat = *channelsToTreat.begin();
PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")
channelsAlreadyTreated.insert(channelToTreat);
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
createDirectory(channelToTreatDirectory);
treatChannelOrVideo(true, channelToTreat, channelToTreat);
PRINT(commentsCount)
commentsCount = 0;
requestsPerChannel = 0;
channelsToTreat.erase(channelToTreat);
}
return 0;
}
void treatChannelOrVideo(bool isChannel, string id, string channelToTreat)
{
string pageToken = "";
while(true)
{
ostringstream toString;
toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
string url = toString.str();
json data = getJson(url, channelToTreat);
bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
if(doesRelyingOnCommentThreadsIsEnough)
{
json items = data["items"];
for(const auto& item : items)
{
json comment = item["snippet"]["topLevelComment"];
string commentId = comment["id"];
treatComment(comment, channelToTreat);
if(item.contains("replies"))
{
json replies = item["replies"]["comments"];
if(replies.size() >= 5)
{
string pageToken = "";
while(true)
{
json data = getJson("comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat),
items = data["items"];
for(const auto& item : items)
{
treatComment(item, channelToTreat);
}
if(data.contains("nextPageToken"))
{
pageToken = data["nextPageToken"];
}
else
{
break;
}
}
}
else
{
for(const auto& reply : replies)
{
treatComment(reply, channelToTreat);
}
}
}
}
if(data.contains("nextPageToken"))
{
pageToken = data["nextPageToken"];
}
else
{
break;
}
}
else
{
PRINT("Comments disabled channel, treating differently...")
json data = getJson("channels?part=statistics&id=" + channelToTreat, channelToTreat);
// YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer...
unsigned int videoCount = atoi(string(data["items"][0]["statistics"]["videoCount"]).c_str());
PRINT("The channel has about " << videoCount << " videos.")
// `UC-3A9g4U1PpLaeAuD4jSP_w` has a `videoCount` of 2, while its `uploads` playlist contains 3 videos. So we use a strict inequality here.
if(videoCount < 20000)
{
string playlistToTreat = "UU" + channelToTreat.substr(2),
pageToken = "";
while(true)
{
// `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them.
json data = getJson("playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, channelToTreat),
items = data["items"];
for(const auto& item : items)
{
string videoId = item["contentDetails"]["videoId"];
// To keep the same amount of logs for each channel, I comment the following `PRINT`.
//PRINT("Treating video " << videoId)
treatChannelOrVideo(false, videoId, channelToTreat);
}
if(data.contains("nextPageToken"))
{
pageToken = data["nextPageToken"];
}
else
{
break;
}
}
break;
}
else
{
PRINT("The videos count of the channel exceeds the supported 20,000 limit!")
exit(1);
}
}
}
}
void treatComment(json comment, string channelId)
{
json snippet = comment["snippet"];
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if(snippet.contains("authorChannelId"))
{
string channelId = snippet["authorChannelId"]["value"];
if(find(channelsAlreadyTreated.begin(), channelsAlreadyTreated.end(), channelId) == channelsAlreadyTreated.end() && find(channelsToTreat.begin(), channelsToTreat.end(), channelId) == channelsToTreat.end())
{
channelsToTreat.insert(channelId);
writeFile(CHANNELS_FILE_PATH, "a", "\n" + channelId);
}
}
commentsCount++;
}
bool writeFile(string filePath, string option, string toWrite)
{
FILE* file = fopen(filePath.c_str(), option.c_str());
if(file != NULL)
{
fputs(toWrite.c_str(), file);
fclose(file);
return true;
}
else
{
PRINT("writeFile error: " << strerror(errno))
}
return false;
}
bool doesFileExist(string filePath)
{
struct stat buffer;
return stat(filePath.c_str(), &buffer) == 0;
}
void createDirectory(string path)
{
mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
}
string getDate()
{
auto t = time(nullptr);
auto tm = *localtime(&t);
ostringstream toString;
toString << put_time(&tm, "%d-%m-%Y %H-%M-%S");
return toString.str();
}
vector<string> getFileContent(string filePath)
{
vector<string> lines;
ifstream infile(filePath.c_str());
string line;
while(getline(infile, line))
lines.push_back(line);
return lines;
}
json getJson(string url, string directoryPath)
{
#ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
#else
string finalUrl = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY;
#endif
string content = getHttps(finalUrl);
json data = json::parse(content);
ostringstream toString;
toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json";
requestsPerChannel++;
writeFile(toString.str(), "w", url + "\n" + content);
return data;
}
void print(ostringstream* toPrint)
{
cout << getDate() << ": " << toPrint->str() << endl;
toPrint->str("");
}
string getHttps(string url)
{
CURL* curl = curl_easy_init();
string got;
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 1);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &got);
curl_easy_perform(curl);
curl_easy_cleanup(curl);
return got;
}
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp)
{
((string*)userp)->append((char*)contents, size * nmemb);
return size * nmemb;
}