412 lines
15 KiB
C++
412 lines
15 KiB
C++
#include <iostream>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <set>
|
|
#include <mutex>
|
|
#include <thread>
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
#include <curl/curl.h>
|
|
#include <nlohmann/json.hpp>
|
|
using namespace std;
|
|
using namespace chrono;
|
|
using json = nlohmann::json;
|
|
|
|
enum getJsonBehavior { normal, retryOnCommentsDisabled, returnErrorIfPlaylistNotFound };
|
|
|
|
vector<string> getFileContent(string filePath);
|
|
json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior = normal);
|
|
void createDirectory(string path),
|
|
print(ostringstream* toPrint),
|
|
treatComment(unsigned short threadId, json comment, string channelId),
|
|
treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat),
|
|
treatChannels(unsigned short threadId),
|
|
deleteDirectory(string path);
|
|
string getHttps(string url),
|
|
exec(string cmd);
|
|
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
|
|
bool doesFileExist(string filePath),
|
|
writeFile(unsigned short threadId, string filePath, string option, string toWrite);
|
|
|
|
#define USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
|
|
#define API_KEY "AIzaSy..."
|
|
#define THREADS_NUMBER 10
|
|
|
|
#define PRINT(threadId, x) { ostringstream toPrint; toPrint << threadId << ": " << x; print(&toPrint); }
|
|
#define DEFAULT_THREAD_ID 0
|
|
|
|
mutex printMutex,
|
|
channelsAlreadyTreatedAndToTreatMutex;
|
|
set<string> channelsAlreadyTreated,
|
|
channelsToTreat;
|
|
unsigned int commentsCount = 0,
|
|
commentsPerSecondCount = 0,
|
|
requestsPerChannel = 0;
|
|
string CHANNELS_DIRECTORY = "channels/",
|
|
CHANNELS_FILE_PATH = "channels.txt";
|
|
|
|
int main()
|
|
{
|
|
// The starting set should be written to `CHANNELS_FILE_PATH`.
|
|
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
|
|
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
|
|
vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
|
|
// Note that using `set`s makes the search faster but we lose the `channels.txt` lines order.
|
|
channelsToTreat = set(channelsVec.begin(), channelsVec.end());
|
|
|
|
createDirectory(CHANNELS_DIRECTORY);
|
|
|
|
for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
|
|
{
|
|
string fileName = entry.path().filename(),
|
|
channelId = fileName.substr(0, fileName.length() - 4);
|
|
channelsToTreat.erase(channelId);
|
|
channelsAlreadyTreated.insert(channelId);
|
|
}
|
|
|
|
PRINT(DEFAULT_THREAD_ID, channelsToTreat.size() << " channel(s) to treat")
|
|
PRINT(DEFAULT_THREAD_ID, channelsAlreadyTreated.size() << " channel(s) already treated")
|
|
|
|
thread threads[THREADS_NUMBER];
|
|
for(unsigned short threadsIndex = 0; threadsIndex < THREADS_NUMBER; threadsIndex++)
|
|
{
|
|
threads[threadsIndex] = thread(treatChannels, threadsIndex + 1);
|
|
}
|
|
|
|
while(true)
|
|
{
|
|
PRINT(DEFAULT_THREAD_ID, "Comments per second: " << commentsPerSecondCount)
|
|
commentsPerSecondCount = 0;
|
|
sleep(1);
|
|
}
|
|
|
|
// The following is dead code, as we assume below not to have ever treated completely YouTube.
|
|
for(unsigned short threadsIndex = 0; threadsIndex < THREADS_NUMBER; threadsIndex++)
|
|
{
|
|
threads[threadsIndex].join();
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void treatChannels(unsigned short threadId)
|
|
{
|
|
// For the moment we assume that we never have treated completely YouTube, otherwise we have to pay attention how to proceed if the starting set involves startvation for some threads.
|
|
while(true)
|
|
{
|
|
channelsAlreadyTreatedAndToTreatMutex.lock();
|
|
if(channelsToTreat.empty())
|
|
{
|
|
channelsAlreadyTreatedAndToTreatMutex.unlock();
|
|
sleep(1);
|
|
continue;
|
|
}
|
|
|
|
string channelToTreat = *channelsToTreat.begin();
|
|
|
|
PRINT(threadId, "Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")
|
|
|
|
channelsToTreat.erase(channelToTreat);
|
|
channelsAlreadyTreated.insert(channelToTreat);
|
|
|
|
channelsAlreadyTreatedAndToTreatMutex.unlock();
|
|
|
|
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
|
|
createDirectory(channelToTreatDirectory);
|
|
|
|
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
|
|
|
|
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
|
|
PRINT(threadId, "Starting compression...")
|
|
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
|
|
exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
|
|
|
|
PRINT(threadId, "Compression finished, started deleting initial directory...")
|
|
deleteDirectory(channelToTreatDirectory);
|
|
PRINT(threadId, "Deleting directory finished.")
|
|
|
|
PRINT(threadId, commentsCount << " comments were found for this channel.")
|
|
commentsCount = 0;
|
|
requestsPerChannel = 0;
|
|
}
|
|
|
|
channelsAlreadyTreatedAndToTreatMutex.unlock();
|
|
}
|
|
|
|
void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat)
|
|
{
|
|
string pageToken = "";
|
|
while(true)
|
|
{
|
|
ostringstream toString;
|
|
toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
|
|
string url = toString.str();
|
|
json data = getJson(threadId, url, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled);
|
|
bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
|
|
if(doesRelyingOnCommentThreadsIsEnough)
|
|
{
|
|
json items = data["items"];
|
|
for(const auto& item : items)
|
|
{
|
|
json comment = item["snippet"]["topLevelComment"];
|
|
string commentId = comment["id"];
|
|
treatComment(threadId, comment, channelToTreat);
|
|
if(item.contains("replies"))
|
|
{
|
|
if(item["snippet"]["totalReplyCount"] > 5)
|
|
{
|
|
string pageToken = "";
|
|
while(true)
|
|
{
|
|
json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat),
|
|
items = data["items"];
|
|
for(const auto& item : items)
|
|
{
|
|
treatComment(threadId, item, channelToTreat);
|
|
}
|
|
if(data.contains("nextPageToken"))
|
|
{
|
|
pageToken = data["nextPageToken"];
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
json replies = item["replies"]["comments"];
|
|
for(const auto& reply : replies)
|
|
{
|
|
treatComment(threadId, reply, channelToTreat);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if(data.contains("nextPageToken"))
|
|
{
|
|
pageToken = data["nextPageToken"];
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PRINT(threadId, "Comments disabled channel, treating differently...")
|
|
json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, channelToTreat);
|
|
// YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer...
|
|
unsigned int videoCount = atoi(string(data["items"][0]["statistics"]["videoCount"]).c_str());
|
|
PRINT(threadId, "The channel has about " << videoCount << " videos.")
|
|
// `UC-3A9g4U1PpLaeAuD4jSP_w` has a `videoCount` of 2, while its `uploads` playlist contains 3 videos. So we use a strict inequality here.
|
|
if(0 < videoCount && videoCount < 20000)
|
|
{
|
|
string playlistToTreat = "UU" + channelToTreat.substr(2),
|
|
pageToken = "";
|
|
while(true)
|
|
{
|
|
// `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them.
|
|
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, channelToTreat, returnErrorIfPlaylistNotFound);
|
|
if(data.contains("error"))
|
|
{
|
|
PRINT(threadId, "Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
|
|
exit(1);
|
|
}
|
|
json items = data["items"];
|
|
for(const auto& item : items)
|
|
{
|
|
string videoId = item["contentDetails"]["videoId"];
|
|
// To keep the same amount of logs for each channel, I comment the following `PRINT`.
|
|
//PRINT("Treating video " << videoId)
|
|
treatChannelOrVideo(threadId, false, videoId, channelToTreat);
|
|
}
|
|
if(data.contains("nextPageToken"))
|
|
{
|
|
pageToken = data["nextPageToken"];
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
else if(videoCount == 0)
|
|
{
|
|
PRINT(threadId, "Skip listing comments on videos, as they shouldn't be any according to `channels?part=statistics`.")
|
|
break;
|
|
}
|
|
else //if(videoCount >= 20000)
|
|
{
|
|
PRINT(threadId, "The videos count of the channel exceeds the supported 20,000 limit!")
|
|
exit(1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void treatComment(unsigned short threadId, json comment, string channelId)
|
|
{
|
|
json snippet = comment["snippet"];
|
|
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
|
|
if(snippet.contains("authorChannelId"))
|
|
{
|
|
string channelId = snippet["authorChannelId"]["value"];
|
|
channelsAlreadyTreatedAndToTreatMutex.lock();
|
|
if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreat.find(channelId) == channelsToTreat.end())
|
|
{
|
|
channelsToTreat.insert(channelId);
|
|
channelsAlreadyTreatedAndToTreatMutex.unlock();
|
|
|
|
writeFile(threadId, CHANNELS_FILE_PATH, "a", "\n" + channelId);
|
|
}
|
|
else
|
|
{
|
|
channelsAlreadyTreatedAndToTreatMutex.unlock();
|
|
}
|
|
}
|
|
commentsCount++;
|
|
commentsPerSecondCount++;
|
|
}
|
|
|
|
string exec(string cmd)
|
|
{
|
|
array<char, 128> buffer;
|
|
string result;
|
|
unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
|
|
if (!pipe)
|
|
{
|
|
throw runtime_error("popen() failed!");
|
|
}
|
|
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
|
|
{
|
|
result += buffer.data();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
bool writeFile(unsigned short threadId, string filePath, string option, string toWrite)
|
|
{
|
|
FILE* file = fopen(filePath.c_str(), option.c_str());
|
|
if(file != NULL)
|
|
{
|
|
fputs(toWrite.c_str(), file);
|
|
fclose(file);
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
PRINT(threadId, "writeFile error: " << strerror(errno))
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool doesFileExist(string filePath)
|
|
{
|
|
struct stat buffer;
|
|
return stat(filePath.c_str(), &buffer) == 0;
|
|
}
|
|
|
|
void createDirectory(string path)
|
|
{
|
|
mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
|
|
}
|
|
|
|
void deleteDirectory(string path)
|
|
{
|
|
filesystem::remove_all(path);
|
|
}
|
|
|
|
string getDate()
|
|
{
|
|
auto t = time(nullptr);
|
|
auto tm = *localtime(&t);
|
|
ostringstream toString;
|
|
toString << put_time(&tm, "%d-%m-%Y %H-%M-%S.");
|
|
milliseconds ms = duration_cast<milliseconds>(
|
|
system_clock::now().time_since_epoch()
|
|
);
|
|
toString << (ms.count() % 1000);
|
|
return toString.str();
|
|
}
|
|
|
|
vector<string> getFileContent(string filePath)
|
|
{
|
|
vector<string> lines;
|
|
ifstream infile(filePath.c_str());
|
|
string line;
|
|
while(getline(infile, line))
|
|
lines.push_back(line);
|
|
return lines;
|
|
}
|
|
|
|
json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior)
|
|
{
|
|
#ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
|
|
string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
|
|
#else
|
|
string finalUrl = "https://www.googleapis.com/youtube/v3/" + url + "&key=" + API_KEY;
|
|
#endif
|
|
string content = getHttps(finalUrl);
|
|
json data;
|
|
try
|
|
{
|
|
data = json::parse(content);
|
|
}
|
|
catch (json::parse_error& ex)
|
|
{
|
|
PRINT(threadId, "Parse error for " << finalUrl << ", as got: " << content << " !")
|
|
exit(1);
|
|
}
|
|
|
|
if(data.contains("error"))
|
|
{
|
|
PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
|
|
string reason = data["error"]["errors"][0]["reason"];
|
|
if(reason != "commentsDisabled" || behavior == retryOnCommentsDisabled)
|
|
{
|
|
return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, directoryPath);
|
|
}
|
|
}
|
|
|
|
ostringstream toString;
|
|
toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json";
|
|
requestsPerChannel++;
|
|
writeFile(threadId, toString.str(), "w", url + "\n" + content);
|
|
|
|
return data;
|
|
}
|
|
|
|
void print(ostringstream* toPrint)
|
|
{
|
|
printMutex.lock();
|
|
|
|
cout << getDate() << ": " << toPrint->str() << endl;
|
|
toPrint->str("");
|
|
|
|
printMutex.unlock();
|
|
}
|
|
|
|
// Is this function really multi-threading friendly? If not, could consider executing `curl` using the command line.
|
|
string getHttps(string url)
|
|
{
|
|
CURL* curl = curl_easy_init();
|
|
string got;
|
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1);
|
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 1);
|
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
|
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &got);
|
|
curl_easy_perform(curl);
|
|
curl_easy_cleanup(curl);
|
|
return got;
|
|
}
|
|
|
|
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp)
|
|
{
|
|
((string*)userp)->append((char*)contents, size * nmemb);
|
|
return size * nmemb;
|
|
}
|