#11: Add a first iteration for the CHANNELS retrieval

This commit is contained in:
Benjamin Loison 2023-01-15 02:19:31 +01:00
parent d1b84335d1
commit aa97c94bf8
3 changed files with 68 additions and 28 deletions

View File

@ -1,4 +1,4 @@
.PHONY: main .PHONY: main
main: main:
g++ main.cpp -g -std=c++17 -lcurl -lpthread -o main g++ main.cpp -g -std=c++17 -lcurl -lpthread -o youtubeCaptionsSearchEngine

View File

@ -15,5 +15,10 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
```sh ```sh
sudo apt install nlohmann-json3-dev sudo apt install nlohmann-json3-dev
make make
./main ```
Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
```sh
./youtubeCaptionsSearchEngine
``` ```

View File

@ -16,13 +16,14 @@ enum getJsonBehavior { normal, retryOnCommentsDisabled, returnErrorIfPlaylistNot
set<string> setFromVector(vector<string> vec); set<string> setFromVector(vector<string> vec);
vector<string> getFileContent(string filePath); vector<string> getFileContent(string filePath);
json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior = normal); json getJson(unsigned short threadId, string url, bool usingYouTubeDataApiV3, string directoryPath, getJsonBehavior behavior = normal);
void createDirectory(string path), void createDirectory(string path),
print(ostringstream* toPrint), print(ostringstream* toPrint),
treatComment(unsigned short threadId, json comment, string channelId), treatComment(unsigned short threadId, json comment, string channelId),
treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat), treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat),
treatChannels(unsigned short threadId), treatChannels(unsigned short threadId),
deleteDirectory(string path); deleteDirectory(string path),
addChannelToTreat(unsigned short threadId, string channelId);
string getHttps(string url), string getHttps(string url),
exec(string cmd); exec(string cmd);
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp); size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
@ -196,7 +197,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
ostringstream toString; ostringstream toString;
toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken; toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
string url = toString.str(); string url = toString.str();
json data = getJson(threadId, url, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled); json data = getJson(threadId, url, true, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled);
bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled"; bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
if(doesRelyingOnCommentThreadsIsEnough) if(doesRelyingOnCommentThreadsIsEnough)
{ {
@ -213,7 +214,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
string pageToken = ""; string pageToken = "";
while(true) while(true)
{ {
json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat), json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, true, channelToTreat),
items = data["items"]; items = data["items"];
for(const auto& item : items) for(const auto& item : items)
{ {
@ -251,7 +252,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
else else
{ {
PRINT(threadId, "Comments disabled channel, treating differently...") PRINT(threadId, "Comments disabled channel, treating differently...")
json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, channelToTreat); json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, true, channelToTreat);
// YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer... // YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer...
unsigned int videoCount = atoi(string(data["items"][0]["statistics"]["videoCount"]).c_str()); unsigned int videoCount = atoi(string(data["items"][0]["statistics"]["videoCount"]).c_str());
PRINT(threadId, "The channel has about " << videoCount << " videos.") PRINT(threadId, "The channel has about " << videoCount << " videos.")
@ -263,7 +264,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
while(true) while(true)
{ {
// `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them. // `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them.
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, channelToTreat, returnErrorIfPlaylistNotFound); json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
if(data.contains("error")) if(data.contains("error"))
{ {
PRINT(threadId, "Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!") PRINT(threadId, "Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
@ -300,15 +301,36 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
} }
} }
} }
if(isChannel)
{
string pageToken = "";
while(true)
{
json data = getJson(threadId, "channels?part=channels&id=" + id + (pageToken == "" ? "" : "&pageToken=" + pageToken), false, id),
items = data["items"];
for(const auto& item : items)
{
for(const auto& channel : item["channels"]["channels"])
{
PRINT(threadId, channel)
addChannelToTreat(threadId, channel["channelId"]);
}
}
if(!data["nextPageToken"].is_null())
{
pageToken = data["nextPageToken"];
}
else
{
break;
}
}
}
} }
void treatComment(unsigned short threadId, json comment, string channelId) // This function verifies that the given hasn't already been treated.
void addChannelToTreat(unsigned short threadId, string channelId)
{ {
json snippet = comment["snippet"];
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if(snippet.contains("authorChannelId"))
{
string channelId = snippet["authorChannelId"]["value"];
channelsAlreadyTreatedAndToTreatMutex.lock(); channelsAlreadyTreatedAndToTreatMutex.lock();
if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreatRev.find(channelId) == channelsToTreatRev.end()) if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreatRev.find(channelId) == channelsToTreatRev.end())
{ {
@ -324,6 +346,16 @@ void treatComment(unsigned short threadId, json comment, string channelId)
{ {
channelsAlreadyTreatedAndToTreatMutex.unlock(); channelsAlreadyTreatedAndToTreatMutex.unlock();
} }
}
void treatComment(unsigned short threadId, json comment, string channelId)
{
json snippet = comment["snippet"];
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if(snippet.contains("authorChannelId"))
{
string channelId = snippet["authorChannelId"]["value"];
addChannelToTreat(threadId, channelId);
} }
commentsCount++; commentsCount++;
commentsPerSecondCount++; commentsPerSecondCount++;
@ -405,10 +437,13 @@ vector<string> getFileContent(string filePath)
return lines; return lines;
} }
json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior) json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, string directoryPath, getJsonBehavior behavior)
{ {
string finalUrl = USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ? "https://yt.lemnoslife.com/noKey/" + url : string finalUrl = usingYoutubeDataApiv3 ?
"https://www.googleapis.com/youtube/v3/" + url + "&key=" + apiKey, (USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ?
"https://yt.lemnoslife.com/noKey/" + url :
"https://www.googleapis.com/youtube/v3/" + url + "&key=" + apiKey) :
YOUTUBE_OPERATIONAL_API_INSTANCE_URL + "/" + url,
content = getHttps(finalUrl); content = getHttps(finalUrl);
json data; json data;
try try
@ -433,12 +468,12 @@ json getJson(unsigned short threadId, string url, string directoryPath, getJsonB
PRINT(threadId, "No more quota on " << apiKey << " switching to " << keys[0] << ".") PRINT(threadId, "No more quota on " << apiKey << " switching to " << keys[0] << ".")
apiKey = keys[0]; apiKey = keys[0];
quotaMutex.unlock(); quotaMutex.unlock();
return getJson(threadId, url, directoryPath); return getJson(threadId, url, true, directoryPath);
} }
PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !") PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
if(reason != "commentsDisabled" || behavior == retryOnCommentsDisabled) if(reason != "commentsDisabled" || behavior == retryOnCommentsDisabled)
{ {
return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, directoryPath); return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, true, directoryPath);
} }
} }