#11: Add a first iteration for the CHANNELS retrieval

This commit is contained in:
Benjamin Loison 2023-01-15 02:19:31 +01:00
parent 270c48da02
commit 7456685f2b
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
3 changed files with 68 additions and 28 deletions

View File

@ -1,4 +1,4 @@
.PHONY: main
main:
g++ main.cpp -g -std=c++17 -lcurl -lpthread -o main
g++ main.cpp -g -std=c++17 -lcurl -lpthread -o youtubeCaptionsSearchEngine

View File

@ -15,5 +15,10 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
```sh
sudo apt install nlohmann-json3-dev
make
./main
```
Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
```sh
./youtubeCaptionsSearchEngine
```

View File

@ -16,13 +16,14 @@ enum getJsonBehavior { normal, retryOnCommentsDisabled, returnErrorIfPlaylistNot
set<string> setFromVector(vector<string> vec);
vector<string> getFileContent(string filePath);
json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior = normal);
json getJson(unsigned short threadId, string url, bool usingYouTubeDataApiV3, string directoryPath, getJsonBehavior behavior = normal);
void createDirectory(string path),
print(ostringstream* toPrint),
treatComment(unsigned short threadId, json comment, string channelId),
treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat),
treatChannels(unsigned short threadId),
deleteDirectory(string path);
deleteDirectory(string path),
addChannelToTreat(unsigned short threadId, string channelId);
string getHttps(string url),
exec(string cmd);
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
@ -196,7 +197,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
ostringstream toString;
toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
string url = toString.str();
json data = getJson(threadId, url, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled);
json data = getJson(threadId, url, true, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled);
bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
if(doesRelyingOnCommentThreadsIsEnough)
{
@ -213,7 +214,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
string pageToken = "";
while(true)
{
json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat),
json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, true, channelToTreat),
items = data["items"];
for(const auto& item : items)
{
@ -251,7 +252,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
else
{
PRINT(threadId, "Comments disabled channel, treating differently...")
json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, channelToTreat);
json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, true, channelToTreat);
// YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer...
unsigned int videoCount = atoi(string(data["items"][0]["statistics"]["videoCount"]).c_str());
PRINT(threadId, "The channel has about " << videoCount << " videos.")
@ -263,7 +264,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
while(true)
{
// `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them.
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, channelToTreat, returnErrorIfPlaylistNotFound);
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
if(data.contains("error"))
{
PRINT(threadId, "Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
@ -300,15 +301,36 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
}
}
}
if(isChannel)
{
string pageToken = "";
while(true)
{
json data = getJson(threadId, "channels?part=channels&id=" + id + (pageToken == "" ? "" : "&pageToken=" + pageToken), false, id),
items = data["items"];
for(const auto& item : items)
{
for(const auto& channel : item["channels"]["channels"])
{
PRINT(threadId, channel)
addChannelToTreat(threadId, channel["channelId"]);
}
}
if(!data["nextPageToken"].is_null())
{
pageToken = data["nextPageToken"];
}
else
{
break;
}
}
}
}
void treatComment(unsigned short threadId, json comment, string channelId)
// This function verifies that the given hasn't already been treated.
void addChannelToTreat(unsigned short threadId, string channelId)
{
json snippet = comment["snippet"];
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if(snippet.contains("authorChannelId"))
{
string channelId = snippet["authorChannelId"]["value"];
channelsAlreadyTreatedAndToTreatMutex.lock();
if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreatRev.find(channelId) == channelsToTreatRev.end())
{
@ -325,6 +347,16 @@ void treatComment(unsigned short threadId, json comment, string channelId)
channelsAlreadyTreatedAndToTreatMutex.unlock();
}
}
void treatComment(unsigned short threadId, json comment, string channelId)
{
json snippet = comment["snippet"];
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if(snippet.contains("authorChannelId"))
{
string channelId = snippet["authorChannelId"]["value"];
addChannelToTreat(threadId, channelId);
}
commentsCount++;
commentsPerSecondCount++;
}
@ -405,10 +437,13 @@ vector<string> getFileContent(string filePath)
return lines;
}
json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior)
json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, string directoryPath, getJsonBehavior behavior)
{
string finalUrl = USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ? "https://yt.lemnoslife.com/noKey/" + url :
"https://www.googleapis.com/youtube/v3/" + url + "&key=" + apiKey,
string finalUrl = usingYoutubeDataApiv3 ?
(USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ?
"https://yt.lemnoslife.com/noKey/" + url :
"https://www.googleapis.com/youtube/v3/" + url + "&key=" + apiKey) :
YOUTUBE_OPERATIONAL_API_INSTANCE_URL + "/" + url,
content = getHttps(finalUrl);
json data;
try
@ -433,12 +468,12 @@ json getJson(unsigned short threadId, string url, string directoryPath, getJsonB
PRINT(threadId, "No more quota on " << apiKey << " switching to " << keys[0] << ".")
apiKey = keys[0];
quotaMutex.unlock();
return getJson(threadId, url, directoryPath);
return getJson(threadId, url, true, directoryPath);
}
PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
if(reason != "commentsDisabled" || behavior == retryOnCommentsDisabled)
{
return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, directoryPath);
return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, true, directoryPath);
}
}