Fix #20: YouTube Data API v3 returns rarely suddenly commentsDisabled error which involves an unwanted method switch

Also modified compression command, as I got `sh: 1: zip: Argument list too long` when compressing the 248,868 json files of the French most subscribers channel.
This commit is contained in:
Benjamin Loison 2023-01-08 15:43:27 +01:00
parent ba37d6a111
commit 7e35a6473a
2 changed files with 25 additions and 7 deletions

View File

@ -13,7 +13,7 @@ using namespace chrono;
using json = nlohmann::json; using json = nlohmann::json;
vector<string> getFileContent(string filePath); vector<string> getFileContent(string filePath);
json getJson(unsigned short threadId, string url, string directoryPath); json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled = false);
void createDirectory(string path), void createDirectory(string path),
print(ostringstream* toPrint), print(ostringstream* toPrint),
treatComment(unsigned short threadId, json comment, string channelId), treatComment(unsigned short threadId, json comment, string channelId),
@ -49,13 +49,15 @@ int main()
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated. // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set. // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH); vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
// Note that using `set`s makes the search faster but we lose the `channels.txt` lines order.
channelsToTreat = set(channelsVec.begin(), channelsVec.end()); channelsToTreat = set(channelsVec.begin(), channelsVec.end());
createDirectory(CHANNELS_DIRECTORY); createDirectory(CHANNELS_DIRECTORY);
for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY)) for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
{ {
string channelId = entry.path().filename(); string fileName = entry.path().filename(),
channelId = fileName.substr(0, fileName.length() - 4);
channelsToTreat.erase(channelId); channelsToTreat.erase(channelId);
channelsAlreadyTreated.insert(channelId); channelsAlreadyTreated.insert(channelId);
} }
@ -112,9 +114,14 @@ void treatChannels(unsigned short threadId)
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat); treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
PRINT(threadId, "Starting compression...")
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli. // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
exec("cd " + channelToTreatDirectory + " && zip -r ../" + channelToTreat + ".zip *"); exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
PRINT(threadId, "Compression finished, started deleting initial directory...")
deleteDirectory(channelToTreatDirectory); deleteDirectory(channelToTreatDirectory);
PRINT(threadId, "Deleting directory finished.")
PRINT(threadId, commentsCount << " comments were found for this channel.") PRINT(threadId, commentsCount << " comments were found for this channel.")
commentsCount = 0; commentsCount = 0;
@ -132,7 +139,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
ostringstream toString; ostringstream toString;
toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken; toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
string url = toString.str(); string url = toString.str();
json data = getJson(threadId, url, channelToTreat); json data = getJson(threadId, url, channelToTreat, pageToken != "");
bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled"; bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
if(doesRelyingOnCommentThreadsIsEnough) if(doesRelyingOnCommentThreadsIsEnough)
{ {
@ -323,7 +330,7 @@ vector<string> getFileContent(string filePath)
return lines; return lines;
} }
json getJson(unsigned short threadId, string url, string directoryPath) json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled)
{ {
#ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE #ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
string finalUrl = "https://yt.lemnoslife.com/noKey/" + url; string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
@ -345,8 +352,11 @@ json getJson(unsigned short threadId, string url, string directoryPath)
if(data.contains("error")) if(data.contains("error"))
{ {
PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !") PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
if(data["error"]["errors"][0]["reason"] != "commentsDisabled" || retryOnCommentsDisabled)
{
return getJson(threadId, url, directoryPath); return getJson(threadId, url, directoryPath);
} }
}
ostringstream toString; ostringstream toString;
toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json"; toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json";

View File

@ -1,6 +1,6 @@
#!/usr/bin/python3 #!/usr/bin/python3
import shutil import shutil, os
infix = ': Treating channel ' infix = ': Treating channel '
path = 'channels/' path = 'channels/'
@ -18,8 +18,16 @@ with open('nohup.out') as f:
for threadId in threads: for threadId in threads:
channelId = threads[threadId] channelId = threads[threadId]
print(threadId, channelId) print(threadId, channelId)
# There are three cases:
# - `channelId`/ exists
# - `channelId`/ and `channelId`.zip exist
# - `channelId`.zip exists
# To manage every case, we need to use two `try`/`except`.
try: try:
shutil.rmtree(path + channelId) shutil.rmtree(path + channelId)
except:
pass
try:
os.remove(path + channelId + ".zip") os.remove(path + channelId + ".zip")
except: except:
pass pass