Fix #20: YouTube Data API v3 returns rarely suddenly commentsDisabled
error which involves an unwanted method switch
Also modified compression command, as I got `sh: 1: zip: Argument list too long` when compressing the 248,868 json files of the French most subscribers channel.
This commit is contained in:
parent
3ae0f4e924
commit
b3779fe49a
22
main.cpp
22
main.cpp
@ -13,7 +13,7 @@ using namespace chrono;
|
||||
using json = nlohmann::json;
|
||||
|
||||
vector<string> getFileContent(string filePath);
|
||||
json getJson(unsigned short threadId, string url, string directoryPath);
|
||||
json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled = false);
|
||||
void createDirectory(string path),
|
||||
print(ostringstream* toPrint),
|
||||
treatComment(unsigned short threadId, json comment, string channelId),
|
||||
@ -49,13 +49,15 @@ int main()
|
||||
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
|
||||
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
|
||||
vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
|
||||
// Note that using `set`s makes the search faster but we lose the `channels.txt` lines order.
|
||||
channelsToTreat = set(channelsVec.begin(), channelsVec.end());
|
||||
|
||||
createDirectory(CHANNELS_DIRECTORY);
|
||||
|
||||
for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
|
||||
{
|
||||
string channelId = entry.path().filename();
|
||||
string fileName = entry.path().filename(),
|
||||
channelId = fileName.substr(0, fileName.length() - 4);
|
||||
channelsToTreat.erase(channelId);
|
||||
channelsAlreadyTreated.insert(channelId);
|
||||
}
|
||||
@ -112,9 +114,14 @@ void treatChannels(unsigned short threadId)
|
||||
|
||||
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
|
||||
|
||||
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
|
||||
PRINT(threadId, "Starting compression...")
|
||||
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
|
||||
exec("cd " + channelToTreatDirectory + " && zip -r ../" + channelToTreat + ".zip *");
|
||||
exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
|
||||
|
||||
PRINT(threadId, "Compression finished, started deleting initial directory...")
|
||||
deleteDirectory(channelToTreatDirectory);
|
||||
PRINT(threadId, "Deleting directory finished.")
|
||||
|
||||
PRINT(threadId, commentsCount << " comments were found for this channel.")
|
||||
commentsCount = 0;
|
||||
@ -132,7 +139,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
ostringstream toString;
|
||||
toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
|
||||
string url = toString.str();
|
||||
json data = getJson(threadId, url, channelToTreat);
|
||||
json data = getJson(threadId, url, channelToTreat, pageToken != "");
|
||||
bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
|
||||
if(doesRelyingOnCommentThreadsIsEnough)
|
||||
{
|
||||
@ -323,7 +330,7 @@ vector<string> getFileContent(string filePath)
|
||||
return lines;
|
||||
}
|
||||
|
||||
json getJson(unsigned short threadId, string url, string directoryPath)
|
||||
json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled)
|
||||
{
|
||||
#ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
|
||||
string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
|
||||
@ -345,7 +352,10 @@ json getJson(unsigned short threadId, string url, string directoryPath)
|
||||
if(data.contains("error"))
|
||||
{
|
||||
PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
|
||||
return getJson(threadId, url, directoryPath);
|
||||
if(data["error"]["errors"][0]["reason"] != "commentsDisabled" || retryOnCommentsDisabled)
|
||||
{
|
||||
return getJson(threadId, url, directoryPath);
|
||||
}
|
||||
}
|
||||
|
||||
ostringstream toString;
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import shutil
|
||||
import shutil, os
|
||||
|
||||
infix = ': Treating channel '
|
||||
path = 'channels/'
|
||||
@ -18,8 +18,16 @@ with open('nohup.out') as f:
|
||||
for threadId in threads:
|
||||
channelId = threads[threadId]
|
||||
print(threadId, channelId)
|
||||
# There are three cases:
|
||||
# - `channelId`/ exists
|
||||
# - `channelId`/ and `channelId`.zip exist
|
||||
# - `channelId`.zip exists
|
||||
# To manage every case, we need to use two `try`/`except`.
|
||||
try:
|
||||
shutil.rmtree(path + channelId)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.remove(path + channelId + ".zip")
|
||||
except:
|
||||
pass
|
||||
|
Loading…
x
Reference in New Issue
Block a user