Fix #20: YouTube Data API v3 returns rarely suddenly commentsDisabled
error which involves an unwanted method switch
Also modified compression command, as I got `sh: 1: zip: Argument list too long` when compressing the 248,868 json files of the French most subscribers channel.
This commit is contained in:
parent
ba37d6a111
commit
7e35a6473a
20
main.cpp
20
main.cpp
@ -13,7 +13,7 @@ using namespace chrono;
|
|||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
vector<string> getFileContent(string filePath);
|
vector<string> getFileContent(string filePath);
|
||||||
json getJson(unsigned short threadId, string url, string directoryPath);
|
json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled = false);
|
||||||
void createDirectory(string path),
|
void createDirectory(string path),
|
||||||
print(ostringstream* toPrint),
|
print(ostringstream* toPrint),
|
||||||
treatComment(unsigned short threadId, json comment, string channelId),
|
treatComment(unsigned short threadId, json comment, string channelId),
|
||||||
@ -49,13 +49,15 @@ int main()
|
|||||||
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
|
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
|
||||||
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
|
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
|
||||||
vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
|
vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
|
||||||
|
// Note that using `set`s makes the search faster but we lose the `channels.txt` lines order.
|
||||||
channelsToTreat = set(channelsVec.begin(), channelsVec.end());
|
channelsToTreat = set(channelsVec.begin(), channelsVec.end());
|
||||||
|
|
||||||
createDirectory(CHANNELS_DIRECTORY);
|
createDirectory(CHANNELS_DIRECTORY);
|
||||||
|
|
||||||
for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
|
for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
|
||||||
{
|
{
|
||||||
string channelId = entry.path().filename();
|
string fileName = entry.path().filename(),
|
||||||
|
channelId = fileName.substr(0, fileName.length() - 4);
|
||||||
channelsToTreat.erase(channelId);
|
channelsToTreat.erase(channelId);
|
||||||
channelsAlreadyTreated.insert(channelId);
|
channelsAlreadyTreated.insert(channelId);
|
||||||
}
|
}
|
||||||
@ -112,9 +114,14 @@ void treatChannels(unsigned short threadId)
|
|||||||
|
|
||||||
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
|
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
|
||||||
|
|
||||||
|
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
|
||||||
|
PRINT(threadId, "Starting compression...")
|
||||||
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
|
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
|
||||||
exec("cd " + channelToTreatDirectory + " && zip -r ../" + channelToTreat + ".zip *");
|
exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
|
||||||
|
|
||||||
|
PRINT(threadId, "Compression finished, started deleting initial directory...")
|
||||||
deleteDirectory(channelToTreatDirectory);
|
deleteDirectory(channelToTreatDirectory);
|
||||||
|
PRINT(threadId, "Deleting directory finished.")
|
||||||
|
|
||||||
PRINT(threadId, commentsCount << " comments were found for this channel.")
|
PRINT(threadId, commentsCount << " comments were found for this channel.")
|
||||||
commentsCount = 0;
|
commentsCount = 0;
|
||||||
@ -132,7 +139,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
|||||||
ostringstream toString;
|
ostringstream toString;
|
||||||
toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
|
toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
|
||||||
string url = toString.str();
|
string url = toString.str();
|
||||||
json data = getJson(threadId, url, channelToTreat);
|
json data = getJson(threadId, url, channelToTreat, pageToken != "");
|
||||||
bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
|
bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
|
||||||
if(doesRelyingOnCommentThreadsIsEnough)
|
if(doesRelyingOnCommentThreadsIsEnough)
|
||||||
{
|
{
|
||||||
@ -323,7 +330,7 @@ vector<string> getFileContent(string filePath)
|
|||||||
return lines;
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
json getJson(unsigned short threadId, string url, string directoryPath)
|
json getJson(unsigned short threadId, string url, string directoryPath, bool retryOnCommentsDisabled)
|
||||||
{
|
{
|
||||||
#ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
|
#ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
|
||||||
string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
|
string finalUrl = "https://yt.lemnoslife.com/noKey/" + url;
|
||||||
@ -345,8 +352,11 @@ json getJson(unsigned short threadId, string url, string directoryPath)
|
|||||||
if(data.contains("error"))
|
if(data.contains("error"))
|
||||||
{
|
{
|
||||||
PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
|
PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
|
||||||
|
if(data["error"]["errors"][0]["reason"] != "commentsDisabled" || retryOnCommentsDisabled)
|
||||||
|
{
|
||||||
return getJson(threadId, url, directoryPath);
|
return getJson(threadId, url, directoryPath);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ostringstream toString;
|
ostringstream toString;
|
||||||
toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json";
|
toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json";
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
import shutil
|
import shutil, os
|
||||||
|
|
||||||
infix = ': Treating channel '
|
infix = ': Treating channel '
|
||||||
path = 'channels/'
|
path = 'channels/'
|
||||||
@ -18,8 +18,16 @@ with open('nohup.out') as f:
|
|||||||
for threadId in threads:
|
for threadId in threads:
|
||||||
channelId = threads[threadId]
|
channelId = threads[threadId]
|
||||||
print(threadId, channelId)
|
print(threadId, channelId)
|
||||||
|
# There are three cases:
|
||||||
|
# - `channelId`/ exists
|
||||||
|
# - `channelId`/ and `channelId`.zip exist
|
||||||
|
# - `channelId`.zip exists
|
||||||
|
# To manage every case, we need to use two `try`/`except`.
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(path + channelId)
|
shutil.rmtree(path + channelId)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
os.remove(path + channelId + ".zip")
|
os.remove(path + channelId + ".zip")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
Loading…
Reference in New Issue
Block a user