2022-12-22 05:20:32 +01:00
# include <iostream>
# include <fstream>
2023-01-02 18:23:30 +01:00
# include <sstream>
2022-12-22 05:20:32 +01:00
# include <set>
2023-01-03 04:56:19 +01:00
# include <mutex>
# include <thread>
2022-12-22 06:18:22 +01:00
# include <sys/stat.h>
2023-01-03 04:56:19 +01:00
# include <unistd.h>
2022-12-22 05:20:32 +01:00
# include <curl/curl.h>
# include <nlohmann/json.hpp>
using namespace std ;
using json = nlohmann : : json ;
vector < string > getFileContent ( string filePath ) ;
2023-01-03 04:56:19 +01:00
json getJson ( unsigned short threadId , string url , string directoryPath ) ;
2022-12-22 06:18:22 +01:00
void createDirectory ( string path ) ,
print ( ostringstream * toPrint ) ,
2023-01-03 04:56:19 +01:00
treatComment ( unsigned short threadId , json comment , string channelId ) ,
treatChannelOrVideo ( unsigned short threadId , bool isChannel , string id , string channelToTreat ) ,
treatChannels ( unsigned short threadId ) ;
2022-12-22 05:20:32 +01:00
string getHttps ( string url ) ;
size_t writeCallback ( void * contents , size_t size , size_t nmemb , void * userp ) ;
2022-12-22 06:18:22 +01:00
bool doesFileExist ( string filePath ) ,
2023-01-03 04:56:19 +01:00
writeFile ( unsigned short threadId , string filePath , string option , string toWrite ) ;
2022-12-22 05:20:32 +01:00
2023-01-02 18:30:18 +01:00
# define USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
2022-12-22 05:20:32 +01:00
# define API_KEY "AIzaSy..."
2023-01-03 04:56:19 +01:00
# define THREADS_NUMBER 10
2022-12-22 05:20:32 +01:00
2023-01-03 04:56:19 +01:00
# define PRINT(threadId, x) { ostringstream toPrint; toPrint << threadId << ": " << x; print(&toPrint); }
# define DEFAULT_THREAD_ID 0
2022-12-22 05:20:32 +01:00
2023-01-03 04:56:19 +01:00
mutex printMutex ,
allocateChannelMutex ;
2022-12-22 06:18:22 +01:00
set < string > channelsAlreadyTreated ,
2023-01-02 18:31:16 +01:00
channelsToTreat ;
2023-01-02 19:46:32 +01:00
unsigned int commentsCount = 0 ,
requestsPerChannel = 0 ;
string CHANNELS_DIRECTORY = " channels/ " ,
CHANNELS_FILE_PATH = " channels.txt " ;
2022-12-22 05:20:32 +01:00
int main ( )
{
2023-01-02 19:46:32 +01:00
// The starting set should be written to `CHANNELS_FILE_PATH`.
2023-01-03 04:56:19 +01:00
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
2023-01-02 19:46:32 +01:00
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
vector < string > channelsVec = getFileContent ( CHANNELS_FILE_PATH ) ;
channelsToTreat = set ( channelsVec . begin ( ) , channelsVec . end ( ) ) ;
2022-12-22 06:18:22 +01:00
2023-01-02 19:46:32 +01:00
createDirectory ( CHANNELS_DIRECTORY ) ;
2022-12-22 06:18:22 +01:00
2023-01-02 19:46:32 +01:00
for ( const auto & entry : filesystem : : directory_iterator ( CHANNELS_DIRECTORY ) )
2022-12-22 06:18:22 +01:00
{
2023-01-02 19:46:32 +01:00
string channelId = entry . path ( ) . filename ( ) ;
channelsToTreat . erase ( channelId ) ;
channelsAlreadyTreated . insert ( channelId ) ;
2022-12-22 06:18:22 +01:00
}
2023-01-03 04:56:19 +01:00
PRINT ( DEFAULT_THREAD_ID , channelsToTreat . size ( ) < < " channel(s) to treat " )
PRINT ( DEFAULT_THREAD_ID , channelsAlreadyTreated . size ( ) < < " channel(s) already treated " )
2022-12-22 06:18:22 +01:00
2023-01-03 04:56:19 +01:00
thread threads [ THREADS_NUMBER ] ;
for ( unsigned short threadsIndex = 0 ; threadsIndex < THREADS_NUMBER ; threadsIndex + + )
2022-12-22 06:18:22 +01:00
{
2023-01-03 04:56:19 +01:00
threads [ threadsIndex ] = thread ( treatChannels , threadsIndex + 1 ) ;
}
for ( unsigned short threadsIndex = 0 ; threadsIndex < THREADS_NUMBER ; threadsIndex + + )
{
threads [ threadsIndex ] . join ( ) ;
}
return 0 ;
}
void treatChannels ( unsigned short threadId )
{
// For the moment we assume that we never have treated completely YouTube, otherwise we have to pay attention how to proceed if the starting set involves startvation for some threads.
while ( true )
{
allocateChannelMutex . lock ( ) ;
if ( channelsToTreat . empty ( ) )
{
allocateChannelMutex . unlock ( ) ;
sleep ( 1 ) ;
continue ;
}
2022-12-22 06:18:22 +01:00
string channelToTreat = * channelsToTreat . begin ( ) ;
2023-01-02 19:46:32 +01:00
2023-01-03 04:56:19 +01:00
PRINT ( threadId , " Treating channel " < < channelToTreat < < " (treated: " < < channelsAlreadyTreated . size ( ) < < " , to treat: " < < channelsToTreat . size ( ) < < " ) " )
2022-12-22 06:18:22 +01:00
2023-01-03 04:56:19 +01:00
channelsToTreat . erase ( channelToTreat ) ;
2023-01-02 19:46:32 +01:00
channelsAlreadyTreated . insert ( channelToTreat ) ;
2023-01-03 04:56:19 +01:00
allocateChannelMutex . unlock ( ) ;
2023-01-02 19:46:32 +01:00
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + " / " ;
createDirectory ( channelToTreatDirectory ) ;
2023-01-03 04:56:19 +01:00
treatChannelOrVideo ( threadId , true , channelToTreat , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
2023-01-03 04:56:19 +01:00
PRINT ( threadId , commentsCount < < " comments were found for this channel. " )
2023-01-03 02:56:07 +01:00
commentsCount = 0 ;
requestsPerChannel = 0 ;
}
2023-01-03 04:56:19 +01:00
allocateChannelMutex . unlock ( ) ;
2023-01-03 02:56:07 +01:00
}
2023-01-03 04:56:19 +01:00
void treatChannelOrVideo ( unsigned short threadId , bool isChannel , string id , string channelToTreat )
2023-01-03 02:56:07 +01:00
{
string pageToken = " " ;
while ( true )
{
ostringstream toString ;
toString < < " commentThreads?part=snippet,replies& " < < ( isChannel ? " allThreadsRelatedToChannelId " : " videoId " ) < < " = " < < id < < " &maxResults=100&pageToken= " < < pageToken ;
string url = toString . str ( ) ;
2023-01-03 04:56:19 +01:00
json data = getJson ( threadId , url , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
bool doesRelyingOnCommentThreadsIsEnough = ( ! isChannel ) | | data [ " error " ] [ " errors " ] [ 0 ] [ " reason " ] ! = " commentsDisabled " ;
if ( doesRelyingOnCommentThreadsIsEnough )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
json items = data [ " items " ] ;
for ( const auto & item : items )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
json comment = item [ " snippet " ] [ " topLevelComment " ] ;
string commentId = comment [ " id " ] ;
2023-01-03 04:56:19 +01:00
treatComment ( threadId , comment , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
if ( item . contains ( " replies " ) )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
json replies = item [ " replies " ] [ " comments " ] ;
if ( replies . size ( ) > = 5 )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
string pageToken = " " ;
while ( true )
2022-12-22 06:18:22 +01:00
{
2023-01-03 04:56:19 +01:00
json data = getJson ( threadId , " comments?part=snippet&parentId= " + commentId + " &maxResults=100&pageToken= " + pageToken , channelToTreat ) ,
2023-01-03 02:56:07 +01:00
items = data [ " items " ] ;
for ( const auto & item : items )
2022-12-22 06:18:22 +01:00
{
2023-01-03 04:56:19 +01:00
treatComment ( threadId , item , channelToTreat ) ;
2022-12-22 06:18:22 +01:00
}
2023-01-03 02:56:07 +01:00
if ( data . contains ( " nextPageToken " ) )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
2022-12-22 06:18:22 +01:00
}
}
}
2023-01-03 02:56:07 +01:00
else
{
for ( const auto & reply : replies )
{
2023-01-03 04:56:19 +01:00
treatComment ( threadId , reply , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
}
}
2022-12-22 06:18:22 +01:00
}
2023-01-03 02:56:07 +01:00
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
}
else
{
2023-01-03 04:56:19 +01:00
PRINT ( threadId , " Comments disabled channel, treating differently... " )
json data = getJson ( threadId , " channels?part=statistics&id= " + channelToTreat , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
// YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer...
unsigned int videoCount = atoi ( string ( data [ " items " ] [ 0 ] [ " statistics " ] [ " videoCount " ] ) . c_str ( ) ) ;
2023-01-03 04:56:19 +01:00
PRINT ( threadId , " The channel has about " < < videoCount < < " videos. " )
2023-01-03 02:56:07 +01:00
// `UC-3A9g4U1PpLaeAuD4jSP_w` has a `videoCount` of 2, while its `uploads` playlist contains 3 videos. So we use a strict inequality here.
if ( videoCount < 20000 )
{
string playlistToTreat = " UU " + channelToTreat . substr ( 2 ) ,
pageToken = " " ;
while ( true )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
// `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them.
2023-01-03 04:56:19 +01:00
json data = getJson ( threadId , " playlistItems?part=snippet,contentDetails,status&playlistId= " + playlistToTreat + " &maxResults=50&pageToken= " + pageToken , channelToTreat ) ,
2023-01-03 02:56:07 +01:00
items = data [ " items " ] ;
for ( const auto & item : items )
{
string videoId = item [ " contentDetails " ] [ " videoId " ] ;
// To keep the same amount of logs for each channel, I comment the following `PRINT`.
//PRINT("Treating video " << videoId)
2023-01-03 04:56:19 +01:00
treatChannelOrVideo ( threadId , false , videoId , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
2022-12-22 06:18:22 +01:00
}
2023-01-03 02:56:07 +01:00
break ;
2022-12-22 06:18:22 +01:00
}
else
{
2023-01-03 04:56:19 +01:00
PRINT ( threadId , " The videos count of the channel exceeds the supported 20,000 limit! " )
2022-12-22 06:18:22 +01:00
exit ( 1 ) ;
}
}
}
2022-12-22 05:20:32 +01:00
}
2023-01-03 04:56:19 +01:00
void treatComment ( unsigned short threadId , json comment , string channelId )
2022-12-22 05:20:32 +01:00
{
2022-12-22 06:18:22 +01:00
json snippet = comment [ " snippet " ] ;
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if ( snippet . contains ( " authorChannelId " ) )
{
string channelId = snippet [ " authorChannelId " ] [ " value " ] ;
2023-01-02 19:46:32 +01:00
if ( find ( channelsAlreadyTreated . begin ( ) , channelsAlreadyTreated . end ( ) , channelId ) = = channelsAlreadyTreated . end ( ) & & find ( channelsToTreat . begin ( ) , channelsToTreat . end ( ) , channelId ) = = channelsToTreat . end ( ) )
{
2022-12-22 06:18:22 +01:00
channelsToTreat . insert ( channelId ) ;
2023-01-02 19:46:32 +01:00
2023-01-03 04:56:19 +01:00
writeFile ( threadId , CHANNELS_FILE_PATH , " a " , " \n " + channelId ) ;
2023-01-02 19:46:32 +01:00
}
2022-12-22 06:18:22 +01:00
}
commentsCount + + ;
}
2023-01-03 04:56:19 +01:00
bool writeFile ( unsigned short threadId , string filePath , string option , string toWrite )
2022-12-22 06:18:22 +01:00
{
FILE * file = fopen ( filePath . c_str ( ) , option . c_str ( ) ) ;
if ( file ! = NULL )
{
fputs ( toWrite . c_str ( ) , file ) ;
fclose ( file ) ;
return true ;
}
2023-01-02 19:46:32 +01:00
else
{
2023-01-03 04:56:19 +01:00
PRINT ( threadId , " writeFile error: " < < strerror ( errno ) )
2023-01-02 19:46:32 +01:00
}
2022-12-22 06:18:22 +01:00
return false ;
}
bool doesFileExist ( string filePath )
{
struct stat buffer ;
return stat ( filePath . c_str ( ) , & buffer ) = = 0 ;
}
void createDirectory ( string path )
{
mkdir ( path . c_str ( ) , S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH ) ;
2022-12-22 05:20:32 +01:00
}
2022-12-22 05:47:16 +01:00
string getDate ( )
{
auto t = time ( nullptr ) ;
auto tm = * localtime ( & t ) ;
ostringstream toString ;
toString < < put_time ( & tm , " %d-%m-%Y %H-%M-%S " ) ;
return toString . str ( ) ;
}
2022-12-22 05:20:32 +01:00
vector < string > getFileContent ( string filePath )
{
2022-12-22 06:18:22 +01:00
vector < string > lines ;
ifstream infile ( filePath . c_str ( ) ) ;
2022-12-22 05:20:32 +01:00
string line ;
while ( getline ( infile , line ) )
2022-12-22 06:18:22 +01:00
lines . push_back ( line ) ;
2022-12-22 05:20:32 +01:00
return lines ;
}
2023-01-03 04:56:19 +01:00
json getJson ( unsigned short threadId , string url , string directoryPath )
2022-12-22 05:20:32 +01:00
{
2023-01-02 18:31:16 +01:00
# ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
2023-01-02 19:46:32 +01:00
string finalUrl = " https://yt.lemnoslife.com/noKey/ " + url ;
2023-01-02 18:31:16 +01:00
# else
2023-01-02 19:46:32 +01:00
string finalUrl = " https://www.googleapis.com/youtube/v3/ " + url + " &key= " + API_KEY ;
2023-01-02 18:31:16 +01:00
# endif
2023-01-02 19:46:32 +01:00
string content = getHttps ( finalUrl ) ;
2022-12-22 06:18:22 +01:00
json data = json : : parse ( content ) ;
2023-01-02 19:46:32 +01:00
ostringstream toString ;
toString < < CHANNELS_DIRECTORY < < directoryPath < < " / " < < requestsPerChannel < < " .json " ;
requestsPerChannel + + ;
2023-01-03 04:56:19 +01:00
writeFile ( threadId , toString . str ( ) , " w " , url + " \n " + content ) ;
2023-01-02 19:46:32 +01:00
2022-12-22 06:18:22 +01:00
return data ;
2022-12-22 05:20:32 +01:00
}
void print ( ostringstream * toPrint )
{
2023-01-03 04:56:19 +01:00
printMutex . lock ( ) ;
2022-12-22 06:18:22 +01:00
cout < < getDate ( ) < < " : " < < toPrint - > str ( ) < < endl ;
toPrint - > str ( " " ) ;
2023-01-03 04:56:19 +01:00
printMutex . unlock ( ) ;
2022-12-22 05:20:32 +01:00
}
2023-01-03 04:56:19 +01:00
// Is this function really multi-threading friendly? If not, could consider executing `curl` using the command line.
2022-12-22 05:20:32 +01:00
string getHttps ( string url )
{
CURL * curl = curl_easy_init ( ) ;
string got ;
curl_easy_setopt ( curl , CURLOPT_URL , url . c_str ( ) ) ;
curl_easy_setopt ( curl , CURLOPT_SSL_VERIFYPEER , 1 ) ;
curl_easy_setopt ( curl , CURLOPT_SSL_VERIFYHOST , 1 ) ;
curl_easy_setopt ( curl , CURLOPT_WRITEFUNCTION , writeCallback ) ;
curl_easy_setopt ( curl , CURLOPT_WRITEDATA , & got ) ;
curl_easy_perform ( curl ) ;
curl_easy_cleanup ( curl ) ;
return got ;
}
size_t writeCallback ( void * contents , size_t size , size_t nmemb , void * userp )
{
( ( string * ) userp ) - > append ( ( char * ) contents , size * nmemb ) ;
return size * nmemb ;
}