2022-12-22 05:20:32 +01:00
# include <iostream>
# include <fstream>
2023-01-02 18:23:30 +01:00
# include <sstream>
2022-12-22 05:20:32 +01:00
# include <set>
2022-12-22 06:18:22 +01:00
# include <sys/stat.h>
2022-12-22 05:20:32 +01:00
# include <curl/curl.h>
# include <nlohmann/json.hpp>
using namespace std ;
using json = nlohmann : : json ;
vector < string > getFileContent ( string filePath ) ;
2023-01-02 19:46:32 +01:00
json getJson ( string url , string directoryPath ) ;
2022-12-22 06:18:22 +01:00
void createDirectory ( string path ) ,
print ( ostringstream * toPrint ) ,
2023-01-02 19:46:32 +01:00
treatComment ( json comment , string channelId ) ;
2022-12-22 05:20:32 +01:00
string getHttps ( string url ) ;
size_t writeCallback ( void * contents , size_t size , size_t nmemb , void * userp ) ;
2022-12-22 06:18:22 +01:00
bool doesFileExist ( string filePath ) ,
writeFile ( string filePath , string option , string toWrite ) ;
2022-12-22 05:20:32 +01:00
2023-01-02 18:30:18 +01:00
# define USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
2022-12-22 05:20:32 +01:00
# define API_KEY "AIzaSy..."
// Note that this printing approach is only safe in a mono-thread context.
# define PRINT(x) toPrint << x; print(&toPrint);
ostringstream toPrint ;
2022-12-22 06:18:22 +01:00
set < string > channelsAlreadyTreated ,
2023-01-02 18:31:16 +01:00
channelsToTreat ;
2023-01-02 19:46:32 +01:00
unsigned int commentsCount = 0 ,
requestsPerChannel = 0 ;
string CHANNELS_DIRECTORY = " channels/ " ,
CHANNELS_FILE_PATH = " channels.txt " ;
2022-12-22 05:20:32 +01:00
int main ( )
{
2023-01-02 19:46:32 +01:00
// The starting set should be written to `CHANNELS_FILE_PATH`.
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folder in `CHANNELS_DIRECTORY` being treated.
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
vector < string > channelsVec = getFileContent ( CHANNELS_FILE_PATH ) ;
channelsToTreat = set ( channelsVec . begin ( ) , channelsVec . end ( ) ) ;
2022-12-22 06:18:22 +01:00
2023-01-02 19:46:32 +01:00
createDirectory ( CHANNELS_DIRECTORY ) ;
2022-12-22 06:18:22 +01:00
2023-01-02 19:46:32 +01:00
for ( const auto & entry : filesystem : : directory_iterator ( CHANNELS_DIRECTORY ) )
2022-12-22 06:18:22 +01:00
{
2023-01-02 19:46:32 +01:00
string channelId = entry . path ( ) . filename ( ) ;
channelsToTreat . erase ( channelId ) ;
channelsAlreadyTreated . insert ( channelId ) ;
2022-12-22 06:18:22 +01:00
}
PRINT ( channelsToTreat . size ( ) < < " channel(s) to treat " )
PRINT ( channelsAlreadyTreated . size ( ) < < " channel(s) already treated " )
while ( ! channelsToTreat . empty ( ) )
{
string channelToTreat = * channelsToTreat . begin ( ) ;
2023-01-02 19:46:32 +01:00
2022-12-22 06:18:22 +01:00
PRINT ( " Treating channel " < < channelToTreat < < " (treated: " < < channelsAlreadyTreated . size ( ) < < " , to treat: " < < channelsToTreat . size ( ) < < " ) " )
2023-01-02 19:46:32 +01:00
channelsAlreadyTreated . insert ( channelToTreat ) ;
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + " / " ;
createDirectory ( channelToTreatDirectory ) ;
2022-12-22 06:18:22 +01:00
string pageToken = " " ;
while ( true )
{
2023-01-02 19:46:32 +01:00
json data = getJson ( " commentThreads?part=snippet,replies&allThreadsRelatedToChannelId= " + channelToTreat + " &maxResults=100&pageToken= " + pageToken , channelToTreat ) ;
2022-12-22 06:18:22 +01:00
bool doesRelyingOnCommentThreadsIsEnough = data [ " error " ] [ " errors " ] [ 0 ] [ " reason " ] ! = " commentsDisabled " ;
if ( doesRelyingOnCommentThreadsIsEnough )
{
json items = data [ " items " ] ;
for ( const auto & item : items )
{
json comment = item [ " snippet " ] [ " topLevelComment " ] ;
string commentId = comment [ " id " ] ;
2023-01-02 19:46:32 +01:00
treatComment ( comment , channelToTreat ) ;
2022-12-22 06:18:22 +01:00
if ( item . contains ( " replies " ) )
{
json replies = item [ " replies " ] [ " comments " ] ;
if ( replies . size ( ) > = 5 )
{
string pageToken = " " ;
while ( true )
{
2023-01-02 19:46:32 +01:00
json data = getJson ( " comments?part=snippet&parentId= " + commentId + " &maxResults=100&pageToken= " + pageToken , channelToTreat ) ;
2022-12-22 06:18:22 +01:00
json items = data [ " items " ] ;
for ( const auto & item : items )
{
2023-01-02 19:46:32 +01:00
treatComment ( item , channelToTreat ) ;
2022-12-22 06:18:22 +01:00
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
}
}
else
{
for ( const auto & reply : replies )
{
2023-01-02 19:46:32 +01:00
treatComment ( reply , channelToTreat ) ;
2022-12-22 06:18:22 +01:00
}
}
}
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
}
else
{
PRINT ( " Comments disabled channel! " )
exit ( 1 ) ;
}
}
PRINT ( commentsCount )
commentsCount = 0 ;
2023-01-02 19:46:32 +01:00
requestsPerChannel = 0 ;
2022-12-22 06:18:22 +01:00
channelsToTreat . erase ( channelToTreat ) ;
}
return 0 ;
2022-12-22 05:20:32 +01:00
}
2023-01-02 19:46:32 +01:00
void treatComment ( json comment , string channelId )
2022-12-22 05:20:32 +01:00
{
2022-12-22 06:18:22 +01:00
json snippet = comment [ " snippet " ] ;
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if ( snippet . contains ( " authorChannelId " ) )
{
string channelId = snippet [ " authorChannelId " ] [ " value " ] ;
2023-01-02 19:46:32 +01:00
if ( find ( channelsAlreadyTreated . begin ( ) , channelsAlreadyTreated . end ( ) , channelId ) = = channelsAlreadyTreated . end ( ) & & find ( channelsToTreat . begin ( ) , channelsToTreat . end ( ) , channelId ) = = channelsToTreat . end ( ) )
{
2022-12-22 06:18:22 +01:00
channelsToTreat . insert ( channelId ) ;
2023-01-02 19:46:32 +01:00
writeFile ( CHANNELS_FILE_PATH , " a " , " \n " + channelId ) ;
}
2022-12-22 06:18:22 +01:00
}
commentsCount + + ;
}
bool writeFile ( string filePath , string option , string toWrite )
{
FILE * file = fopen ( filePath . c_str ( ) , option . c_str ( ) ) ;
if ( file ! = NULL )
{
fputs ( toWrite . c_str ( ) , file ) ;
fclose ( file ) ;
return true ;
}
2023-01-02 19:46:32 +01:00
else
{
PRINT ( " writeFile error: " < < strerror ( errno ) )
}
2022-12-22 06:18:22 +01:00
return false ;
}
bool doesFileExist ( string filePath )
{
struct stat buffer ;
return stat ( filePath . c_str ( ) , & buffer ) = = 0 ;
}
void createDirectory ( string path )
{
mkdir ( path . c_str ( ) , S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH ) ;
2022-12-22 05:20:32 +01:00
}
2022-12-22 05:47:16 +01:00
string getDate ( )
{
auto t = time ( nullptr ) ;
auto tm = * localtime ( & t ) ;
ostringstream toString ;
toString < < put_time ( & tm , " %d-%m-%Y %H-%M-%S " ) ;
return toString . str ( ) ;
}
2022-12-22 05:20:32 +01:00
vector < string > getFileContent ( string filePath )
{
2022-12-22 06:18:22 +01:00
vector < string > lines ;
ifstream infile ( filePath . c_str ( ) ) ;
2022-12-22 05:20:32 +01:00
string line ;
while ( getline ( infile , line ) )
2022-12-22 06:18:22 +01:00
lines . push_back ( line ) ;
2022-12-22 05:20:32 +01:00
return lines ;
}
2023-01-02 19:46:32 +01:00
json getJson ( string url , string directoryPath )
2022-12-22 05:20:32 +01:00
{
2023-01-02 18:31:16 +01:00
# ifdef USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE
2023-01-02 19:46:32 +01:00
string finalUrl = " https://yt.lemnoslife.com/noKey/ " + url ;
2023-01-02 18:31:16 +01:00
# else
2023-01-02 19:46:32 +01:00
string finalUrl = " https://www.googleapis.com/youtube/v3/ " + url + " &key= " + API_KEY ;
2023-01-02 18:31:16 +01:00
# endif
2023-01-02 19:46:32 +01:00
string content = getHttps ( finalUrl ) ;
2022-12-22 06:18:22 +01:00
json data = json : : parse ( content ) ;
2023-01-02 19:46:32 +01:00
ostringstream toString ;
toString < < CHANNELS_DIRECTORY < < directoryPath < < " / " < < requestsPerChannel < < " .json " ;
requestsPerChannel + + ;
writeFile ( toString . str ( ) , " w " , url + " \n " + content ) ;
2022-12-22 06:18:22 +01:00
return data ;
2022-12-22 05:20:32 +01:00
}
void print ( ostringstream * toPrint )
{
2022-12-22 06:18:22 +01:00
cout < < getDate ( ) < < " : " < < toPrint - > str ( ) < < endl ;
toPrint - > str ( " " ) ;
2022-12-22 05:20:32 +01:00
}
string getHttps ( string url )
{
CURL * curl = curl_easy_init ( ) ;
string got ;
curl_easy_setopt ( curl , CURLOPT_URL , url . c_str ( ) ) ;
curl_easy_setopt ( curl , CURLOPT_SSL_VERIFYPEER , 1 ) ;
curl_easy_setopt ( curl , CURLOPT_SSL_VERIFYHOST , 1 ) ;
curl_easy_setopt ( curl , CURLOPT_WRITEFUNCTION , writeCallback ) ;
curl_easy_setopt ( curl , CURLOPT_WRITEDATA , & got ) ;
curl_easy_perform ( curl ) ;
curl_easy_cleanup ( curl ) ;
return got ;
}
size_t writeCallback ( void * contents , size_t size , size_t nmemb , void * userp )
{
( ( string * ) userp ) - > append ( ( char * ) contents , size * nmemb ) ;
return size * nmemb ;
}