2022-12-22 05:20:32 +01:00
# include <iostream>
# include <fstream>
2023-01-02 18:23:30 +01:00
# include <sstream>
2022-12-22 05:20:32 +01:00
# include <set>
2023-01-03 04:56:19 +01:00
# include <mutex>
# include <thread>
2022-12-22 06:18:22 +01:00
# include <sys/stat.h>
2023-01-03 04:56:19 +01:00
# include <unistd.h>
2022-12-22 05:20:32 +01:00
# include <curl/curl.h>
# include <nlohmann/json.hpp>
using namespace std ;
2023-01-06 17:55:16 +01:00
using namespace chrono ;
2022-12-22 05:20:32 +01:00
using json = nlohmann : : json ;
2023-01-08 16:31:57 +01:00
enum getJsonBehavior { normal , retryOnCommentsDisabled , returnErrorIfPlaylistNotFound } ;
2023-01-08 17:59:08 +01:00
set < string > setFromVector ( vector < string > vec ) ;
2022-12-22 05:20:32 +01:00
vector < string > getFileContent ( string filePath ) ;
2023-02-10 20:17:49 +01:00
json getJson ( unsigned short threadId , string url , bool usingYouTubeDataApiV3 , string channelId , getJsonBehavior behavior = normal ) ;
2022-12-22 06:18:22 +01:00
void createDirectory ( string path ) ,
print ( ostringstream * toPrint ) ,
2023-01-03 04:56:19 +01:00
treatComment ( unsigned short threadId , json comment , string channelId ) ,
treatChannelOrVideo ( unsigned short threadId , bool isChannel , string id , string channelToTreat ) ,
2023-01-04 03:06:33 +01:00
treatChannels ( unsigned short threadId ) ,
2023-01-15 02:19:31 +01:00
deleteDirectory ( string path ) ,
2023-02-12 16:24:16 +01:00
addChannelToTreat ( unsigned short threadId , string channelId ) ,
exec ( unsigned short threadId , string cmd , bool debug = true ) ;
2023-01-04 03:06:33 +01:00
string getHttps ( string url ) ,
2023-01-22 04:00:11 +01:00
join ( vector < string > parts , string delimiter ) ;
2022-12-22 05:20:32 +01:00
size_t writeCallback ( void * contents , size_t size , size_t nmemb , void * userp ) ;
2022-12-22 06:18:22 +01:00
bool doesFileExist ( string filePath ) ,
2023-01-03 04:56:19 +01:00
writeFile ( unsigned short threadId , string filePath , string option , string toWrite ) ;
2022-12-22 05:20:32 +01:00
2023-01-22 02:04:03 +01:00
# define THREAD_PRINT(threadId, x) { ostringstream toPrint; toPrint << threadId << ": " << x; print(&toPrint); }
# define PRINT(x) THREAD_PRINT(threadId, x)
2023-01-03 04:56:19 +01:00
# define DEFAULT_THREAD_ID 0
2023-01-22 02:04:03 +01:00
# define MAIN_PRINT(x) THREAD_PRINT(DEFAULT_THREAD_ID, x)
2022-12-22 05:20:32 +01:00
2023-01-22 15:17:14 +01:00
# define EXIT_WITH_ERROR(x) { PRINT(x); exit(EXIT_FAILURE); }
# define MAIN_EXIT_WITH_ERROR(x) { MAIN_PRINT(x); exit(EXIT_FAILURE); }
2023-01-03 04:56:19 +01:00
mutex printMutex ,
2023-01-08 17:59:08 +01:00
channelsAlreadyTreatedAndToTreatMutex ,
quotaMutex ;
2023-01-14 15:14:24 +01:00
set < string > channelsAlreadyTreated ;
// Two `map`s to simulate a bidirectional map.
map < unsigned int , string > channelsToTreat ;
map < string , unsigned int > channelsToTreatRev ;
2023-01-08 17:59:08 +01:00
vector < string > keys ;
2023-02-12 16:31:27 +01:00
unsigned int channelsPerSecondCount = 0 ;
map < unsigned short , unsigned int > channelsCountThreads ,
2023-01-15 14:31:55 +01:00
requestsPerChannelThreads ;
2023-01-08 18:26:20 +01:00
unsigned short THREADS_NUMBER = 1 ;
2023-01-15 14:56:44 +01:00
// Use `string` variables instead of macros to have `string` properties, even if could use a meta-macro inlining as `string`s.
2023-01-02 19:46:32 +01:00
string CHANNELS_DIRECTORY = " channels/ " ,
2023-01-08 17:59:08 +01:00
CHANNELS_FILE_PATH = " channels.txt " ,
KEYS_FILE_PATH = " keys.txt " ,
2023-01-15 14:56:44 +01:00
UNLISTED_VIDEOS_FILE_PATH = " unlistedVideos.txt " ,
2023-01-15 00:49:32 +01:00
apiKey = " " , // Will firstly be filled with `KEYS_FILE_PATH` first line.
Fix #13: Add captions extraction
I was about to commit in addition:
```c++
// Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`.
// My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`.
/*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat),
items = data["items"];
for(const auto& item : items)
{
json snippet = item["snippet"];
if(snippet["trackKind"] == "asr")
{
string language = snippet["language"];
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
// As there should be a single automatic speech recognized track, there is no need to go through all tracks.
break;
}
}*/
```
Instead of:
```c++
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
```
But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was
wrong:
> `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again.
>
> ```
> 'subtitleslangs': ['en.*','.*orig'],
> 'writeautomaticsub': True,
> ```
>
> Work as expected too. Thank you
>
> Very sorry for the video sample. I even not watched it.
Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
2023-02-10 20:03:08 +01:00
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = " http://localhost/YouTube-operational-API " , // Can be "https://yt.lemnoslife.com" for instance.
CAPTIONS_DIRECTORY = " captions/ " ,
2023-02-10 20:17:49 +01:00
DEBUG_DIRECTORY = " debug/ " ,
YOUTUBE_API_REQUESTS_DIRECTORY = " requests/ " ;
2023-01-08 18:26:20 +01:00
bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false ;
2022-12-22 05:20:32 +01:00
2023-01-08 18:26:20 +01:00
int main ( int argc , char * argv [ ] )
2022-12-22 05:20:32 +01:00
{
2023-01-08 18:26:20 +01:00
for ( unsigned short argvIndex = 1 ; argvIndex < argc ; argvIndex + + )
{
string argvStr = string ( argv [ argvIndex ] ) ;
if ( argvStr = = " --no-keys " )
{
USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = true ;
}
else if ( argvStr . rfind ( " --threads= " , 0 ) = = 0 )
{
THREADS_NUMBER = atoi ( argvStr . substr ( 10 ) . c_str ( ) ) ;
}
else if ( argvStr = = " -h " | | argvStr = = " --help " )
{
2023-01-15 00:49:32 +01:00
MAIN_PRINT ( " Usage: " < < argv [ 0 ] < < " [--help/-h] [--no-keys] [--threads=N] [--youtube-operational-api-instance-url URL] " )
exit ( EXIT_SUCCESS ) ;
}
else if ( argvStr = = " --youtube-operational-api-instance-url " )
{
if ( argvIndex < argc - 1 )
{
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = string ( argv [ argvIndex + 1 ] ) ;
argvIndex + + ;
}
else
{
2023-01-22 15:17:14 +01:00
MAIN_EXIT_WITH_ERROR ( " YouTube operational API instance URL missing! " )
2023-01-15 00:49:32 +01:00
}
2023-01-08 18:26:20 +01:00
}
else
{
2023-01-22 15:17:14 +01:00
MAIN_EXIT_WITH_ERROR ( " Unrecognized parameter " < < argvStr )
2023-01-08 18:26:20 +01:00
}
}
2023-01-02 19:46:32 +01:00
// The starting set should be written to `CHANNELS_FILE_PATH`.
2023-01-03 04:56:19 +01:00
// To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
2023-01-14 15:14:24 +01:00
// On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat*` or `channelsToTreat*` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
2023-01-02 19:46:32 +01:00
vector < string > channelsVec = getFileContent ( CHANNELS_FILE_PATH ) ;
2023-01-14 15:14:24 +01:00
for ( unsigned int channelsVecIndex = 0 ; channelsVecIndex < channelsVec . size ( ) ; channelsVecIndex + + )
{
string channel = channelsVec [ channelsVecIndex ] ;
channelsToTreat [ channelsVecIndex ] = channel ;
channelsToTreatRev [ channel ] = channelsVecIndex ;
}
2023-01-08 17:59:08 +01:00
keys = getFileContent ( KEYS_FILE_PATH ) ;
apiKey = keys [ 0 ] ;
2022-12-22 06:18:22 +01:00
2023-01-02 19:46:32 +01:00
createDirectory ( CHANNELS_DIRECTORY ) ;
2022-12-22 06:18:22 +01:00
2023-01-02 19:46:32 +01:00
for ( const auto & entry : filesystem : : directory_iterator ( CHANNELS_DIRECTORY ) )
2022-12-22 06:18:22 +01:00
{
2023-02-14 23:15:07 +01:00
string fileName = entry . path ( ) . filename ( ) ;
// Skip files such as `UNLISTED_VIDEOS_FILE_PATH`.
if ( fileName . substr ( 0 , 2 ) = = " UC " ) {
string channelId = fileName . substr ( 0 , fileName . length ( ) - 4 ) ;
2023-01-14 15:14:24 +01:00
2023-02-14 23:15:07 +01:00
channelsToTreat . erase ( channelsToTreatRev [ channelId ] ) ;
channelsToTreatRev . erase ( channelId ) ;
2023-01-14 15:14:24 +01:00
2023-02-14 23:15:07 +01:00
channelsAlreadyTreated . insert ( channelId ) ;
}
2022-12-22 06:18:22 +01:00
}
2023-01-08 18:26:20 +01:00
MAIN_PRINT ( channelsToTreat . size ( ) < < " channel(s) to treat " )
MAIN_PRINT ( channelsAlreadyTreated . size ( ) < < " channel(s) already treated " )
2022-12-22 06:18:22 +01:00
2023-01-08 18:26:20 +01:00
vector < thread > threads ;
2023-01-03 04:56:19 +01:00
for ( unsigned short threadsIndex = 0 ; threadsIndex < THREADS_NUMBER ; threadsIndex + + )
2022-12-22 06:18:22 +01:00
{
2023-01-08 18:26:20 +01:00
threads . push_back ( thread ( treatChannels , threadsIndex + 1 ) ) ;
2023-01-03 04:56:19 +01:00
}
2023-01-06 17:55:16 +01:00
while ( true )
{
2023-02-12 16:31:27 +01:00
MAIN_PRINT ( " Channels per second: " < < channelsPerSecondCount )
channelsPerSecondCount = 0 ;
2023-01-06 17:55:16 +01:00
sleep ( 1 ) ;
}
// The following is dead code, as we assume below not to have ever treated completely YouTube.
2023-01-03 04:56:19 +01:00
for ( unsigned short threadsIndex = 0 ; threadsIndex < THREADS_NUMBER ; threadsIndex + + )
{
threads [ threadsIndex ] . join ( ) ;
}
return 0 ;
}
void treatChannels ( unsigned short threadId )
{
// For the moment we assume that we never have treated completely YouTube, otherwise we have to pay attention how to proceed if the starting set involves startvation for some threads.
while ( true )
{
2023-01-06 18:00:51 +01:00
channelsAlreadyTreatedAndToTreatMutex . lock ( ) ;
2023-01-03 04:56:19 +01:00
if ( channelsToTreat . empty ( ) )
{
2023-01-06 18:00:51 +01:00
channelsAlreadyTreatedAndToTreatMutex . unlock ( ) ;
2023-01-03 04:56:19 +01:00
sleep ( 1 ) ;
continue ;
}
2023-01-14 15:14:24 +01:00
string channelToTreat = channelsToTreat . begin ( ) - > second ;
2023-01-02 19:46:32 +01:00
2023-01-22 02:04:03 +01:00
PRINT ( " Treating channel " < < channelToTreat < < " (treated: " < < channelsAlreadyTreated . size ( ) < < " , to treat: " < < channelsToTreat . size ( ) < < " ) " )
2022-12-22 06:18:22 +01:00
2023-02-12 16:31:27 +01:00
channelsCountThreads [ threadId ] = 0 ;
2023-01-15 14:31:55 +01:00
requestsPerChannelThreads [ threadId ] = 0 ;
2023-01-14 15:14:24 +01:00
channelsToTreat . erase ( channelsToTreatRev [ channelToTreat ] ) ;
channelsToTreatRev . erase ( channelToTreat ) ;
2023-01-02 19:46:32 +01:00
channelsAlreadyTreated . insert ( channelToTreat ) ;
2023-01-06 18:00:51 +01:00
channelsAlreadyTreatedAndToTreatMutex . unlock ( ) ;
2023-01-03 04:56:19 +01:00
2023-01-02 19:46:32 +01:00
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + " / " ;
createDirectory ( channelToTreatDirectory ) ;
Fix #13: Add captions extraction
I was about to commit in addition:
```c++
// Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`.
// My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`.
/*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat),
items = data["items"];
for(const auto& item : items)
{
json snippet = item["snippet"];
if(snippet["trackKind"] == "asr")
{
string language = snippet["language"];
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
// As there should be a single automatic speech recognized track, there is no need to go through all tracks.
break;
}
}*/
```
Instead of:
```c++
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
```
But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was
wrong:
> `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again.
>
> ```
> 'subtitleslangs': ['en.*','.*orig'],
> 'writeautomaticsub': True,
> ```
>
> Work as expected too. Thank you
>
> Very sorry for the video sample. I even not watched it.
Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
2023-02-10 20:03:08 +01:00
createDirectory ( DEBUG_DIRECTORY ) ;
createDirectory ( channelToTreatDirectory + CAPTIONS_DIRECTORY ) ;
2023-02-10 20:17:49 +01:00
createDirectory ( channelToTreatDirectory + YOUTUBE_API_REQUESTS_DIRECTORY ) ;
2023-01-02 19:46:32 +01:00
2023-01-03 04:56:19 +01:00
treatChannelOrVideo ( threadId , true , channelToTreat , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
2023-01-08 15:43:27 +01:00
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
2023-01-22 02:04:03 +01:00
PRINT ( " Starting compression... " )
2023-01-04 03:06:33 +01:00
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
2023-02-12 16:24:16 +01:00
// We precise no `debug`ging, as otherwise the zipping operation doesn't work as expected.
// As the zipping process isn't recursive, we can't just rely on `ls`, but we are obliged to use `find`.
exec ( threadId , " cd " + channelToTreatDirectory + " && find | zip ../ " + channelToTreat + " .zip -@ " , false ) ;
2023-01-08 15:43:27 +01:00
2023-01-22 02:04:03 +01:00
PRINT ( " Compression finished, started deleting initial directory... " )
2023-01-04 03:06:33 +01:00
deleteDirectory ( channelToTreatDirectory ) ;
2023-01-22 02:04:03 +01:00
PRINT ( " Deleting directory finished. " )
2023-01-04 03:06:33 +01:00
2023-02-12 16:31:27 +01:00
PRINT ( channelsCountThreads [ threadId ] < < " comments were found for this channel. " )
2023-01-03 02:56:07 +01:00
}
2023-01-06 18:00:51 +01:00
channelsAlreadyTreatedAndToTreatMutex . unlock ( ) ;
2023-01-03 02:56:07 +01:00
}
2023-01-15 14:31:55 +01:00
// Have to pay attention not to recursively call this function with another channel otherwise we break the ability of the program to halt at any top level channel.
2023-01-03 04:56:19 +01:00
void treatChannelOrVideo ( unsigned short threadId , bool isChannel , string id , string channelToTreat )
2023-01-03 02:56:07 +01:00
{
string pageToken = " " ;
while ( true )
{
ostringstream toString ;
toString < < " commentThreads?part=snippet,replies& " < < ( isChannel ? " allThreadsRelatedToChannelId " : " videoId " ) < < " = " < < id < < " &maxResults=100&pageToken= " < < pageToken ;
string url = toString . str ( ) ;
2023-01-15 02:19:31 +01:00
json data = getJson ( threadId , url , true , channelToTreat , pageToken = = " " ? normal : retryOnCommentsDisabled ) ;
2023-01-03 02:56:07 +01:00
bool doesRelyingOnCommentThreadsIsEnough = ( ! isChannel ) | | data [ " error " ] [ " errors " ] [ 0 ] [ " reason " ] ! = " commentsDisabled " ;
if ( doesRelyingOnCommentThreadsIsEnough )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
json items = data [ " items " ] ;
for ( const auto & item : items )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
json comment = item [ " snippet " ] [ " topLevelComment " ] ;
string commentId = comment [ " id " ] ;
2023-01-03 04:56:19 +01:00
treatComment ( threadId , comment , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
if ( item . contains ( " replies " ) )
2022-12-22 06:18:22 +01:00
{
2023-01-06 18:00:51 +01:00
if ( item [ " snippet " ] [ " totalReplyCount " ] > 5 )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
string pageToken = " " ;
while ( true )
2022-12-22 06:18:22 +01:00
{
2023-01-15 02:19:31 +01:00
json data = getJson ( threadId , " comments?part=snippet&parentId= " + commentId + " &maxResults=100&pageToken= " + pageToken , true , channelToTreat ) ,
2023-01-03 02:56:07 +01:00
items = data [ " items " ] ;
for ( const auto & item : items )
2022-12-22 06:18:22 +01:00
{
2023-01-03 04:56:19 +01:00
treatComment ( threadId , item , channelToTreat ) ;
2022-12-22 06:18:22 +01:00
}
2023-01-03 02:56:07 +01:00
if ( data . contains ( " nextPageToken " ) )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
2022-12-22 06:18:22 +01:00
}
}
}
2023-01-03 02:56:07 +01:00
else
{
2023-01-06 18:00:51 +01:00
json replies = item [ " replies " ] [ " comments " ] ;
2023-01-03 02:56:07 +01:00
for ( const auto & reply : replies )
{
2023-01-03 04:56:19 +01:00
treatComment ( threadId , reply , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
}
}
2022-12-22 06:18:22 +01:00
}
2023-01-03 02:56:07 +01:00
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
}
else
{
2023-01-22 02:04:03 +01:00
PRINT ( " Comments disabled channel, treating differently... " )
2023-01-15 02:19:31 +01:00
json data = getJson ( threadId , " channels?part=statistics&id= " + channelToTreat , true , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
// YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer...
unsigned int videoCount = atoi ( string ( data [ " items " ] [ 0 ] [ " statistics " ] [ " videoCount " ] ) . c_str ( ) ) ;
2023-01-22 02:04:03 +01:00
PRINT ( " The channel has about " < < videoCount < < " videos. " )
2023-01-03 02:56:07 +01:00
// `UC-3A9g4U1PpLaeAuD4jSP_w` has a `videoCount` of 2, while its `uploads` playlist contains 3 videos. So we use a strict inequality here.
2023-01-08 16:31:57 +01:00
if ( 0 < videoCount & & videoCount < 20000 )
2023-01-03 02:56:07 +01:00
{
string playlistToTreat = " UU " + channelToTreat . substr ( 2 ) ,
pageToken = " " ;
while ( true )
2022-12-22 06:18:22 +01:00
{
2023-01-03 02:56:07 +01:00
// `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them.
2023-01-15 02:19:31 +01:00
json data = getJson ( threadId , " playlistItems?part=snippet,contentDetails,status&playlistId= " + playlistToTreat + " &maxResults=50&pageToken= " + pageToken , true , channelToTreat , returnErrorIfPlaylistNotFound ) ;
2023-01-08 16:31:57 +01:00
if ( data . contains ( " error " ) )
{
2023-01-22 15:17:14 +01:00
EXIT_WITH_ERROR ( " Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist! " )
2023-01-08 16:31:57 +01:00
}
json items = data [ " items " ] ;
2023-01-03 02:56:07 +01:00
for ( const auto & item : items )
{
string videoId = item [ " contentDetails " ] [ " videoId " ] ;
// To keep the same amount of logs for each channel, I comment the following `PRINT`.
//PRINT("Treating video " << videoId)
2023-01-03 04:56:19 +01:00
treatChannelOrVideo ( threadId , false , videoId , channelToTreat ) ;
2023-01-03 02:56:07 +01:00
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
2022-12-22 06:18:22 +01:00
}
2023-01-03 02:56:07 +01:00
break ;
2022-12-22 06:18:22 +01:00
}
2023-01-08 16:31:57 +01:00
else if ( videoCount = = 0 )
{
2023-01-22 02:04:03 +01:00
PRINT ( " Skip listing comments on videos, as they shouldn't be any according to `channels?part=statistics`. " )
2023-01-08 16:31:57 +01:00
break ;
}
else //if(videoCount >= 20000)
2022-12-22 06:18:22 +01:00
{
2023-01-22 15:17:14 +01:00
EXIT_WITH_ERROR ( " The videos count of the channel exceeds the supported 20,000 limit! " )
2022-12-22 06:18:22 +01:00
}
}
}
2023-01-15 02:19:31 +01:00
if ( isChannel )
{
2023-01-15 14:31:55 +01:00
// `CHANNELS`
2023-01-15 02:19:31 +01:00
string pageToken = " " ;
while ( true )
{
json data = getJson ( threadId , " channels?part=channels&id= " + id + ( pageToken = = " " ? " " : " &pageToken= " + pageToken ) , false , id ) ,
2023-01-22 02:19:26 +01:00
channelSections = data [ " items " ] [ 0 ] [ " channelSections " ] ;
for ( const auto & channelSection : channelSections )
2023-01-15 02:19:31 +01:00
{
2023-01-22 02:19:26 +01:00
for ( const auto & sectionChannel : channelSection [ " sectionChannels " ] )
2023-01-15 02:19:31 +01:00
{
2023-01-22 02:19:26 +01:00
string channelId = sectionChannel [ " channelId " ] ;
addChannelToTreat ( threadId , channelId ) ;
2023-01-21 02:24:42 +01:00
}
2023-01-22 02:19:26 +01:00
}
if ( channelSections . size ( ) = = 1 )
{
json channelSection = channelSections [ 0 ] ;
if ( ! channelSection [ " nextPageToken " ] . is_null ( ) )
2023-01-21 02:24:42 +01:00
{
2023-01-22 02:19:26 +01:00
pageToken = channelSection [ " nextPageToken " ] ;
2023-01-21 02:24:42 +01:00
}
else
{
2023-01-22 02:19:26 +01:00
break ;
2023-01-15 02:19:31 +01:00
}
}
2023-01-22 02:19:26 +01:00
else
{
break ;
}
2023-01-15 02:19:31 +01:00
}
2023-01-22 01:37:32 +01:00
// `COMMUNITY`
pageToken = " " ;
while ( true )
{
json data = getJson ( threadId , " channels?part=community&id= " + id + ( pageToken = = " " ? " " : " &pageToken= " + pageToken ) , false , id ) ;
data = data [ " items " ] [ 0 ] ;
json posts = data [ " community " ] ;
for ( const auto & post : posts )
{
string postId = post [ " id " ] ;
json data = getJson ( threadId , " community?part=snippet&id= " + postId + " &order=time " , false , id ) ;
string pageToken = data [ " items " ] [ 0 ] [ " snippet " ] [ " comments " ] [ " nextPageToken " ] ;
while ( pageToken ! = " " )
{
json data = getJson ( threadId , " commentThreads?part=snippet,replies&pageToken= " + pageToken , false , id ) ,
items = data [ " items " ] ;
for ( const auto & item : items )
{
2023-02-09 01:51:22 +01:00
json snippet = item [ " snippet " ] [ " topLevelComment " ] [ " snippet " ] ,
authorChannelId = snippet [ " authorChannelId " ] ;
if ( ! authorChannelId [ " value " ] . is_null ( ) )
{
string channelId = authorChannelId [ " value " ] ;
addChannelToTreat ( threadId , channelId ) ;
}
2023-01-22 01:37:32 +01:00
string pageToken = snippet [ " nextPageToken " ] ;
while ( pageToken ! = " " )
{
json data = getJson ( threadId , " commentThreads?part=snippet,replies&pageToken= " + pageToken , false , id ) ,
items = data [ " items " ] ;
for ( const auto & item : items )
{
string channelId = item [ " snippet " ] [ " authorChannelId " ] [ " value " ] ;
addChannelToTreat ( threadId , channelId ) ;
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
}
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
}
}
2023-02-13 06:17:23 +01:00
if ( data . contains ( " nextPageToken " ) & & data [ " nextPageToken " ] ! = " " )
2023-01-22 01:37:32 +01:00
{
pageToken = data [ " nextPageToken " ] ;
}
2023-02-10 00:37:28 +01:00
else
2023-01-22 01:37:32 +01:00
{
break ;
}
}
2023-01-15 14:56:44 +01:00
// `PLAYLISTS`
pageToken = " " ;
while ( true )
{
json data = getJson ( threadId , " channels?part=playlists&id= " + id + ( pageToken = = " " ? " " : " &pageToken= " + pageToken ) , false , id ) ,
2023-01-22 02:19:26 +01:00
playlistSections = data [ " items " ] [ 0 ] [ " playlistSections " ] ;
2023-01-15 14:56:44 +01:00
2023-01-22 02:19:26 +01:00
for ( const auto & playlistSection : playlistSections )
2023-01-15 14:56:44 +01:00
{
2023-01-22 02:19:26 +01:00
for ( const auto & playlist : playlistSection [ " playlists " ] )
2023-01-15 14:56:44 +01:00
{
2023-01-22 02:19:26 +01:00
string playlistId = playlist [ " id " ] ;
//PRINT(threadId, playlistId)
string pageToken = " " ;
while ( true )
2023-01-15 14:56:44 +01:00
{
2023-01-22 02:19:26 +01:00
json data = getJson ( threadId , " playlistItems?part=contentDetails,snippet,status&playlistId= " + playlistId + " &maxResults=50&pageToken= " + pageToken , true , id ) ,
items = data [ " items " ] ;
for ( const auto & item : items )
2023-01-15 14:56:44 +01:00
{
2023-01-22 02:19:26 +01:00
json snippet = item [ " snippet " ] ;
string privacyStatus = item [ " status " ] [ " privacyStatus " ] ;
// `5-CXVU8si3A` in `PLTYUE9O6WCrjQsnOm56rMMNmFy_A-SjUx` has its privacy status on `privacyStatusUnspecified` and is inaccessible.
// `GMiVi8xkEXA` in `PLTYUE9O6WCrgNpeSiryP8LYVX-7tOJ1f1` has its privacy status on `private`.
// Of course `commentThreads?videoId=` doesn't work for these videos (same result on YouTube UI).
// By hypothesis that the discovery algorithm never ends we can't postpone the treatment of these unlisted videos, because we can find such unlisted videos at any point in time (before or after the given channel treatment).
// Maybe modifying this hypothesis would make sense, otherwise we have to treat them right-away (note that except code architecture, there is no recursion problem as documented on this function).
if ( privacyStatus ! = " public " & & privacyStatus ! = " private " & & snippet [ " title " ] ! = " Deleted video " )
2023-01-15 14:56:44 +01:00
{
2023-01-22 02:19:26 +01:00
string videoId = snippet [ " resourceId " ] [ " videoId " ] ,
channelId = snippet [ " videoOwnerChannelId " ] ;
PRINT ( " Found non public video ( " < < videoId < < " ) in: " < < playlistId )
string channelUnlistedVideosFilePath = CHANNELS_DIRECTORY + UNLISTED_VIDEOS_FILE_PATH ;
bool doesChannelUnlistedVideosFileExist = doesFileExist ( channelUnlistedVideosFilePath ) ;
writeFile ( threadId , channelUnlistedVideosFilePath , ! doesChannelUnlistedVideosFileExist ? " w " : " a " , ( ! doesChannelUnlistedVideosFileExist ? " " : " \n " ) + channelId ) ;
2023-01-15 14:56:44 +01:00
}
2023-01-22 02:19:26 +01:00
if ( snippet . contains ( " videoOwnerChannelId " ) )
2023-01-15 14:56:44 +01:00
{
2023-01-22 02:19:26 +01:00
// There isn't any `videoOwnerChannelId` to retrieve for `5-CXVU8si3A` for instance.
string channelId = snippet [ " videoOwnerChannelId " ] ;
if ( channelId ! = id )
{
addChannelToTreat ( threadId , channelId ) ;
}
2023-01-15 14:56:44 +01:00
}
}
2023-01-22 02:19:26 +01:00
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
2023-01-15 14:56:44 +01:00
}
}
}
if ( ! data [ " nextPageToken " ] . is_null ( ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
}
2023-01-25 01:00:29 +01:00
// `LIVE`
2023-01-22 04:00:11 +01:00
pageToken = " " ;
string playlistId = " UU " + id . substr ( 2 ) ;
vector < string > videoIds ;
while ( true )
{
json data = getJson ( threadId , " playlistItems?part=contentDetails,snippet,status&playlistId= " + playlistId + " &maxResults=50&pageToken= " + pageToken , true , id , returnErrorIfPlaylistNotFound ) ,
items = data [ " items " ] ;
for ( const auto & item : items )
{
string videoId = item [ " snippet " ] [ " resourceId " ] [ " videoId " ] ;
videoIds . push_back ( videoId ) ;
}
bool hasNextPageToken = data . contains ( " nextPageToken " ) ;
if ( videoIds . size ( ) = = 50 | | ! hasNextPageToken )
{
json data = getJson ( threadId , " videos?part=contentDetails,id,liveStreamingDetails,localizations,player,snippet,statistics,status,topicDetails&id= " + join ( videoIds , " , " ) , true , id ) ,
items = data [ " items " ] ;
for ( const auto & item : items )
{
if ( item . contains ( " liveStreamingDetails " ) )
{
2023-01-22 15:15:27 +01:00
string videoId = item [ " id " ] ;
//PRINT(videoId)
2023-01-22 04:00:11 +01:00
json liveStreamingDetails = item [ " liveStreamingDetails " ] ;
if ( liveStreamingDetails . contains ( " activeLiveChatId " ) )
{
string activeLiveChatId = liveStreamingDetails [ " activeLiveChatId " ] ;
json data = getJson ( threadId , " liveChat/messages?part=snippet,authorDetails&liveChatId= " + activeLiveChatId , true , id ) ,
items = data [ " items " ] ;
for ( const auto & item : items )
{
string channelId = item [ " snippet " ] [ " authorChannelId " ] ;
addChannelToTreat ( threadId , channelId ) ;
}
}
else
{
2023-01-22 15:15:27 +01:00
// As there isn't the usual pagination mechanism for these ended livestreams, we proceed in an uncertain way as follows.
set < string > messageIds ;
unsigned long long lastMessageTimestampRelativeMsec = 0 ;
while ( true )
{
string time = to_string ( lastMessageTimestampRelativeMsec ) ;
json data = getJson ( threadId , " liveChats?part=snippet&id= " + videoId + " &time= " + time , false , id ) ,
snippet = data [ " items " ] [ 0 ] [ " snippet " ] ;
if ( snippet . empty ( ) )
{
break ;
}
json firstMessage = snippet [ 0 ] ;
string firstMessageId = firstMessage [ " id " ] ;
// We verify that we don't skip any message by verifying that the first message was already treated if we already treated some messages.
if ( ! messageIds . empty ( ) & & messageIds . find ( firstMessageId ) = = messageIds . end ( ) )
{
2023-02-14 23:15:07 +01:00
PRINT ( " The verification that we don't skip any message failed! Continuing anyway... " )
2023-01-22 15:15:27 +01:00
}
for ( const auto & message : snippet )
{
string messageId = message [ " id " ] ;
if ( messageIds . find ( messageId ) = = messageIds . end ( ) )
{
messageIds . insert ( messageId ) ;
string channelId = message [ " authorChannelId " ] ;
addChannelToTreat ( threadId , channelId ) ;
}
}
json lastMessage = snippet . back ( ) ;
// If there isn't any new message, then we stop the retrieving.
if ( lastMessageTimestampRelativeMsec = = lastMessage [ " videoOffsetTimeMsec " ] )
{
break ;
}
lastMessageTimestampRelativeMsec = lastMessage [ " videoOffsetTimeMsec " ] ;
}
2023-01-22 04:00:11 +01:00
}
}
}
videoIds . clear ( ) ;
}
if ( hasNextPageToken )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
}
2023-02-10 20:17:49 +01:00
// Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
string playlistToTreat = " UU " + channelToTreat . substr ( 2 ) ;
pageToken = " " ;
while ( true )
Fix #13: Add captions extraction
I was about to commit in addition:
```c++
// Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`.
// My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`.
/*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat),
items = data["items"];
for(const auto& item : items)
{
json snippet = item["snippet"];
if(snippet["trackKind"] == "asr")
{
string language = snippet["language"];
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
// As there should be a single automatic speech recognized track, there is no need to go through all tracks.
break;
}
}*/
```
Instead of:
```c++
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
```
But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was
wrong:
> `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again.
>
> ```
> 'subtitleslangs': ['en.*','.*orig'],
> 'writeautomaticsub': True,
> ```
>
> Work as expected too. Thank you
>
> Very sorry for the video sample. I even not watched it.
Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
2023-02-10 20:03:08 +01:00
{
2023-02-10 20:17:49 +01:00
json data = getJson ( threadId , " playlistItems?part=snippet,contentDetails,status&playlistId= " + playlistToTreat + " &maxResults=50&pageToken= " + pageToken , true , channelToTreat , returnErrorIfPlaylistNotFound ) ;
if ( data . contains ( " error " ) )
{
2023-02-16 12:21:28 +01:00
// `UCFoBM1VginhMH7lR56GtVbQ` doesn't have videos and is in this case for instance.
PRINT ( " Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist! " )
break ;
2023-02-10 20:17:49 +01:00
}
json items = data [ " items " ] ;
for ( const auto & item : items )
{
string videoId = item [ " contentDetails " ] [ " videoId " ] ;
// Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
//treatChannelOrVideo(threadId, false, videoId, channelToTreat);
string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + " / " + CAPTIONS_DIRECTORY + videoId + " / " ;
createDirectory ( channelCaptionsToTreatDirectory ) ;
// Firstly download all not automatically generated captions.
// The underscore in `-o` argument is used to not end up with hidden files.
2023-02-12 16:24:16 +01:00
// We are obliged to precise the video id after `--`, otherwise if the video id starts with `-` it's considered as an argument.
2023-02-10 20:17:49 +01:00
string cmdCommonPrefix = " yt-dlp --skip-download " ,
2023-02-12 16:24:16 +01:00
cmdCommonPostfix = " -o ' " + channelCaptionsToTreatDirectory + " _' -- " + videoId ;
2023-02-17 16:57:11 +01:00
string cmd = cmdCommonPrefix + " --write-sub --sub-lang all,-live_chat " + cmdCommonPostfix ;
2023-02-10 20:17:49 +01:00
exec ( threadId , cmd ) ;
// Secondly download the automatically generated captions.
cmd = cmdCommonPrefix + " --write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt " + cmdCommonPostfix ;
exec ( threadId , cmd ) ;
}
if ( data . contains ( " nextPageToken " ) )
{
pageToken = data [ " nextPageToken " ] ;
}
else
{
break ;
}
Fix #13: Add captions extraction
I was about to commit in addition:
```c++
// Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.*orig'`.
// My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`.
/*json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat),
items = data["items"];
for(const auto& item : items)
{
json snippet = item["snippet"];
if(snippet["trackKind"] == "asr")
{
string language = snippet["language"];
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
// As there should be a single automatic speech recognized track, there is no need to go through all tracks.
break;
}
}*/
```
Instead of:
```c++
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
exec(threadId, cmd);
```
But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was
wrong:
> `yt-dlp --cookies cookies.txt --sub-langs 'en.*,.*orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again.
>
> ```
> 'subtitleslangs': ['en.*','.*orig'],
> 'writeautomaticsub': True,
> ```
>
> Work as expected too. Thank you
>
> Very sorry for the video sample. I even not watched it.
Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).
2023-02-10 20:03:08 +01:00
}
}
2023-01-15 02:19:31 +01:00
}
// This function verifies that the given hasn't already been treated.
void addChannelToTreat ( unsigned short threadId , string channelId )
{
2023-02-12 16:31:27 +01:00
channelsPerSecondCount + + ;
channelsCountThreads [ threadId ] + + ;
2023-01-15 02:19:31 +01:00
channelsAlreadyTreatedAndToTreatMutex . lock ( ) ;
if ( channelsAlreadyTreated . find ( channelId ) = = channelsAlreadyTreated . end ( ) & & channelsToTreatRev . find ( channelId ) = = channelsToTreatRev . end ( ) )
{
unsigned int channelsToTreatIndex = channelsToTreat . end ( ) - > first + 1 ;
channelsToTreat [ channelsToTreatIndex ] = channelId ;
channelsToTreatRev [ channelId ] = channelsToTreatIndex ;
channelsAlreadyTreatedAndToTreatMutex . unlock ( ) ;
writeFile ( threadId , CHANNELS_FILE_PATH , " a " , " \n " + channelId ) ;
}
else
{
channelsAlreadyTreatedAndToTreatMutex . unlock ( ) ;
}
2022-12-22 05:20:32 +01:00
}
2023-01-03 04:56:19 +01:00
void treatComment ( unsigned short threadId , json comment , string channelId )
2022-12-22 05:20:32 +01:00
{
2022-12-22 06:18:22 +01:00
json snippet = comment [ " snippet " ] ;
// The `else` case can happen (cf `95a9421ad0469a09335afeddb2983e31dc00bc36`).
if ( snippet . contains ( " authorChannelId " ) )
{
string channelId = snippet [ " authorChannelId " ] [ " value " ] ;
2023-01-15 02:19:31 +01:00
addChannelToTreat ( threadId , channelId ) ;
2022-12-22 06:18:22 +01:00
}
}
2023-01-22 04:00:11 +01:00
string join ( vector < string > parts , string delimiter )
{
string result = " " ;
unsigned int partsSize = parts . size ( ) ;
for ( unsigned int partsIndex = 0 ; partsIndex < partsSize ; partsIndex + + )
{
result + = parts [ partsIndex ] ;
if ( partsIndex < partsSize - 1 )
{
result + = delimiter ;
}
}
return result ;
}
2023-02-12 16:24:16 +01:00
void exec ( unsigned short threadId , string cmd , bool debug )
2023-01-04 03:06:33 +01:00
{
2023-02-12 16:24:16 +01:00
if ( debug )
2023-01-04 03:06:33 +01:00
{
2023-02-12 16:24:16 +01:00
ostringstream toString ;
toString < < threadId ;
string initialCmd = cmd ,
threadIdStr = toString . str ( ) ,
debugCommonFilePath = DEBUG_DIRECTORY + threadIdStr ,
debugOutFilePath = debugCommonFilePath + " .out " ,
debugErrFilePath = debugCommonFilePath + " .err " ;
cmd + = " >> " + debugOutFilePath ;
cmd + = " 2>> " + debugErrFilePath ;
cmd + = " ; echo \" " + initialCmd + " \" | tee -a " + debugOutFilePath + " " + debugErrFilePath ;
2023-01-04 03:06:33 +01:00
}
2023-02-12 16:24:16 +01:00
system ( cmd . c_str ( ) ) ;
2023-01-04 03:06:33 +01:00
}
2023-01-03 04:56:19 +01:00
bool writeFile ( unsigned short threadId , string filePath , string option , string toWrite )
2022-12-22 06:18:22 +01:00
{
FILE * file = fopen ( filePath . c_str ( ) , option . c_str ( ) ) ;
if ( file ! = NULL )
{
fputs ( toWrite . c_str ( ) , file ) ;
fclose ( file ) ;
return true ;
}
2023-01-02 19:46:32 +01:00
else
{
2023-01-22 02:04:03 +01:00
PRINT ( " writeFile error: " < < strerror ( errno ) )
2023-01-02 19:46:32 +01:00
}
2022-12-22 06:18:22 +01:00
return false ;
}
bool doesFileExist ( string filePath )
{
struct stat buffer ;
return stat ( filePath . c_str ( ) , & buffer ) = = 0 ;
}
void createDirectory ( string path )
{
mkdir ( path . c_str ( ) , S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH ) ;
2022-12-22 05:20:32 +01:00
}
2023-01-04 03:06:33 +01:00
void deleteDirectory ( string path )
{
filesystem : : remove_all ( path ) ;
}
2022-12-22 05:47:16 +01:00
string getDate ( )
{
auto t = time ( nullptr ) ;
auto tm = * localtime ( & t ) ;
ostringstream toString ;
2023-01-06 17:55:16 +01:00
toString < < put_time ( & tm , " %d-%m-%Y %H-%M-%S. " ) ;
milliseconds ms = duration_cast < milliseconds > (
system_clock : : now ( ) . time_since_epoch ( )
) ;
toString < < ( ms . count ( ) % 1000 ) ;
2022-12-22 05:47:16 +01:00
return toString . str ( ) ;
}
2023-01-08 17:59:08 +01:00
set < string > setFromVector ( vector < string > vec )
{
return set ( vec . begin ( ) , vec . end ( ) ) ;
}
2022-12-22 05:20:32 +01:00
vector < string > getFileContent ( string filePath )
{
2022-12-22 06:18:22 +01:00
vector < string > lines ;
ifstream infile ( filePath . c_str ( ) ) ;
2022-12-22 05:20:32 +01:00
string line ;
while ( getline ( infile , line ) )
2022-12-22 06:18:22 +01:00
lines . push_back ( line ) ;
2022-12-22 05:20:32 +01:00
return lines ;
}
2023-02-10 20:17:49 +01:00
json getJson ( unsigned short threadId , string url , bool usingYoutubeDataApiv3 , string channelId , getJsonBehavior behavior )
2022-12-22 05:20:32 +01:00
{
2023-01-15 02:19:31 +01:00
string finalUrl = usingYoutubeDataApiv3 ?
( USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ?
" https://yt.lemnoslife.com/noKey/ " + url :
" https://www.googleapis.com/youtube/v3/ " + url + " &key= " + apiKey ) :
YOUTUBE_OPERATIONAL_API_INSTANCE_URL + " / " + url ,
2023-01-08 18:26:20 +01:00
content = getHttps ( finalUrl ) ;
2023-01-06 00:31:05 +01:00
json data ;
try
{
data = json : : parse ( content ) ;
}
catch ( json : : parse_error & ex )
{
2023-02-14 23:15:07 +01:00
// From the experience this sometimes happens due to empty `content` but retrying just after solves the problem.
PRINT ( " Parse error for " < < finalUrl < < " , as got: " < < content < < " ! Retrying... " )
return getJson ( threadId , url , usingYoutubeDataApiv3 , channelId ) ;
2023-01-06 00:31:05 +01:00
}
2023-01-02 19:46:32 +01:00
2023-01-06 20:55:32 +01:00
if ( data . contains ( " error " ) )
{
2023-02-10 12:02:39 +01:00
if ( ! usingYoutubeDataApiv3 )
{
EXIT_WITH_ERROR ( " Found error in JSON retrieve from YouTube operational API at URL: " < < finalUrl < < " for content: " < < content < < " ! " )
}
2023-01-08 16:31:57 +01:00
string reason = data [ " error " ] [ " errors " ] [ 0 ] [ " reason " ] ;
2023-01-08 17:59:08 +01:00
// Contrarily to YouTube operational API no-key service we don't rotate keys in `KEYS_FILE_PATH`, as we keep them in memory here.
if ( reason = = " quotaExceeded " )
{
quotaMutex . lock ( ) ;
keys . erase ( keys . begin ( ) ) ;
keys . push_back ( apiKey ) ;
2023-01-22 02:04:03 +01:00
PRINT ( " No more quota on " < < apiKey < < " switching to " < < keys [ 0 ] < < " . " )
2023-01-08 17:59:08 +01:00
apiKey = keys [ 0 ] ;
quotaMutex . unlock ( ) ;
2023-02-10 20:17:49 +01:00
return getJson ( threadId , url , true , channelId ) ;
2023-01-08 17:59:08 +01:00
}
2023-01-22 02:04:03 +01:00
PRINT ( " Found error in JSON at URL: " < < finalUrl < < " for content: " < < content < < " ! " )
2023-01-08 16:31:57 +01:00
if ( reason ! = " commentsDisabled " | | behavior = = retryOnCommentsDisabled )
2023-01-08 15:43:27 +01:00
{
2023-02-10 20:17:49 +01:00
return reason = = " playlistNotFound " & & behavior = = returnErrorIfPlaylistNotFound ? data : getJson ( threadId , url , true , channelId ) ;
2023-01-08 15:43:27 +01:00
}
2023-01-06 20:55:32 +01:00
}
2023-01-02 19:46:32 +01:00
ostringstream toString ;
2023-02-10 20:17:49 +01:00
toString < < CHANNELS_DIRECTORY < < channelId < < " / " < < YOUTUBE_API_REQUESTS_DIRECTORY ;
2023-02-08 16:05:03 +01:00
writeFile ( threadId , toString . str ( ) + " urls.txt " , " a " , url + " " + ( usingYoutubeDataApiv3 ? " true " : " false " ) + " \n " ) ;
2023-02-07 23:08:09 +01:00
toString < < requestsPerChannelThreads [ threadId ] + + < < " .json " ;
writeFile ( threadId , toString . str ( ) , " w " , content ) ;
2023-01-02 19:46:32 +01:00
2022-12-22 06:18:22 +01:00
return data ;
2022-12-22 05:20:32 +01:00
}
void print ( ostringstream * toPrint )
{
2023-01-03 04:56:19 +01:00
printMutex . lock ( ) ;
2022-12-22 06:18:22 +01:00
cout < < getDate ( ) < < " : " < < toPrint - > str ( ) < < endl ;
toPrint - > str ( " " ) ;
2023-01-03 04:56:19 +01:00
printMutex . unlock ( ) ;
2022-12-22 05:20:32 +01:00
}
2023-01-03 04:56:19 +01:00
// Is this function really multi-threading friendly? If not, could consider executing `curl` using the command line.
2022-12-22 05:20:32 +01:00
string getHttps ( string url )
{
CURL * curl = curl_easy_init ( ) ;
string got ;
curl_easy_setopt ( curl , CURLOPT_URL , url . c_str ( ) ) ;
curl_easy_setopt ( curl , CURLOPT_SSL_VERIFYPEER , 1 ) ;
curl_easy_setopt ( curl , CURLOPT_SSL_VERIFYHOST , 1 ) ;
curl_easy_setopt ( curl , CURLOPT_WRITEFUNCTION , writeCallback ) ;
curl_easy_setopt ( curl , CURLOPT_WRITEDATA , & got ) ;
curl_easy_perform ( curl ) ;
curl_easy_cleanup ( curl ) ;
return got ;
}
size_t writeCallback ( void * contents , size_t size , size_t nmemb , void * userp )
{
( ( string * ) userp ) - > append ( ( char * ) contents , size * nmemb ) ;
return size * nmemb ;
}