Make the first channel of channels.txt being treated again, solve temporary empty response from YouTube Data API v3 issue and temporarily remove sanity check failing very rarely #39

Fix #31 : List all occurrences of search within video captions
Fix #31 : Make a website with a search engine notably based on the captions extracted
2023-02-14 23:15:07 +01:00 · 2023-02-14 02:56:11 +01:00 · 2023-02-14 02:00:23 +01:00 · 2023-02-14 01:32:36 +01:00 · 2023-02-14 01:08:05 +01:00 · 2023-02-14 00:59:37 +01:00
12 changed files with 2296 additions and 86 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+keys.txt
+channels.txt
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
 .PHONY: main

 main:
-	g++ main.cpp -g -std=c++17 -lcurl -lpthread -o main
+	g++ main.cpp -g -std=c++17 -lcurl -lpthread -o youtubeCaptionsSearchEngine
--- a/README.md
+++ b/README.md
@@ -12,8 +12,24 @@ As would like to proceed channel per channel, the question is **how much time do

 Have to proceed with a breadth-first search approach as treating all *child* channels might take a time equivalent to treating the whole original tree.

+Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
+
 ```sh
-sudo apt install nlohmann-json3-dev
+sudo apt install nlohmann-json3-dev yt-dlp
 make
-./main
+./youtubeCaptionsSearchEngine -h
+```
+
+If you plan to use the front-end website, also run:
+
+```sh
+pip install webvtt-py
+```
+
+Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
+
+Except if you provide the argument `--no-keys`, you have to provide at least one [YouTube Data API v3 key](https://developers.google.com/youtube/v3/getting-started) in `keys.txt`.
+
+```sh
+./youtubeCaptionsSearchEngine
 ```
--- a/channels.txt
+++ b/channels.txt
@@ -98,3 +98,13 @@ UCfih6kPJCpzWmtCFtlpYK6A
 UCdTyuXgmJkG_O8_75eqej-w
 UCxXFx2jz8N02sNqv1VeDEGA
 UCj8BKFCTH-mqRlYwcmX2xwg
+UCsT0YIqwnpJCM-mx7-gSA4Q
+UCAuUUnT6oDeKwE6v1NGQxug
+UCy0uwqmXSHVOgqo3nrN4RCQ
+UCawLcDd9clh27b1z55Gcawg
+UC6bfT6U4WED5EyzymREvKlQ
+UCINdSH_R15xft_ctNm50eGQ
+UCVx2ZvskbDkHpLlYEQ9FULw
+UCBcmi8nLrqfFluiexxjl7bg
+UCBnZ16ahKA2DZ_T5W0FPUXg
+UCf8w5m0YsRa8MHQ5bwSGmbw
--- a/main.cpp
+++ b/main.cpp
@@ -16,37 +16,51 @@ enum getJsonBehavior { normal, retryOnCommentsDisabled, returnErrorIfPlaylistNot

 set<string> setFromVector(vector<string> vec);
 vector<string> getFileContent(string filePath);
-json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior = normal);
+json getJson(unsigned short threadId, string url, bool usingYouTubeDataApiV3, string channelId, getJsonBehavior behavior = normal);
 void createDirectory(string path),
     print(ostringstream* toPrint),
     treatComment(unsigned short threadId, json comment, string channelId),
     treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat),
     treatChannels(unsigned short threadId),
-     deleteDirectory(string path);
+     deleteDirectory(string path),
+     addChannelToTreat(unsigned short threadId, string channelId),
+     exec(unsigned short threadId, string cmd, bool debug = true);
 string getHttps(string url),
-       exec(string cmd);
+       join(vector<string> parts, string delimiter);
 size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
 bool doesFileExist(string filePath),
     writeFile(unsigned short threadId, string filePath, string option, string toWrite);

-#define PRINT(threadId, x) { ostringstream toPrint; toPrint << threadId << ": " << x; print(&toPrint); }
+#define THREAD_PRINT(threadId, x) { ostringstream toPrint; toPrint << threadId << ": " << x; print(&toPrint); }
+#define PRINT(x) THREAD_PRINT(threadId, x)
 #define DEFAULT_THREAD_ID 0
-#define MAIN_PRINT(x) PRINT(DEFAULT_THREAD_ID, x)
+#define MAIN_PRINT(x) THREAD_PRINT(DEFAULT_THREAD_ID, x)
+
+#define EXIT_WITH_ERROR(x) { PRINT(x); exit(EXIT_FAILURE); }
+#define MAIN_EXIT_WITH_ERROR(x) { MAIN_PRINT(x); exit(EXIT_FAILURE); }

 mutex printMutex,
      channelsAlreadyTreatedAndToTreatMutex,
      quotaMutex;
-set<string> channelsAlreadyTreated,
-    channelsToTreat;
+set<string> channelsAlreadyTreated;
+// Two `map`s to simulate a bidirectional map.
+map<unsigned int, string> channelsToTreat;
+map<string, unsigned int> channelsToTreatRev;
 vector<string> keys;
-unsigned int commentsCount = 0,
-             commentsPerSecondCount = 0,
-             requestsPerChannel = 0;
+unsigned int channelsPerSecondCount = 0;
+map<unsigned short, unsigned int> channelsCountThreads,
+    requestsPerChannelThreads;
 unsigned short THREADS_NUMBER = 1;
+// Use `string` variables instead of macros to have `string` properties, even if could use a meta-macro inlining as `string`s.
 string CHANNELS_DIRECTORY = "channels/",
       CHANNELS_FILE_PATH = "channels.txt",
       KEYS_FILE_PATH = "keys.txt",
-       apiKey = ""; // Will firstly be filled with `KEYS_FILE_PATH` first line.
+       UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
+       apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
+       YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance.
+       CAPTIONS_DIRECTORY = "captions/",
+       DEBUG_DIRECTORY = "debug/",
+       YOUTUBE_API_REQUESTS_DIRECTORY = "requests/";
 bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;

 int main(int argc, char *argv[])
@@ -64,22 +78,37 @@ int main(int argc, char *argv[])
        }
        else if(argvStr == "-h" || argvStr == "--help")
        {
-            MAIN_PRINT("Usage: " << argv[0] << " [--help/-h] [--no-keys] [--threads=N]")
-            exit(0);
+            MAIN_PRINT("Usage: " << argv[0] << " [--help/-h] [--no-keys] [--threads=N] [--youtube-operational-api-instance-url URL]")
+            exit(EXIT_SUCCESS);
+        }
+        else if(argvStr == "--youtube-operational-api-instance-url")
+        {
+            if(argvIndex < argc - 1)
+            {
+                YOUTUBE_OPERATIONAL_API_INSTANCE_URL = string(argv[argvIndex + 1]);
+                argvIndex++;
+            }
+            else
+            {
+                MAIN_EXIT_WITH_ERROR("YouTube operational API instance URL missing!")
+            }
        }
        else
        {
-            MAIN_PRINT("Unrecognized parameter " << argvStr)
-            exit(1);
+            MAIN_EXIT_WITH_ERROR("Unrecognized parameter " << argvStr)
        }
    }

    // The starting set should be written to `CHANNELS_FILE_PATH`.
    // To resume this algorithm after a shutdown, just restart it after having deleted the last channel folders in `CHANNELS_DIRECTORY` being treated.
-    // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat` or `channelsToTreat` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
+    // On a restart, `CHANNELS_FILE_PATH` is read and every channel not found in `CHANNELS_DIRECTORY` is added to `channelsToTreat*` or `channelsToTreat*` otherwise before continuing, as if `CHANNELS_FILE_PATH` was containing a **treated** starting set.
    vector<string> channelsVec = getFileContent(CHANNELS_FILE_PATH);
-    // Note that using `set`s makes the search faster but we lose the `channels.txt` lines order.
-    channelsToTreat = setFromVector(channelsVec);
+    for(unsigned int channelsVecIndex = 0; channelsVecIndex < channelsVec.size(); channelsVecIndex++)
+    {
+        string channel = channelsVec[channelsVecIndex];
+        channelsToTreat[channelsVecIndex] = channel;
+        channelsToTreatRev[channel] = channelsVecIndex;
+    }

    keys = getFileContent(KEYS_FILE_PATH);
    apiKey = keys[0];
@@ -88,10 +117,16 @@ int main(int argc, char *argv[])

    for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
    {
-        string fileName = entry.path().filename(),
-               channelId = fileName.substr(0, fileName.length() - 4);
-        channelsToTreat.erase(channelId);
-        channelsAlreadyTreated.insert(channelId);
+        string fileName = entry.path().filename();
+        // Skip files such as `UNLISTED_VIDEOS_FILE_PATH`.
+        if (fileName.substr(0, 2) == "UC") {
+            string channelId = fileName.substr(0, fileName.length() - 4);
+
+            channelsToTreat.erase(channelsToTreatRev[channelId]);
+            channelsToTreatRev.erase(channelId);
+
+            channelsAlreadyTreated.insert(channelId);
+        }
    }

    MAIN_PRINT(channelsToTreat.size() << " channel(s) to treat")
@@ -105,8 +140,8 @@ int main(int argc, char *argv[])

    while(true)
    {
-        MAIN_PRINT("Comments per second: " << commentsPerSecondCount)
-        commentsPerSecondCount = 0;
+        MAIN_PRINT("Channels per second: " << channelsPerSecondCount)
+        channelsPerSecondCount = 0;
        sleep(1);
    }

@@ -132,37 +167,46 @@ void treatChannels(unsigned short threadId)
            continue;
        }

-        string channelToTreat = *channelsToTreat.begin();
+        string channelToTreat = channelsToTreat.begin()->second;

-        PRINT(threadId, "Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")
+        PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")
+
+        channelsCountThreads[threadId] = 0;
+        requestsPerChannelThreads[threadId] = 0;
+
+        channelsToTreat.erase(channelsToTreatRev[channelToTreat]);
+        channelsToTreatRev.erase(channelToTreat);

-        channelsToTreat.erase(channelToTreat);
        channelsAlreadyTreated.insert(channelToTreat);

        channelsAlreadyTreatedAndToTreatMutex.unlock();

        string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
        createDirectory(channelToTreatDirectory);
+        createDirectory(DEBUG_DIRECTORY);
+        createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY);
+        createDirectory(channelToTreatDirectory + YOUTUBE_API_REQUESTS_DIRECTORY);

        treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);

        // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
-        PRINT(threadId, "Starting compression...")
+        PRINT("Starting compression...")
        // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
-        exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
+        // We precise no `debug`ging, as otherwise the zipping operation doesn't work as expected.
+        // As the zipping process isn't recursive, we can't just rely on `ls`, but we are obliged to use `find`.
+        exec(threadId, "cd " + channelToTreatDirectory + " && find | zip ../" + channelToTreat + ".zip -@", false);

-        PRINT(threadId, "Compression finished, started deleting initial directory...")
+        PRINT("Compression finished, started deleting initial directory...")
        deleteDirectory(channelToTreatDirectory);
-        PRINT(threadId, "Deleting directory finished.")
+        PRINT("Deleting directory finished.")

-        PRINT(threadId, commentsCount << " comments were found for this channel.")
-        commentsCount = 0;
-        requestsPerChannel = 0;
+        PRINT(channelsCountThreads[threadId] << " comments were found for this channel.")
    }

    channelsAlreadyTreatedAndToTreatMutex.unlock();
 }

+// Have to pay attention not to recursively call this function with another channel otherwise we break the ability of the program to halt at any top level channel.
 void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat)
 {
    string pageToken = "";
@@ -171,7 +215,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
        ostringstream toString;
        toString << "commentThreads?part=snippet,replies&" << (isChannel ? "allThreadsRelatedToChannelId" : "videoId") << "=" << id << "&maxResults=100&pageToken=" << pageToken;
        string url = toString.str();
-        json data = getJson(threadId, url, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled);
+        json data = getJson(threadId, url, true, channelToTreat, pageToken == "" ? normal : retryOnCommentsDisabled);
        bool doesRelyingOnCommentThreadsIsEnough = (!isChannel) || data["error"]["errors"][0]["reason"] != "commentsDisabled";
        if(doesRelyingOnCommentThreadsIsEnough)
        {
@@ -188,7 +232,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                        string pageToken = "";
                        while(true)
                        {
-                            json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, channelToTreat),
+                            json data = getJson(threadId, "comments?part=snippet&parentId=" + commentId + "&maxResults=100&pageToken=" + pageToken, true, channelToTreat),
                                 items = data["items"];
                            for(const auto& item : items)
                            {
@@ -225,11 +269,11 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
        }
        else
        {
-            PRINT(threadId, "Comments disabled channel, treating differently...")
-            json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, channelToTreat);
+            PRINT("Comments disabled channel, treating differently...")
+            json data = getJson(threadId, "channels?part=statistics&id=" + channelToTreat, true, channelToTreat);
            // YouTube Data API v3 Videos: list endpoint returns `videoCount` as a string and not an integer...
            unsigned int videoCount = atoi(string(data["items"][0]["statistics"]["videoCount"]).c_str());
-            PRINT(threadId, "The channel has about " << videoCount << " videos.")
+            PRINT("The channel has about " << videoCount << " videos.")
            // `UC-3A9g4U1PpLaeAuD4jSP_w` has a `videoCount` of 2, while its `uploads` playlist contains 3 videos. So we use a strict inequality here.
            if(0 < videoCount && videoCount < 20000)
            {
@@ -238,11 +282,10 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                while(true)
                {
                    // `snippet` and `status` are unneeded `part`s here but may be interesting later, as we log them.
-                    json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, channelToTreat, returnErrorIfPlaylistNotFound);
+                    json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
                    if(data.contains("error"))
                    {
-                        PRINT(threadId, "Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
-                        exit(1);
+                        EXIT_WITH_ERROR("Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
                    }
                    json items = data["items"];
                    for(const auto& item : items)
@@ -265,16 +308,331 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
            }
            else if(videoCount == 0)
            {
-                PRINT(threadId, "Skip listing comments on videos, as they shouldn't be any according to `channels?part=statistics`.")
+                PRINT("Skip listing comments on videos, as they shouldn't be any according to `channels?part=statistics`.")
                break;
            }
            else //if(videoCount >= 20000)
            {
-                PRINT(threadId, "The videos count of the channel exceeds the supported 20,000 limit!")
-                exit(1);
+                EXIT_WITH_ERROR("The videos count of the channel exceeds the supported 20,000 limit!")
            }
        }
    }
+    if(isChannel)
+    {
+        // `CHANNELS`
+        string pageToken = "";
+        while(true)
+        {
+            json data = getJson(threadId, "channels?part=channels&id=" + id + (pageToken == "" ? "" : "&pageToken=" + pageToken), false, id),
+                 channelSections = data["items"][0]["channelSections"];
+            for(const auto& channelSection : channelSections)
+            {
+                for(const auto& sectionChannel : channelSection["sectionChannels"])
+                {
+                    string channelId = sectionChannel["channelId"];
+                    addChannelToTreat(threadId, channelId);
+                }
+            }
+            if(channelSections.size() == 1)
+            {
+                json channelSection = channelSections[0];
+                if(!channelSection["nextPageToken"].is_null())
+                {
+                    pageToken = channelSection["nextPageToken"];
+                }
+                else
+                {
+                    break;
+                }
+            }
+            else
+            {
+                break;
+            }
+        }
+        // `COMMUNITY`
+        pageToken = "";
+        while(true)
+        {
+            json data = getJson(threadId, "channels?part=community&id=" + id + (pageToken == "" ? "" : "&pageToken=" + pageToken), false, id);
+            data = data["items"][0];
+            json posts = data["community"];
+            for(const auto& post : posts)
+            {
+                string postId = post["id"];
+                json data = getJson(threadId, "community?part=snippet&id=" + postId + "&order=time", false, id);
+                string pageToken = data["items"][0]["snippet"]["comments"]["nextPageToken"];
+                while(pageToken != "")
+                {
+                    json data = getJson(threadId, "commentThreads?part=snippet,replies&pageToken=" + pageToken, false, id),
+                         items = data["items"];
+                    for(const auto& item : items)
+                    {
+                        json snippet = item["snippet"]["topLevelComment"]["snippet"],
+                             authorChannelId = snippet["authorChannelId"];
+                        if(!authorChannelId["value"].is_null())
+                        {
+                            string channelId = authorChannelId["value"];
+                            addChannelToTreat(threadId, channelId);
+                        }
+                        string pageToken = snippet["nextPageToken"];
+                        while(pageToken != "")
+                        {
+                            json data = getJson(threadId, "commentThreads?part=snippet,replies&pageToken=" + pageToken, false, id),
+                                 items = data["items"];
+                            for(const auto& item : items)
+                            {
+                                string channelId = item["snippet"]["authorChannelId"]["value"];
+                                addChannelToTreat(threadId, channelId);
+                            }
+                            if(data.contains("nextPageToken"))
+                            {
+                                pageToken = data["nextPageToken"];
+                            }
+                            else
+                            {
+                                break;
+                            }
+                        }
+                    }
+                    if(data.contains("nextPageToken"))
+                    {
+                        pageToken = data["nextPageToken"];
+                    }
+                    else
+                    {
+                        break;
+                    }
+                }
+            }
+            if(data.contains("nextPageToken") && data["nextPageToken"] != "")
+            {
+                pageToken = data["nextPageToken"];
+            }
+            else
+            {
+                break;
+            }
+        }
+        // `PLAYLISTS`
+        pageToken = "";
+        while(true)
+        {
+            json data = getJson(threadId, "channels?part=playlists&id=" + id + (pageToken == "" ? "" : "&pageToken=" + pageToken), false, id),
+                 playlistSections = data["items"][0]["playlistSections"];
+
+            for(const auto& playlistSection : playlistSections)
+            {
+                for(const auto& playlist : playlistSection["playlists"])
+                {
+                    string playlistId = playlist["id"];
+                    //PRINT(threadId, playlistId)
+                    string pageToken = "";
+                    while(true)
+                    {
+                        json data = getJson(threadId, "playlistItems?part=contentDetails,snippet,status&playlistId=" + playlistId + "&maxResults=50&pageToken=" + pageToken, true, id),
+                             items = data["items"];
+                        for(const auto& item : items)
+                        {
+                            json snippet = item["snippet"];
+                            string privacyStatus = item["status"]["privacyStatus"];
+                            // `5-CXVU8si3A` in `PLTYUE9O6WCrjQsnOm56rMMNmFy_A-SjUx` has its privacy status on `privacyStatusUnspecified` and is inaccessible.
+                            // `GMiVi8xkEXA` in `PLTYUE9O6WCrgNpeSiryP8LYVX-7tOJ1f1` has its privacy status on `private`.
+                            // Of course `commentThreads?videoId=` doesn't work for these videos (same result on YouTube UI).
+                            // By hypothesis that the discovery algorithm never ends we can't postpone the treatment of these unlisted videos, because we can find such unlisted videos at any point in time (before or after the given channel treatment).
+                            // Maybe modifying this hypothesis would make sense, otherwise we have to treat them right-away (note that except code architecture, there is no recursion problem as documented on this function).
+                            if(privacyStatus != "public" && privacyStatus != "private" && snippet["title"] != "Deleted video")
+                            {
+                                string videoId = snippet["resourceId"]["videoId"],
+                                       channelId = snippet["videoOwnerChannelId"];
+                                PRINT("Found non public video (" << videoId << ") in: " << playlistId)
+                                string channelUnlistedVideosFilePath = CHANNELS_DIRECTORY + UNLISTED_VIDEOS_FILE_PATH;
+                                bool doesChannelUnlistedVideosFileExist = doesFileExist(channelUnlistedVideosFilePath);
+                                writeFile(threadId, channelUnlistedVideosFilePath, !doesChannelUnlistedVideosFileExist ? "w" : "a", (!doesChannelUnlistedVideosFileExist ? "" : "\n") + channelId);
+                            }
+                            if(snippet.contains("videoOwnerChannelId"))
+                            {
+                                // There isn't any `videoOwnerChannelId` to retrieve for `5-CXVU8si3A` for instance.
+                                string channelId = snippet["videoOwnerChannelId"];
+                                if(channelId != id)
+                                {
+                                    addChannelToTreat(threadId, channelId);
+                                }
+                            }
+                        }
+                        if(data.contains("nextPageToken"))
+                        {
+                            pageToken = data["nextPageToken"];
+                        }
+                        else
+                        {
+                            break;
+                        }
+                    }
+                }
+            }
+            if(!data["nextPageToken"].is_null())
+            {
+                pageToken = data["nextPageToken"];
+            }
+            else
+            {
+                break;
+            }
+        }
+        // `LIVE`
+        pageToken = "";
+        string playlistId = "UU" + id.substr(2);
+        vector<string> videoIds;
+        while(true)
+        {
+            json data = getJson(threadId, "playlistItems?part=contentDetails,snippet,status&playlistId=" + playlistId + "&maxResults=50&pageToken=" + pageToken, true, id, returnErrorIfPlaylistNotFound),
+                 items = data["items"];
+            for(const auto& item : items)
+            {
+                string videoId = item["snippet"]["resourceId"]["videoId"];
+                videoIds.push_back(videoId);
+            }
+            bool hasNextPageToken = data.contains("nextPageToken");
+            if(videoIds.size() == 50 || !hasNextPageToken)
+            {
+                json data = getJson(threadId, "videos?part=contentDetails,id,liveStreamingDetails,localizations,player,snippet,statistics,status,topicDetails&id=" + join(videoIds, ","), true, id),
+                     items = data["items"];
+                for(const auto& item : items)
+                {
+                    if(item.contains("liveStreamingDetails"))
+                    {
+                        string videoId = item["id"];
+                        //PRINT(videoId)
+                        json liveStreamingDetails = item["liveStreamingDetails"];
+                        if(liveStreamingDetails.contains("activeLiveChatId"))
+                        {
+                            string activeLiveChatId = liveStreamingDetails["activeLiveChatId"];
+                            json data = getJson(threadId, "liveChat/messages?part=snippet,authorDetails&liveChatId=" + activeLiveChatId, true, id),
+                                 items = data["items"];
+                            for(const auto& item : items)
+                            {
+                                string channelId = item["snippet"]["authorChannelId"];
+                                addChannelToTreat(threadId, channelId);
+                            }
+                        }
+                        else
+                        {
+                            // As there isn't the usual pagination mechanism for these ended livestreams, we proceed in an uncertain way as follows.
+                            set<string> messageIds;
+                            unsigned long long lastMessageTimestampRelativeMsec = 0;
+                            while(true)
+                            {
+                                string time = to_string(lastMessageTimestampRelativeMsec);
+                                json data = getJson(threadId, "liveChats?part=snippet&id=" + videoId + "&time=" + time, false, id),
+                                     snippet = data["items"][0]["snippet"];
+                                if(snippet.empty())
+                                {
+                                    break;
+                                }
+                                json firstMessage = snippet[0];
+                                string firstMessageId = firstMessage["id"];
+                                // We verify that we don't skip any message by verifying that the first message was already treated if we already treated some messages.
+                                if(!messageIds.empty() && messageIds.find(firstMessageId) == messageIds.end())
+                                {
+                                    PRINT("The verification that we don't skip any message failed! Continuing anyway...")
+                                }
+                                for(const auto& message : snippet)
+                                {
+                                    string messageId = message["id"];
+                                    if(messageIds.find(messageId) == messageIds.end())
+                                    {
+                                        messageIds.insert(messageId);
+                                        string channelId = message["authorChannelId"];
+                                        addChannelToTreat(threadId, channelId);
+                                    }
+                                }
+                                json lastMessage = snippet.back();
+                                // If there isn't any new message, then we stop the retrieving.
+                                if(lastMessageTimestampRelativeMsec == lastMessage["videoOffsetTimeMsec"])
+                                {
+                                    break;
+                                }
+                                lastMessageTimestampRelativeMsec = lastMessage["videoOffsetTimeMsec"];
+                            }
+                        }
+                    }
+                }
+                videoIds.clear();
+            }
+            if(hasNextPageToken)
+            {
+                pageToken = data["nextPageToken"];
+            }
+            else
+            {
+                break;
+            }
+        }
+        // Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
+        string playlistToTreat = "UU" + channelToTreat.substr(2);
+        pageToken = "";
+        while(true)
+        {
+            json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
+            if(data.contains("error"))
+            {
+                EXIT_WITH_ERROR("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!")
+            }
+            json items = data["items"];
+            for(const auto& item : items)
+            {
+                string videoId = item["contentDetails"]["videoId"];
+                // Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
+                //treatChannelOrVideo(threadId, false, videoId, channelToTreat);
+
+                string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/";
+                createDirectory(channelCaptionsToTreatDirectory);
+
+                // Firstly download all not automatically generated captions.
+                // The underscore in `-o` argument is used to not end up with hidden files.
+                // We are obliged to precise the video id after `--`, otherwise if the video id starts with `-` it's considered as an argument.
+                string cmdCommonPrefix = "yt-dlp --skip-download ",
+                       cmdCommonPostfix = " -o '" + channelCaptionsToTreatDirectory + "_' -- " + videoId;
+                string cmd = cmdCommonPrefix + "--sub-lang all,-live_chat" + cmdCommonPostfix;
+                exec(threadId, cmd);
+
+                // Secondly download the automatically generated captions.
+                cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
+                exec(threadId, cmd);
+            }
+            if(data.contains("nextPageToken"))
+            {
+                pageToken = data["nextPageToken"];
+            }
+            else
+            {
+                break;
+            }
+        }
+    }
+}
+
+// This function verifies that the given hasn't already been treated.
+void addChannelToTreat(unsigned short threadId, string channelId)
+{
+    channelsPerSecondCount++;
+    channelsCountThreads[threadId]++;
+    channelsAlreadyTreatedAndToTreatMutex.lock();
+    if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreatRev.find(channelId) == channelsToTreatRev.end())
+    {
+        unsigned int channelsToTreatIndex = channelsToTreat.end()->first + 1;
+        channelsToTreat[channelsToTreatIndex] = channelId;
+        channelsToTreatRev[channelId] = channelsToTreatIndex;
+
+        channelsAlreadyTreatedAndToTreatMutex.unlock();
+
+        writeFile(threadId, CHANNELS_FILE_PATH, "a", "\n" + channelId);
+    }
+    else
+    {
+        channelsAlreadyTreatedAndToTreatMutex.unlock();
+    }
 }

 void treatComment(unsigned short threadId, json comment, string channelId)
@@ -284,39 +642,43 @@ void treatComment(unsigned short threadId, json comment, string channelId)
    if(snippet.contains("authorChannelId"))
    {
        string channelId = snippet["authorChannelId"]["value"];
-        channelsAlreadyTreatedAndToTreatMutex.lock();
-        if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreat.find(channelId) == channelsToTreat.end())
-        {
-            channelsToTreat.insert(channelId);
-            channelsAlreadyTreatedAndToTreatMutex.unlock();
-
-            writeFile(threadId, CHANNELS_FILE_PATH, "a", "\n" + channelId);
-        }
-        else
-        {
-            channelsAlreadyTreatedAndToTreatMutex.unlock();
-        }
+        addChannelToTreat(threadId, channelId);
    }
-    commentsCount++;
-    commentsPerSecondCount++;
 }

-string exec(string cmd)
+string join(vector<string> parts, string delimiter)
 {
-    array<char, 128> buffer;
-    string result;
-    unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
-    if (!pipe)
+    string result = "";
+    unsigned int partsSize = parts.size();
+    for(unsigned int partsIndex = 0; partsIndex < partsSize; partsIndex++)
    {
-        throw runtime_error("popen() failed!");
-    }
-    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
-    {
-        result += buffer.data();
+        result += parts[partsIndex];
+        if(partsIndex < partsSize - 1)
+        {
+            result += delimiter;
+        }
    }
    return result;
 }

+void exec(unsigned short threadId, string cmd, bool debug)
+{
+    if(debug)
+    {
+        ostringstream toString;
+        toString << threadId;
+        string initialCmd = cmd,
+               threadIdStr = toString.str(),
+               debugCommonFilePath = DEBUG_DIRECTORY + threadIdStr,
+               debugOutFilePath = debugCommonFilePath + ".out",
+               debugErrFilePath = debugCommonFilePath + ".err";
+        cmd += " >> " + debugOutFilePath;
+        cmd += " 2>> " + debugErrFilePath;
+        cmd += "; echo \"" + initialCmd + "\" | tee -a " + debugOutFilePath + " " + debugErrFilePath;
+    }
+    system(cmd.c_str());
+}
+
 bool writeFile(unsigned short threadId, string filePath, string option, string toWrite)
 {
    FILE* file = fopen(filePath.c_str(), option.c_str());
@@ -328,7 +690,7 @@ bool writeFile(unsigned short threadId, string filePath, string option, string t
    }
    else
    {
-        PRINT(threadId, "writeFile error: " << strerror(errno))
+        PRINT("writeFile error: " << strerror(errno))
    }
    return false;
 }
@@ -377,10 +739,13 @@ vector<string> getFileContent(string filePath)
    return lines;
 }

-json getJson(unsigned short threadId, string url, string directoryPath, getJsonBehavior behavior)
+json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, string channelId, getJsonBehavior behavior)
 {
-    string finalUrl = USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ? "https://yt.lemnoslife.com/noKey/" + url :
-                      "https://www.googleapis.com/youtube/v3/" + url + "&key=" + apiKey,
+    string finalUrl = usingYoutubeDataApiv3 ?
+                      (USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ?
+                       "https://yt.lemnoslife.com/noKey/" + url :
+                       "https://www.googleapis.com/youtube/v3/" + url + "&key=" + apiKey) :
+                      YOUTUBE_OPERATIONAL_API_INSTANCE_URL + "/" + url,
                      content = getHttps(finalUrl);
    json data;
    try
@@ -389,12 +754,17 @@ json getJson(unsigned short threadId, string url, string directoryPath, getJsonB
    }
    catch (json::parse_error& ex)
    {
-        PRINT(threadId, "Parse error for " << finalUrl << ", as got: " << content << " !")
-        exit(1);
+        // From the experience this sometimes happens due to empty `content` but retrying just after solves the problem.
+        PRINT("Parse error for " << finalUrl << ", as got: " << content << " ! Retrying...")
+        return getJson(threadId, url, usingYoutubeDataApiv3, channelId);
    }

    if(data.contains("error"))
    {
+        if(!usingYoutubeDataApiv3)
+        {
+            EXIT_WITH_ERROR("Found error in JSON retrieve from YouTube operational API at URL: " << finalUrl << " for content: " << content << " !")
+        }
        string reason = data["error"]["errors"][0]["reason"];
        // Contrarily to YouTube operational API no-key service we don't rotate keys in `KEYS_FILE_PATH`, as we keep them in memory here.
        if(reason == "quotaExceeded")
@@ -402,22 +772,23 @@ json getJson(unsigned short threadId, string url, string directoryPath, getJsonB
            quotaMutex.lock();
            keys.erase(keys.begin());
            keys.push_back(apiKey);
-            PRINT(threadId, "No more quota on " << apiKey << " switching to " << keys[0] << ".")
+            PRINT("No more quota on " << apiKey << " switching to " << keys[0] << ".")
            apiKey = keys[0];
            quotaMutex.unlock();
-            return getJson(threadId, url, directoryPath);
+            return getJson(threadId, url, true, channelId);
        }
-        PRINT(threadId, "Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
+        PRINT("Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
        if(reason != "commentsDisabled" || behavior == retryOnCommentsDisabled)
        {
-            return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, directoryPath);
+            return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, true, channelId);
        }
    }

    ostringstream toString;
-    toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannel << ".json";
-    requestsPerChannel++;
-    writeFile(threadId, toString.str(), "w", url + "\n" + content);
+    toString << CHANNELS_DIRECTORY << channelId << "/" << YOUTUBE_API_REQUESTS_DIRECTORY;
+    writeFile(threadId, toString.str() + "urls.txt", "a", url + " " + (usingYoutubeDataApiv3 ? "true" : "false") + "\n");
+    toString << requestsPerChannelThreads[threadId]++ << ".json";
+    writeFile(threadId, toString.str(), "w", content);

    return data;
 }
--- a/website/channels.php
+++ b/website/channels.php
@@ -0,0 +1,42 @@
+<?php
+
+    if (!function_exists('str_contains')) {
+        function str_contains($haystack, $needle)
+        {
+            return strpos($haystack, $needle) !== false;
+        }
+    }
+
+    if (!function_exists('str_ends_with')) {
+        function str_ends_with($haystack, $needle)
+        {
+            $length = strlen($needle);
+            return $length > 0 ? substr($haystack, -$length) === $needle : true;
+        }
+    }
+
+    function str_replace_first($needle, $replace, $haystack) {
+        $pos = strpos($haystack, $needle);
+        if ($pos !== false) {
+            $haystack = substr_replace($haystack, $replace, $pos, strlen($needle));
+        }
+        return $haystack;
+    }
+
+    $uri = $_SERVER['REQUEST_URI'];
+    $uri = str_replace('/channels/', '', $uri);
+    $prefix = '/mnt/HDD0/YouTube_captions_search_engine/channels/';
+    if (str_contains($uri, '/')) {
+        $uri = str_replace_first('/', '#', $uri);
+        $uri = $prefix . $uri;
+        if (str_ends_with($uri, '.json')) {
+            header('Content-Type: application/json; charset=UTF-8');
+        }
+        echo file_get_contents("zip://$uri");
+    } else {
+        $uri = $prefix . $uri;
+        header("Content-Type: application/zip");
+        echo readfile($uri);
+    }
+
+?>
--- a/website/composer.json
+++ b/website/composer.json
@@ -0,0 +1,5 @@
+{
+    "require": {
+        "cboden/ratchet": "^0.4.4"
+    }
+}
--- a/website/composer.lock
+++ b/website/composer.lock
--- a/website/index.php
+++ b/website/index.php
@@ -0,0 +1,103 @@
+<?php
+
+    function echoUrl($url)
+    {
+        echo "<a href=\"$url\">$url</a>";
+    }
+
+    ?>
+
+See <?php echoUrl('https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine'); ?> for more information.<br/>
+
+Access raw data with: <?php echoUrl('channels/'); ?>.
+
+<form id="form">
+    <input type="text" autofocus id="search" pattern="[A-Za-z0-9-_ ]+" placeholder="Your [A-Za-z0-9-_ ]+ search"></input>
+    <input type="submit" id="search" value="Search">
+    <input type="submit" id="search-only-captions" value="Search only captions">
+</form>
+
+Progress: <span id="progress"></span> channels
+
+<ul id="channels">
+</ul>
+
+<script>
+    var firstRun = true;
+    var conn;
+    // Could parse DOM instead of using following variable.
+    var channels = [];
+
+    function createA(text, href) {
+        var a = document.createElement('a');
+        var text = document.createTextNode(text);
+        a.appendChild(text);
+        a.href = href;
+        return a;
+    }
+
+    function treatLine(line) {
+        console.log(line);
+        if (line.startsWith('progress:')) {
+            document.getElementById('progress').innerHTML = line.replace('progress:', '');
+        } else {
+            var channelsDom = document.getElementById('channels');
+            var timestamp = [];
+            const lineParts = line.split('|');
+            if (lineParts.length > 0) {
+                timestamps = lineParts.slice(1).map(linePart => parseInt(linePart));
+                line = lineParts[0];
+            }
+            const channelFileParts = line.split('/');
+            const channel = channelFileParts[0];
+            const channelFile = channelFileParts.slice(1).join('/');
+            const channelHref = `channels/${channel}`;
+            if (!channels.includes(channel)) {
+                channels.push(channel);
+                channelDom = document.createElement('li');
+                var a = createA(channel, channelHref);
+                channelDom.appendChild(a);
+                var channelFilesDom = document.createElement('ul');
+                channelDom.appendChild(channelFilesDom);
+                channelsDom.appendChild(channelDom);
+            }
+            var channelDom = channelsDom.lastChild;
+            var channelFilesDom = channelDom.lastChild;
+            var channelFileDom = document.createElement('li');
+            var a = createA(channelFile, `${channelHref}/${channelFile}`);
+            channelFileDom.appendChild(a);
+            const id = channelFileParts[2];
+            for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) {
+                const space = document.createTextNode('\u00A0');
+                channelFileDom.appendChild(space);
+                const timestamp = timestamps[timestampsIndex];
+                var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
+                channelFileDom.appendChild(a);
+            }
+            channelFilesDom.appendChild(channelFileDom);
+        }
+    }
+
+    function search(event) {
+        // We don't want to refresh the webpage which is the default behavior.
+        event.preventDefault();
+        const query = event.submitter.id + ' ' + document.getElementById('search').value;
+        if (firstRun) {
+            firstRun = false;
+            conn = new WebSocket('wss://crawler.yt.lemnoslife.com/websocket');
+            conn.onmessage = function(e) {
+                e.data.split('\n').forEach(treatLine);
+            };
+            // We can't directly proceed with `conn.send`, as the connection may not be already established.
+            conn.onopen = function(e) { conn.send(query); };
+        } else {
+            // We assume at this point that the connection is established.
+            channels = [];
+            document.getElementById('channels').innerHTML = '';
+            conn.send(query);
+        }
+    }
+
+    var form = document.getElementById('form');
+    form.addEventListener('submit', search);
+</script>
--- a/website/search.py
+++ b/website/search.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python3
+
+import sys, time, fcntl, os, zipfile, webvtt, re
+from io import StringIO
+
+path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
+
+clientId = sys.argv[1]
+message = sys.argv[2]
+
+searchOnlyCaptions = message.startswith('search-only-captions ')
+message = message[message.find(' ') + 1:]
+
+clientFilePath = f'users/{clientId}.txt'
+
+def write(s):
+    f = open(clientFilePath, 'r+')
+    try:
+        fcntl.flock(f, fcntl.LOCK_EX)
+        # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
+        read = f.read()
+        # We are appening content, as we moved in-file cursor.
+        if read != '':
+            f.write("\n")
+        f.write(s)
+        f.flush()
+        fcntl.flock(f, fcntl.LOCK_UN)
+        f.close()
+    except Exception as e:
+        sys.exit(e)
+
+# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
+files = [file for file in os.listdir(path) if file.endswith('.zip')]
+for fileIndex, file in enumerate(files):
+    write(f'progress:{fileIndex + 1} / {len(files)}')
+    zip = zipfile.ZipFile(path + file)
+    for fileInZip in zip.namelist():
+        endsWithVtt = fileInZip.endswith('.vtt')
+        if searchOnlyCaptions and not endsWithVtt:
+            continue
+        with zip.open(fileInZip) as f:
+            toWrite = f'{file}/{fileInZip}'
+            if endsWithVtt:
+                content = f.read().decode('utf-8')
+                stringIOf = StringIO(content)
+                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
+                messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
+                if messagePositions != []:
+                    timestamps = []
+                    for messagePosition in messagePositions:
+                        stringIOf = StringIO(content)
+                        for caption in webvtt.read_buffer(stringIOf):
+                            text = caption.text
+                            if messagePosition <= len(text):
+                                timestamp = str(int(caption.start_in_seconds))
+                                timestamps += [timestamp]
+                                break
+                            messagePosition -= len(text) + 1
+                    write(f'{toWrite}|{"|".join(timestamps)}')
+            else:
+                for line in f.readlines():
+                    if message in str(line):
+                        write(toWrite)
+                        break
+
+f = open(clientFilePath)
+while True:
+    try:
+        fcntl.flock(f, fcntl.LOCK_EX)
+        if f.read() == '':
+            os.remove(clientFilePath)
+            break
+        else:
+            fcntl.flock(f, fcntl.LOCK_UN)
+            time.sleep(1)
+    except Exception as e:
+        sys.exit(e)
+
+f.close()
--- a/website/users/.gitignore
+++ b/website/users/.gitignore
@@ -0,0 +1,5 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
+
--- a/website/websocket.php
+++ b/website/websocket.php
@@ -0,0 +1,166 @@
+<?php
+
+use Ratchet\MessageComponentInterface;
+use Ratchet\ConnectionInterface;
+use React\EventLoop\LoopInterface;
+use React\EventLoop\Timer\Timer;
+
+// Make sure composer dependencies have been installed
+require __DIR__ . '/vendor/autoload.php';
+
+class Client
+{
+    public $id;
+    public $timer;
+    public $pid;
+
+    public function __construct($id)
+    {
+        $this->id = $id;
+    }
+
+    // `__destruct` can't take arguments.
+    public function free($loop)
+    {
+        $loop->cancelTimer($this->timer);
+        // Should in theory verify that the pid wasn't re-assigned.
+        posix_kill($this->pid, SIGTERM);
+        $clientFilePath = getClientFilePath($this->id);
+        if (file_exists($clientFilePath)) {
+            $fp = fopen($clientFilePath, "r+");
+            if (flock($fp, LOCK_EX, $WAIT_IF_LOCKED)) {   // acquire an exclusive lock
+                unlink($clientFilePath); // delete file
+                flock($fp, LOCK_UN);     // release the lock
+            } else {
+                echo "Couldn't get the lock!";
+            }
+            fclose($fp);
+        }
+    }
+}
+
+// Need to be passed as a reference to `flock`.
+$WAIT_IF_LOCKED = 1;
+
+define('USERS_FOLDER', 'users/');
+
+// Delete users outputs of previous `websocket.php` execution.
+// We skip `.`, `..` and `.gitignore`.
+foreach (array_slice(scandir(USERS_FOLDER), 3) as $file) {
+    unlink(USERS_FOLDER . $file);
+}
+
+function getClientFilePath($clientId)
+{
+    return USERS_FOLDER . "$clientId.txt";
+}
+
+// Current implementation may add latency across users.
+class MyProcess implements MessageComponentInterface
+{
+    protected $clients;
+    private $loop;
+    private $newClientId;
+    private $newClientIdSem;
+
+    public function __construct(LoopInterface $loop)
+    {
+        $this->clients = new \SplObjectStorage();
+        $this->loop = $loop;
+        $this->newClientId = 0;
+        $this->newClientIdSem = sem_get(1, 1);
+    }
+
+    private function newClient()
+    {
+        // If `onOpen` and `onMessage` can't be called at the same time, then this semaphore is useless.
+        if (sem_acquire($this->newClientIdSem)) {
+            // Note that we don't re-use ids except on `websockets.php` restart, but as the maximal int in PHP is a very great number we are fine for a while (https://www.php.net/manual/en/reserved.constants.php#constant.php-int-max)
+            $clientId = $this->newClientId++;
+            sem_release($this->newClientIdSem);
+            return new Client($clientId);
+        } else {
+            exit('`newClient` error');
+        }
+    }
+
+    public function onOpen(ConnectionInterface $conn)
+    {
+        $client = $this->newClient();
+        $this->clients->attach($conn, $client);
+    }
+
+    public function onMessage(ConnectionInterface $from, $msg)
+    {
+        // As we are going to use this argument in a shell command, we verify a limited set of characters that are safe once quoted.
+        if (preg_match("/^[a-zA-Z0-9-_ ]+$/", $msg) !== 1) {
+            return;
+        }
+        $client = $this->clients->offsetGet($from);
+        // If a previous request was received, we execute the new one with another client for simplicity otherwise with current file deletion approach, we can't tell the worker `search.py` that we don't care about its execution anymore.
+        if ($client->pid !== null) {
+            // As `$this->clients->detach` doesn't call `__destruct` for unknown reason, we clean manually the previous request.
+            $client->free($this->loop);
+            $client = $this->newClient();
+        }
+        $clientId = $client->id;
+        $clientFilePath = getClientFilePath($clientId);
+        // Create the worker output file otherwise it would believe that we don't need this worker anymore.
+        file_put_contents($clientFilePath, '');
+        // Start the independent worker.
+        // Redirecting `stdout` is mandatory otherwise `exec` is blocking.
+        $client->pid = exec("./search.py $clientId '$msg' > /dev/null & echo $!");
+        // `addTimer` doesn't enable us to use independently `$from->send` multiple times with blocking instructions between.
+        $client->timer = $this->loop->addPeriodicTimer(1, function () use ($from, $clientId, $clientFilePath, $client) {
+            echo "Checking news from $clientId\n";
+            // If the worker output file doesn't exist anymore, then it means that the worker have finished its work and acknowledged that `websocket.php` completely read its output.
+            if (file_exists($clientFilePath)) {
+                // `flock` requires `r`eading permission and we need `w`riting one due to `ftruncate` usage.
+                $fp = fopen($clientFilePath, "r+");
+                $read = null;
+                if (flock($fp, LOCK_EX, $WAIT_IF_LOCKED)) { // acquire an exclusive lock
+                    // We assume that the temporary output is less than 1 MB long.
+                    $read = fread($fp, 1_000_000);
+                    ftruncate($fp, 0);     // truncate file
+                    fflush($fp);           // flush output before releasing the lock
+                    flock($fp, LOCK_UN);   // release the lock
+                } else {
+                    // We `die` instead of `echo`ing to force the developer to investigate the reason.
+                    die("Couldn't get the lock!");
+                }
+                fclose($fp);
+
+                // Assume that empty output doesn't need to me forwarded to the end-user.
+                if ($read !== null && $read !== '') {
+                    $from->send($read);
+                }
+            } else {
+                // We don't need the periodic timer anymore, as the worker finished its work and acknowledged that `websocket.php` completely read its output.
+                $this->loop->cancelTimer($client->timer);
+            }
+        });
+    }
+
+    public function onClose(ConnectionInterface $conn)
+    {
+        $client = $this->clients->offsetGet($conn);
+        $clientId = $client->id;
+        $client->free($this->loop);
+        echo "$clientId disconnected\n";
+        $this->clients->detach($conn);
+    }
+
+    public function onError(ConnectionInterface $conn, \Exception $e)
+    {
+        $conn->close();
+        die('`onError`');
+    }
+}
+
+$loop = \React\EventLoop\Factory::create();
+
+// Run the server application through the WebSocket protocol on port 4430.
+// Note that named arguments come with PHP 8 which isn't current Debian one.
+$app = new Ratchet\App('crawler.yt.lemnoslife.com', 4430, '127.0.0.1', $loop);
+$app->route('/websocket', new MyProcess($loop), array('*'));
+$app->run();
Author	SHA1	Message	Date
Benjamin Loison	eb8431746e	Make the first channel of `channels.txt` being treated again, solve temporary empty response from YouTube Data API v3 issue and temporarily remove sanity check failing very rarely #39	2023-02-14 23:15:07 +01:00
Benjamin Loison	a7f6e1cd85	Fix #31 : List all occurrences of search within video captions	2023-02-14 02:56:11 +01:00
Benjamin Loison	21ad878be8	Fix #31 : Make a website with a search engine notably based on the captions extracted	2023-02-14 02:00:23 +01:00
Benjamin Loison	57572c6d6c	#31 : Make search within captions not limited by line wrapping	2023-02-14 01:32:36 +01:00
Benjamin Loison	e0faf053a1	Fix #38 : Add a loading message with progress on end-user interface	2023-02-14 01:08:05 +01:00
Benjamin Loison	77bafdd592	#31 : Add a first search only captions support	2023-02-14 00:59:37 +01:00
Benjamin Loison	fa7da64879	Add `.gitignore` to ignore `{keys, channels}.txt`	2023-02-13 06:18:42 +01:00
Benjamin Loison	9e650cf72a	Make the `COMMUNITY` tab process not infinitely loop Related to https://github.com/Benjamin-Loison/YouTube-operational-API/issues/49	2023-02-13 06:17:23 +01:00
Benjamin Loison	dc63de82f5	Add link to `channels/` to `index.php`	2023-02-13 05:55:44 +01:00
Benjamin Loison	dfdfbe3272	Modify website to support new sub-folders architecture	2023-02-13 05:45:08 +01:00
Benjamin Loison	a51e3b1a9a	Fix #37 : Use a number of channels seen (possibly repeated) instead of YouTube Data API v3 Comment(Thread): resource	2023-02-12 16:31:27 +01:00
Benjamin Loison	b572d078dd	Add logging to `exec` and make it crashless, `requests` and `captions` folders support for compressing, clean captions support for videos being livestreams and videos starting with `-`	2023-02-12 16:24:16 +01:00
Benjamin Loison	8df226e2bc	Move YouTube API requests logging to `requests/` channel sub-folder	2023-02-10 20:17:49 +01:00
Benjamin Loison	3c4664a4b1	Fix #13 : Add captions extraction I was about to commit in addition: ```c++ // Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.orig'`. // My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`. /json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat), items = data["items"]; for(const auto& item : items) { json snippet = item["snippet"]; if(snippet["trackKind"] == "asr") { string language = snippet["language"]; cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); // As there should be a single automatic speech recognized track, there is no need to go through all tracks. break; } }/ ``` Instead of: ```c++ cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); ``` But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was wrong: > `yt-dlp --cookies cookies.txt --sub-langs 'en.,.orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again. > > ``` > 'subtitleslangs': ['en.','.orig'], > 'writeautomaticsub': True, > ``` > > Work as expected too. Thank you > > Very sorry for the video sample. I even not watched it. Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).	2023-02-10 20:03:08 +01:00
Benjamin Loison	7fcc8b09fa	Fix #36 : Make the program stops by crashing on YouTube operational API instance being detected as sending unusual traffic	2023-02-10 12:02:39 +01:00
Benjamin Loison	87d67e4e85	Correct the termination of `COMMUNITY` tab process due to missing page tokens	2023-02-10 00:37:28 +01:00
Benjamin Loison	8f9b1275be	Remove the `Content-Type: application/json` HTTP header when retrieving `urls.txt` inside a `.zip`	2023-02-09 02:07:10 +01:00
Benjamin Loison	afd9e1b0b6	Add a verification that `snippet/authorChannelId/value` isn't null when using `commentThreads` for `COMMUNITY` As it can happen cf https://www.youtube.com/channel/UCWeg2Pkate69NFdBeuRFTAw/community?lc=UgwGfjNxGuwqP8qYPPN4AaABAg&lb=UgkxYiEAo9-b1vWPasxFy13f959rrctQpZwW	2023-02-09 01:51:22 +01:00
Benjamin Loison	5a1df71bb9	Append to `channels.txt` all channels mentioned in the Wiki	2023-02-08 16:28:44 +01:00
Benjamin Loison	622188d6d9	Add in `urls.txt` if the URL is related to YouTube Data API v3 or YouTube operational API	2023-02-08 16:05:03 +01:00
Benjamin Loison	0c51bd05bc	Fix #34 : Correct JSON files by putting first line in another metadata file	2023-02-07 23:08:09 +01:00
Benjamin Loison	e0f521d572	Restore ability to download whole archives As API keys aren't written in the first line of JSON files.	2023-02-07 23:01:26 +01:00
Benjamin Loison	e5a50bcba4	Remove ability in `channels.php` to download whole archive for not leaking API keys used	2023-02-07 22:42:24 +01:00
Benjamin Loison	2179e9b6f4	Add `channels.php` adding support for (file in) zip download	2023-02-07 22:39:43 +01:00
Benjamin Loison	e9b77369fb	#31 : Add zip files search	2023-02-07 20:15:36 +01:00
Benjamin Loison	b45384bab7	Comment WebSocket mechanism to work with an arbitrary number of independent send	2023-02-07 18:14:49 +01:00
Benjamin Loison	126cc75dc6	Make WebSocket able to manage arbitrary feedback to end-user While previous implementation was able to send two independent messages, now we can send an arbitrary amount of independent messages.	2023-02-07 17:25:17 +01:00
Benjamin Loison	7302679a81	Make `websockets.php` able to proceed blocking treatments	2023-02-07 01:22:26 +01:00
Benjamin Loison	0dba8e0c7d	Make a WebSocket example work with `crawler.yt.lemnoslife.com`	2023-01-31 01:05:09 +01:00
Benjamin Loison	155d372186	Run `php-cs-fixer fix --rules=@PSR12 websocket.php`	2023-01-31 00:57:06 +01:00
Benjamin Loison	bd184bd0f0	Rename `chat.php` to `websocket.php`	2023-01-30 22:24:02 +01:00
Benjamin Loison	0193f05143	Copy-pasted the `README.md` quick example of `ratchetphp/Ratchet` `5012dc9545 (a-quick-example)`	2023-01-30 22:19:04 +01:00
Benjamin Loison	931b2df563	Add static `website/index.php`	2023-01-30 22:14:05 +01:00
Benjamin Loison	0f4b89ccd9	Correct typo: the channel tab is `LIVE`, not `LIVES`	2023-01-25 01:00:29 +01:00
Benjamin Loison	4e162e34c3	Add comment in `README.md` about the usage of `--no-keys` or generating a YouTube Data API v3 key	2023-01-22 15:41:13 +01:00
Benjamin Loison	10e8811817	Introduce `{,MAIN_}EXIT_WITH_ERROR` macros for exitting with an error	2023-01-22 15:17:14 +01:00
Benjamin Loison	0f15bb0235	#11 : Add the discovering of channels having commented on ended livestreams	2023-01-22 15:15:27 +01:00
Benjamin Loison	bdb4e6443a	#11 : Add current livestreams support to discover channels	2023-01-22 04:00:11 +01:00
Benjamin Loison	d2391e5d54	Instead of looping on `items` where we expect only one to be, we just use `items[0]`	2023-01-22 02:19:26 +01:00
Benjamin Loison	993d0b9771	Make `PRINT` not requiring to precise `threadId`	2023-01-22 02:04:03 +01:00
Benjamin Loison	0fcb5a0426	#11 : Treat `COMMUNITY` post comments to discover channels	2023-01-22 01:37:32 +01:00
Benjamin Loison	57200da482	Add in `README.md` the fact that as documented in #30 , this algorithm is only known to be working fin on Linux	2023-01-21 22:20:45 +01:00
Benjamin Loison	a0880c79bb	#11 : Update channel `CHANNELS` tab treatment following YouTube-operational-API/issues/121 closure	2023-01-21 02:24:42 +01:00
Benjamin Loison	10c5c1d605	#11 : Add the treatment of channels' tab, but only postpone unlisted videos treatment	2023-01-15 14:56:44 +01:00
Benjamin Loison	51a70f6e54	#7 : Make `commentsCount` and `requestsPerChannel` compatible with multithreading	2023-01-15 14:31:55 +01:00
Benjamin Loison	aa97c94bf8	#11 : Add a first iteration for the `CHANNELS` retrieval	2023-01-15 02:19:31 +01:00
Benjamin Loison	d1b84335d1	#11 : Add `--youtube-operational-api-instance-url` parameter and use `exit(EXIT_{SUCCESS, FAILURE})` instead of `exit({0, 1})`	2023-01-15 00:49:32 +01:00
Benjamin Loison	6ce29051c0	Fix #26 : Keep efficient search algorithm while keeping order (notably of the starting set)	2023-01-14 15:14:24 +01:00