Fix #51 : These last days the algorithm seems to not treat completely firstly the starting set of channels before treating discovered channels

I verified that this commit solves the issue by treating only `CHANNELS` tab of the channels in `channels.txt`.
#48 : Stop relying on echo, tee and /dev/null for redirecting compression command to debug/
2023-02-22 04:09:35 +01:00 · 2023-02-22 03:47:06 +01:00 · 2023-02-22 03:37:07 +01:00 · 2023-02-22 03:27:49 +01:00 · 2023-02-19 02:04:28 +01:00 · 2023-02-17 16:57:11 +01:00
12 changed files with 2003 additions and 59 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+keys.txt
+channels.txt
--- a/README.md
+++ b/README.md
@@ -15,12 +15,21 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
 Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.

 ```sh
-sudo apt install nlohmann-json3-dev
+sudo apt install nlohmann-json3-dev yt-dlp
 make
+./youtubeCaptionsSearchEngine -h
+```
+
+If you plan to use the front-end website, also run:
+
+```sh
+pip install webvtt-py
 ```

 Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).

+Except if you provide the argument `--no-keys`, you have to provide at least one [YouTube Data API v3 key](https://developers.google.com/youtube/v3/getting-started) in `keys.txt`.
+
 ```sh
 ./youtubeCaptionsSearchEngine
 ```
--- a/channels.txt
+++ b/channels.txt
@@ -97,4 +97,14 @@ UCWrtcU1OId_PQ_YoBN6lIRA
 UCfih6kPJCpzWmtCFtlpYK6A
 UCdTyuXgmJkG_O8_75eqej-w
 UCxXFx2jz8N02sNqv1VeDEGA
-UCj8BKFCTH-mqRlYwcmX2xwg
+UCj8BKFCTH-mqRlYwcmX2xwg
+UCsT0YIqwnpJCM-mx7-gSA4Q
+UCAuUUnT6oDeKwE6v1NGQxug
+UCy0uwqmXSHVOgqo3nrN4RCQ
+UCawLcDd9clh27b1z55Gcawg
+UC6bfT6U4WED5EyzymREvKlQ
+UCINdSH_R15xft_ctNm50eGQ
+UCVx2ZvskbDkHpLlYEQ9FULw
+UCBcmi8nLrqfFluiexxjl7bg
+UCBnZ16ahKA2DZ_T5W0FPUXg
+UCf8w5m0YsRa8MHQ5bwSGmbw
--- a/main.cpp
+++ b/main.cpp
@@ -16,16 +16,16 @@ enum getJsonBehavior { normal, retryOnCommentsDisabled, returnErrorIfPlaylistNot

 set<string> setFromVector(vector<string> vec);
 vector<string> getFileContent(string filePath);
-json getJson(unsigned short threadId, string url, bool usingYouTubeDataApiV3, string directoryPath, getJsonBehavior behavior = normal);
+json getJson(unsigned short threadId, string url, bool usingYouTubeDataApiV3, string channelId, getJsonBehavior behavior = normal);
 void createDirectory(string path),
     print(ostringstream* toPrint),
     treatComment(unsigned short threadId, json comment, string channelId),
     treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat),
     treatChannels(unsigned short threadId),
     deleteDirectory(string path),
-     addChannelToTreat(unsigned short threadId, string channelId);
+     addChannelToTreat(unsigned short threadId, string channelId),
+     exec(unsigned short threadId, string cmd, bool debug = true);
 string getHttps(string url),
-       exec(string cmd),
       join(vector<string> parts, string delimiter);
 size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
 bool doesFileExist(string filePath),
@@ -36,6 +36,9 @@ bool doesFileExist(string filePath),
 #define DEFAULT_THREAD_ID 0
 #define MAIN_PRINT(x) THREAD_PRINT(DEFAULT_THREAD_ID, x)

+#define EXIT_WITH_ERROR(x) { PRINT(x); exit(EXIT_FAILURE); }
+#define MAIN_EXIT_WITH_ERROR(x) { MAIN_PRINT(x); exit(EXIT_FAILURE); }
+
 mutex printMutex,
      channelsAlreadyTreatedAndToTreatMutex,
      quotaMutex;
@@ -44,8 +47,8 @@ set<string> channelsAlreadyTreated;
 map<unsigned int, string> channelsToTreat;
 map<string, unsigned int> channelsToTreatRev;
 vector<string> keys;
-unsigned int commentsPerSecondCount = 0;
-map<unsigned short, unsigned int> commentsCountThreads,
+unsigned int channelsPerSecondCount = 0;
+map<unsigned short, unsigned int> channelsCountThreads,
    requestsPerChannelThreads;
 unsigned short THREADS_NUMBER = 1;
 // Use `string` variables instead of macros to have `string` properties, even if could use a meta-macro inlining as `string`s.
@@ -54,7 +57,11 @@ string CHANNELS_DIRECTORY = "channels/",
       KEYS_FILE_PATH = "keys.txt",
       UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
       apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
-       YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API"; // Can be "https://yt.lemnoslife.com" for instance.
+       YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance.
+       CAPTIONS_DIRECTORY = "captions/",
+       DEBUG_DIRECTORY = "debug/",
+       YOUTUBE_API_REQUESTS_DIRECTORY = "requests/",
+       CURRENT_WORKING_DIRECTORY;
 bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;

 int main(int argc, char *argv[])
@@ -84,14 +91,12 @@ int main(int argc, char *argv[])
            }
            else
            {
-                MAIN_PRINT("YouTube operational API instance URL missing!")
-                exit(EXIT_FAILURE);
+                MAIN_EXIT_WITH_ERROR("YouTube operational API instance URL missing!")
            }
        }
        else
        {
-            MAIN_PRINT("Unrecognized parameter " << argvStr)
-            exit(EXIT_FAILURE);
+            MAIN_EXIT_WITH_ERROR("Unrecognized parameter " << argvStr)
        }
    }

@@ -113,13 +118,23 @@ int main(int argc, char *argv[])

    for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
    {
-        string fileName = entry.path().filename(),
-               channelId = fileName.substr(0, fileName.length() - 4);
+        string fileName = entry.path().filename();
+        // Skip files such as `UNLISTED_VIDEOS_FILE_PATH`.
+        if (fileName.substr(0, 2) == "UC") {
+            string channelId = fileName.substr(0, fileName.length() - 4);

-        channelsToTreat.erase(channelsToTreatRev[channelId]);
-        channelsToTreatRev.erase(channelId);
+            channelsToTreat.erase(channelsToTreatRev[channelId]);
+            channelsToTreatRev.erase(channelId);

-        channelsAlreadyTreated.insert(channelId);
+            channelsAlreadyTreated.insert(channelId);
+        }
+    }
+
+    char cwd[PATH_MAX];
+    if (getcwd(cwd, sizeof(cwd)) != NULL) {
+        CURRENT_WORKING_DIRECTORY = string(cwd) + "/";
+    } else {
+        MAIN_EXIT_WITH_ERROR("`getcwd()` error");
    }

    MAIN_PRINT(channelsToTreat.size() << " channel(s) to treat")
@@ -133,8 +148,8 @@ int main(int argc, char *argv[])

    while(true)
    {
-        MAIN_PRINT("Comments per second: " << commentsPerSecondCount)
-        commentsPerSecondCount = 0;
+        MAIN_PRINT("Channels per second: " << channelsPerSecondCount)
+        channelsPerSecondCount = 0;
        sleep(1);
    }

@@ -164,7 +179,7 @@ void treatChannels(unsigned short threadId)

        PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")

-        commentsCountThreads[threadId] = 0;
+        channelsCountThreads[threadId] = 0;
        requestsPerChannelThreads[threadId] = 0;

        channelsToTreat.erase(channelsToTreatRev[channelToTreat]);
@@ -176,19 +191,24 @@ void treatChannels(unsigned short threadId)

        string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
        createDirectory(channelToTreatDirectory);
+        createDirectory(DEBUG_DIRECTORY);
+        createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY);
+        createDirectory(channelToTreatDirectory + YOUTUBE_API_REQUESTS_DIRECTORY);

        treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);

        // Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
        PRINT("Starting compression...")
        // As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
-        exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
+        // We precise no `debug`ging, as otherwise the zipping operation doesn't work as expected.
+        // As the zipping process isn't recursive, we can't just rely on `ls`, but we are obliged to use `find`.
+        exec(threadId, "cd " + channelToTreatDirectory + " && find | zip ../" + channelToTreat + ".zip -@");

        PRINT("Compression finished, started deleting initial directory...")
        deleteDirectory(channelToTreatDirectory);
        PRINT("Deleting directory finished.")

-        PRINT(commentsCountThreads[threadId] << " comments were found for this channel.")
+        PRINT(channelsCountThreads[threadId] << " channels were found for this channel.")
    }

    channelsAlreadyTreatedAndToTreatMutex.unlock();
@@ -273,8 +293,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                    json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
                    if(data.contains("error"))
                    {
-                        PRINT("Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
-                        exit(EXIT_FAILURE);
+                        EXIT_WITH_ERROR("Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
                    }
                    json items = data["items"];
                    for(const auto& item : items)
@@ -302,8 +321,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
            }
            else //if(videoCount >= 20000)
            {
-                PRINT("The videos count of the channel exceeds the supported 20,000 limit!")
-                exit(EXIT_FAILURE);
+                EXIT_WITH_ERROR("The videos count of the channel exceeds the supported 20,000 limit!")
            }
        }
    }
@@ -358,9 +376,13 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                         items = data["items"];
                    for(const auto& item : items)
                    {
-                        json snippet = item["snippet"]["topLevelComment"]["snippet"];
-                        string channelId = snippet["authorChannelId"]["value"];
-                        addChannelToTreat(threadId, channelId);
+                        json snippet = item["snippet"]["topLevelComment"]["snippet"],
+                             authorChannelId = snippet["authorChannelId"];
+                        if(!authorChannelId["value"].is_null())
+                        {
+                            string channelId = authorChannelId["value"];
+                            addChannelToTreat(threadId, channelId);
+                        }
                        string pageToken = snippet["nextPageToken"];
                        while(pageToken != "")
                        {
@@ -391,11 +413,11 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                    }
                }
            }
-            if(data.contains("nextPageToken"))
+            if(data.contains("nextPageToken") && data["nextPageToken"] != "")
            {
                pageToken = data["nextPageToken"];
            }
-            if(pageToken == "")
+            else
            {
                break;
            }
@@ -466,7 +488,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                break;
            }
        }
-        // `LIVES`
+        // `LIVE`
        pageToken = "";
        string playlistId = "UU" + id.substr(2);
        vector<string> videoIds;
@@ -488,11 +510,11 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                {
                    if(item.contains("liveStreamingDetails"))
                    {
-                        PRINT(item["id"])
+                        string videoId = item["id"];
+                        //PRINT(videoId)
                        json liveStreamingDetails = item["liveStreamingDetails"];
                        if(liveStreamingDetails.contains("activeLiveChatId"))
                        {
-                            PRINT("streaming")
                            string activeLiveChatId = liveStreamingDetails["activeLiveChatId"];
                            json data = getJson(threadId, "liveChat/messages?part=snippet,authorDetails&liveChatId=" + activeLiveChatId, true, id),
                                 items = data["items"];
@@ -500,12 +522,47 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                            {
                                string channelId = item["snippet"]["authorChannelId"];
                                addChannelToTreat(threadId, channelId);
-                                PRINT("Found: " << channelId)
                            }
                        }
                        else
                        {
-                            PRINT("no more streaming")
+                            // As there isn't the usual pagination mechanism for these ended livestreams, we proceed in an uncertain way as follows.
+                            set<string> messageIds;
+                            unsigned long long lastMessageTimestampRelativeMsec = 0;
+                            while(true)
+                            {
+                                string time = to_string(lastMessageTimestampRelativeMsec);
+                                json data = getJson(threadId, "liveChats?part=snippet&id=" + videoId + "&time=" + time, false, id),
+                                     snippet = data["items"][0]["snippet"];
+                                if(snippet.empty())
+                                {
+                                    break;
+                                }
+                                json firstMessage = snippet[0];
+                                string firstMessageId = firstMessage["id"];
+                                // We verify that we don't skip any message by verifying that the first message was already treated if we already treated some messages.
+                                if(!messageIds.empty() && messageIds.find(firstMessageId) == messageIds.end())
+                                {
+                                    PRINT("The verification that we don't skip any message failed! Continuing anyway...")
+                                }
+                                for(const auto& message : snippet)
+                                {
+                                    string messageId = message["id"];
+                                    if(messageIds.find(messageId) == messageIds.end())
+                                    {
+                                        messageIds.insert(messageId);
+                                        string channelId = message["authorChannelId"];
+                                        addChannelToTreat(threadId, channelId);
+                                    }
+                                }
+                                json lastMessage = snippet.back();
+                                // If there isn't any new message, then we stop the retrieving.
+                                if(lastMessageTimestampRelativeMsec == lastMessage["videoOffsetTimeMsec"])
+                                {
+                                    break;
+                                }
+                                lastMessageTimestampRelativeMsec = lastMessage["videoOffsetTimeMsec"];
+                            }
                        }
                    }
                }
@@ -520,16 +577,62 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
                break;
            }
        }
+        // Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
+        string playlistToTreat = "UU" + channelToTreat.substr(2);
+        pageToken = "";
+        while(true)
+        {
+            json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
+            if(data.contains("error"))
+            {
+                // `UCFoBM1VginhMH7lR56GtVbQ` doesn't have videos and is in this case for instance.
+                PRINT("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!")
+                break;
+            }
+            json items = data["items"];
+            for(const auto& item : items)
+            {
+                string videoId = item["contentDetails"]["videoId"];
+                // Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
+                //treatChannelOrVideo(threadId, false, videoId, channelToTreat);
+
+                string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/";
+                createDirectory(channelCaptionsToTreatDirectory);
+
+                // Firstly download all not automatically generated captions.
+                // The underscore in `-o` argument is used to not end up with hidden files.
+                // We are obliged to precise the video id after `--`, otherwise if the video id starts with `-` it's considered as an argument.
+                string cmdCommonPrefix = "yt-dlp --skip-download ",
+                       cmdCommonPostfix = " -o '" + channelCaptionsToTreatDirectory + "_' -- " + videoId;
+                string cmd = cmdCommonPrefix + "--write-sub --sub-lang all,-live_chat" + cmdCommonPostfix;
+                exec(threadId, cmd);
+
+                // Secondly download the automatically generated captions.
+                cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
+                exec(threadId, cmd);
+            }
+            if(data.contains("nextPageToken"))
+            {
+                pageToken = data["nextPageToken"];
+            }
+            else
+            {
+                break;
+            }
+        }
    }
 }

 // This function verifies that the given hasn't already been treated.
 void addChannelToTreat(unsigned short threadId, string channelId)
 {
+    channelsPerSecondCount++;
+    channelsCountThreads[threadId]++;
    channelsAlreadyTreatedAndToTreatMutex.lock();
    if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreatRev.find(channelId) == channelsToTreatRev.end())
    {
-        unsigned int channelsToTreatIndex = channelsToTreat.end()->first + 1;
+        // It is unclear to me why `channelsToTreat.end()->first + 1` doesn't work here.
+        unsigned int channelsToTreatIndex = channelsToTreat.rbegin()->first + 1;
        channelsToTreat[channelsToTreatIndex] = channelId;
        channelsToTreatRev[channelId] = channelsToTreatIndex;

@@ -552,8 +655,6 @@ void treatComment(unsigned short threadId, json comment, string channelId)
        string channelId = snippet["authorChannelId"]["value"];
        addChannelToTreat(threadId, channelId);
    }
-    commentsCountThreads[threadId]++;
-    commentsPerSecondCount++;
 }

 string join(vector<string> parts, string delimiter)
@@ -571,20 +672,24 @@ string join(vector<string> parts, string delimiter)
    return result;
 }

-string exec(string cmd)
+void exec(unsigned short threadId, string cmd, bool debug)
 {
-    array<char, 128> buffer;
-    string result;
-    unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
-    if (!pipe)
+    if(debug)
    {
-        throw runtime_error("popen() failed!");
+        ostringstream toString;
+        toString << threadId;
+        string initialCmd = cmd,
+               threadIdStr = toString.str(),
+               debugCommonFilePath = CURRENT_WORKING_DIRECTORY + DEBUG_DIRECTORY + threadIdStr,
+               debugOutFilePath = debugCommonFilePath + ".out",
+               debugErrFilePath = debugCommonFilePath + ".err";
+        cmd += " >> " + debugOutFilePath;
+        cmd += " 2>> " + debugErrFilePath;
+
+        writeFile(threadId, debugOutFilePath, "a", initialCmd + "\n");
+        writeFile(threadId, debugErrFilePath, "a", initialCmd + "\n");
    }
-    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
-    {
-        result += buffer.data();
-    }
-    return result;
+    system(cmd.c_str());
 }

 bool writeFile(unsigned short threadId, string filePath, string option, string toWrite)
@@ -647,7 +752,7 @@ vector<string> getFileContent(string filePath)
    return lines;
 }

-json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, string directoryPath, getJsonBehavior behavior)
+json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, string channelId, getJsonBehavior behavior)
 {
    string finalUrl = usingYoutubeDataApiv3 ?
                      (USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ?
@@ -662,12 +767,17 @@ json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, st
    }
    catch (json::parse_error& ex)
    {
-        PRINT("Parse error for " << finalUrl << ", as got: " << content << " !")
-        exit(EXIT_FAILURE);
+        // From the experience this sometimes happens due to empty `content` but retrying just after solves the problem.
+        PRINT("Parse error for " << finalUrl << ", as got: " << content << " ! Retrying...")
+        return getJson(threadId, url, usingYoutubeDataApiv3, channelId);
    }

    if(data.contains("error"))
    {
+        if(!usingYoutubeDataApiv3)
+        {
+            EXIT_WITH_ERROR("Found error in JSON retrieve from YouTube operational API at URL: " << finalUrl << " for content: " << content << " !")
+        }
        string reason = data["error"]["errors"][0]["reason"];
        // Contrarily to YouTube operational API no-key service we don't rotate keys in `KEYS_FILE_PATH`, as we keep them in memory here.
        if(reason == "quotaExceeded")
@@ -678,19 +788,20 @@ json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, st
            PRINT("No more quota on " << apiKey << " switching to " << keys[0] << ".")
            apiKey = keys[0];
            quotaMutex.unlock();
-            return getJson(threadId, url, true, directoryPath);
+            return getJson(threadId, url, true, channelId);
        }
        PRINT("Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
        if(reason != "commentsDisabled" || behavior == retryOnCommentsDisabled)
        {
-            return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, true, directoryPath);
+            return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, true, channelId);
        }
    }

    ostringstream toString;
-    toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannelThreads[threadId] << ".json";
-    requestsPerChannelThreads[threadId]++;
-    writeFile(threadId, toString.str(), "w", url + "\n" + content);
+    toString << CHANNELS_DIRECTORY << channelId << "/" << YOUTUBE_API_REQUESTS_DIRECTORY;
+    writeFile(threadId, toString.str() + "urls.txt", "a", url + " " + (usingYoutubeDataApiv3 ? "true" : "false") + "\n");
+    toString << requestsPerChannelThreads[threadId]++ << ".json";
+    writeFile(threadId, toString.str(), "w", content);

    return data;
 }
--- a/removeChannelsBeingTreated.py
+++ b/removeChannelsBeingTreated.py
@@ -14,7 +14,8 @@ with open('nohup.out') as f:
            #print(line)
            threadId = line.split(': ')[1]
            channelId = line.split(infix)[1].split(' (')[0]
-            threads[threadId] = channelId
+            if threadId.isdigit() and channelId.startswith('UC') and len(channelId) == 24:
+                threads[threadId] = channelId
    for threadId in threads:
        channelId = threads[threadId]
        print(threadId, channelId)
--- a/website/channels.php
+++ b/website/channels.php
@@ -0,0 +1,42 @@
+<?php
+
+    if (!function_exists('str_contains')) {
+        function str_contains($haystack, $needle)
+        {
+            return strpos($haystack, $needle) !== false;
+        }
+    }
+
+    if (!function_exists('str_ends_with')) {
+        function str_ends_with($haystack, $needle)
+        {
+            $length = strlen($needle);
+            return $length > 0 ? substr($haystack, -$length) === $needle : true;
+        }
+    }
+
+    function str_replace_first($needle, $replace, $haystack) {
+        $pos = strpos($haystack, $needle);
+        if ($pos !== false) {
+            $haystack = substr_replace($haystack, $replace, $pos, strlen($needle));
+        }
+        return $haystack;
+    }
+
+    $uri = $_SERVER['REQUEST_URI'];
+    $uri = str_replace('/channels/', '', $uri);
+    $prefix = '/mnt/HDD0/YouTube_captions_search_engine/channels/';
+    if (str_contains($uri, '/')) {
+        $uri = str_replace_first('/', '#', $uri);
+        $uri = $prefix . $uri;
+        if (str_ends_with($uri, '.json')) {
+            header('Content-Type: application/json; charset=UTF-8');
+        }
+        echo file_get_contents("zip://$uri");
+    } else {
+        $uri = $prefix . $uri;
+        header("Content-Type: application/zip");
+        echo readfile($uri);
+    }
+
+?>
--- a/website/composer.json
+++ b/website/composer.json
@@ -0,0 +1,5 @@
+{
+    "require": {
+        "cboden/ratchet": "^0.4.4"
+    }
+}
--- a/website/composer.lock
+++ b/website/composer.lock
--- a/website/index.php
+++ b/website/index.php
@@ -0,0 +1,103 @@
+<?php
+
+    function echoUrl($url)
+    {
+        echo "<a href=\"$url\">$url</a>";
+    }
+
+    ?>
+
+See <?php echoUrl('https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine'); ?> for more information.<br/>
+
+Access raw data with: <?php echoUrl('channels/'); ?>.
+
+<form id="form">
+    <input type="text" autofocus id="search" pattern="[A-Za-z0-9-_ ]+" placeholder="Your [A-Za-z0-9-_ ]+ search"></input>
+    <input type="submit" id="search" value="Search">
+    <input type="submit" id="search-only-captions" value="Search only captions">
+</form>
+
+Progress: <span id="progress"></span> channels
+
+<ul id="channels">
+</ul>
+
+<script>
+    var firstRun = true;
+    var conn;
+    // Could parse DOM instead of using following variable.
+    var channels = [];
+
+    function createA(text, href) {
+        var a = document.createElement('a');
+        var text = document.createTextNode(text);
+        a.appendChild(text);
+        a.href = href;
+        return a;
+    }
+
+    function treatLine(line) {
+        console.log(line);
+        if (line.startsWith('progress:')) {
+            document.getElementById('progress').innerHTML = line.replace('progress:', '');
+        } else {
+            var channelsDom = document.getElementById('channels');
+            var timestamp = [];
+            const lineParts = line.split('|');
+            if (lineParts.length > 0) {
+                timestamps = lineParts.slice(1).map(linePart => parseInt(linePart));
+                line = lineParts[0];
+            }
+            const channelFileParts = line.split('/');
+            const channel = channelFileParts[0];
+            const channelFile = channelFileParts.slice(1).join('/');
+            const channelHref = `channels/${channel}`;
+            if (!channels.includes(channel)) {
+                channels.push(channel);
+                channelDom = document.createElement('li');
+                var a = createA(channel, channelHref);
+                channelDom.appendChild(a);
+                var channelFilesDom = document.createElement('ul');
+                channelDom.appendChild(channelFilesDom);
+                channelsDom.appendChild(channelDom);
+            }
+            var channelDom = channelsDom.lastChild;
+            var channelFilesDom = channelDom.lastChild;
+            var channelFileDom = document.createElement('li');
+            var a = createA(channelFile, `${channelHref}/${channelFile}`);
+            channelFileDom.appendChild(a);
+            const id = channelFileParts[2];
+            for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) {
+                const space = document.createTextNode('\u00A0');
+                channelFileDom.appendChild(space);
+                const timestamp = timestamps[timestampsIndex];
+                var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
+                channelFileDom.appendChild(a);
+            }
+            channelFilesDom.appendChild(channelFileDom);
+        }
+    }
+
+    function search(event) {
+        // We don't want to refresh the webpage which is the default behavior.
+        event.preventDefault();
+        const query = event.submitter.id + ' ' + document.getElementById('search').value;
+        if (firstRun) {
+            firstRun = false;
+            conn = new WebSocket('wss://crawler.yt.lemnoslife.com/websocket');
+            conn.onmessage = function(e) {
+                e.data.split('\n').forEach(treatLine);
+            };
+            // We can't directly proceed with `conn.send`, as the connection may not be already established.
+            conn.onopen = function(e) { conn.send(query); };
+        } else {
+            // We assume at this point that the connection is established.
+            channels = [];
+            document.getElementById('channels').innerHTML = '';
+            conn.send(query);
+        }
+    }
+
+    var form = document.getElementById('form');
+    form.addEventListener('submit', search);
+</script>
--- a/website/search.py
+++ b/website/search.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python3
+
+import sys, time, fcntl, os, zipfile, webvtt, re
+from io import StringIO
+
+path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
+
+clientId = sys.argv[1]
+message = sys.argv[2]
+
+searchOnlyCaptions = message.startswith('search-only-captions ')
+message = message[message.find(' ') + 1:]
+
+clientFilePath = f'users/{clientId}.txt'
+
+def write(s):
+    f = open(clientFilePath, 'r+')
+    try:
+        fcntl.flock(f, fcntl.LOCK_EX)
+        # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
+        read = f.read()
+        # We are appening content, as we moved in-file cursor.
+        if read != '':
+            f.write("\n")
+        f.write(s)
+        f.flush()
+        fcntl.flock(f, fcntl.LOCK_UN)
+        f.close()
+    except Exception as e:
+        sys.exit(e)
+
+# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
+files = [file for file in os.listdir(path) if file.endswith('.zip')]
+for fileIndex, file in enumerate(files):
+    write(f'progress:{fileIndex + 1} / {len(files)}')
+    zip = zipfile.ZipFile(path + file)
+    for fileInZip in zip.namelist():
+        endsWithVtt = fileInZip.endswith('.vtt')
+        if searchOnlyCaptions and not endsWithVtt:
+            continue
+        with zip.open(fileInZip) as f:
+            toWrite = f'{file}/{fileInZip}'
+            if endsWithVtt:
+                content = f.read().decode('utf-8')
+                stringIOf = StringIO(content)
+                wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
+                messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
+                if messagePositions != []:
+                    timestamps = []
+                    for messagePosition in messagePositions:
+                        stringIOf = StringIO(content)
+                        for caption in webvtt.read_buffer(stringIOf):
+                            text = caption.text
+                            if messagePosition <= len(text):
+                                timestamp = str(int(caption.start_in_seconds))
+                                timestamps += [timestamp]
+                                break
+                            messagePosition -= len(text) + 1
+                    write(f'{toWrite}|{"|".join(timestamps)}')
+            else:
+                for line in f.readlines():
+                    if message in str(line):
+                        write(toWrite)
+                        break
+
+f = open(clientFilePath)
+while True:
+    try:
+        fcntl.flock(f, fcntl.LOCK_EX)
+        if f.read() == '':
+            os.remove(clientFilePath)
+            break
+        else:
+            fcntl.flock(f, fcntl.LOCK_UN)
+            time.sleep(1)
+    except Exception as e:
+        sys.exit(e)
+
+f.close()
--- a/website/users/.gitignore
+++ b/website/users/.gitignore
@@ -0,0 +1,5 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
+
--- a/website/websocket.php
+++ b/website/websocket.php
@@ -0,0 +1,166 @@
+<?php
+
+use Ratchet\MessageComponentInterface;
+use Ratchet\ConnectionInterface;
+use React\EventLoop\LoopInterface;
+use React\EventLoop\Timer\Timer;
+
+// Make sure composer dependencies have been installed
+require __DIR__ . '/vendor/autoload.php';
+
+class Client
+{
+    public $id;
+    public $timer;
+    public $pid;
+
+    public function __construct($id)
+    {
+        $this->id = $id;
+    }
+
+    // `__destruct` can't take arguments.
+    public function free($loop)
+    {
+        $loop->cancelTimer($this->timer);
+        // Should in theory verify that the pid wasn't re-assigned.
+        posix_kill($this->pid, SIGTERM);
+        $clientFilePath = getClientFilePath($this->id);
+        if (file_exists($clientFilePath)) {
+            $fp = fopen($clientFilePath, "r+");
+            if (flock($fp, LOCK_EX, $WAIT_IF_LOCKED)) {   // acquire an exclusive lock
+                unlink($clientFilePath); // delete file
+                flock($fp, LOCK_UN);     // release the lock
+            } else {
+                echo "Couldn't get the lock!";
+            }
+            fclose($fp);
+        }
+    }
+}
+
+// Need to be passed as a reference to `flock`.
+$WAIT_IF_LOCKED = 1;
+
+define('USERS_FOLDER', 'users/');
+
+// Delete users outputs of previous `websocket.php` execution.
+// We skip `.`, `..` and `.gitignore`.
+foreach (array_slice(scandir(USERS_FOLDER), 3) as $file) {
+    unlink(USERS_FOLDER . $file);
+}
+
+function getClientFilePath($clientId)
+{
+    return USERS_FOLDER . "$clientId.txt";
+}
+
+// Current implementation may add latency across users.
+class MyProcess implements MessageComponentInterface
+{
+    protected $clients;
+    private $loop;
+    private $newClientId;
+    private $newClientIdSem;
+
+    public function __construct(LoopInterface $loop)
+    {
+        $this->clients = new \SplObjectStorage();
+        $this->loop = $loop;
+        $this->newClientId = 0;
+        $this->newClientIdSem = sem_get(1, 1);
+    }
+
+    private function newClient()
+    {
+        // If `onOpen` and `onMessage` can't be called at the same time, then this semaphore is useless.
+        if (sem_acquire($this->newClientIdSem)) {
+            // Note that we don't re-use ids except on `websockets.php` restart, but as the maximal int in PHP is a very great number we are fine for a while (https://www.php.net/manual/en/reserved.constants.php#constant.php-int-max)
+            $clientId = $this->newClientId++;
+            sem_release($this->newClientIdSem);
+            return new Client($clientId);
+        } else {
+            exit('`newClient` error');
+        }
+    }
+
+    public function onOpen(ConnectionInterface $conn)
+    {
+        $client = $this->newClient();
+        $this->clients->attach($conn, $client);
+    }
+
+    public function onMessage(ConnectionInterface $from, $msg)
+    {
+        // As we are going to use this argument in a shell command, we verify a limited set of characters that are safe once quoted.
+        if (preg_match("/^[a-zA-Z0-9-_ ]+$/", $msg) !== 1) {
+            return;
+        }
+        $client = $this->clients->offsetGet($from);
+        // If a previous request was received, we execute the new one with another client for simplicity otherwise with current file deletion approach, we can't tell the worker `search.py` that we don't care about its execution anymore.
+        if ($client->pid !== null) {
+            // As `$this->clients->detach` doesn't call `__destruct` for unknown reason, we clean manually the previous request.
+            $client->free($this->loop);
+            $client = $this->newClient();
+        }
+        $clientId = $client->id;
+        $clientFilePath = getClientFilePath($clientId);
+        // Create the worker output file otherwise it would believe that we don't need this worker anymore.
+        file_put_contents($clientFilePath, '');
+        // Start the independent worker.
+        // Redirecting `stdout` is mandatory otherwise `exec` is blocking.
+        $client->pid = exec("./search.py $clientId '$msg' > /dev/null & echo $!");
+        // `addTimer` doesn't enable us to use independently `$from->send` multiple times with blocking instructions between.
+        $client->timer = $this->loop->addPeriodicTimer(1, function () use ($from, $clientId, $clientFilePath, $client) {
+            echo "Checking news from $clientId\n";
+            // If the worker output file doesn't exist anymore, then it means that the worker have finished its work and acknowledged that `websocket.php` completely read its output.
+            if (file_exists($clientFilePath)) {
+                // `flock` requires `r`eading permission and we need `w`riting one due to `ftruncate` usage.
+                $fp = fopen($clientFilePath, "r+");
+                $read = null;
+                if (flock($fp, LOCK_EX, $WAIT_IF_LOCKED)) { // acquire an exclusive lock
+                    // We assume that the temporary output is less than 1 MB long.
+                    $read = fread($fp, 1_000_000);
+                    ftruncate($fp, 0);     // truncate file
+                    fflush($fp);           // flush output before releasing the lock
+                    flock($fp, LOCK_UN);   // release the lock
+                } else {
+                    // We `die` instead of `echo`ing to force the developer to investigate the reason.
+                    die("Couldn't get the lock!");
+                }
+                fclose($fp);
+
+                // Assume that empty output doesn't need to me forwarded to the end-user.
+                if ($read !== null && $read !== '') {
+                    $from->send($read);
+                }
+            } else {
+                // We don't need the periodic timer anymore, as the worker finished its work and acknowledged that `websocket.php` completely read its output.
+                $this->loop->cancelTimer($client->timer);
+            }
+        });
+    }
+
+    public function onClose(ConnectionInterface $conn)
+    {
+        $client = $this->clients->offsetGet($conn);
+        $clientId = $client->id;
+        $client->free($this->loop);
+        echo "$clientId disconnected\n";
+        $this->clients->detach($conn);
+    }
+
+    public function onError(ConnectionInterface $conn, \Exception $e)
+    {
+        $conn->close();
+        die('`onError`');
+    }
+}
+
+$loop = \React\EventLoop\Factory::create();
+
+// Run the server application through the WebSocket protocol on port 4430.
+// Note that named arguments come with PHP 8 which isn't current Debian one.
+$app = new Ratchet\App('crawler.yt.lemnoslife.com', 4430, '127.0.0.1', $loop);
+$app->route('/websocket', new MyProcess($loop), array('*'));
+$app->run();
Author	SHA1	Message	Date
Benjamin Loison	4a11ac4196	Fix #51 : These last days the algorithm seems to not treat completely firstly the starting set of channels before treating discovered channels I verified that this commit solves the issue by treating only `CHANNELS` tab of the channels in `channels.txt`.	2023-02-22 04:09:35 +01:00
Benjamin Loison	c30847c1f5	#48 : Stop relying on `echo`, `tee` and `/dev/null` for redirecting compression command to `debug/`	2023-02-22 03:47:06 +01:00
Benjamin Loison	221956438d	#48 : Redirect compression command `echo` to `/dev/null`	2023-02-22 03:37:07 +01:00
Benjamin Loison	ba78223c0c	Fix #48 : Redirect compression execution logs for not having them overlapping `PRINT`s	2023-02-22 03:27:49 +01:00
Benjamin Loison	e86d629597	#48 : Modify `removeChannelsBeingTreated.py` to temporarily solve the issue	2023-02-19 02:04:28 +01:00
Benjamin Loison	78b2bf18fa	#35 : Make the not automatically generated captions correctly downloaded	2023-02-17 16:57:11 +01:00
Benjamin Loison	5bfceccb8e	Change the `EXIT_WITH_ERROR` to `PRINT` for channels not having an enumerable `uploads` playlist	2023-02-16 12:21:28 +01:00
Benjamin Loison	eb8431746e	Make the first channel of `channels.txt` being treated again, solve temporary empty response from YouTube Data API v3 issue and temporarily remove sanity check failing very rarely #39	2023-02-14 23:15:07 +01:00
Benjamin Loison	a7f6e1cd85	Fix #31 : List all occurrences of search within video captions	2023-02-14 02:56:11 +01:00
Benjamin Loison	21ad878be8	Fix #31 : Make a website with a search engine notably based on the captions extracted	2023-02-14 02:00:23 +01:00
Benjamin Loison	57572c6d6c	#31 : Make search within captions not limited by line wrapping	2023-02-14 01:32:36 +01:00
Benjamin Loison	e0faf053a1	Fix #38 : Add a loading message with progress on end-user interface	2023-02-14 01:08:05 +01:00
Benjamin Loison	77bafdd592	#31 : Add a first search only captions support	2023-02-14 00:59:37 +01:00
Benjamin Loison	fa7da64879	Add `.gitignore` to ignore `{keys, channels}.txt`	2023-02-13 06:18:42 +01:00
Benjamin Loison	9e650cf72a	Make the `COMMUNITY` tab process not infinitely loop Related to https://github.com/Benjamin-Loison/YouTube-operational-API/issues/49	2023-02-13 06:17:23 +01:00
Benjamin Loison	dc63de82f5	Add link to `channels/` to `index.php`	2023-02-13 05:55:44 +01:00
Benjamin Loison	dfdfbe3272	Modify website to support new sub-folders architecture	2023-02-13 05:45:08 +01:00
Benjamin Loison	a51e3b1a9a	Fix #37 : Use a number of channels seen (possibly repeated) instead of YouTube Data API v3 Comment(Thread): resource	2023-02-12 16:31:27 +01:00
Benjamin Loison	b572d078dd	Add logging to `exec` and make it crashless, `requests` and `captions` folders support for compressing, clean captions support for videos being livestreams and videos starting with `-`	2023-02-12 16:24:16 +01:00
Benjamin Loison	8df226e2bc	Move YouTube API requests logging to `requests/` channel sub-folder	2023-02-10 20:17:49 +01:00
Benjamin Loison	3c4664a4b1	Fix #13 : Add captions extraction I was about to commit in addition: ```c++ // Due to videos with automatically generated captions but being set to `Off` by default aren't retrieved with `--sub-langs '.orig'`. // My workaround is to first call YouTube Data API v3 Captions: list endpoint with `part=snippet` and retrieve the language that has `"trackKind": "asr"` (automatic speech recognition) in `snippet`. /json data = getJson(threadId, "captions?part=snippet&videoId=" + videoId, true, channelToTreat), items = data["items"]; for(const auto& item : items) { json snippet = item["snippet"]; if(snippet["trackKind"] == "asr") { string language = snippet["language"]; cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '" + language + "-orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); // As there should be a single automatic speech recognized track, there is no need to go through all tracks. break; } }/ ``` Instead of: ```c++ cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix; exec(threadId, cmd); ``` But I realized that, as the GitHub comment I was about to add to https://github.com/yt-dlp/yt-dlp/issues/2655, I was wrong: > `yt-dlp --cookies cookies.txt --sub-langs 'en.,.orig' --write-auto-subs https://www.youtube.com/watch?v=tQqDBySHYlc` work as expected. Many thanks again. > > ``` > 'subtitleslangs': ['en.','.orig'], > 'writeautomaticsub': True, > ``` > > Work as expected too. Thank you > > Very sorry for the video sample. I even not watched it. Thank you for this workaround. However note that videos having automatically generated subtitles but being set to `Off` by default aren't retrieved with your method (example of such video: [`mozyXsZJnQ4`](https://www.youtube.com/watch?v=mozyXsZJnQ4)). My workaround is to first call [YouTube Data API v3](https://developers.google.com/youtube/v3) [Captions: list](https://developers.google.com/youtube/v3/docs/captions/list) endpoint with [`part=snippet`](https://developers.google.com/youtube/v3/docs/captions/list#part) and retrieve the [`language`](https://developers.google.com/youtube/v3/docs/captions#snippet.language) that has [`"trackKind": "asr"`](https://developers.google.com/youtube/v3/docs/captions#snippet.trackKind) (automatic speech recognition) in [`snippet`](https://developers.google.com/youtube/v3/docs/captions#snippet).	2023-02-10 20:03:08 +01:00
Benjamin Loison	7fcc8b09fa	Fix #36 : Make the program stops by crashing on YouTube operational API instance being detected as sending unusual traffic	2023-02-10 12:02:39 +01:00
Benjamin Loison	87d67e4e85	Correct the termination of `COMMUNITY` tab process due to missing page tokens	2023-02-10 00:37:28 +01:00
Benjamin Loison	8f9b1275be	Remove the `Content-Type: application/json` HTTP header when retrieving `urls.txt` inside a `.zip`	2023-02-09 02:07:10 +01:00
Benjamin Loison	afd9e1b0b6	Add a verification that `snippet/authorChannelId/value` isn't null when using `commentThreads` for `COMMUNITY` As it can happen cf https://www.youtube.com/channel/UCWeg2Pkate69NFdBeuRFTAw/community?lc=UgwGfjNxGuwqP8qYPPN4AaABAg&lb=UgkxYiEAo9-b1vWPasxFy13f959rrctQpZwW	2023-02-09 01:51:22 +01:00
Benjamin Loison	5a1df71bb9	Append to `channels.txt` all channels mentioned in the Wiki	2023-02-08 16:28:44 +01:00
Benjamin Loison	622188d6d9	Add in `urls.txt` if the URL is related to YouTube Data API v3 or YouTube operational API	2023-02-08 16:05:03 +01:00
Benjamin Loison	0c51bd05bc	Fix #34 : Correct JSON files by putting first line in another metadata file	2023-02-07 23:08:09 +01:00
Benjamin Loison	e0f521d572	Restore ability to download whole archives As API keys aren't written in the first line of JSON files.	2023-02-07 23:01:26 +01:00
Benjamin Loison	e5a50bcba4	Remove ability in `channels.php` to download whole archive for not leaking API keys used	2023-02-07 22:42:24 +01:00
Benjamin Loison	2179e9b6f4	Add `channels.php` adding support for (file in) zip download	2023-02-07 22:39:43 +01:00
Benjamin Loison	e9b77369fb	#31 : Add zip files search	2023-02-07 20:15:36 +01:00
Benjamin Loison	b45384bab7	Comment WebSocket mechanism to work with an arbitrary number of independent send	2023-02-07 18:14:49 +01:00
Benjamin Loison	126cc75dc6	Make WebSocket able to manage arbitrary feedback to end-user While previous implementation was able to send two independent messages, now we can send an arbitrary amount of independent messages.	2023-02-07 17:25:17 +01:00
Benjamin Loison	7302679a81	Make `websockets.php` able to proceed blocking treatments	2023-02-07 01:22:26 +01:00
Benjamin Loison	0dba8e0c7d	Make a WebSocket example work with `crawler.yt.lemnoslife.com`	2023-01-31 01:05:09 +01:00
Benjamin Loison	155d372186	Run `php-cs-fixer fix --rules=@PSR12 websocket.php`	2023-01-31 00:57:06 +01:00
Benjamin Loison	bd184bd0f0	Rename `chat.php` to `websocket.php`	2023-01-30 22:24:02 +01:00
Benjamin Loison	0193f05143	Copy-pasted the `README.md` quick example of `ratchetphp/Ratchet` `5012dc9545 (a-quick-example)`	2023-01-30 22:19:04 +01:00
Benjamin Loison	931b2df563	Add static `website/index.php`	2023-01-30 22:14:05 +01:00
Benjamin Loison	0f4b89ccd9	Correct typo: the channel tab is `LIVE`, not `LIVES`	2023-01-25 01:00:29 +01:00
Benjamin Loison	4e162e34c3	Add comment in `README.md` about the usage of `--no-keys` or generating a YouTube Data API v3 key	2023-01-22 15:41:13 +01:00
Benjamin Loison	10e8811817	Introduce `{,MAIN_}EXIT_WITH_ERROR` macros for exitting with an error	2023-01-22 15:17:14 +01:00
Benjamin Loison	0f15bb0235	#11 : Add the discovering of channels having commented on ended livestreams	2023-01-22 15:15:27 +01:00