Compare commits
44 Commits
bdb4e6443a
...
0.0.2
| Author | SHA1 | Date | |
|---|---|---|---|
| 4a11ac4196 | |||
| c30847c1f5 | |||
| 221956438d | |||
| ba78223c0c | |||
| e86d629597 | |||
| 78b2bf18fa | |||
| 5bfceccb8e | |||
| eb8431746e | |||
| a7f6e1cd85 | |||
| 21ad878be8 | |||
| 57572c6d6c | |||
| e0faf053a1 | |||
| 77bafdd592 | |||
| fa7da64879 | |||
| 9e650cf72a | |||
| dc63de82f5 | |||
| dfdfbe3272 | |||
| a51e3b1a9a | |||
| b572d078dd | |||
| 8df226e2bc | |||
| 3c4664a4b1 | |||
| 7fcc8b09fa | |||
| 87d67e4e85 | |||
| 8f9b1275be | |||
| afd9e1b0b6 | |||
| 5a1df71bb9 | |||
| 622188d6d9 | |||
| 0c51bd05bc | |||
| e0f521d572 | |||
| e5a50bcba4 | |||
| 2179e9b6f4 | |||
| e9b77369fb | |||
| b45384bab7 | |||
| 126cc75dc6 | |||
| 7302679a81 | |||
| 0dba8e0c7d | |||
| 155d372186 | |||
| bd184bd0f0 | |||
| 0193f05143 | |||
| 931b2df563 | |||
| 0f4b89ccd9 | |||
| 4e162e34c3 | |||
| 10e8811817 | |||
| 0f15bb0235 |
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
keys.txt
|
||||
channels.txt
|
||||
11
README.md
11
README.md
@@ -15,12 +15,21 @@ Have to proceed with a breadth-first search approach as treating all *child* cha
|
||||
Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
|
||||
|
||||
```sh
|
||||
sudo apt install nlohmann-json3-dev
|
||||
sudo apt install nlohmann-json3-dev yt-dlp
|
||||
make
|
||||
./youtubeCaptionsSearchEngine -h
|
||||
```
|
||||
|
||||
If you plan to use the front-end website, also run:
|
||||
|
||||
```sh
|
||||
pip install webvtt-py
|
||||
```
|
||||
|
||||
Except if you provide the argument `--youtube-operational-api-instance-url https://yt.lemnoslife.com`, you have [to host your own instance of the YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API/#install-your-own-instance-of-the-api).
|
||||
|
||||
Except if you provide the argument `--no-keys`, you have to provide at least one [YouTube Data API v3 key](https://developers.google.com/youtube/v3/getting-started) in `keys.txt`.
|
||||
|
||||
```sh
|
||||
./youtubeCaptionsSearchEngine
|
||||
```
|
||||
|
||||
12
channels.txt
12
channels.txt
@@ -97,4 +97,14 @@ UCWrtcU1OId_PQ_YoBN6lIRA
|
||||
UCfih6kPJCpzWmtCFtlpYK6A
|
||||
UCdTyuXgmJkG_O8_75eqej-w
|
||||
UCxXFx2jz8N02sNqv1VeDEGA
|
||||
UCj8BKFCTH-mqRlYwcmX2xwg
|
||||
UCj8BKFCTH-mqRlYwcmX2xwg
|
||||
UCsT0YIqwnpJCM-mx7-gSA4Q
|
||||
UCAuUUnT6oDeKwE6v1NGQxug
|
||||
UCy0uwqmXSHVOgqo3nrN4RCQ
|
||||
UCawLcDd9clh27b1z55Gcawg
|
||||
UC6bfT6U4WED5EyzymREvKlQ
|
||||
UCINdSH_R15xft_ctNm50eGQ
|
||||
UCVx2ZvskbDkHpLlYEQ9FULw
|
||||
UCBcmi8nLrqfFluiexxjl7bg
|
||||
UCBnZ16ahKA2DZ_T5W0FPUXg
|
||||
UCf8w5m0YsRa8MHQ5bwSGmbw
|
||||
223
main.cpp
223
main.cpp
@@ -16,16 +16,16 @@ enum getJsonBehavior { normal, retryOnCommentsDisabled, returnErrorIfPlaylistNot
|
||||
|
||||
set<string> setFromVector(vector<string> vec);
|
||||
vector<string> getFileContent(string filePath);
|
||||
json getJson(unsigned short threadId, string url, bool usingYouTubeDataApiV3, string directoryPath, getJsonBehavior behavior = normal);
|
||||
json getJson(unsigned short threadId, string url, bool usingYouTubeDataApiV3, string channelId, getJsonBehavior behavior = normal);
|
||||
void createDirectory(string path),
|
||||
print(ostringstream* toPrint),
|
||||
treatComment(unsigned short threadId, json comment, string channelId),
|
||||
treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, string channelToTreat),
|
||||
treatChannels(unsigned short threadId),
|
||||
deleteDirectory(string path),
|
||||
addChannelToTreat(unsigned short threadId, string channelId);
|
||||
addChannelToTreat(unsigned short threadId, string channelId),
|
||||
exec(unsigned short threadId, string cmd, bool debug = true);
|
||||
string getHttps(string url),
|
||||
exec(string cmd),
|
||||
join(vector<string> parts, string delimiter);
|
||||
size_t writeCallback(void* contents, size_t size, size_t nmemb, void* userp);
|
||||
bool doesFileExist(string filePath),
|
||||
@@ -36,6 +36,9 @@ bool doesFileExist(string filePath),
|
||||
#define DEFAULT_THREAD_ID 0
|
||||
#define MAIN_PRINT(x) THREAD_PRINT(DEFAULT_THREAD_ID, x)
|
||||
|
||||
#define EXIT_WITH_ERROR(x) { PRINT(x); exit(EXIT_FAILURE); }
|
||||
#define MAIN_EXIT_WITH_ERROR(x) { MAIN_PRINT(x); exit(EXIT_FAILURE); }
|
||||
|
||||
mutex printMutex,
|
||||
channelsAlreadyTreatedAndToTreatMutex,
|
||||
quotaMutex;
|
||||
@@ -44,8 +47,8 @@ set<string> channelsAlreadyTreated;
|
||||
map<unsigned int, string> channelsToTreat;
|
||||
map<string, unsigned int> channelsToTreatRev;
|
||||
vector<string> keys;
|
||||
unsigned int commentsPerSecondCount = 0;
|
||||
map<unsigned short, unsigned int> commentsCountThreads,
|
||||
unsigned int channelsPerSecondCount = 0;
|
||||
map<unsigned short, unsigned int> channelsCountThreads,
|
||||
requestsPerChannelThreads;
|
||||
unsigned short THREADS_NUMBER = 1;
|
||||
// Use `string` variables instead of macros to have `string` properties, even if could use a meta-macro inlining as `string`s.
|
||||
@@ -54,7 +57,11 @@ string CHANNELS_DIRECTORY = "channels/",
|
||||
KEYS_FILE_PATH = "keys.txt",
|
||||
UNLISTED_VIDEOS_FILE_PATH = "unlistedVideos.txt",
|
||||
apiKey = "", // Will firstly be filled with `KEYS_FILE_PATH` first line.
|
||||
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API"; // Can be "https://yt.lemnoslife.com" for instance.
|
||||
YOUTUBE_OPERATIONAL_API_INSTANCE_URL = "http://localhost/YouTube-operational-API", // Can be "https://yt.lemnoslife.com" for instance.
|
||||
CAPTIONS_DIRECTORY = "captions/",
|
||||
DEBUG_DIRECTORY = "debug/",
|
||||
YOUTUBE_API_REQUESTS_DIRECTORY = "requests/",
|
||||
CURRENT_WORKING_DIRECTORY;
|
||||
bool USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE = false;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
@@ -84,14 +91,12 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
else
|
||||
{
|
||||
MAIN_PRINT("YouTube operational API instance URL missing!")
|
||||
exit(EXIT_FAILURE);
|
||||
MAIN_EXIT_WITH_ERROR("YouTube operational API instance URL missing!")
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
MAIN_PRINT("Unrecognized parameter " << argvStr)
|
||||
exit(EXIT_FAILURE);
|
||||
MAIN_EXIT_WITH_ERROR("Unrecognized parameter " << argvStr)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,13 +118,23 @@ int main(int argc, char *argv[])
|
||||
|
||||
for(const auto& entry : filesystem::directory_iterator(CHANNELS_DIRECTORY))
|
||||
{
|
||||
string fileName = entry.path().filename(),
|
||||
channelId = fileName.substr(0, fileName.length() - 4);
|
||||
string fileName = entry.path().filename();
|
||||
// Skip files such as `UNLISTED_VIDEOS_FILE_PATH`.
|
||||
if (fileName.substr(0, 2) == "UC") {
|
||||
string channelId = fileName.substr(0, fileName.length() - 4);
|
||||
|
||||
channelsToTreat.erase(channelsToTreatRev[channelId]);
|
||||
channelsToTreatRev.erase(channelId);
|
||||
channelsToTreat.erase(channelsToTreatRev[channelId]);
|
||||
channelsToTreatRev.erase(channelId);
|
||||
|
||||
channelsAlreadyTreated.insert(channelId);
|
||||
channelsAlreadyTreated.insert(channelId);
|
||||
}
|
||||
}
|
||||
|
||||
char cwd[PATH_MAX];
|
||||
if (getcwd(cwd, sizeof(cwd)) != NULL) {
|
||||
CURRENT_WORKING_DIRECTORY = string(cwd) + "/";
|
||||
} else {
|
||||
MAIN_EXIT_WITH_ERROR("`getcwd()` error");
|
||||
}
|
||||
|
||||
MAIN_PRINT(channelsToTreat.size() << " channel(s) to treat")
|
||||
@@ -133,8 +148,8 @@ int main(int argc, char *argv[])
|
||||
|
||||
while(true)
|
||||
{
|
||||
MAIN_PRINT("Comments per second: " << commentsPerSecondCount)
|
||||
commentsPerSecondCount = 0;
|
||||
MAIN_PRINT("Channels per second: " << channelsPerSecondCount)
|
||||
channelsPerSecondCount = 0;
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
@@ -164,7 +179,7 @@ void treatChannels(unsigned short threadId)
|
||||
|
||||
PRINT("Treating channel " << channelToTreat << " (treated: " << channelsAlreadyTreated.size() << ", to treat: " << channelsToTreat.size() << ")")
|
||||
|
||||
commentsCountThreads[threadId] = 0;
|
||||
channelsCountThreads[threadId] = 0;
|
||||
requestsPerChannelThreads[threadId] = 0;
|
||||
|
||||
channelsToTreat.erase(channelsToTreatRev[channelToTreat]);
|
||||
@@ -176,19 +191,24 @@ void treatChannels(unsigned short threadId)
|
||||
|
||||
string channelToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/";
|
||||
createDirectory(channelToTreatDirectory);
|
||||
createDirectory(DEBUG_DIRECTORY);
|
||||
createDirectory(channelToTreatDirectory + CAPTIONS_DIRECTORY);
|
||||
createDirectory(channelToTreatDirectory + YOUTUBE_API_REQUESTS_DIRECTORY);
|
||||
|
||||
treatChannelOrVideo(threadId, true, channelToTreat, channelToTreat);
|
||||
|
||||
// Note that compressing the French most subscribers channel took 4 minutes and 42 seconds.
|
||||
PRINT("Starting compression...")
|
||||
// As I haven't found any well-known library that compress easily a directory, I have chosen to rely on `zip` cli.
|
||||
exec("cd " + channelToTreatDirectory + " && ls | zip ../" + channelToTreat + ".zip -@");
|
||||
// We precise no `debug`ging, as otherwise the zipping operation doesn't work as expected.
|
||||
// As the zipping process isn't recursive, we can't just rely on `ls`, but we are obliged to use `find`.
|
||||
exec(threadId, "cd " + channelToTreatDirectory + " && find | zip ../" + channelToTreat + ".zip -@");
|
||||
|
||||
PRINT("Compression finished, started deleting initial directory...")
|
||||
deleteDirectory(channelToTreatDirectory);
|
||||
PRINT("Deleting directory finished.")
|
||||
|
||||
PRINT(commentsCountThreads[threadId] << " comments were found for this channel.")
|
||||
PRINT(channelsCountThreads[threadId] << " channels were found for this channel.")
|
||||
}
|
||||
|
||||
channelsAlreadyTreatedAndToTreatMutex.unlock();
|
||||
@@ -273,8 +293,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
|
||||
if(data.contains("error"))
|
||||
{
|
||||
PRINT("Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
|
||||
exit(EXIT_FAILURE);
|
||||
EXIT_WITH_ERROR("Not listing comments on videos, as `playlistItems` hasn't found the `uploads` playlist!")
|
||||
}
|
||||
json items = data["items"];
|
||||
for(const auto& item : items)
|
||||
@@ -302,8 +321,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
}
|
||||
else //if(videoCount >= 20000)
|
||||
{
|
||||
PRINT("The videos count of the channel exceeds the supported 20,000 limit!")
|
||||
exit(EXIT_FAILURE);
|
||||
EXIT_WITH_ERROR("The videos count of the channel exceeds the supported 20,000 limit!")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -358,9 +376,13 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
items = data["items"];
|
||||
for(const auto& item : items)
|
||||
{
|
||||
json snippet = item["snippet"]["topLevelComment"]["snippet"];
|
||||
string channelId = snippet["authorChannelId"]["value"];
|
||||
addChannelToTreat(threadId, channelId);
|
||||
json snippet = item["snippet"]["topLevelComment"]["snippet"],
|
||||
authorChannelId = snippet["authorChannelId"];
|
||||
if(!authorChannelId["value"].is_null())
|
||||
{
|
||||
string channelId = authorChannelId["value"];
|
||||
addChannelToTreat(threadId, channelId);
|
||||
}
|
||||
string pageToken = snippet["nextPageToken"];
|
||||
while(pageToken != "")
|
||||
{
|
||||
@@ -391,11 +413,11 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
}
|
||||
}
|
||||
}
|
||||
if(data.contains("nextPageToken"))
|
||||
if(data.contains("nextPageToken") && data["nextPageToken"] != "")
|
||||
{
|
||||
pageToken = data["nextPageToken"];
|
||||
}
|
||||
if(pageToken == "")
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
@@ -466,7 +488,7 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
break;
|
||||
}
|
||||
}
|
||||
// `LIVES`
|
||||
// `LIVE`
|
||||
pageToken = "";
|
||||
string playlistId = "UU" + id.substr(2);
|
||||
vector<string> videoIds;
|
||||
@@ -488,11 +510,11 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
{
|
||||
if(item.contains("liveStreamingDetails"))
|
||||
{
|
||||
PRINT(item["id"])
|
||||
string videoId = item["id"];
|
||||
//PRINT(videoId)
|
||||
json liveStreamingDetails = item["liveStreamingDetails"];
|
||||
if(liveStreamingDetails.contains("activeLiveChatId"))
|
||||
{
|
||||
PRINT("streaming")
|
||||
string activeLiveChatId = liveStreamingDetails["activeLiveChatId"];
|
||||
json data = getJson(threadId, "liveChat/messages?part=snippet,authorDetails&liveChatId=" + activeLiveChatId, true, id),
|
||||
items = data["items"];
|
||||
@@ -500,12 +522,47 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
{
|
||||
string channelId = item["snippet"]["authorChannelId"];
|
||||
addChannelToTreat(threadId, channelId);
|
||||
PRINT("Found: " << channelId)
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
PRINT("no more streaming")
|
||||
// As there isn't the usual pagination mechanism for these ended livestreams, we proceed in an uncertain way as follows.
|
||||
set<string> messageIds;
|
||||
unsigned long long lastMessageTimestampRelativeMsec = 0;
|
||||
while(true)
|
||||
{
|
||||
string time = to_string(lastMessageTimestampRelativeMsec);
|
||||
json data = getJson(threadId, "liveChats?part=snippet&id=" + videoId + "&time=" + time, false, id),
|
||||
snippet = data["items"][0]["snippet"];
|
||||
if(snippet.empty())
|
||||
{
|
||||
break;
|
||||
}
|
||||
json firstMessage = snippet[0];
|
||||
string firstMessageId = firstMessage["id"];
|
||||
// We verify that we don't skip any message by verifying that the first message was already treated if we already treated some messages.
|
||||
if(!messageIds.empty() && messageIds.find(firstMessageId) == messageIds.end())
|
||||
{
|
||||
PRINT("The verification that we don't skip any message failed! Continuing anyway...")
|
||||
}
|
||||
for(const auto& message : snippet)
|
||||
{
|
||||
string messageId = message["id"];
|
||||
if(messageIds.find(messageId) == messageIds.end())
|
||||
{
|
||||
messageIds.insert(messageId);
|
||||
string channelId = message["authorChannelId"];
|
||||
addChannelToTreat(threadId, channelId);
|
||||
}
|
||||
}
|
||||
json lastMessage = snippet.back();
|
||||
// If there isn't any new message, then we stop the retrieving.
|
||||
if(lastMessageTimestampRelativeMsec == lastMessage["videoOffsetTimeMsec"])
|
||||
{
|
||||
break;
|
||||
}
|
||||
lastMessageTimestampRelativeMsec = lastMessage["videoOffsetTimeMsec"];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -520,16 +577,62 @@ void treatChannelOrVideo(unsigned short threadId, bool isChannel, string id, str
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Captions retrieval by relying on `yt-dlp` after having listed all videos ids of the given channel.
|
||||
string playlistToTreat = "UU" + channelToTreat.substr(2);
|
||||
pageToken = "";
|
||||
while(true)
|
||||
{
|
||||
json data = getJson(threadId, "playlistItems?part=snippet,contentDetails,status&playlistId=" + playlistToTreat + "&maxResults=50&pageToken=" + pageToken, true, channelToTreat, returnErrorIfPlaylistNotFound);
|
||||
if(data.contains("error"))
|
||||
{
|
||||
// `UCFoBM1VginhMH7lR56GtVbQ` doesn't have videos and is in this case for instance.
|
||||
PRINT("Not listing captions on videos, as `playlistItems` hasn't found the `uploads` playlist!")
|
||||
break;
|
||||
}
|
||||
json items = data["items"];
|
||||
for(const auto& item : items)
|
||||
{
|
||||
string videoId = item["contentDetails"]["videoId"];
|
||||
// Could proceed as follows by verifying `!isChannel` but as we don't know how to manage unlisted videos, we don't proceed this way.
|
||||
//treatChannelOrVideo(threadId, false, videoId, channelToTreat);
|
||||
|
||||
string channelCaptionsToTreatDirectory = CHANNELS_DIRECTORY + channelToTreat + "/" + CAPTIONS_DIRECTORY + videoId + "/";
|
||||
createDirectory(channelCaptionsToTreatDirectory);
|
||||
|
||||
// Firstly download all not automatically generated captions.
|
||||
// The underscore in `-o` argument is used to not end up with hidden files.
|
||||
// We are obliged to precise the video id after `--`, otherwise if the video id starts with `-` it's considered as an argument.
|
||||
string cmdCommonPrefix = "yt-dlp --skip-download ",
|
||||
cmdCommonPostfix = " -o '" + channelCaptionsToTreatDirectory + "_' -- " + videoId;
|
||||
string cmd = cmdCommonPrefix + "--write-sub --sub-lang all,-live_chat" + cmdCommonPostfix;
|
||||
exec(threadId, cmd);
|
||||
|
||||
// Secondly download the automatically generated captions.
|
||||
cmd = cmdCommonPrefix + "--write-auto-subs --sub-langs '.*orig' --sub-format ttml --convert-subs vtt" + cmdCommonPostfix;
|
||||
exec(threadId, cmd);
|
||||
}
|
||||
if(data.contains("nextPageToken"))
|
||||
{
|
||||
pageToken = data["nextPageToken"];
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function verifies that the given hasn't already been treated.
|
||||
void addChannelToTreat(unsigned short threadId, string channelId)
|
||||
{
|
||||
channelsPerSecondCount++;
|
||||
channelsCountThreads[threadId]++;
|
||||
channelsAlreadyTreatedAndToTreatMutex.lock();
|
||||
if(channelsAlreadyTreated.find(channelId) == channelsAlreadyTreated.end() && channelsToTreatRev.find(channelId) == channelsToTreatRev.end())
|
||||
{
|
||||
unsigned int channelsToTreatIndex = channelsToTreat.end()->first + 1;
|
||||
// It is unclear to me why `channelsToTreat.end()->first + 1` doesn't work here.
|
||||
unsigned int channelsToTreatIndex = channelsToTreat.rbegin()->first + 1;
|
||||
channelsToTreat[channelsToTreatIndex] = channelId;
|
||||
channelsToTreatRev[channelId] = channelsToTreatIndex;
|
||||
|
||||
@@ -552,8 +655,6 @@ void treatComment(unsigned short threadId, json comment, string channelId)
|
||||
string channelId = snippet["authorChannelId"]["value"];
|
||||
addChannelToTreat(threadId, channelId);
|
||||
}
|
||||
commentsCountThreads[threadId]++;
|
||||
commentsPerSecondCount++;
|
||||
}
|
||||
|
||||
string join(vector<string> parts, string delimiter)
|
||||
@@ -571,20 +672,24 @@ string join(vector<string> parts, string delimiter)
|
||||
return result;
|
||||
}
|
||||
|
||||
string exec(string cmd)
|
||||
void exec(unsigned short threadId, string cmd, bool debug)
|
||||
{
|
||||
array<char, 128> buffer;
|
||||
string result;
|
||||
unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
|
||||
if (!pipe)
|
||||
if(debug)
|
||||
{
|
||||
throw runtime_error("popen() failed!");
|
||||
ostringstream toString;
|
||||
toString << threadId;
|
||||
string initialCmd = cmd,
|
||||
threadIdStr = toString.str(),
|
||||
debugCommonFilePath = CURRENT_WORKING_DIRECTORY + DEBUG_DIRECTORY + threadIdStr,
|
||||
debugOutFilePath = debugCommonFilePath + ".out",
|
||||
debugErrFilePath = debugCommonFilePath + ".err";
|
||||
cmd += " >> " + debugOutFilePath;
|
||||
cmd += " 2>> " + debugErrFilePath;
|
||||
|
||||
writeFile(threadId, debugOutFilePath, "a", initialCmd + "\n");
|
||||
writeFile(threadId, debugErrFilePath, "a", initialCmd + "\n");
|
||||
}
|
||||
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)
|
||||
{
|
||||
result += buffer.data();
|
||||
}
|
||||
return result;
|
||||
system(cmd.c_str());
|
||||
}
|
||||
|
||||
bool writeFile(unsigned short threadId, string filePath, string option, string toWrite)
|
||||
@@ -647,7 +752,7 @@ vector<string> getFileContent(string filePath)
|
||||
return lines;
|
||||
}
|
||||
|
||||
json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, string directoryPath, getJsonBehavior behavior)
|
||||
json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, string channelId, getJsonBehavior behavior)
|
||||
{
|
||||
string finalUrl = usingYoutubeDataApiv3 ?
|
||||
(USE_YT_LEMNOSLIFE_COM_NO_KEY_SERVICE ?
|
||||
@@ -662,12 +767,17 @@ json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, st
|
||||
}
|
||||
catch (json::parse_error& ex)
|
||||
{
|
||||
PRINT("Parse error for " << finalUrl << ", as got: " << content << " !")
|
||||
exit(EXIT_FAILURE);
|
||||
// From the experience this sometimes happens due to empty `content` but retrying just after solves the problem.
|
||||
PRINT("Parse error for " << finalUrl << ", as got: " << content << " ! Retrying...")
|
||||
return getJson(threadId, url, usingYoutubeDataApiv3, channelId);
|
||||
}
|
||||
|
||||
if(data.contains("error"))
|
||||
{
|
||||
if(!usingYoutubeDataApiv3)
|
||||
{
|
||||
EXIT_WITH_ERROR("Found error in JSON retrieve from YouTube operational API at URL: " << finalUrl << " for content: " << content << " !")
|
||||
}
|
||||
string reason = data["error"]["errors"][0]["reason"];
|
||||
// Contrarily to YouTube operational API no-key service we don't rotate keys in `KEYS_FILE_PATH`, as we keep them in memory here.
|
||||
if(reason == "quotaExceeded")
|
||||
@@ -678,19 +788,20 @@ json getJson(unsigned short threadId, string url, bool usingYoutubeDataApiv3, st
|
||||
PRINT("No more quota on " << apiKey << " switching to " << keys[0] << ".")
|
||||
apiKey = keys[0];
|
||||
quotaMutex.unlock();
|
||||
return getJson(threadId, url, true, directoryPath);
|
||||
return getJson(threadId, url, true, channelId);
|
||||
}
|
||||
PRINT("Found error in JSON at URL: " << finalUrl << " for content: " << content << " !")
|
||||
if(reason != "commentsDisabled" || behavior == retryOnCommentsDisabled)
|
||||
{
|
||||
return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, true, directoryPath);
|
||||
return reason == "playlistNotFound" && behavior == returnErrorIfPlaylistNotFound ? data : getJson(threadId, url, true, channelId);
|
||||
}
|
||||
}
|
||||
|
||||
ostringstream toString;
|
||||
toString << CHANNELS_DIRECTORY << directoryPath << "/" << requestsPerChannelThreads[threadId] << ".json";
|
||||
requestsPerChannelThreads[threadId]++;
|
||||
writeFile(threadId, toString.str(), "w", url + "\n" + content);
|
||||
toString << CHANNELS_DIRECTORY << channelId << "/" << YOUTUBE_API_REQUESTS_DIRECTORY;
|
||||
writeFile(threadId, toString.str() + "urls.txt", "a", url + " " + (usingYoutubeDataApiv3 ? "true" : "false") + "\n");
|
||||
toString << requestsPerChannelThreads[threadId]++ << ".json";
|
||||
writeFile(threadId, toString.str(), "w", content);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
@@ -14,7 +14,8 @@ with open('nohup.out') as f:
|
||||
#print(line)
|
||||
threadId = line.split(': ')[1]
|
||||
channelId = line.split(infix)[1].split(' (')[0]
|
||||
threads[threadId] = channelId
|
||||
if threadId.isdigit() and channelId.startswith('UC') and len(channelId) == 24:
|
||||
threads[threadId] = channelId
|
||||
for threadId in threads:
|
||||
channelId = threads[threadId]
|
||||
print(threadId, channelId)
|
||||
|
||||
42
website/channels.php
Normal file
42
website/channels.php
Normal file
@@ -0,0 +1,42 @@
|
||||
<?php
|
||||
|
||||
if (!function_exists('str_contains')) {
|
||||
function str_contains($haystack, $needle)
|
||||
{
|
||||
return strpos($haystack, $needle) !== false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!function_exists('str_ends_with')) {
|
||||
function str_ends_with($haystack, $needle)
|
||||
{
|
||||
$length = strlen($needle);
|
||||
return $length > 0 ? substr($haystack, -$length) === $needle : true;
|
||||
}
|
||||
}
|
||||
|
||||
function str_replace_first($needle, $replace, $haystack) {
|
||||
$pos = strpos($haystack, $needle);
|
||||
if ($pos !== false) {
|
||||
$haystack = substr_replace($haystack, $replace, $pos, strlen($needle));
|
||||
}
|
||||
return $haystack;
|
||||
}
|
||||
|
||||
$uri = $_SERVER['REQUEST_URI'];
|
||||
$uri = str_replace('/channels/', '', $uri);
|
||||
$prefix = '/mnt/HDD0/YouTube_captions_search_engine/channels/';
|
||||
if (str_contains($uri, '/')) {
|
||||
$uri = str_replace_first('/', '#', $uri);
|
||||
$uri = $prefix . $uri;
|
||||
if (str_ends_with($uri, '.json')) {
|
||||
header('Content-Type: application/json; charset=UTF-8');
|
||||
}
|
||||
echo file_get_contents("zip://$uri");
|
||||
} else {
|
||||
$uri = $prefix . $uri;
|
||||
header("Content-Type: application/zip");
|
||||
echo readfile($uri);
|
||||
}
|
||||
|
||||
?>
|
||||
5
website/composer.json
Normal file
5
website/composer.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"require": {
|
||||
"cboden/ratchet": "^0.4.4"
|
||||
}
|
||||
}
|
||||
1411
website/composer.lock
generated
Normal file
1411
website/composer.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
103
website/index.php
Normal file
103
website/index.php
Normal file
@@ -0,0 +1,103 @@
|
||||
<?php
|
||||
|
||||
function echoUrl($url)
|
||||
{
|
||||
echo "<a href=\"$url\">$url</a>";
|
||||
}
|
||||
|
||||
?>
|
||||
|
||||
See <?php echoUrl('https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine'); ?> for more information.<br/>
|
||||
|
||||
Access raw data with: <?php echoUrl('channels/'); ?>.
|
||||
|
||||
<form id="form">
|
||||
<input type="text" autofocus id="search" pattern="[A-Za-z0-9-_ ]+" placeholder="Your [A-Za-z0-9-_ ]+ search"></input>
|
||||
<input type="submit" id="search" value="Search">
|
||||
<input type="submit" id="search-only-captions" value="Search only captions">
|
||||
</form>
|
||||
|
||||
Progress: <span id="progress"></span> channels
|
||||
|
||||
<ul id="channels">
|
||||
</ul>
|
||||
|
||||
<script>
|
||||
var firstRun = true;
|
||||
var conn;
|
||||
// Could parse DOM instead of using following variable.
|
||||
var channels = [];
|
||||
|
||||
function createA(text, href) {
|
||||
var a = document.createElement('a');
|
||||
var text = document.createTextNode(text);
|
||||
a.appendChild(text);
|
||||
a.href = href;
|
||||
return a;
|
||||
}
|
||||
|
||||
function treatLine(line) {
|
||||
console.log(line);
|
||||
if (line.startsWith('progress:')) {
|
||||
document.getElementById('progress').innerHTML = line.replace('progress:', '');
|
||||
} else {
|
||||
var channelsDom = document.getElementById('channels');
|
||||
var timestamp = [];
|
||||
const lineParts = line.split('|');
|
||||
if (lineParts.length > 0) {
|
||||
timestamps = lineParts.slice(1).map(linePart => parseInt(linePart));
|
||||
line = lineParts[0];
|
||||
}
|
||||
const channelFileParts = line.split('/');
|
||||
const channel = channelFileParts[0];
|
||||
const channelFile = channelFileParts.slice(1).join('/');
|
||||
const channelHref = `channels/${channel}`;
|
||||
if (!channels.includes(channel)) {
|
||||
channels.push(channel);
|
||||
channelDom = document.createElement('li');
|
||||
var a = createA(channel, channelHref);
|
||||
channelDom.appendChild(a);
|
||||
var channelFilesDom = document.createElement('ul');
|
||||
channelDom.appendChild(channelFilesDom);
|
||||
channelsDom.appendChild(channelDom);
|
||||
}
|
||||
var channelDom = channelsDom.lastChild;
|
||||
var channelFilesDom = channelDom.lastChild;
|
||||
var channelFileDom = document.createElement('li');
|
||||
var a = createA(channelFile, `${channelHref}/${channelFile}`);
|
||||
channelFileDom.appendChild(a);
|
||||
const id = channelFileParts[2];
|
||||
for(var timestampsIndex = 0; timestampsIndex < timestamps.length; timestampsIndex++) {
|
||||
const space = document.createTextNode('\u00A0');
|
||||
channelFileDom.appendChild(space);
|
||||
const timestamp = timestamps[timestampsIndex];
|
||||
var a = createA(`${timestamp} s`, `https://www.youtube.com/watch?v=${id}&t=${timestamp}`);
|
||||
channelFileDom.appendChild(a);
|
||||
}
|
||||
channelFilesDom.appendChild(channelFileDom);
|
||||
}
|
||||
}
|
||||
|
||||
function search(event) {
|
||||
// We don't want to refresh the webpage which is the default behavior.
|
||||
event.preventDefault();
|
||||
const query = event.submitter.id + ' ' + document.getElementById('search').value;
|
||||
if (firstRun) {
|
||||
firstRun = false;
|
||||
conn = new WebSocket('wss://crawler.yt.lemnoslife.com/websocket');
|
||||
conn.onmessage = function(e) {
|
||||
e.data.split('\n').forEach(treatLine);
|
||||
};
|
||||
// We can't directly proceed with `conn.send`, as the connection may not be already established.
|
||||
conn.onopen = function(e) { conn.send(query); };
|
||||
} else {
|
||||
// We assume at this point that the connection is established.
|
||||
channels = [];
|
||||
document.getElementById('channels').innerHTML = '';
|
||||
conn.send(query);
|
||||
}
|
||||
}
|
||||
|
||||
var form = document.getElementById('form');
|
||||
form.addEventListener('submit', search);
|
||||
</script>
|
||||
79
website/search.py
Executable file
79
website/search.py
Executable file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import sys, time, fcntl, os, zipfile, webvtt, re
|
||||
from io import StringIO
|
||||
|
||||
path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
|
||||
|
||||
clientId = sys.argv[1]
|
||||
message = sys.argv[2]
|
||||
|
||||
searchOnlyCaptions = message.startswith('search-only-captions ')
|
||||
message = message[message.find(' ') + 1:]
|
||||
|
||||
clientFilePath = f'users/{clientId}.txt'
|
||||
|
||||
def write(s):
|
||||
f = open(clientFilePath, 'r+')
|
||||
try:
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
# If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
|
||||
read = f.read()
|
||||
# We are appening content, as we moved in-file cursor.
|
||||
if read != '':
|
||||
f.write("\n")
|
||||
f.write(s)
|
||||
f.flush()
|
||||
fcntl.flock(f, fcntl.LOCK_UN)
|
||||
f.close()
|
||||
except Exception as e:
|
||||
sys.exit(e)
|
||||
|
||||
# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
|
||||
files = [file for file in os.listdir(path) if file.endswith('.zip')]
|
||||
for fileIndex, file in enumerate(files):
|
||||
write(f'progress:{fileIndex + 1} / {len(files)}')
|
||||
zip = zipfile.ZipFile(path + file)
|
||||
for fileInZip in zip.namelist():
|
||||
endsWithVtt = fileInZip.endswith('.vtt')
|
||||
if searchOnlyCaptions and not endsWithVtt:
|
||||
continue
|
||||
with zip.open(fileInZip) as f:
|
||||
toWrite = f'{file}/{fileInZip}'
|
||||
if endsWithVtt:
|
||||
content = f.read().decode('utf-8')
|
||||
stringIOf = StringIO(content)
|
||||
wholeCaption = ' '.join([caption.text for caption in webvtt.read_buffer(stringIOf)])
|
||||
messagePositions = [m.start() for m in re.finditer(f'(?={message})', wholeCaption)]
|
||||
if messagePositions != []:
|
||||
timestamps = []
|
||||
for messagePosition in messagePositions:
|
||||
stringIOf = StringIO(content)
|
||||
for caption in webvtt.read_buffer(stringIOf):
|
||||
text = caption.text
|
||||
if messagePosition <= len(text):
|
||||
timestamp = str(int(caption.start_in_seconds))
|
||||
timestamps += [timestamp]
|
||||
break
|
||||
messagePosition -= len(text) + 1
|
||||
write(f'{toWrite}|{"|".join(timestamps)}')
|
||||
else:
|
||||
for line in f.readlines():
|
||||
if message in str(line):
|
||||
write(toWrite)
|
||||
break
|
||||
|
||||
f = open(clientFilePath)
|
||||
while True:
|
||||
try:
|
||||
fcntl.flock(f, fcntl.LOCK_EX)
|
||||
if f.read() == '':
|
||||
os.remove(clientFilePath)
|
||||
break
|
||||
else:
|
||||
fcntl.flock(f, fcntl.LOCK_UN)
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
sys.exit(e)
|
||||
|
||||
f.close()
|
||||
5
website/users/.gitignore
vendored
Normal file
5
website/users/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
# Ignore everything in this directory
|
||||
*
|
||||
# Except this file
|
||||
!.gitignore
|
||||
|
||||
166
website/websocket.php
Normal file
166
website/websocket.php
Normal file
@@ -0,0 +1,166 @@
|
||||
<?php
|
||||
|
||||
use Ratchet\MessageComponentInterface;
|
||||
use Ratchet\ConnectionInterface;
|
||||
use React\EventLoop\LoopInterface;
|
||||
use React\EventLoop\Timer\Timer;
|
||||
|
||||
// Make sure composer dependencies have been installed
|
||||
require __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
class Client
|
||||
{
|
||||
public $id;
|
||||
public $timer;
|
||||
public $pid;
|
||||
|
||||
public function __construct($id)
|
||||
{
|
||||
$this->id = $id;
|
||||
}
|
||||
|
||||
// `__destruct` can't take arguments.
|
||||
public function free($loop)
|
||||
{
|
||||
$loop->cancelTimer($this->timer);
|
||||
// Should in theory verify that the pid wasn't re-assigned.
|
||||
posix_kill($this->pid, SIGTERM);
|
||||
$clientFilePath = getClientFilePath($this->id);
|
||||
if (file_exists($clientFilePath)) {
|
||||
$fp = fopen($clientFilePath, "r+");
|
||||
if (flock($fp, LOCK_EX, $WAIT_IF_LOCKED)) { // acquire an exclusive lock
|
||||
unlink($clientFilePath); // delete file
|
||||
flock($fp, LOCK_UN); // release the lock
|
||||
} else {
|
||||
echo "Couldn't get the lock!";
|
||||
}
|
||||
fclose($fp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Need to be passed as a reference to `flock`.
|
||||
$WAIT_IF_LOCKED = 1;
|
||||
|
||||
define('USERS_FOLDER', 'users/');
|
||||
|
||||
// Delete users outputs of previous `websocket.php` execution.
|
||||
// We skip `.`, `..` and `.gitignore`.
|
||||
foreach (array_slice(scandir(USERS_FOLDER), 3) as $file) {
|
||||
unlink(USERS_FOLDER . $file);
|
||||
}
|
||||
|
||||
function getClientFilePath($clientId)
|
||||
{
|
||||
return USERS_FOLDER . "$clientId.txt";
|
||||
}
|
||||
|
||||
// Current implementation may add latency across users.
|
||||
class MyProcess implements MessageComponentInterface
|
||||
{
|
||||
protected $clients;
|
||||
private $loop;
|
||||
private $newClientId;
|
||||
private $newClientIdSem;
|
||||
|
||||
public function __construct(LoopInterface $loop)
|
||||
{
|
||||
$this->clients = new \SplObjectStorage();
|
||||
$this->loop = $loop;
|
||||
$this->newClientId = 0;
|
||||
$this->newClientIdSem = sem_get(1, 1);
|
||||
}
|
||||
|
||||
private function newClient()
|
||||
{
|
||||
// If `onOpen` and `onMessage` can't be called at the same time, then this semaphore is useless.
|
||||
if (sem_acquire($this->newClientIdSem)) {
|
||||
// Note that we don't re-use ids except on `websockets.php` restart, but as the maximal int in PHP is a very great number we are fine for a while (https://www.php.net/manual/en/reserved.constants.php#constant.php-int-max)
|
||||
$clientId = $this->newClientId++;
|
||||
sem_release($this->newClientIdSem);
|
||||
return new Client($clientId);
|
||||
} else {
|
||||
exit('`newClient` error');
|
||||
}
|
||||
}
|
||||
|
||||
public function onOpen(ConnectionInterface $conn)
|
||||
{
|
||||
$client = $this->newClient();
|
||||
$this->clients->attach($conn, $client);
|
||||
}
|
||||
|
||||
public function onMessage(ConnectionInterface $from, $msg)
|
||||
{
|
||||
// As we are going to use this argument in a shell command, we verify a limited set of characters that are safe once quoted.
|
||||
if (preg_match("/^[a-zA-Z0-9-_ ]+$/", $msg) !== 1) {
|
||||
return;
|
||||
}
|
||||
$client = $this->clients->offsetGet($from);
|
||||
// If a previous request was received, we execute the new one with another client for simplicity otherwise with current file deletion approach, we can't tell the worker `search.py` that we don't care about its execution anymore.
|
||||
if ($client->pid !== null) {
|
||||
// As `$this->clients->detach` doesn't call `__destruct` for unknown reason, we clean manually the previous request.
|
||||
$client->free($this->loop);
|
||||
$client = $this->newClient();
|
||||
}
|
||||
$clientId = $client->id;
|
||||
$clientFilePath = getClientFilePath($clientId);
|
||||
// Create the worker output file otherwise it would believe that we don't need this worker anymore.
|
||||
file_put_contents($clientFilePath, '');
|
||||
// Start the independent worker.
|
||||
// Redirecting `stdout` is mandatory otherwise `exec` is blocking.
|
||||
$client->pid = exec("./search.py $clientId '$msg' > /dev/null & echo $!");
|
||||
// `addTimer` doesn't enable us to use independently `$from->send` multiple times with blocking instructions between.
|
||||
$client->timer = $this->loop->addPeriodicTimer(1, function () use ($from, $clientId, $clientFilePath, $client) {
|
||||
echo "Checking news from $clientId\n";
|
||||
// If the worker output file doesn't exist anymore, then it means that the worker have finished its work and acknowledged that `websocket.php` completely read its output.
|
||||
if (file_exists($clientFilePath)) {
|
||||
// `flock` requires `r`eading permission and we need `w`riting one due to `ftruncate` usage.
|
||||
$fp = fopen($clientFilePath, "r+");
|
||||
$read = null;
|
||||
if (flock($fp, LOCK_EX, $WAIT_IF_LOCKED)) { // acquire an exclusive lock
|
||||
// We assume that the temporary output is less than 1 MB long.
|
||||
$read = fread($fp, 1_000_000);
|
||||
ftruncate($fp, 0); // truncate file
|
||||
fflush($fp); // flush output before releasing the lock
|
||||
flock($fp, LOCK_UN); // release the lock
|
||||
} else {
|
||||
// We `die` instead of `echo`ing to force the developer to investigate the reason.
|
||||
die("Couldn't get the lock!");
|
||||
}
|
||||
fclose($fp);
|
||||
|
||||
// Assume that empty output doesn't need to me forwarded to the end-user.
|
||||
if ($read !== null && $read !== '') {
|
||||
$from->send($read);
|
||||
}
|
||||
} else {
|
||||
// We don't need the periodic timer anymore, as the worker finished its work and acknowledged that `websocket.php` completely read its output.
|
||||
$this->loop->cancelTimer($client->timer);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public function onClose(ConnectionInterface $conn)
|
||||
{
|
||||
$client = $this->clients->offsetGet($conn);
|
||||
$clientId = $client->id;
|
||||
$client->free($this->loop);
|
||||
echo "$clientId disconnected\n";
|
||||
$this->clients->detach($conn);
|
||||
}
|
||||
|
||||
public function onError(ConnectionInterface $conn, \Exception $e)
|
||||
{
|
||||
$conn->close();
|
||||
die('`onError`');
|
||||
}
|
||||
}
|
||||
|
||||
$loop = \React\EventLoop\Factory::create();
|
||||
|
||||
// Run the server application through the WebSocket protocol on port 4430.
|
||||
// Note that named arguments come with PHP 8 which isn't current Debian one.
|
||||
$app = new Ratchet\App('crawler.yt.lemnoslife.com', 4430, '127.0.0.1', $loop);
|
||||
$app->route('/websocket', new MyProcess($loop), array('*'));
|
||||
$app->run();
|
||||
Reference in New Issue
Block a user