From e1aff6f469d54d0319d5145b3af6e5761ac5ba6a Mon Sep 17 00:00:00 2001
From: Benjamin Loison <Benjamin_Loison@users.noreply.gitea.lemnoslife.com>
Date: Sun, 26 Feb 2023 15:12:06 +0100
Subject: [PATCH] #35: Move Python scripts to `scripts/` and describe the
 project structure in `README.md`

---
 README.md                                      | 18 ++++++++++++++++++
 .../findAlreadyTreatedCommentsCount.py         |  4 +++-
 ...stTreatedCommentsForChannelsBeingTreated.py | 12 ++++++++----
 .../findTreatedChannelWithMostComments.py      |  0
 .../findTreatedChannelWithMostSubscribers.py   |  2 +-
 .../removeChannelsBeingTreated.py              |  0
 .../retrieveTop100SubscribersFrance.py         |  3 +++
 7 files changed, 33 insertions(+), 6 deletions(-)
 rename findAlreadyTreatedCommentsCount.py => scripts/findAlreadyTreatedCommentsCount.py (73%)
 rename findLatestTreatedCommentsForChannelsBeingTreated.py => scripts/findLatestTreatedCommentsForChannelsBeingTreated.py (78%)
 rename findTreatedChannelWithMostComments.py => scripts/findTreatedChannelWithMostComments.py (100%)
 rename findTreatedChannelWithMostSubscribers.py => scripts/findTreatedChannelWithMostSubscribers.py (88%)
 rename removeChannelsBeingTreated.py => scripts/removeChannelsBeingTreated.py (100%)
 rename retrieveTop100SubscribersFrance.py => scripts/retrieveTop100SubscribersFrance.py (81%)

diff --git a/README.md b/README.md
index 6ae89dc..ab57bbf 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,24 @@ A ready to be used by the end-user website instance of this project is hosted at
 
 See more details on [the Wiki](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/wiki).
 
+# The project structure:
+
+- `main.cpp` contains the C++ multi-threaded algorithm proceeding to the YouTube channels discovery. It is notably made of the following functions:
+  - `main` which takes into account the command line arguments, load variables from files (`channels.txt`, `keys.txt`, `channels/` content) and start the threads as executing `treatChannels` function
+  - `treatChannels` gets a YouTube channel to treat, treat it in `treatChannelOrVideo` function and compress the retrieved data
+  - `treatChannelOrVideo` which provided a YouTube channel id or a video id, treats this resource. In both cases it treats comments left on this resource. In the case of a channel it also treats its `CHANNELS`, `COMMUNITY`, `PLAYLISTS` and `LIVE` tabs and downloads the captions of the channel videos.
+  - `markChannelAsRequiringTreatmentIfNeeded` which provided a YouTube channel id marks it as requiring treatment if it wasn't already treated
+  - `execute` which provided an `yt-dlp` command executes it in a shell
+  - `getJson` which provided an API request returns a JSON structure with its result. In the case that the API requested is YouTube Data API v3 and a set of keys is provided (see below `keys.txt`), it rotates the keys as required
+- `channels.txt` contains a starting set of channels which contains mostly the 100 most subscribed French channels
+- `keys.txt` contains a set of YouTube Data API v3 keys (not provided) to have the ability to request this API (see an alternative to filling it in the section below with `--no-keys` command line argument)
+- `scripts/` contains Python scripts to:
+  - generate the `channels.txt` as described above (`retrieveTop100SubscribersFrance.py`)
+  - remove channels being treated before a restart of the algorithm as described in [the `main` function documentation](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/src/commit/8dd89e6e881da0a905b6fa4b23775c4344dd0d9d/main.cpp#L126-L128) (`removeChannelsBeingTreated.py`)
+- `website/` is a PHP website using WebSocket to allow the end-user to proceed to requests on the retrieved dataset. When fetching the website, the end-user receives the interpreted `index.php` which upon making a request interacts with `websocket.php` which in the back-end dispatches the requests from various end-users to `search.py` (which treats the actual end-user request on the compressed dataset) by using `users/` to make the inter-process communication.
+
+Note that this project heavily relies on [YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API) [which was modified for this project](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/wiki/YouTube-operational-API-commits).
+
 # Running the YouTube graph discovery algorithm:
 
 Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
diff --git a/findAlreadyTreatedCommentsCount.py b/scripts/findAlreadyTreatedCommentsCount.py
similarity index 73%
rename from findAlreadyTreatedCommentsCount.py
rename to scripts/findAlreadyTreatedCommentsCount.py
index 7fc6ad9..d617260 100755
--- a/findAlreadyTreatedCommentsCount.py
+++ b/scripts/findAlreadyTreatedCommentsCount.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-PREFIX = 'Comments per second: '
+PREFIX = 'Channels per second: '
 alreadyTreatedCommentsCount = 0
 
 with open('nohup.out') as f:
@@ -8,5 +8,7 @@ with open('nohup.out') as f:
     for line in lines:
         if PREFIX in line:
             alreadyTreatedCommentsCount += int(line.split(PREFIX)[-1])
+        #if 'UCsT0YIqwnpJCM-mx7-gSA4Q' in line:
+        #    break
 
 print(alreadyTreatedCommentsCount)
diff --git a/findLatestTreatedCommentsForChannelsBeingTreated.py b/scripts/findLatestTreatedCommentsForChannelsBeingTreated.py
similarity index 78%
rename from findLatestTreatedCommentsForChannelsBeingTreated.py
rename to scripts/findLatestTreatedCommentsForChannelsBeingTreated.py
index 9076348..961355a 100755
--- a/findLatestTreatedCommentsForChannelsBeingTreated.py
+++ b/scripts/findLatestTreatedCommentsForChannelsBeingTreated.py
@@ -1,5 +1,7 @@
 #!/usr/bin/python3
 
+# This algorithm should also take in account other features that we use to retrieve channels.
+
 import os, requests, json, time, datetime
 
 path = 'channels/'
@@ -12,11 +14,13 @@ def getTimestampFromDateString(dateString):
 for channelId in list(os.walk('.'))[1]:
     channelId = channelId[2:]
     #print(channelId)
-    numberOfRequests = len(list(os.walk(channelId))[0][2])
+    numberOfRequests = len(list(os.walk(f'{channelId}/requests'))[0][2]) - 1
     # Assume that the folder isn't empty (may not be the case, but it is most of the time).
-    with open(f'{channelId}/{str(numberOfRequests - 1)}.json') as f:
-        content = "\n".join(f.read().splitlines()[1:])
-        data = json.loads(content)
+    filePath = f'{channelId}/requests/{str(numberOfRequests - 1)}.json'
+    with open(filePath) as f:
+        print(filePath)
+        #content = "\n".join(f.read().splitlines()[1:])
+        data = json.load(f)#json.loads(content)
         snippet = data['items'][-1]['snippet']
         if 'topLevelComment' in snippet:
             snippet = snippet['topLevelComment']['snippet']
diff --git a/findTreatedChannelWithMostComments.py b/scripts/findTreatedChannelWithMostComments.py
similarity index 100%
rename from findTreatedChannelWithMostComments.py
rename to scripts/findTreatedChannelWithMostComments.py
diff --git a/findTreatedChannelWithMostSubscribers.py b/scripts/findTreatedChannelWithMostSubscribers.py
similarity index 88%
rename from findTreatedChannelWithMostSubscribers.py
rename to scripts/findTreatedChannelWithMostSubscribers.py
index 0a452cf..bd8f0e6 100755
--- a/findTreatedChannelWithMostSubscribers.py
+++ b/scripts/findTreatedChannelWithMostSubscribers.py
@@ -2,7 +2,7 @@
 
 import os, requests, json
 
-channelIds = next(os.walk('channels/'))[1]
+channelIds = [channelId.replace('.zip', '') for channelId in next(os.walk('channels/'))[2]]
 maxResults = 50
 
 channelIdsChunks = [channelIds[i : i + maxResults] for i in range(0, len(channelIds), maxResults)]
diff --git a/removeChannelsBeingTreated.py b/scripts/removeChannelsBeingTreated.py
similarity index 100%
rename from removeChannelsBeingTreated.py
rename to scripts/removeChannelsBeingTreated.py
diff --git a/retrieveTop100SubscribersFrance.py b/scripts/retrieveTop100SubscribersFrance.py
similarity index 81%
rename from retrieveTop100SubscribersFrance.py
rename to scripts/retrieveTop100SubscribersFrance.py
index 6c8df4f..b523194 100755
--- a/retrieveTop100SubscribersFrance.py
+++ b/scripts/retrieveTop100SubscribersFrance.py
@@ -1,4 +1,7 @@
+#!/usr/bin/python3
+
 # We can't proceed automatically by using `requests` Python module because https://socialblade.com/youtube/top/country/fr/mostsubscribed is protected by CloudFlare.
+# Note that `undetected-chromedriver` might be a workaround this limitation.
 
 with open('mostsubscribed.html') as f:
     lines = f.read().splitlines()