#35: Move Python scripts to scripts/
and describe the project structure in README.md
This commit is contained in:
parent
ff5542d8b0
commit
e1aff6f469
18
README.md
18
README.md
@ -9,6 +9,24 @@ A ready to be used by the end-user website instance of this project is hosted at
|
|||||||
|
|
||||||
See more details on [the Wiki](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/wiki).
|
See more details on [the Wiki](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/wiki).
|
||||||
|
|
||||||
|
# The project structure:
|
||||||
|
|
||||||
|
- `main.cpp` contains the C++ multi-threaded algorithm proceeding to the YouTube channels discovery. It is notably made of the following functions:
|
||||||
|
- `main` which takes into account the command line arguments, load variables from files (`channels.txt`, `keys.txt`, `channels/` content) and start the threads as executing `treatChannels` function
|
||||||
|
- `treatChannels` gets a YouTube channel to treat, treat it in `treatChannelOrVideo` function and compress the retrieved data
|
||||||
|
- `treatChannelOrVideo` which provided a YouTube channel id or a video id, treats this resource. In both cases it treats comments left on this resource. In the case of a channel it also treats its `CHANNELS`, `COMMUNITY`, `PLAYLISTS` and `LIVE` tabs and downloads the captions of the channel videos.
|
||||||
|
- `markChannelAsRequiringTreatmentIfNeeded` which provided a YouTube channel id marks it as requiring treatment if it wasn't already treated
|
||||||
|
- `execute` which provided an `yt-dlp` command executes it in a shell
|
||||||
|
- `getJson` which provided an API request returns a JSON structure with its result. In the case that the API requested is YouTube Data API v3 and a set of keys is provided (see below `keys.txt`), it rotates the keys as required
|
||||||
|
- `channels.txt` contains a starting set of channels which contains mostly the 100 most subscribed French channels
|
||||||
|
- `keys.txt` contains a set of YouTube Data API v3 keys (not provided) to have the ability to request this API (see an alternative to filling it in the section below with `--no-keys` command line argument)
|
||||||
|
- `scripts/` contains Python scripts to:
|
||||||
|
- generate the `channels.txt` as described above (`retrieveTop100SubscribersFrance.py`)
|
||||||
|
- remove channels being treated before a restart of the algorithm as described in [the `main` function documentation](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/src/commit/8dd89e6e881da0a905b6fa4b23775c4344dd0d9d/main.cpp#L126-L128) (`removeChannelsBeingTreated.py`)
|
||||||
|
- `website/` is a PHP website using WebSocket to allow the end-user to proceed to requests on the retrieved dataset. When fetching the website, the end-user receives the interpreted `index.php` which upon making a request interacts with `websocket.php` which in the back-end dispatches the requests from various end-users to `search.py` (which treats the actual end-user request on the compressed dataset) by using `users/` to make the inter-process communication.
|
||||||
|
|
||||||
|
Note that this project heavily relies on [YouTube operational API](https://github.com/Benjamin-Loison/YouTube-operational-API) [which was modified for this project](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/wiki/YouTube-operational-API-commits).
|
||||||
|
|
||||||
# Running the YouTube graph discovery algorithm:
|
# Running the YouTube graph discovery algorithm:
|
||||||
|
|
||||||
Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
|
Because of [the current compression mechanism](https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions_search_engine/issues/30), Linux is the only known OS able to run this algorithm.
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
PREFIX = 'Comments per second: '
|
PREFIX = 'Channels per second: '
|
||||||
alreadyTreatedCommentsCount = 0
|
alreadyTreatedCommentsCount = 0
|
||||||
|
|
||||||
with open('nohup.out') as f:
|
with open('nohup.out') as f:
|
||||||
@ -8,5 +8,7 @@ with open('nohup.out') as f:
|
|||||||
for line in lines:
|
for line in lines:
|
||||||
if PREFIX in line:
|
if PREFIX in line:
|
||||||
alreadyTreatedCommentsCount += int(line.split(PREFIX)[-1])
|
alreadyTreatedCommentsCount += int(line.split(PREFIX)[-1])
|
||||||
|
#if 'UCsT0YIqwnpJCM-mx7-gSA4Q' in line:
|
||||||
|
# break
|
||||||
|
|
||||||
print(alreadyTreatedCommentsCount)
|
print(alreadyTreatedCommentsCount)
|
@ -1,5 +1,7 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
# This algorithm should also take in account other features that we use to retrieve channels.
|
||||||
|
|
||||||
import os, requests, json, time, datetime
|
import os, requests, json, time, datetime
|
||||||
|
|
||||||
path = 'channels/'
|
path = 'channels/'
|
||||||
@ -12,11 +14,13 @@ def getTimestampFromDateString(dateString):
|
|||||||
for channelId in list(os.walk('.'))[1]:
|
for channelId in list(os.walk('.'))[1]:
|
||||||
channelId = channelId[2:]
|
channelId = channelId[2:]
|
||||||
#print(channelId)
|
#print(channelId)
|
||||||
numberOfRequests = len(list(os.walk(channelId))[0][2])
|
numberOfRequests = len(list(os.walk(f'{channelId}/requests'))[0][2]) - 1
|
||||||
# Assume that the folder isn't empty (may not be the case, but it is most of the time).
|
# Assume that the folder isn't empty (may not be the case, but it is most of the time).
|
||||||
with open(f'{channelId}/{str(numberOfRequests - 1)}.json') as f:
|
filePath = f'{channelId}/requests/{str(numberOfRequests - 1)}.json'
|
||||||
content = "\n".join(f.read().splitlines()[1:])
|
with open(filePath) as f:
|
||||||
data = json.loads(content)
|
print(filePath)
|
||||||
|
#content = "\n".join(f.read().splitlines()[1:])
|
||||||
|
data = json.load(f)#json.loads(content)
|
||||||
snippet = data['items'][-1]['snippet']
|
snippet = data['items'][-1]['snippet']
|
||||||
if 'topLevelComment' in snippet:
|
if 'topLevelComment' in snippet:
|
||||||
snippet = snippet['topLevelComment']['snippet']
|
snippet = snippet['topLevelComment']['snippet']
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import os, requests, json
|
import os, requests, json
|
||||||
|
|
||||||
channelIds = next(os.walk('channels/'))[1]
|
channelIds = [channelId.replace('.zip', '') for channelId in next(os.walk('channels/'))[2]]
|
||||||
maxResults = 50
|
maxResults = 50
|
||||||
|
|
||||||
channelIdsChunks = [channelIds[i : i + maxResults] for i in range(0, len(channelIds), maxResults)]
|
channelIdsChunks = [channelIds[i : i + maxResults] for i in range(0, len(channelIds), maxResults)]
|
@ -1,4 +1,7 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
# We can't proceed automatically by using `requests` Python module because https://socialblade.com/youtube/top/country/fr/mostsubscribed is protected by CloudFlare.
|
# We can't proceed automatically by using `requests` Python module because https://socialblade.com/youtube/top/country/fr/mostsubscribed is protected by CloudFlare.
|
||||||
|
# Note that `undetected-chromedriver` might be a workaround this limitation.
|
||||||
|
|
||||||
with open('mostsubscribed.html') as f:
|
with open('mostsubscribed.html') as f:
|
||||||
lines = f.read().splitlines()
|
lines = f.read().splitlines()
|
Loading…
Reference in New Issue
Block a user