#35: Move Python scripts to scripts/ and describe the project structure in README.md

2023-02-26 15:12:06 +01:00
parent ff5542d8b0
commit e1aff6f469
7 changed files with 33 additions and 6 deletions
--- a/scripts/findAlreadyTreatedCommentsCount.py
+++ b/scripts/findAlreadyTreatedCommentsCount.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+
+PREFIX = 'Channels per second: '
+alreadyTreatedCommentsCount = 0
+
+with open('nohup.out') as f:
+    lines = f.read().splitlines()
+    for line in lines:
+        if PREFIX in line:
+            alreadyTreatedCommentsCount += int(line.split(PREFIX)[-1])
+        #if 'UCsT0YIqwnpJCM-mx7-gSA4Q' in line:
+        #    break
+
+print(alreadyTreatedCommentsCount)
--- a/scripts/findLatestTreatedCommentsForChannelsBeingTreated.py
+++ b/scripts/findLatestTreatedCommentsForChannelsBeingTreated.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python3
+
+# This algorithm should also take in account other features that we use to retrieve channels.
+
+import os, requests, json, time, datetime
+
+path = 'channels/'
+
+os.chdir(path)
+
+def getTimestampFromDateString(dateString):
+    return int(time.mktime(datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ").timetuple()))
+
+for channelId in list(os.walk('.'))[1]:
+    channelId = channelId[2:]
+    #print(channelId)
+    numberOfRequests = len(list(os.walk(f'{channelId}/requests'))[0][2]) - 1
+    # Assume that the folder isn't empty (may not be the case, but it is most of the time).
+    filePath = f'{channelId}/requests/{str(numberOfRequests - 1)}.json'
+    with open(filePath) as f:
+        print(filePath)
+        #content = "\n".join(f.read().splitlines()[1:])
+        data = json.load(f)#json.loads(content)
+        snippet = data['items'][-1]['snippet']
+        if 'topLevelComment' in snippet:
+            snippet = snippet['topLevelComment']['snippet']
+        latestTreatedCommentDate = snippet['publishedAt']
+    url = f'https://yt.lemnoslife.com/noKey/channels?part=snippet&id={channelId}'
+    content = requests.get(url).text
+    data = json.loads(content)
+    channelCreationDate = data['items'][0]['snippet']['publishedAt']
+    #print(channelCreationDate)
+    # Timing percentage not taking into account the not uniform in time distribution of comments. Note that in the case of the last request is to list replies to a comment, the percentage might goes a bit backward, as replies are posted after the initial comment.
+    currentTimestamp = int(time.time())
+    timingPercentage = round(100 * (currentTimestamp - getTimestampFromDateString(latestTreatedCommentDate)) / (currentTimestamp - getTimestampFromDateString(channelCreationDate)), 3)
+    print(f'{channelId} {latestTreatedCommentDate} / {channelCreationDate} ({timingPercentage}%)')
+    break
--- a/scripts/findTreatedChannelWithMostComments.py
+++ b/scripts/findTreatedChannelWithMostComments.py
@@ -0,0 +1,16 @@
+#!/usr/bin/python3
+
+infix = ' comments were found for this channel.'
+biggestCommentsCount = 0
+
+with open('nohup.out') as f:
+    lines = f.read().splitlines()
+    for line in lines:
+        if infix in line:
+            #print(line)
+            commentsCount = int(line.split(': ')[-1].split(infix)[0])
+            #print(commentsCount)
+            if biggestCommentsCount < commentsCount:
+                biggestCommentsCount = commentsCount
+
+print(biggestCommentsCount)
--- a/scripts/findTreatedChannelWithMostSubscribers.py
+++ b/scripts/findTreatedChannelWithMostSubscribers.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python3
+
+import os, requests, json
+
+channelIds = [channelId.replace('.zip', '') for channelId in next(os.walk('channels/'))[2]]
+maxResults = 50
+
+channelIdsChunks = [channelIds[i : i + maxResults] for i in range(0, len(channelIds), maxResults)]
+mostSubscriberCount = 0
+mostSubscriberChannel = None
+
+for channelIds in channelIdsChunks:
+    url = 'https://yt.lemnoslife.com/noKey/channels?part=statistics&id=' + ','.join(channelIds)
+    content = requests.get(url).text
+    data = json.loads(content)
+    items = data['items']
+    for item in items:
+        subscriberCount = int(item['statistics']['subscriberCount'])
+        if mostSubscriberCount < subscriberCount:
+            mostSubscriberCount = subscriberCount
+            mostSubscriberChannel = item['id']
+
+print(mostSubscriberChannel, mostSubscriberCount)
--- a/scripts/removeChannelsBeingTreated.py
+++ b/scripts/removeChannelsBeingTreated.py
@@ -0,0 +1,34 @@
+#!/usr/bin/python3
+
+import shutil, os
+
+infix = ': Treating channel '
+path = 'channels/'
+
+threads = {}
+
+with open('nohup.out') as f:
+    lines = f.read().splitlines()
+    for line in lines:
+        if infix in line:
+            #print(line)
+            threadId = line.split(': ')[1]
+            channelId = line.split(infix)[1].split(' (')[0]
+            if threadId.isdigit() and channelId.startswith('UC') and len(channelId) == 24:
+                threads[threadId] = channelId
+    for threadId in threads:
+        channelId = threads[threadId]
+        print(threadId, channelId)
+        # There are three cases:
+        # - `channelId`/ exists
+        # - `channelId`/ and `channelId`.zip exist
+        # - `channelId`.zip exists
+        # To manage every case, we need to use two `try`/`except`.
+        try:
+            shutil.rmtree(path + channelId)
+        except:
+            pass
+        try:
+            os.remove(path + channelId + ".zip")
+        except:
+            pass
--- a/scripts/retrieveTop100SubscribersFrance.py
+++ b/scripts/retrieveTop100SubscribersFrance.py
@@ -0,0 +1,14 @@
+#!/usr/bin/python3
+
+# We can't proceed automatically by using `requests` Python module because https://socialblade.com/youtube/top/country/fr/mostsubscribed is protected by CloudFlare.
+# Note that `undetected-chromedriver` might be a workaround this limitation.
+
+with open('mostsubscribed.html') as f:
+    lines = f.read().splitlines()
+
+PREFIX = '                   <a href = "/youtube/channel/'
+
+for line in lines:
+    if PREFIX in line:
+        channelId = line.split(PREFIX)[1].split('">')[0]
+        print(channelId)