#35: Move Python scripts to scripts/ and describe the project structure in README.md

This commit is contained in:
2023-02-26 15:12:06 +01:00
parent ff5542d8b0
commit e1aff6f469
7 changed files with 33 additions and 6 deletions

View File

@@ -0,0 +1,14 @@
#!/usr/bin/python3
PREFIX = 'Channels per second: '
alreadyTreatedCommentsCount = 0
with open('nohup.out') as f:
lines = f.read().splitlines()
for line in lines:
if PREFIX in line:
alreadyTreatedCommentsCount += int(line.split(PREFIX)[-1])
#if 'UCsT0YIqwnpJCM-mx7-gSA4Q' in line:
# break
print(alreadyTreatedCommentsCount)

View File

@@ -0,0 +1,37 @@
#!/usr/bin/python3
# This algorithm should also take in account other features that we use to retrieve channels.
import os, requests, json, time, datetime
path = 'channels/'
os.chdir(path)
def getTimestampFromDateString(dateString):
return int(time.mktime(datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ").timetuple()))
for channelId in list(os.walk('.'))[1]:
channelId = channelId[2:]
#print(channelId)
numberOfRequests = len(list(os.walk(f'{channelId}/requests'))[0][2]) - 1
# Assume that the folder isn't empty (may not be the case, but it is most of the time).
filePath = f'{channelId}/requests/{str(numberOfRequests - 1)}.json'
with open(filePath) as f:
print(filePath)
#content = "\n".join(f.read().splitlines()[1:])
data = json.load(f)#json.loads(content)
snippet = data['items'][-1]['snippet']
if 'topLevelComment' in snippet:
snippet = snippet['topLevelComment']['snippet']
latestTreatedCommentDate = snippet['publishedAt']
url = f'https://yt.lemnoslife.com/noKey/channels?part=snippet&id={channelId}'
content = requests.get(url).text
data = json.loads(content)
channelCreationDate = data['items'][0]['snippet']['publishedAt']
#print(channelCreationDate)
# Timing percentage not taking into account the not uniform in time distribution of comments. Note that in the case of the last request is to list replies to a comment, the percentage might goes a bit backward, as replies are posted after the initial comment.
currentTimestamp = int(time.time())
timingPercentage = round(100 * (currentTimestamp - getTimestampFromDateString(latestTreatedCommentDate)) / (currentTimestamp - getTimestampFromDateString(channelCreationDate)), 3)
print(f'{channelId} {latestTreatedCommentDate} / {channelCreationDate} ({timingPercentage}%)')
break

View File

@@ -0,0 +1,16 @@
#!/usr/bin/python3
infix = ' comments were found for this channel.'
biggestCommentsCount = 0
with open('nohup.out') as f:
lines = f.read().splitlines()
for line in lines:
if infix in line:
#print(line)
commentsCount = int(line.split(': ')[-1].split(infix)[0])
#print(commentsCount)
if biggestCommentsCount < commentsCount:
biggestCommentsCount = commentsCount
print(biggestCommentsCount)

View File

@@ -0,0 +1,23 @@
#!/usr/bin/python3
import os, requests, json
channelIds = [channelId.replace('.zip', '') for channelId in next(os.walk('channels/'))[2]]
maxResults = 50
channelIdsChunks = [channelIds[i : i + maxResults] for i in range(0, len(channelIds), maxResults)]
mostSubscriberCount = 0
mostSubscriberChannel = None
for channelIds in channelIdsChunks:
url = 'https://yt.lemnoslife.com/noKey/channels?part=statistics&id=' + ','.join(channelIds)
content = requests.get(url).text
data = json.loads(content)
items = data['items']
for item in items:
subscriberCount = int(item['statistics']['subscriberCount'])
if mostSubscriberCount < subscriberCount:
mostSubscriberCount = subscriberCount
mostSubscriberChannel = item['id']
print(mostSubscriberChannel, mostSubscriberCount)

View File

@@ -0,0 +1,34 @@
#!/usr/bin/python3
import shutil, os
infix = ': Treating channel '
path = 'channels/'
threads = {}
with open('nohup.out') as f:
lines = f.read().splitlines()
for line in lines:
if infix in line:
#print(line)
threadId = line.split(': ')[1]
channelId = line.split(infix)[1].split(' (')[0]
if threadId.isdigit() and channelId.startswith('UC') and len(channelId) == 24:
threads[threadId] = channelId
for threadId in threads:
channelId = threads[threadId]
print(threadId, channelId)
# There are three cases:
# - `channelId`/ exists
# - `channelId`/ and `channelId`.zip exist
# - `channelId`.zip exists
# To manage every case, we need to use two `try`/`except`.
try:
shutil.rmtree(path + channelId)
except:
pass
try:
os.remove(path + channelId + ".zip")
except:
pass

View File

@@ -0,0 +1,14 @@
#!/usr/bin/python3
# We can't proceed automatically by using `requests` Python module because https://socialblade.com/youtube/top/country/fr/mostsubscribed is protected by CloudFlare.
# Note that `undetected-chromedriver` might be a workaround this limitation.
with open('mostsubscribed.html') as f:
lines = f.read().splitlines()
PREFIX = ' <a href = "/youtube/channel/'
for line in lines:
if PREFIX in line:
channelId = line.split(PREFIX)[1].split('">')[0]
print(channelId)