#35: Move Python scripts to scripts/
and describe the project structure in README.md
This commit is contained in:
14
scripts/findAlreadyTreatedCommentsCount.py
Executable file
14
scripts/findAlreadyTreatedCommentsCount.py
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
PREFIX = 'Channels per second: '
|
||||
alreadyTreatedCommentsCount = 0
|
||||
|
||||
with open('nohup.out') as f:
|
||||
lines = f.read().splitlines()
|
||||
for line in lines:
|
||||
if PREFIX in line:
|
||||
alreadyTreatedCommentsCount += int(line.split(PREFIX)[-1])
|
||||
#if 'UCsT0YIqwnpJCM-mx7-gSA4Q' in line:
|
||||
# break
|
||||
|
||||
print(alreadyTreatedCommentsCount)
|
37
scripts/findLatestTreatedCommentsForChannelsBeingTreated.py
Executable file
37
scripts/findLatestTreatedCommentsForChannelsBeingTreated.py
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
# This algorithm should also take in account other features that we use to retrieve channels.
|
||||
|
||||
import os, requests, json, time, datetime
|
||||
|
||||
path = 'channels/'
|
||||
|
||||
os.chdir(path)
|
||||
|
||||
def getTimestampFromDateString(dateString):
|
||||
return int(time.mktime(datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ").timetuple()))
|
||||
|
||||
for channelId in list(os.walk('.'))[1]:
|
||||
channelId = channelId[2:]
|
||||
#print(channelId)
|
||||
numberOfRequests = len(list(os.walk(f'{channelId}/requests'))[0][2]) - 1
|
||||
# Assume that the folder isn't empty (may not be the case, but it is most of the time).
|
||||
filePath = f'{channelId}/requests/{str(numberOfRequests - 1)}.json'
|
||||
with open(filePath) as f:
|
||||
print(filePath)
|
||||
#content = "\n".join(f.read().splitlines()[1:])
|
||||
data = json.load(f)#json.loads(content)
|
||||
snippet = data['items'][-1]['snippet']
|
||||
if 'topLevelComment' in snippet:
|
||||
snippet = snippet['topLevelComment']['snippet']
|
||||
latestTreatedCommentDate = snippet['publishedAt']
|
||||
url = f'https://yt.lemnoslife.com/noKey/channels?part=snippet&id={channelId}'
|
||||
content = requests.get(url).text
|
||||
data = json.loads(content)
|
||||
channelCreationDate = data['items'][0]['snippet']['publishedAt']
|
||||
#print(channelCreationDate)
|
||||
# Timing percentage not taking into account the not uniform in time distribution of comments. Note that in the case of the last request is to list replies to a comment, the percentage might goes a bit backward, as replies are posted after the initial comment.
|
||||
currentTimestamp = int(time.time())
|
||||
timingPercentage = round(100 * (currentTimestamp - getTimestampFromDateString(latestTreatedCommentDate)) / (currentTimestamp - getTimestampFromDateString(channelCreationDate)), 3)
|
||||
print(f'{channelId} {latestTreatedCommentDate} / {channelCreationDate} ({timingPercentage}%)')
|
||||
break
|
16
scripts/findTreatedChannelWithMostComments.py
Executable file
16
scripts/findTreatedChannelWithMostComments.py
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
infix = ' comments were found for this channel.'
|
||||
biggestCommentsCount = 0
|
||||
|
||||
with open('nohup.out') as f:
|
||||
lines = f.read().splitlines()
|
||||
for line in lines:
|
||||
if infix in line:
|
||||
#print(line)
|
||||
commentsCount = int(line.split(': ')[-1].split(infix)[0])
|
||||
#print(commentsCount)
|
||||
if biggestCommentsCount < commentsCount:
|
||||
biggestCommentsCount = commentsCount
|
||||
|
||||
print(biggestCommentsCount)
|
23
scripts/findTreatedChannelWithMostSubscribers.py
Executable file
23
scripts/findTreatedChannelWithMostSubscribers.py
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import os, requests, json
|
||||
|
||||
channelIds = [channelId.replace('.zip', '') for channelId in next(os.walk('channels/'))[2]]
|
||||
maxResults = 50
|
||||
|
||||
channelIdsChunks = [channelIds[i : i + maxResults] for i in range(0, len(channelIds), maxResults)]
|
||||
mostSubscriberCount = 0
|
||||
mostSubscriberChannel = None
|
||||
|
||||
for channelIds in channelIdsChunks:
|
||||
url = 'https://yt.lemnoslife.com/noKey/channels?part=statistics&id=' + ','.join(channelIds)
|
||||
content = requests.get(url).text
|
||||
data = json.loads(content)
|
||||
items = data['items']
|
||||
for item in items:
|
||||
subscriberCount = int(item['statistics']['subscriberCount'])
|
||||
if mostSubscriberCount < subscriberCount:
|
||||
mostSubscriberCount = subscriberCount
|
||||
mostSubscriberChannel = item['id']
|
||||
|
||||
print(mostSubscriberChannel, mostSubscriberCount)
|
34
scripts/removeChannelsBeingTreated.py
Executable file
34
scripts/removeChannelsBeingTreated.py
Executable file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import shutil, os
|
||||
|
||||
infix = ': Treating channel '
|
||||
path = 'channels/'
|
||||
|
||||
threads = {}
|
||||
|
||||
with open('nohup.out') as f:
|
||||
lines = f.read().splitlines()
|
||||
for line in lines:
|
||||
if infix in line:
|
||||
#print(line)
|
||||
threadId = line.split(': ')[1]
|
||||
channelId = line.split(infix)[1].split(' (')[0]
|
||||
if threadId.isdigit() and channelId.startswith('UC') and len(channelId) == 24:
|
||||
threads[threadId] = channelId
|
||||
for threadId in threads:
|
||||
channelId = threads[threadId]
|
||||
print(threadId, channelId)
|
||||
# There are three cases:
|
||||
# - `channelId`/ exists
|
||||
# - `channelId`/ and `channelId`.zip exist
|
||||
# - `channelId`.zip exists
|
||||
# To manage every case, we need to use two `try`/`except`.
|
||||
try:
|
||||
shutil.rmtree(path + channelId)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.remove(path + channelId + ".zip")
|
||||
except:
|
||||
pass
|
14
scripts/retrieveTop100SubscribersFrance.py
Executable file
14
scripts/retrieveTop100SubscribersFrance.py
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
# We can't proceed automatically by using `requests` Python module because https://socialblade.com/youtube/top/country/fr/mostsubscribed is protected by CloudFlare.
|
||||
# Note that `undetected-chromedriver` might be a workaround this limitation.
|
||||
|
||||
with open('mostsubscribed.html') as f:
|
||||
lines = f.read().splitlines()
|
||||
|
||||
PREFIX = ' <a href = "/youtube/channel/'
|
||||
|
||||
for line in lines:
|
||||
if PREFIX in line:
|
||||
channelId = line.split(PREFIX)[1].split('">')[0]
|
||||
print(channelId)
|
Reference in New Issue
Block a user