From f4360078364ed5482f630bb7a6b6bb0328da3e87 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Fri, 6 Jan 2023 17:51:00 +0100 Subject: [PATCH] Fix #16: Provide an algorithm to determine the progress of retrieving comments for huge YouTube channels --- ...tTreatedCommentsForChannelsBeingTreated.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 findLatestTreatedCommentsForChannelsBeingTreated.py diff --git a/findLatestTreatedCommentsForChannelsBeingTreated.py b/findLatestTreatedCommentsForChannelsBeingTreated.py new file mode 100644 index 0000000..f6494cd --- /dev/null +++ b/findLatestTreatedCommentsForChannelsBeingTreated.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 + +import os, requests, json, time, datetime + +path = 'channels/' + +os.chdir(path) + +def getTimestampFromDateString(dateString): + return int(time.mktime(datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ").timetuple())) + +for channelId in list(os.walk('.'))[1]: + channelId = channelId[2:] + #print(channelId) + numberOfRequests = len(list(os.walk(channelId))[0][2]) + # Assume that the folder isn't empty (may not be the case, but it is most of the time). + with open(f'{channelId}/{str(numberOfRequests - 1)}.json') as f: + content = "\n".join(f.read().splitlines()[1:]) + data = json.loads(content) + snippet = data['items'][-1]['snippet'] + if 'topLevelComment' in snippet: + snippet = snippet['topLevelComment']['snippet'] + latestTreatedCommentDate = snippet['publishedAt'] + url = f'https://yt.lemnoslife.com/noKey/channels?part=snippet&id={channelId}' + content = requests.get(url).text + data = json.loads(content) + channelCreationDate = data['items'][0]['snippet']['publishedAt'] + #print(channelCreationDate) + # Timing percentage not taking into account the not uniform in time distribution of comments. + currentTimestamp = int(time.time()) + timingPercentage = round(100 * (currentTimestamp - getTimestampFromDateString(latestTreatedCommentDate)) / (currentTimestamp - getTimestampFromDateString(channelCreationDate)), 3) + print(f'{channelId} {latestTreatedCommentDate} / {channelCreationDate} ({timingPercentage}%)') + break