YouTube_captions_search_engine/scripts/findLatestTreatedCommentsForChannelsBeingTreated.py

#!/usr/bin/python3

# This algorithm should also take in account other features that we use to retrieve channels.

import os, requests, json, time, datetime

path = 'channels/'

os.chdir(path)

def getTimestampFromDateString(dateString):
    return int(time.mktime(datetime.datetime.strptime(dateString, '%Y-%m-%dT%H:%M:%SZ').timetuple()))

for channelId in list(os.walk('.'))[1]:
    channelId = channelId[2:]
    #print(channelId)
    numberOfRequests = len(list(os.walk(f'{channelId}/requests'))[0][2]) - 1
    # Assume that the folder isn't empty (may not be the case, but it is most of the time).
    filePath = f'{channelId}/requests/{str(numberOfRequests - 1)}.json'
    with open(filePath) as f:
        print(filePath)
        #content = "\n".join(f.read().splitlines()[1:])
        data = json.load(f)#json.loads(content)
        snippet = data['items'][-1]['snippet']
        if 'topLevelComment' in snippet:
            snippet = snippet['topLevelComment']['snippet']
        latestTreatedCommentDate = snippet['publishedAt']
    url = f'https://yt.lemnoslife.com/noKey/channels?part=snippet&id={channelId}'
    data = requests.get(url).json()
    channelCreationDate = data['items'][0]['snippet']['publishedAt']
    #print(channelCreationDate)
    # Timing percentage not taking into account the not uniform in time distribution of comments. Note that in the case of the last request is to list replies to a comment, the percentage might goes a bit backward, as replies are posted after the initial comment.
    currentTimestamp = int(time.time())
    timingPercentage = round(100 * (currentTimestamp - getTimestampFromDateString(latestTreatedCommentDate)) / (currentTimestamp - getTimestampFromDateString(channelCreationDate)), 3)
    print(f'{channelId} {latestTreatedCommentDate} / {channelCreationDate} ({timingPercentage}%)')
    break
Fix #16: Provide an algorithm to determine the progress of retrieving comments for huge YouTube channels 2023-01-06 17:51:00 +01:00			`#!/usr/bin/python3`

#35: Move Python scripts to `scripts/` and describe the project structure in `README.md` 2023-02-26 15:12:06 +01:00			`# This algorithm should also take in account other features that we use to retrieve channels.`

Fix #16: Provide an algorithm to determine the progress of retrieving comments for huge YouTube channels 2023-01-06 17:51:00 +01:00			`import os, requests, json, time, datetime`

			`path = 'channels/'`

			`os.chdir(path)`

			`def getTimestampFromDateString(dateString):`
Simplify `scripts/findLatestTreatedCommentsForChannelsBeingTreated.py` 2023-07-28 11:32:52 +02:00			`return int(time.mktime(datetime.datetime.strptime(dateString, '%Y-%m-%dT%H:%M:%SZ').timetuple()))`
Fix #16: Provide an algorithm to determine the progress of retrieving comments for huge YouTube channels 2023-01-06 17:51:00 +01:00
			`for channelId in list(os.walk('.'))[1]:`
			`channelId = channelId[2:]`
			`#print(channelId)`
#35: Move Python scripts to `scripts/` and describe the project structure in `README.md` 2023-02-26 15:12:06 +01:00			`numberOfRequests = len(list(os.walk(f'{channelId}/requests'))[0][2]) - 1`
Fix #16: Provide an algorithm to determine the progress of retrieving comments for huge YouTube channels 2023-01-06 17:51:00 +01:00			`# Assume that the folder isn't empty (may not be the case, but it is most of the time).`
#35: Move Python scripts to `scripts/` and describe the project structure in `README.md` 2023-02-26 15:12:06 +01:00			`filePath = f'{channelId}/requests/{str(numberOfRequests - 1)}.json'`
			`with open(filePath) as f:`
			`print(filePath)`
			`#content = "\n".join(f.read().splitlines()[1:])`
			`data = json.load(f)#json.loads(content)`
Fix #16: Provide an algorithm to determine the progress of retrieving comments for huge YouTube channels 2023-01-06 17:51:00 +01:00			`snippet = data['items'][-1]['snippet']`
			`if 'topLevelComment' in snippet:`
			`snippet = snippet['topLevelComment']['snippet']`
			`latestTreatedCommentDate = snippet['publishedAt']`
			`url = f'https://yt.lemnoslife.com/noKey/channels?part=snippet&id={channelId}'`
Simplify `scripts/findLatestTreatedCommentsForChannelsBeingTreated.py` 2023-07-28 11:32:52 +02:00			`data = requests.get(url).json()`
Fix #16: Provide an algorithm to determine the progress of retrieving comments for huge YouTube channels 2023-01-06 17:51:00 +01:00			`channelCreationDate = data['items'][0]['snippet']['publishedAt']`
			`#print(channelCreationDate)`
Add a note about the timing percentage of `findLatestTreatedCommentsForChannelsBeingTreated.py` going backward 2023-01-07 15:35:12 +01:00			`# Timing percentage not taking into account the not uniform in time distribution of comments. Note that in the case of the last request is to list replies to a comment, the percentage might goes a bit backward, as replies are posted after the initial comment.`
Fix #16: Provide an algorithm to determine the progress of retrieving comments for huge YouTube channels 2023-01-06 17:51:00 +01:00			`currentTimestamp = int(time.time())`
			`timingPercentage = round(100 * (currentTimestamp - getTimestampFromDateString(latestTreatedCommentDate)) / (currentTimestamp - getTimestampFromDateString(channelCreationDate)), 3)`
			`print(f'{channelId} {latestTreatedCommentDate} / {channelCreationDate} ({timingPercentage}%)')`
			`break`