From e1e087267aca7a16cc54212e5efd67623f8c28a9 Mon Sep 17 00:00:00 2001 From: Benjamin_Loison Date: Wed, 25 Jan 2023 00:51:13 +0100 Subject: [PATCH] Add Python code associated to paragraph concerning YouTube exact search inconsistency --- Home.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/Home.md b/Home.md index ee602e9..1fad285 100644 --- a/Home.md +++ b/Home.md @@ -74,6 +74,92 @@ Note that [YouTube UI](https://www.youtube.com/results?search_query=%22kids+have From [my experience with YouTube](https://stackoverflow.com/users/7123660/benjamin-loison) which starts to be significant, we can't rely on YouTube search feature, as they give weird results as shown. However YouTube gives quite correctly the information concerning a given video id, so [the best approach that I am aware of](https://stackoverflow.com/a/69259093) to returns exactly correct and as far as possible exhaustive results consists in discovering the maximum number of videos through some crawling approach as I sketch in the last paragraph of the project proposal. +
+The code associated to this approach is here: + +```py +import requests, json, subprocess + +channelId = 'UCAuUUnT6oDeKwE6v1NGQxug' +uploadsPlaylistId = 'UU' + channelId[2:] + +def getJson(url): + url = f'https://yt.lemnoslife.com/{url}' + content = requests.get(url).text + data = json.loads(content) + return data + +videoIds = [] + +pageToken = '' +while True: + data = getJson(f'noKey/playlistItems?part=snippet&playlistId={uploadsPlaylistId}&maxResults=50&pageToken={pageToken}') + items = data['items'] + print(len(videoIds)) + for item in items: + #print(item) + videoId = item['snippet']['resourceId']['videoId'] + #print(videoId) + videoIds += [videoId] + if 'nextPageToken' in data: + pageToken = data['nextPageToken'] + else: + break + +print(len(videoIds)) +# 4185 + +videoIds = videoIds[::-1] + +def execute(command): + subprocess.check_output(command, shell = True) + +videoIds = videoIds[2968:] + +## + +# 2968 SMnKboI4fvY + +for videoIndex, videoId in enumerate(videoIds): + print(videoIndex, videoId) + data = getJson(f'noKey/captions?part=snippet&videoId={videoId}') + items = data['items'] + if len(items) <= 2: + for item in items: + snippet = item['snippet'] + trackKind = snippet['trackKind'] + language = snippet['language'] + if language == 'en' and trackKind == 'standard': + print('Found') + #execute('notify-send "Found"') + break + +## + +# Find shortest video: + +url = 'noKey/search?part=snippet&q="your software Linux is in millions of computers"&maxResults=50' +data = getJson(url) +items = data['items'] +setVideoIds = [] +shortestVideo = 10 ** 9 +shortestVideoId = None +for item in items: + videoId = item['id']['videoId'] + print(videoId) + setVideoIds += [videoId] + url = f'videos?part=contentDetails&id={videoId}' + data = getJson(url) + duration = data['items'][0]['contentDetails']['duration'] + if shortestVideo > duration and duration > 0: + shortestVideo = duration + shortestVideoId = videoId + +print(shortestVideoId, shortestVideo) +``` + +
+ ## Concerning 20,000 videos limit for YouTube Data API v3 PlaylistItems: list endpoint Could try both (`-i` was required for ignoring errors such as age-restricted videos):