Add Python code associated to paragraph concerning YouTube exact search inconsistency
parent
49ff87ac54
commit
e1e087267a
86
Home.md
86
Home.md
@ -74,6 +74,92 @@ Note that [YouTube UI](https://www.youtube.com/results?search_query=%22kids+have
|
|||||||
|
|
||||||
From [my experience with YouTube](https://stackoverflow.com/users/7123660/benjamin-loison) which starts to be significant, we can't rely on YouTube search feature, as they give weird results as shown. However YouTube gives quite correctly the information concerning a given video id, so [the best approach that I am aware of](https://stackoverflow.com/a/69259093) to returns exactly correct and as far as possible exhaustive results consists in discovering the maximum number of videos through some crawling approach as I sketch in the last paragraph of the project proposal.
|
From [my experience with YouTube](https://stackoverflow.com/users/7123660/benjamin-loison) which starts to be significant, we can't rely on YouTube search feature, as they give weird results as shown. However YouTube gives quite correctly the information concerning a given video id, so [the best approach that I am aware of](https://stackoverflow.com/a/69259093) to returns exactly correct and as far as possible exhaustive results consists in discovering the maximum number of videos through some crawling approach as I sketch in the last paragraph of the project proposal.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>The code associated to this approach is here:</summary>
|
||||||
|
|
||||||
|
```py
|
||||||
|
import requests, json, subprocess
|
||||||
|
|
||||||
|
channelId = 'UCAuUUnT6oDeKwE6v1NGQxug'
|
||||||
|
uploadsPlaylistId = 'UU' + channelId[2:]
|
||||||
|
|
||||||
|
def getJson(url):
|
||||||
|
url = f'https://yt.lemnoslife.com/{url}'
|
||||||
|
content = requests.get(url).text
|
||||||
|
data = json.loads(content)
|
||||||
|
return data
|
||||||
|
|
||||||
|
videoIds = []
|
||||||
|
|
||||||
|
pageToken = ''
|
||||||
|
while True:
|
||||||
|
data = getJson(f'noKey/playlistItems?part=snippet&playlistId={uploadsPlaylistId}&maxResults=50&pageToken={pageToken}')
|
||||||
|
items = data['items']
|
||||||
|
print(len(videoIds))
|
||||||
|
for item in items:
|
||||||
|
#print(item)
|
||||||
|
videoId = item['snippet']['resourceId']['videoId']
|
||||||
|
#print(videoId)
|
||||||
|
videoIds += [videoId]
|
||||||
|
if 'nextPageToken' in data:
|
||||||
|
pageToken = data['nextPageToken']
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(len(videoIds))
|
||||||
|
# 4185
|
||||||
|
|
||||||
|
videoIds = videoIds[::-1]
|
||||||
|
|
||||||
|
def execute(command):
|
||||||
|
subprocess.check_output(command, shell = True)
|
||||||
|
|
||||||
|
videoIds = videoIds[2968:]
|
||||||
|
|
||||||
|
##
|
||||||
|
|
||||||
|
# 2968 SMnKboI4fvY
|
||||||
|
|
||||||
|
for videoIndex, videoId in enumerate(videoIds):
|
||||||
|
print(videoIndex, videoId)
|
||||||
|
data = getJson(f'noKey/captions?part=snippet&videoId={videoId}')
|
||||||
|
items = data['items']
|
||||||
|
if len(items) <= 2:
|
||||||
|
for item in items:
|
||||||
|
snippet = item['snippet']
|
||||||
|
trackKind = snippet['trackKind']
|
||||||
|
language = snippet['language']
|
||||||
|
if language == 'en' and trackKind == 'standard':
|
||||||
|
print('Found')
|
||||||
|
#execute('notify-send "Found"')
|
||||||
|
break
|
||||||
|
|
||||||
|
##
|
||||||
|
|
||||||
|
# Find shortest video:
|
||||||
|
|
||||||
|
url = 'noKey/search?part=snippet&q="your software Linux is in millions of computers"&maxResults=50'
|
||||||
|
data = getJson(url)
|
||||||
|
items = data['items']
|
||||||
|
setVideoIds = []
|
||||||
|
shortestVideo = 10 ** 9
|
||||||
|
shortestVideoId = None
|
||||||
|
for item in items:
|
||||||
|
videoId = item['id']['videoId']
|
||||||
|
print(videoId)
|
||||||
|
setVideoIds += [videoId]
|
||||||
|
url = f'videos?part=contentDetails&id={videoId}'
|
||||||
|
data = getJson(url)
|
||||||
|
duration = data['items'][0]['contentDetails']['duration']
|
||||||
|
if shortestVideo > duration and duration > 0:
|
||||||
|
shortestVideo = duration
|
||||||
|
shortestVideoId = videoId
|
||||||
|
|
||||||
|
print(shortestVideoId, shortestVideo)
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
## Concerning 20,000 videos limit for YouTube Data API v3 PlaylistItems: list endpoint
|
## Concerning 20,000 videos limit for YouTube Data API v3 PlaylistItems: list endpoint
|
||||||
|
|
||||||
Could try both (`-i` was required for ignoring errors such as age-restricted videos):
|
Could try both (`-i` was required for ignoring errors such as age-restricted videos):
|
||||||
|
Loading…
Reference in New Issue
Block a user