YouTube_Audio_library_extra.../media_files_extractor.py

131 lines
6.7 KiB
Python

import undetected_chromedriver as uc
"""
pip install undetected-chromedriver==3.2.1
Relying on Linux Mint apt chromium which is only at version 109.
"""
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import os, json, time
"""
As there is something looking as an anti-bot for downloading media files, we use a Selenium-based approach.
"""
path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library'
os.chdir(path)
AUDIO_LIBRARY_URL = 'https://studio.youtube.com/channel/UC/music'
options = Options()
options.add_argument("--user-data-dir=selenium")
browser = uc.Chrome(options=options, version_main=109)
def goToAudioLibraryAndSelect100RowsPerPage():
browser.get(AUDIO_LIBRARY_URL)
browser.find_element(By.XPATH, '//*[@id="trigger"]/ytcp-dropdown-trigger/div/div[2]/span').click()
browser.find_element(By.CSS_SELECTOR, '#text-item-2 > ytcp-ve > div > div > yt-formatted-string').click()
goToAudioLibraryAndSelect100RowsPerPage()
"""
For `Music` tab, YouTube UI returns 3,000 entries while my reverse-engineering approach returns 5,819 entries.
For `Sound effects` tab, YouTube UI returns 400 entries while my reverse-engineering approach returns 2021 entries.
So I assume YouTube UI pagination doesn't work fine, so to retrieve all media files, the idea is to filter by `Track title` and download all entries, preferably only those that have the title we are looking for, as some tracks have the same titles.
As for `Sound effects`, even with `Sound effect`, `Duration`, `Category` and `Added` there is an ambiguity on which files do we refer to (for instance for `Truck Driving in Parking Structure`, as they all are different).
"""
with open('music.json') as json_file:
tracks = json.load(json_file)
path = '/home/benjamin/Downloads'
os.chdir(path)
MAXIMAL_NUMBER_OF_RESULTS = 367
alreadyTreatedMultipleOccurrences = set()
with open('rename.txt', 'w') as f:
#tracks = tracks[36:]
for trackIndex, track in enumerate(tracks):
seconds = int(track["duration"]["seconds"])
# Note that the leading `0` for seconds may be missing.
cleanDuration = f'{seconds // 60}:{seconds % 60}'
id = f'{track["title"]} - {track["artist"]["name"]} - {cleanDuration}'
print(f'{trackIndex} / {len(tracks)}: {id}')
if id in alreadyTreatedMultipleOccurrences:
print('Already treated these multiple occurrences')
continue
alreadyTreatedMultipleOccurrences.add(id)
browser.find_element(By.ID, 'text-input').send_keys(track['title'])
browser.find_element(By.XPATH, '/html/body/ytcp-text-menu/tp-yt-paper-dialog/tp-yt-paper-listbox/tp-yt-paper-item[2]/ytcp-ve/div/div/yt-formatted-string/span[1]').click()
numberOfResults = MAXIMAL_NUMBER_OF_RESULTS + 1
while numberOfResults > MAXIMAL_NUMBER_OF_RESULTS:
if numberOfResults != MAXIMAL_NUMBER_OF_RESULTS + 1:
time.sleep(1)
numberOfResults = int(browser.find_element(By.CSS_SELECTOR, '.page-description').get_attribute('innerHTML').split()[-1])
print(f'Found {numberOfResults} results')
# I noticed the fact that after a round-trip to the chrome://downloads, the initial setting of rows per page is reset.
#rowsPerPage = int(browser.find_element(By.CSS_SELECTOR, '#trigger > ytcp-dropdown-trigger > div > div.left-container.style-scope.ytcp-dropdown-trigger > span').get_attribute('innerHTML'))
#print('rowsPerPage:', rowsPerPage)
# `DOWNLOAD`
# Doesn't block.
hasMultipleOccurrences = False
if numberOfResults > 1:
print('found multiple')
occurrences = 0
upperBound = ((numberOfResults - 1) // 100) + 1
for i in range(upperBound):
for resultsIndex in range(min(100, numberOfResults - i * 100)):
row = f'/html/body/ytcp-app/ytcp-entity-page/div/div/main/div/ytcp-animatable[24]/ytmus-page/ytmus-library-table/div[1]/ytmus-library-row[{resultsIndex + 1}]/div/'
title = browser.find_element(By.XPATH, row + 'div[2]/div').get_attribute('innerHTML')
artistCommon = 'div[5]/div'
try:
artist = browser.find_element(By.XPATH, row + artistCommon + '/ytcp-hover-anchor/a/span')
except:
artist = browser.find_element(By.XPATH, row + artistCommon)
artist = artist.get_attribute('innerHTML')
print(resultsIndex, title, artist)
if title == track['title'] and artist == track['artist']['name']:
print("it's a match")
browser.find_element(By.XPATH, row + 'div[8]/div[2]/ytcp-button/div').click()
occurrences += 1
if upperBound > 1:
browser.find_element(By.CSS_SELECTOR, '#navigate-after > tp-yt-iron-icon').click()
if occurrences > 1:
print('MULTIPLE OCCURRENCES')
# This isn't a clean solution.
# Questionable if it's even necessary when plugged with optic fiber and power and not running anything else on the computer.
time.sleep(4)
browser.get('chrome://downloads/')
for occurrence in range(occurrences):
download = browser.find_element(By.XPATH, f'/html/body/downloads-manager').shadow_root.find_element(By.ID, 'downloadsList').find_element(By.ID, f'frb{occurrences - occurrence - 1}').shadow_root
url = download.find_element(By.ID, 'url').get_attribute('href')
downloadFileName = download.find_element(By.ID, 'file-link').get_attribute('innerHTML')
viperId = url.split('&id=')[1].split('&')[0]
print(viperId)
occurrenceStr = '' if occurrence == 0 else f' ({occurrence})'
fileName = f"{track['title']} - {track['artist']['name']}{occurrenceStr}"
if f'{fileName}.mp3' != downloadFileName:
print("The solution isn't clean enough!'")
exit(1)
f.write(f"{fileName}|{viperId}\n")
hasMultipleOccurrences = True
goToAudioLibraryAndSelect100RowsPerPage()
#break
else:
browser.find_element(By.CSS_SELECTOR, 'ytcp-button.style-scope:nth-child(1) > div:nth-child(2)').click()
if not hasMultipleOccurrences:
f.write(f"{track['title']} - {track['artist']['name']}|{track['viperId']}\n")
browser.find_element(By.CSS_SELECTOR, '#delete-icon > tp-yt-iron-icon').click()
#break
#browser.quit()