2023-02-20 14:19:16 +01:00
import undetected_chromedriver as uc
"""
pip install undetected - chromedriver == 3.2 .1
Relying on Linux Mint apt chromium which is only at version 109.
"""
2023-02-04 14:18:26 +01:00
from selenium . webdriver . common . by import By
2023-02-04 19:08:02 +01:00
from selenium . webdriver . chrome . options import Options
2023-02-04 19:12:54 +01:00
import os , json , time
2023-02-04 14:18:26 +01:00
"""
As there is something looking as an anti - bot for downloading media files , we use a Selenium - based approach .
"""
2023-02-20 14:19:16 +01:00
path = ' /home/benjamin/Desktop/bens_folder/dev/yt/audio_library '
os . chdir ( path )
2024-01-03 14:43:30 +01:00
AUDIO_LIBRARY_URL = ' https://youtube.com/audiolibrary '
2023-02-04 19:12:54 +01:00
2023-02-04 19:08:02 +01:00
options = Options ( )
options . add_argument ( " --user-data-dir=selenium " )
2023-02-20 14:19:16 +01:00
browser = uc . Chrome ( options = options , version_main = 109 )
def goToAudioLibraryAndSelect100RowsPerPage ( ) :
browser . get ( AUDIO_LIBRARY_URL )
browser . find_element ( By . XPATH , ' //*[@id= " trigger " ]/ytcp-dropdown-trigger/div/div[2]/span ' ) . click ( )
browser . find_element ( By . CSS_SELECTOR , ' #text-item-2 > ytcp-ve > div > div > yt-formatted-string ' ) . click ( )
goToAudioLibraryAndSelect100RowsPerPage ( )
2023-02-04 14:18:26 +01:00
"""
For ` Music ` tab , YouTube UI returns 3 , 000 entries while my reverse - engineering approach returns 5 , 819 entries .
For ` Sound effects ` tab , YouTube UI returns 400 entries while my reverse - engineering approach returns 2021 entries .
2023-02-04 19:08:02 +01:00
So I assume YouTube UI pagination doesn ' t work fine, so to retrieve all media files, the idea is to filter by `Track title` and download all entries, preferably only those that have the title we are looking for, as some tracks have the same titles.
As for ` Sound effects ` , even with ` Sound effect ` , ` Duration ` , ` Category ` and ` Added ` there is an ambiguity on which files do we refer to ( for instance for ` Truck Driving in Parking Structure ` , as they all are different ) .
2023-02-04 14:18:26 +01:00
"""
with open ( ' music.json ' ) as json_file :
tracks = json . load ( json_file )
2023-02-04 19:12:54 +01:00
path = ' /home/benjamin/Downloads '
os . chdir ( path )
2023-02-20 14:19:16 +01:00
MAXIMAL_NUMBER_OF_RESULTS = 367
alreadyTreatedMultipleOccurrences = set ( )
2023-02-04 19:12:54 +01:00
with open ( ' rename.txt ' , ' w ' ) as f :
2023-02-20 14:19:16 +01:00
#tracks = tracks[36:]
2023-02-04 19:12:54 +01:00
for trackIndex , track in enumerate ( tracks ) :
2023-02-04 19:45:30 +01:00
seconds = int ( track [ " duration " ] [ " seconds " ] )
2023-02-20 14:19:16 +01:00
# Note that the leading `0` for seconds may be missing.
2023-02-04 19:12:54 +01:00
cleanDuration = f ' { seconds / / 60 } : { seconds % 60 } '
2023-02-20 14:19:16 +01:00
id = f ' { track [ " title " ] } - { track [ " artist " ] [ " name " ] } - { cleanDuration } '
print ( f ' { trackIndex } / { len ( tracks ) } : { id } ' )
if id in alreadyTreatedMultipleOccurrences :
print ( ' Already treated these multiple occurrences ' )
continue
alreadyTreatedMultipleOccurrences . add ( id )
2023-02-04 19:12:54 +01:00
browser . find_element ( By . ID , ' text-input ' ) . send_keys ( track [ ' title ' ] )
browser . find_element ( By . XPATH , ' /html/body/ytcp-text-menu/tp-yt-paper-dialog/tp-yt-paper-listbox/tp-yt-paper-item[2]/ytcp-ve/div/div/yt-formatted-string/span[1] ' ) . click ( )
numberOfResults = MAXIMAL_NUMBER_OF_RESULTS + 1
while numberOfResults > MAXIMAL_NUMBER_OF_RESULTS :
if numberOfResults != MAXIMAL_NUMBER_OF_RESULTS + 1 :
time . sleep ( 1 )
numberOfResults = int ( browser . find_element ( By . CSS_SELECTOR , ' .page-description ' ) . get_attribute ( ' innerHTML ' ) . split ( ) [ - 1 ] )
2023-02-20 14:19:16 +01:00
print ( f ' Found { numberOfResults } results ' )
# I noticed the fact that after a round-trip to the chrome://downloads, the initial setting of rows per page is reset.
#rowsPerPage = int(browser.find_element(By.CSS_SELECTOR, '#trigger > ytcp-dropdown-trigger > div > div.left-container.style-scope.ytcp-dropdown-trigger > span').get_attribute('innerHTML'))
#print('rowsPerPage:', rowsPerPage)
2023-02-04 14:18:26 +01:00
2023-02-04 19:12:54 +01:00
# `DOWNLOAD`
# Doesn't block.
2023-02-20 14:19:16 +01:00
hasMultipleOccurrences = False
2023-02-04 19:12:54 +01:00
if numberOfResults > 1 :
print ( ' found multiple ' )
occurrences = 0
2023-02-20 14:19:16 +01:00
upperBound = ( ( numberOfResults - 1 ) / / 100 ) + 1
for i in range ( upperBound ) :
for resultsIndex in range ( min ( 100 , numberOfResults - i * 100 ) ) :
row = f ' /html/body/ytcp-app/ytcp-entity-page/div/div/main/div/ytcp-animatable[24]/ytmus-page/ytmus-library-table/div[1]/ytmus-library-row[ { resultsIndex + 1 } ]/div/ '
title = browser . find_element ( By . XPATH , row + ' div[2]/div ' ) . get_attribute ( ' innerHTML ' )
artistCommon = ' div[5]/div '
try :
artist = browser . find_element ( By . XPATH , row + artistCommon + ' /ytcp-hover-anchor/a/span ' )
except :
artist = browser . find_element ( By . XPATH , row + artistCommon )
artist = artist . get_attribute ( ' innerHTML ' )
print ( resultsIndex , title , artist )
if title == track [ ' title ' ] and artist == track [ ' artist ' ] [ ' name ' ] :
print ( " it ' s a match " )
browser . find_element ( By . XPATH , row + ' div[8]/div[2]/ytcp-button/div ' ) . click ( )
occurrences + = 1
if upperBound > 1 :
browser . find_element ( By . CSS_SELECTOR , ' #navigate-after > tp-yt-iron-icon ' ) . click ( )
2023-02-04 19:12:54 +01:00
if occurrences > 1 :
2023-02-20 14:19:16 +01:00
print ( ' MULTIPLE OCCURRENCES ' )
2023-02-04 19:45:30 +01:00
# This isn't a clean solution.
2023-02-20 14:19:16 +01:00
# Questionable if it's even necessary when plugged with optic fiber and power and not running anything else on the computer.
time . sleep ( 4 )
2023-02-04 19:45:30 +01:00
browser . get ( ' chrome://downloads/ ' )
for occurrence in range ( occurrences ) :
2023-02-20 14:19:16 +01:00
download = browser . find_element ( By . XPATH , f ' /html/body/downloads-manager ' ) . shadow_root . find_element ( By . ID , ' downloadsList ' ) . find_element ( By . ID , f ' frb { occurrences - occurrence - 1 } ' ) . shadow_root
url = download . find_element ( By . ID , ' url ' ) . get_attribute ( ' href ' )
downloadFileName = download . find_element ( By . ID , ' file-link ' ) . get_attribute ( ' innerHTML ' )
2023-02-04 19:45:30 +01:00
viperId = url . split ( ' &id= ' ) [ 1 ] . split ( ' & ' ) [ 0 ]
print ( viperId )
2023-02-20 14:19:16 +01:00
occurrenceStr = ' ' if occurrence == 0 else f ' ( { occurrence } ) '
fileName = f " { track [ ' title ' ] } - { track [ ' artist ' ] [ ' name ' ] } { occurrenceStr } "
if f ' { fileName } .mp3 ' != downloadFileName :
print ( " The solution isn ' t clean enough! ' " )
exit ( 1 )
f . write ( f " { fileName } | { viperId } \n " )
hasMultipleOccurrences = True
goToAudioLibraryAndSelect100RowsPerPage ( )
2023-02-04 19:12:54 +01:00
#break
else :
browser . find_element ( By . CSS_SELECTOR , ' ytcp-button.style-scope:nth-child(1) > div:nth-child(2) ' ) . click ( )
2023-02-20 14:19:16 +01:00
if not hasMultipleOccurrences :
2023-02-04 19:45:30 +01:00
f . write ( f " { track [ ' title ' ] } - { track [ ' artist ' ] [ ' name ' ] } | { track [ ' viperId ' ] } \n " )
2023-02-20 14:19:16 +01:00
browser . find_element ( By . CSS_SELECTOR , ' #delete-icon > tp-yt-iron-icon ' ) . click ( )
2023-02-04 19:45:30 +01:00
2023-02-04 19:12:54 +01:00
#break
2023-02-04 14:18:26 +01:00
2024-01-03 14:43:30 +01:00
#browser.quit()