YouTube_Audio_library_extra.../media_files_extractor.py

import undetected_chromedriver.v2 as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import os, json, time

"""
As there is something looking as an anti-bot for downloading media files, we use a Selenium-based approach.
"""

AUDIO_LIBRARY_URL = 'https://studio.youtube.com/channel/UC/music'

options = Options()
options.add_argument("--user-data-dir=selenium")
browser = uc.Chrome(options=options)
browser.get(AUDIO_LIBRARY_URL)

"""
For `Music` tab, YouTube UI returns 3,000 entries while my reverse-engineering approach returns 5,819 entries.
For `Sound effects` tab, YouTube UI returns 400 entries while my reverse-engineering approach returns 2021 entries.

So I assume YouTube UI pagination doesn't work fine, so to retrieve all media files, the idea is to filter by `Track title` and download all entries, preferably only those that have the title we are looking for, as some tracks have the same titles.
As for `Sound effects`, even with `Sound effect`, `Duration`, `Category` and `Added` there is an ambiguity on which files do we refer to (for instance for `Truck Driving in Parking Structure`, as they all are different).
"""

with open('music.json') as json_file:
    tracks = json.load(json_file)

path = '/home/benjamin/Downloads'

os.chdir(path)

MAXIMAL_NUMBER_OF_RESULTS = 162

with open('rename.txt', 'w') as f:
    browser.find_element(By.XPATH, '//*[@id="trigger"]/ytcp-dropdown-trigger/div/div[2]/span').click()
    browser.find_element(By.CSS_SELECTOR, '#text-item-2 > ytcp-ve > div > div > yt-formatted-string').click()
    tracks = tracks[38:]
    for trackIndex, track in enumerate(tracks):
        seconds = int(track["duration"]["seconds"])
        cleanDuration = f'{seconds // 60}:{seconds % 60}'
        print(f'{trackIndex} / {len(tracks)}: {track["title"]} - {track["artist"]["name"]} - {cleanDuration}')
        browser.find_element(By.ID, 'text-input').send_keys(track['title'])
        browser.find_element(By.XPATH, '/html/body/ytcp-text-menu/tp-yt-paper-dialog/tp-yt-paper-listbox/tp-yt-paper-item[2]/ytcp-ve/div/div/yt-formatted-string/span[1]').click()

        numberOfResults = MAXIMAL_NUMBER_OF_RESULTS + 1
        while numberOfResults > MAXIMAL_NUMBER_OF_RESULTS:
            if numberOfResults != MAXIMAL_NUMBER_OF_RESULTS + 1:
                time.sleep(1)
            numberOfResults = int(browser.find_element(By.CSS_SELECTOR, '.page-description').get_attribute('innerHTML').split()[-1])
            print(f'Found {numberOfResults} results')

        # `DOWNLOAD`
        # Doesn't block.
        if numberOfResults > 100:
            print('More than 100 results')
            break

        if numberOfResults > 1:
            print('found multiple')
            occurrences = 0
            for resultsIndex in range(numberOfResults):
                row = f'/html/body/ytcp-app/ytcp-entity-page/div/div/main/div/ytcp-animatable[24]/ytmus-page/ytmus-library-table/div[1]/ytmus-library-row[{resultsIndex + 1}]/div/'
                title = browser.find_element(By.XPATH, row + 'div[2]/div').get_attribute('innerHTML')
                artistCommon = 'div[5]/div'
                try:
                    artist = browser.find_element(By.XPATH, row + artistCommon + '/ytcp-hover-anchor/a/span')
                except:
                    artist = browser.find_element(By.XPATH, row + artistCommon)
                artist = artist.get_attribute('innerHTML')
                print(title, artist)
                if title == track['title'] and artist == track['artist']['name']:
                    print("it's a match")
                    browser.find_element(By.XPATH, row + 'div[8]/div[2]/ytcp-button/div').click()
                    occurrences += 1
            if occurrences > 1:
                print('multiple occurrences')
                # This isn't a clean solution.
                time.sleep(1)
                browser.get('chrome://downloads/')
                for occurrence in range(occurrences):
                    url = browser.find_element(By.XPATH, f'/html/body/downloads-manager').shadow_root.find_element(By.ID, 'downloadsList').find_element(By.ID, f'frb{occurrences - occurrence - 1}').shadow_root.find_element(By.ID, 'url').get_attribute('href')
                    viperId = url.split('&id=')[1].split('&')[0]
                    print(viperId)
                    occurrenceStr = '' if occurrence == 0 else f'({occurrence})'
                    f.write(f"{track['title']} - {track['artist']['name']}{occurrenceStr}|{viperId}\n")
                browser.get(AUDIO_LIBRARY_URL)
                break
            else:
                f.write(f"{track['title']} - {track['artist']['name']}|{track['viperId']}\n")
            #break
        else:
            browser.find_element(By.CSS_SELECTOR, 'ytcp-button.style-scope:nth-child(1) > div:nth-child(2)').click()
            f.write(f"{track['title']} - {track['artist']['name']}|{track['viperId']}\n")
        browser.find_element(By.CSS_SELECTOR, '#delete-icon > tp-yt-iron-icon').click()

        #break

#browser.quit()
Move from Firefox to Chromium to be able to retrieve download URL Thanks to `chrome://downloads`. 2023-02-04 19:08:02 +01:00			`import undetected_chromedriver.v2 as uc`
Add `media_files_extractor.py` 2023-02-04 14:18:26 +01:00			`from selenium.webdriver.common.by import By`
Move from Firefox to Chromium to be able to retrieve download URL Thanks to `chrome://downloads`. 2023-02-04 19:08:02 +01:00			`from selenium.webdriver.chrome.options import Options`
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`import os, json, time`
Add `media_files_extractor.py` 2023-02-04 14:18:26 +01:00
			`"""`
			`As there is something looking as an anti-bot for downloading media files, we use a Selenium-based approach.`
			`"""`

Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`AUDIO_LIBRARY_URL = 'https://studio.youtube.com/channel/UC/music'`

Move from Firefox to Chromium to be able to retrieve download URL Thanks to `chrome://downloads`. 2023-02-04 19:08:02 +01:00			`options = Options()`
			`options.add_argument("--user-data-dir=selenium")`
			`browser = uc.Chrome(options=options)`
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`browser.get(AUDIO_LIBRARY_URL)`
Add `media_files_extractor.py` 2023-02-04 14:18:26 +01:00
			`"""`
			For `Music` tab, YouTube UI returns 3,000 entries while my reverse-engineering approach returns 5,819 entries.
			For `Sound effects` tab, YouTube UI returns 400 entries while my reverse-engineering approach returns 2021 entries.

Move from Firefox to Chromium to be able to retrieve download URL Thanks to `chrome://downloads`. 2023-02-04 19:08:02 +01:00			So I assume YouTube UI pagination doesn't work fine, so to retrieve all media files, the idea is to filter by `Track title` and download all entries, preferably only those that have the title we are looking for, as some tracks have the same titles.
			As for `Sound effects`, even with `Sound effect`, `Duration`, `Category` and `Added` there is an ambiguity on which files do we refer to (for instance for `Truck Driving in Parking Structure`, as they all are different).
Add `media_files_extractor.py` 2023-02-04 14:18:26 +01:00			`"""`

			`with open('music.json') as json_file:`
			`tracks = json.load(json_file)`

Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`path = '/home/benjamin/Downloads'`

			`os.chdir(path)`

			`MAXIMAL_NUMBER_OF_RESULTS = 162`

			`with open('rename.txt', 'w') as f:`
			`browser.find_element(By.XPATH, '//*[@id="trigger"]/ytcp-dropdown-trigger/div/div[2]/span').click()`
			`browser.find_element(By.CSS_SELECTOR, '#text-item-2 > ytcp-ve > div > div > yt-formatted-string').click()`
			`tracks = tracks[38:]`
			`for trackIndex, track in enumerate(tracks):`
Add media export support for duplicates 2023-02-04 19:45:30 +01:00			`seconds = int(track["duration"]["seconds"])`
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`cleanDuration = f'{seconds // 60}:{seconds % 60}'`
			`print(f'{trackIndex} / {len(tracks)}: {track["title"]} - {track["artist"]["name"]} - {cleanDuration}')`
			`browser.find_element(By.ID, 'text-input').send_keys(track['title'])`
			`browser.find_element(By.XPATH, '/html/body/ytcp-text-menu/tp-yt-paper-dialog/tp-yt-paper-listbox/tp-yt-paper-item[2]/ytcp-ve/div/div/yt-formatted-string/span[1]').click()`

			`numberOfResults = MAXIMAL_NUMBER_OF_RESULTS + 1`
			`while numberOfResults > MAXIMAL_NUMBER_OF_RESULTS:`
			`if numberOfResults != MAXIMAL_NUMBER_OF_RESULTS + 1:`
			`time.sleep(1)`
			`numberOfResults = int(browser.find_element(By.CSS_SELECTOR, '.page-description').get_attribute('innerHTML').split()[-1])`
			`print(f'Found {numberOfResults} results')`
Add `media_files_extractor.py` 2023-02-04 14:18:26 +01:00
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			# `DOWNLOAD`
			`# Doesn't block.`
			`if numberOfResults > 100:`
			`print('More than 100 results')`
			`break`
Add `media_files_extractor.py` 2023-02-04 14:18:26 +01:00
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`if numberOfResults > 1:`
			`print('found multiple')`
			`occurrences = 0`
			`for resultsIndex in range(numberOfResults):`
			`row = f'/html/body/ytcp-app/ytcp-entity-page/div/div/main/div/ytcp-animatable[24]/ytmus-page/ytmus-library-table/div[1]/ytmus-library-row[{resultsIndex + 1}]/div/'`
			`title = browser.find_element(By.XPATH, row + 'div[2]/div').get_attribute('innerHTML')`
			`artistCommon = 'div[5]/div'`
			`try:`
			`artist = browser.find_element(By.XPATH, row + artistCommon + '/ytcp-hover-anchor/a/span')`
			`except:`
			`artist = browser.find_element(By.XPATH, row + artistCommon)`
			`artist = artist.get_attribute('innerHTML')`
			`print(title, artist)`
			`if title == track['title'] and artist == track['artist']['name']:`
			`print("it's a match")`
			`browser.find_element(By.XPATH, row + 'div[8]/div[2]/ytcp-button/div').click()`
			`occurrences += 1`
			`if occurrences > 1:`
			`print('multiple occurrences')`
Add media export support for duplicates 2023-02-04 19:45:30 +01:00			`# This isn't a clean solution.`
			`time.sleep(1)`
			`browser.get('chrome://downloads/')`
			`for occurrence in range(occurrences):`
			`url = browser.find_element(By.XPATH, f'/html/body/downloads-manager').shadow_root.find_element(By.ID, 'downloadsList').find_element(By.ID, f'frb{occurrences - occurrence - 1}').shadow_root.find_element(By.ID, 'url').get_attribute('href')`
			`viperId = url.split('&id=')[1].split('&')[0]`
			`print(viperId)`
			`occurrenceStr = '' if occurrence == 0 else f'({occurrence})'`
			`f.write(f"{track['title']} - {track['artist']['name']}{occurrenceStr}\|{viperId}\n")`
			`browser.get(AUDIO_LIBRARY_URL)`
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`break`
Add media export support for duplicates 2023-02-04 19:45:30 +01:00			`else:`
			`f.write(f"{track['title']} - {track['artist']['name']}\|{track['viperId']}\n")`
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`#break`
			`else:`
			`browser.find_element(By.CSS_SELECTOR, 'ytcp-button.style-scope:nth-child(1) > div:nth-child(2)').click()`
Add media export support for duplicates 2023-02-04 19:45:30 +01:00			`f.write(f"{track['title']} - {track['artist']['name']}\|{track['viperId']}\n")`
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`browser.find_element(By.CSS_SELECTOR, '#delete-icon > tp-yt-iron-icon').click()`
Add media export support for duplicates 2023-02-04 19:45:30 +01:00
Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` 2023-02-04 19:12:54 +01:00			`#break`
Add `media_files_extractor.py` 2023-02-04 14:18:26 +01:00
			`#browser.quit()`