From 5540098e96cf6bae60ca8dbb3957ccc6a51ef649 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Sat, 4 Feb 2023 19:12:54 +0100 Subject: [PATCH] Make the media download work up to 100 results or duplicates The `MAXIMAL_NUMBER_OF_RESULTS` constant was computed thanks to: ```py import os, json path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' os.chdir(path) with open('sound_effects.json') as json_file: tracks = json.load(json_file) mostResults = 0 mostResultsTitle = None for track in tracks: title = track['title'] results = 0 for otherTrack in tracks: if title in otherTrack['title']: results += 1 if results > mostResults: mostResults = results mostResultsTitle = title print(mostResults, mostResultsTitle) ``` --- media_files_extractor.py | 70 ++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/media_files_extractor.py b/media_files_extractor.py index 1a3fe27..6a5bcff 100644 --- a/media_files_extractor.py +++ b/media_files_extractor.py @@ -1,16 +1,18 @@ import undetected_chromedriver.v2 as uc from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options -import json +import os, json, time """ As there is something looking as an anti-bot for downloading media files, we use a Selenium-based approach. """ +AUDIO_LIBRARY_URL = 'https://studio.youtube.com/channel/UC/music' + options = Options() options.add_argument("--user-data-dir=selenium") browser = uc.Chrome(options=options) -browser.get('https://studio.youtube.com/channel/UC/music') +browser.get(AUDIO_LIBRARY_URL) """ For `Music` tab, YouTube UI returns 3,000 entries while my reverse-engineering approach returns 5,819 entries. @@ -23,15 +25,63 @@ As for `Sound effects`, even with `Sound effect`, `Duration`, `Category` and `Ad with open('music.json') as json_file: tracks = json.load(json_file) -for track in tracks: - browser.find_element(By.ID, 'text-input').send_keys(track['title']) - browser.find_element(By.XPATH, '/html/body/ytcp-text-menu/tp-yt-paper-dialog/tp-yt-paper-listbox/tp-yt-paper-item[2]/ytcp-ve/div/div/yt-formatted-string/span[1]').click() +path = '/home/benjamin/Downloads' - number_of_results = int(browser.find_element(By.CSS_SELECTOR, '.page-description').get_attribute('innerHTML').split()[-1]) - print(number_of_results) +os.chdir(path) - # `DOWNLOAD` - browser.find_element(By.XPATH, 'div.overflow-actions:nth-child(12) > ytcp-button:nth-child(1) > div:nth-child(2)').click() - break +MAXIMAL_NUMBER_OF_RESULTS = 162 + +with open('rename.txt', 'w') as f: + browser.find_element(By.XPATH, '//*[@id="trigger"]/ytcp-dropdown-trigger/div/div[2]/span').click() + browser.find_element(By.CSS_SELECTOR, '#text-item-2 > ytcp-ve > div > div > yt-formatted-string').click() + tracks = tracks[38:] + for trackIndex, track in enumerate(tracks): + seconds = track["duration"]["seconds"] + cleanDuration = f'{seconds // 60}:{seconds % 60}' + print(f'{trackIndex} / {len(tracks)}: {track["title"]} - {track["artist"]["name"]} - {cleanDuration}') + browser.find_element(By.ID, 'text-input').send_keys(track['title']) + browser.find_element(By.XPATH, '/html/body/ytcp-text-menu/tp-yt-paper-dialog/tp-yt-paper-listbox/tp-yt-paper-item[2]/ytcp-ve/div/div/yt-formatted-string/span[1]').click() + + numberOfResults = MAXIMAL_NUMBER_OF_RESULTS + 1 + while numberOfResults > MAXIMAL_NUMBER_OF_RESULTS: + if numberOfResults != MAXIMAL_NUMBER_OF_RESULTS + 1: + time.sleep(1) + numberOfResults = int(browser.find_element(By.CSS_SELECTOR, '.page-description').get_attribute('innerHTML').split()[-1]) + print(f'Found {numberOfResults} results') + + # `DOWNLOAD` + # Doesn't block. + if numberOfResults > 100: + print('More than 100 results') + break + + if numberOfResults > 1: + print('found multiple') + occurrences = 0 + for resultsIndex in range(numberOfResults): + row = f'/html/body/ytcp-app/ytcp-entity-page/div/div/main/div/ytcp-animatable[24]/ytmus-page/ytmus-library-table/div[1]/ytmus-library-row[{resultsIndex + 1}]/div/' + title = browser.find_element(By.XPATH, row + 'div[2]/div').get_attribute('innerHTML') + artistCommon = 'div[5]/div' + try: + artist = browser.find_element(By.XPATH, row + artistCommon + '/ytcp-hover-anchor/a/span') + except: + artist = browser.find_element(By.XPATH, row + artistCommon) + artist = artist.get_attribute('innerHTML') + print(title, artist) + if title == track['title'] and artist == track['artist']['name']: + print("it's a match") + browser.find_element(By.XPATH, row + 'div[8]/div[2]/ytcp-button/div').click() + occurrences += 1 + if occurrences > 1: + print('multiple occurrences') + break + #break + #browser.get('chrome://downloads/') + #browser.get(AUDIO_LIBRARY_URL) + else: + browser.find_element(By.CSS_SELECTOR, 'ytcp-button.style-scope:nth-child(1) > div:nth-child(2)').click() + browser.find_element(By.CSS_SELECTOR, '#delete-icon > tp-yt-iron-icon').click() + f.write(f"{track['title']} - {track['artist']['name']}|{track['viperId']}\n") + #break #browser.quit() \ No newline at end of file