From ead3fef5b39f0299be06a0e9b60ea561e15a3e71 Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Mon, 20 Feb 2023 14:19:16 +0100 Subject: [PATCH] Add `goToAudioLibraryAndSelect100RowsPerPage` function to keep rows per page setting even after round-tripping `chrome://downloads` --- media_files_extractor.py | 107 +++++++++++++++++++++++++-------------- 1 file changed, 70 insertions(+), 37 deletions(-) diff --git a/media_files_extractor.py b/media_files_extractor.py index 483cef1..c687b36 100644 --- a/media_files_extractor.py +++ b/media_files_extractor.py @@ -1,4 +1,8 @@ -import undetected_chromedriver.v2 as uc +import undetected_chromedriver as uc +""" +pip install undetected-chromedriver==3.2.1 +Relying on Linux Mint apt chromium which is only at version 109. +""" from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import os, json, time @@ -7,12 +11,22 @@ import os, json, time As there is something looking as an anti-bot for downloading media files, we use a Selenium-based approach. """ +path = '/home/benjamin/Desktop/bens_folder/dev/yt/audio_library' + +os.chdir(path) + AUDIO_LIBRARY_URL = 'https://studio.youtube.com/channel/UC/music' options = Options() options.add_argument("--user-data-dir=selenium") -browser = uc.Chrome(options=options) -browser.get(AUDIO_LIBRARY_URL) +browser = uc.Chrome(options=options, version_main=109) + +def goToAudioLibraryAndSelect100RowsPerPage(): + browser.get(AUDIO_LIBRARY_URL) + browser.find_element(By.XPATH, '//*[@id="trigger"]/ytcp-dropdown-trigger/div/div[2]/span').click() + browser.find_element(By.CSS_SELECTOR, '#text-item-2 > ytcp-ve > div > div > yt-formatted-string').click() + +goToAudioLibraryAndSelect100RowsPerPage() """ For `Music` tab, YouTube UI returns 3,000 entries while my reverse-engineering approach returns 5,819 entries. @@ -29,16 +43,24 @@ path = '/home/benjamin/Downloads' os.chdir(path) -MAXIMAL_NUMBER_OF_RESULTS = 162 +MAXIMAL_NUMBER_OF_RESULTS = 367 + +alreadyTreatedMultipleOccurrences = set() with open('rename.txt', 'w') as f: - browser.find_element(By.XPATH, '//*[@id="trigger"]/ytcp-dropdown-trigger/div/div[2]/span').click() - browser.find_element(By.CSS_SELECTOR, '#text-item-2 > ytcp-ve > div > div > yt-formatted-string').click() - tracks = tracks[38:] + #tracks = tracks[36:] for trackIndex, track in enumerate(tracks): seconds = int(track["duration"]["seconds"]) + # Note that the leading `0` for seconds may be missing. cleanDuration = f'{seconds // 60}:{seconds % 60}' - print(f'{trackIndex} / {len(tracks)}: {track["title"]} - {track["artist"]["name"]} - {cleanDuration}') + id = f'{track["title"]} - {track["artist"]["name"]} - {cleanDuration}' + print(f'{trackIndex} / {len(tracks)}: {id}') + if id in alreadyTreatedMultipleOccurrences: + print('Already treated these multiple occurrences') + continue + + alreadyTreatedMultipleOccurrences.add(id) + browser.find_element(By.ID, 'text-input').send_keys(track['title']) browser.find_element(By.XPATH, '/html/body/ytcp-text-menu/tp-yt-paper-dialog/tp-yt-paper-listbox/tp-yt-paper-item[2]/ytcp-ve/div/div/yt-formatted-string/span[1]').click() @@ -47,51 +69,62 @@ with open('rename.txt', 'w') as f: if numberOfResults != MAXIMAL_NUMBER_OF_RESULTS + 1: time.sleep(1) numberOfResults = int(browser.find_element(By.CSS_SELECTOR, '.page-description').get_attribute('innerHTML').split()[-1]) - print(f'Found {numberOfResults} results') + print(f'Found {numberOfResults} results') + + # I noticed the fact that after a round-trip to the chrome://downloads, the initial setting of rows per page is reset. + #rowsPerPage = int(browser.find_element(By.CSS_SELECTOR, '#trigger > ytcp-dropdown-trigger > div > div.left-container.style-scope.ytcp-dropdown-trigger > span').get_attribute('innerHTML')) + #print('rowsPerPage:', rowsPerPage) # `DOWNLOAD` # Doesn't block. - if numberOfResults > 100: - print('More than 100 results') - break - + hasMultipleOccurrences = False if numberOfResults > 1: print('found multiple') occurrences = 0 - for resultsIndex in range(numberOfResults): - row = f'/html/body/ytcp-app/ytcp-entity-page/div/div/main/div/ytcp-animatable[24]/ytmus-page/ytmus-library-table/div[1]/ytmus-library-row[{resultsIndex + 1}]/div/' - title = browser.find_element(By.XPATH, row + 'div[2]/div').get_attribute('innerHTML') - artistCommon = 'div[5]/div' - try: - artist = browser.find_element(By.XPATH, row + artistCommon + '/ytcp-hover-anchor/a/span') - except: - artist = browser.find_element(By.XPATH, row + artistCommon) - artist = artist.get_attribute('innerHTML') - print(title, artist) - if title == track['title'] and artist == track['artist']['name']: - print("it's a match") - browser.find_element(By.XPATH, row + 'div[8]/div[2]/ytcp-button/div').click() - occurrences += 1 + upperBound = ((numberOfResults - 1) // 100) + 1 + for i in range(upperBound): + for resultsIndex in range(min(100, numberOfResults - i * 100)): + row = f'/html/body/ytcp-app/ytcp-entity-page/div/div/main/div/ytcp-animatable[24]/ytmus-page/ytmus-library-table/div[1]/ytmus-library-row[{resultsIndex + 1}]/div/' + title = browser.find_element(By.XPATH, row + 'div[2]/div').get_attribute('innerHTML') + artistCommon = 'div[5]/div' + try: + artist = browser.find_element(By.XPATH, row + artistCommon + '/ytcp-hover-anchor/a/span') + except: + artist = browser.find_element(By.XPATH, row + artistCommon) + artist = artist.get_attribute('innerHTML') + print(resultsIndex, title, artist) + if title == track['title'] and artist == track['artist']['name']: + print("it's a match") + browser.find_element(By.XPATH, row + 'div[8]/div[2]/ytcp-button/div').click() + occurrences += 1 + if upperBound > 1: + browser.find_element(By.CSS_SELECTOR, '#navigate-after > tp-yt-iron-icon').click() if occurrences > 1: - print('multiple occurrences') + print('MULTIPLE OCCURRENCES') # This isn't a clean solution. - time.sleep(1) + # Questionable if it's even necessary when plugged with optic fiber and power and not running anything else on the computer. + time.sleep(4) browser.get('chrome://downloads/') for occurrence in range(occurrences): - url = browser.find_element(By.XPATH, f'/html/body/downloads-manager').shadow_root.find_element(By.ID, 'downloadsList').find_element(By.ID, f'frb{occurrences - occurrence - 1}').shadow_root.find_element(By.ID, 'url').get_attribute('href') + download = browser.find_element(By.XPATH, f'/html/body/downloads-manager').shadow_root.find_element(By.ID, 'downloadsList').find_element(By.ID, f'frb{occurrences - occurrence - 1}').shadow_root + url = download.find_element(By.ID, 'url').get_attribute('href') + downloadFileName = download.find_element(By.ID, 'file-link').get_attribute('innerHTML') viperId = url.split('&id=')[1].split('&')[0] print(viperId) - occurrenceStr = '' if occurrence == 0 else f'({occurrence})' - f.write(f"{track['title']} - {track['artist']['name']}{occurrenceStr}|{viperId}\n") - browser.get(AUDIO_LIBRARY_URL) - break - else: - f.write(f"{track['title']} - {track['artist']['name']}|{track['viperId']}\n") + occurrenceStr = '' if occurrence == 0 else f' ({occurrence})' + fileName = f"{track['title']} - {track['artist']['name']}{occurrenceStr}" + if f'{fileName}.mp3' != downloadFileName: + print("The solution isn't clean enough!'") + exit(1) + f.write(f"{fileName}|{viperId}\n") + hasMultipleOccurrences = True + goToAudioLibraryAndSelect100RowsPerPage() #break else: browser.find_element(By.CSS_SELECTOR, 'ytcp-button.style-scope:nth-child(1) > div:nth-child(2)').click() + if not hasMultipleOccurrences: f.write(f"{track['title']} - {track['artist']['name']}|{track['viperId']}\n") - browser.find_element(By.CSS_SELECTOR, '#delete-icon > tp-yt-iron-icon').click() + browser.find_element(By.CSS_SELECTOR, '#delete-icon > tp-yt-iron-icon').click() #break