From 73a8d77a325c88bf6f252f3af58faff4e6774c4c Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Sat, 4 Feb 2023 16:14:49 +0100 Subject: [PATCH] Precise different possibilities in `media_files_extractor.py` to manage duplicates --- media_files_extractor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/media_files_extractor.py b/media_files_extractor.py index 71a68d9..8b166b3 100644 --- a/media_files_extractor.py +++ b/media_files_extractor.py @@ -17,8 +17,10 @@ browser.get('https://studio.youtube.com/channel/UC/music') For `Music` tab, YouTube UI returns 3,000 entries while my reverse-engineering approach returns 5,819 entries. For `Sound effects` tab, YouTube UI returns 400 entries while my reverse-engineering approach returns 2021 entries. -So I assume YouTube UI pagination doesn't work fine, so to retrieve all media files, the idea is to filter by `Track title` and download all returned entries, as some tracks have the same titles. -We could verify the `trackId` or `viperId` to avoid downloading other media files containing the provided title. +So I assume YouTube UI pagination doesn't work fine, so to retrieve all media files (for `Music`), the idea is to filter by `Track title` and download one entry that perfectly (not just `contains`) matches `artist/name`, `title` and `duration/nanos` (converted if only `seconds`), as some tracks have the same titles. +Only `trackId` and `viperId` differ when identifying with `artist/name`, `title` and `duration/nanos` (cf above comment) (example: `Dyalla_Ringside_116`), as I verified all duplicates, they are binary identical. +So we will have to duplicate the media file with the different `trackId`s for files being *identitcal* (note that `trackId`, as well as `viperId` are uniquely identified). +Otherwise I could clean the metadata by removing duplicates (but then if we update the database we have to make sure that ids that we have kept are still kept). """ with open('music.json') as json_file: