From 408b7a2ba95c9ef3c6fac8cad19c679917c7568a Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Thu, 21 Mar 2024 17:15:56 +0100 Subject: [PATCH] Add `flat` vision dataset size computation It is about 8.7 GB. --- datasets/vision/README.md | 1 + datasets/vision/total_size.py | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 datasets/vision/README.md create mode 100644 datasets/vision/total_size.py diff --git a/datasets/vision/README.md b/datasets/vision/README.md new file mode 100644 index 0000000..4a06d9b --- /dev/null +++ b/datasets/vision/README.md @@ -0,0 +1 @@ +https://lesc.dinfo.unifi.it/VISION/dataset/ diff --git a/datasets/vision/total_size.py b/datasets/vision/total_size.py new file mode 100644 index 0000000..ccc8abf --- /dev/null +++ b/datasets/vision/total_size.py @@ -0,0 +1,36 @@ +import requests +from lxml import html + +url = 'https://lesc.dinfo.unifi.it/VISION/dataset/' + +def getFolderEntries(url): + text = requests.get(url).text + tree = html.fromstring(text) + # Remove legend, deisgn and `Parent Directory` entries. + entriesLines = tree.xpath('//tr')[3:-1] + entries = [] + for entryLine in entriesLines: + entryColumns = entryLine.xpath('td') + entryName = entryColumns[1].text_content() + entrySize = entryColumns[3].text_content().strip() + if entrySize != '-': + sizeUnit = entrySize[-1] + sizeValue = float(entrySize[:-1]) + entrySize = int(sizeValue * { + 'K': 1_000, + 'M': 1_000_000, + }[sizeUnit]) + entries += [[entryName, entrySize]] + return entries + +totalPhotoSizes = 0 +phoneFolders = getFolderEntries(url) +for phoneFolder, _ in phoneFolders: + phoneName = phoneFolder[:-1] + print(phoneName) + phonePhotos = getFolderEntries(f'{url}/{phoneName}/images/flat/') + for phonePhotoName, phonePhotoSize in phonePhotos: + print(phonePhotoName, phonePhotoSize) + totalPhotoSizes += phonePhotoSize + +print(totalPhotoSizes) \ No newline at end of file