diff --git a/datasets/vision/README.md b/datasets/vision/README.md new file mode 100644 index 0000000..4a06d9b --- /dev/null +++ b/datasets/vision/README.md @@ -0,0 +1 @@ +https://lesc.dinfo.unifi.it/VISION/dataset/ diff --git a/datasets/vision/total_size.py b/datasets/vision/total_size.py new file mode 100644 index 0000000..ccc8abf --- /dev/null +++ b/datasets/vision/total_size.py @@ -0,0 +1,36 @@ +import requests +from lxml import html + +url = 'https://lesc.dinfo.unifi.it/VISION/dataset/' + +def getFolderEntries(url): + text = requests.get(url).text + tree = html.fromstring(text) + # Remove legend, deisgn and `Parent Directory` entries. + entriesLines = tree.xpath('//tr')[3:-1] + entries = [] + for entryLine in entriesLines: + entryColumns = entryLine.xpath('td') + entryName = entryColumns[1].text_content() + entrySize = entryColumns[3].text_content().strip() + if entrySize != '-': + sizeUnit = entrySize[-1] + sizeValue = float(entrySize[:-1]) + entrySize = int(sizeValue * { + 'K': 1_000, + 'M': 1_000_000, + }[sizeUnit]) + entries += [[entryName, entrySize]] + return entries + +totalPhotoSizes = 0 +phoneFolders = getFolderEntries(url) +for phoneFolder, _ in phoneFolders: + phoneName = phoneFolder[:-1] + print(phoneName) + phonePhotos = getFolderEntries(f'{url}/{phoneName}/images/flat/') + for phonePhotoName, phonePhotoSize in phonePhotos: + print(phonePhotoName, phonePhotoSize) + totalPhotoSizes += phonePhotoSize + +print(totalPhotoSizes) \ No newline at end of file