Files
Robust_image_source_identif…/datasets/vision/total_size.py

46 lines
1.4 KiB
Python

import requests
from lxml import html
import os
DOWNLOAD = True
# An alternative for downloading would be something like:
# ```bash
# wget --reject 'index.html*' -l inf -nH --cut-dirs=2 --recursive --no-parent https://lesc.dinfo.unifi.it/VISION/dataset/
# ```
# but this would not only download `flat` images.
url = 'https://lesc.dinfo.unifi.it/VISION/dataset/'
def getFolderEntries(url):
text = requests.get(url).text
tree = html.fromstring(text)
# Remove legend, deisgn and `Parent Directory` entries.
entriesLines = tree.xpath('//tr')[3:-1]
entries = []
for entryLine in entriesLines:
entryColumns = entryLine.xpath('td')
entryName = entryColumns[1].text_content()
entrySize = entryColumns[3].text_content().strip()
if entrySize != '-':
sizeUnit = entrySize[-1]
sizeValue = float(entrySize[:-1])
entrySize = int(sizeValue * {
'K': 1_000,
'M': 1_000_000,
}[sizeUnit])
entries += [[entryName, entrySize]]
return entries
os.mkdir('dataset')
totalPhotoSizes = 0
phoneFolders = getFolderEntries(url)
for phoneFolder, _ in phoneFolders:
phoneName = phoneFolder[:-1]
print(phoneName)
phonePhotos = getFolderEntries(f'{url}/{phoneName}/images/flat/')
for phonePhotoName, phonePhotoSize in phonePhotos:
print(phonePhotoName, phonePhotoSize)
totalPhotoSizes += phonePhotoSize
print(totalPhotoSizes)