46 lines
1.4 KiB
Python
46 lines
1.4 KiB
Python
import requests
|
|
from lxml import html
|
|
import os
|
|
|
|
DOWNLOAD = True
|
|
# An alternative for downloading would be something like:
|
|
# ```bash
|
|
# wget --reject 'index.html*' -l inf -nH --cut-dirs=2 --recursive --no-parent https://lesc.dinfo.unifi.it/VISION/dataset/
|
|
# ```
|
|
# but this would not only download `flat` images.
|
|
|
|
url = 'https://lesc.dinfo.unifi.it/VISION/dataset/'
|
|
|
|
def getFolderEntries(url):
|
|
text = requests.get(url).text
|
|
tree = html.fromstring(text)
|
|
# Remove legend, deisgn and `Parent Directory` entries.
|
|
entriesLines = tree.xpath('//tr')[3:-1]
|
|
entries = []
|
|
for entryLine in entriesLines:
|
|
entryColumns = entryLine.xpath('td')
|
|
entryName = entryColumns[1].text_content()
|
|
entrySize = entryColumns[3].text_content().strip()
|
|
if entrySize != '-':
|
|
sizeUnit = entrySize[-1]
|
|
sizeValue = float(entrySize[:-1])
|
|
entrySize = int(sizeValue * {
|
|
'K': 1_000,
|
|
'M': 1_000_000,
|
|
}[sizeUnit])
|
|
entries += [[entryName, entrySize]]
|
|
return entries
|
|
|
|
os.mkdir('dataset')
|
|
|
|
totalPhotoSizes = 0
|
|
phoneFolders = getFolderEntries(url)
|
|
for phoneFolder, _ in phoneFolders:
|
|
phoneName = phoneFolder[:-1]
|
|
print(phoneName)
|
|
phonePhotos = getFolderEntries(f'{url}/{phoneName}/images/flat/')
|
|
for phonePhotoName, phonePhotoSize in phonePhotos:
|
|
print(phonePhotoName, phonePhotoSize)
|
|
totalPhotoSizes += phonePhotoSize
|
|
|
|
print(totalPhotoSizes) |