My aim is to apply k-fold cross-validation for training a VGG19 model. In order to do so, I read my images from directory using the following code:
DIR = "/Images"
data_dir = pathlib.Path(os.getcwd() '\\Images')
train_ds = tf.keras.utils.image_dataset_from_directory(
data_dir,
validation_split=0.2,
subset="training",
seed=123,
image_size=(224, 224),
batch_size=32)
val_ds = tf.keras.utils.image_dataset_from_directory(
data_dir,
validation_split=0.2,
subset="validation",
seed=123,
image_size=(224, 224),
batch_size=32)
and it worked properly without using kfold cross-validation. But when I want to use K-fold cross-validation, I have to have the label and images for train_ds
separately, and I couldn't find a solution for that, except I need to read images using another method. Therefore, I have decided to read images using ImageDataGenerator
and flow_from_directory
. But as far as I understand, in order to load images using flow_from_directory
, I have to have two separate subsets as traning
and test
in images, while I don't have traning
and test
folders in my case. Is there any solution for either of these approaches?
Furthermore, using the first method, which is tf.keras.utils.image_dataset_from_directory
, the number of images that will find is different from flow_from_directory
.
Here is the output of the first method:
Found 1060 files belonging to 4 classes. Using 848 files for training.
Here is the output of the second approach:
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(
rescale=1.0 / 255,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
vertical_flip=True)
Wheat_data = img_gen.flow_from_directory(data_dir,
subset="training",
seed=123)
Found 849 images belonging to 4 classes.
CodePudding user response:
You could convert your datasets to numpy
arrays and it should work as usual:
import tensorflow as tf
import pathlib
import numpy as np
from sklearn.model_selection import KFold
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file('flower_photos', origin=dataset_url, untar=True)
data_dir = pathlib.Path(data_dir)
batch_size = 32
train_ds = tf.keras.utils.image_dataset_from_directory(
data_dir,
validation_split=0.2,
subset="training",
seed=123,
image_size=(180, 180),
batch_size=batch_size)
val_ds = tf.keras.utils.image_dataset_from_directory(
data_dir,
validation_split=0.2,
subset="validation",
seed=123,
image_size=(180, 180),
batch_size=batch_size)
train_images = np.concatenate(list(train_ds.map(lambda x, y:x)))
train_labels = np.concatenate(list(train_ds.map(lambda x, y:y)))
val_images = np.concatenate(list(val_ds.map(lambda x, y:x)))
val_labels = np.concatenate(list(val_ds.map(lambda x, y:y)))
inputs = np.concatenate((train_images, val_images), axis=0)
targets = np.concatenate((train_labels, val_labels), axis=0)
kfold = KFold(n_splits=5, shuffle=True)
for train, test in kfold.split(inputs, targets):
model = tf.keras.Sequential([
tf.keras.layers.Rescaling(1./255, input_shape=(180, 180, 3)),
tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(5)])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
history = model.fit(inputs[train], targets[train],
batch_size=batch_size,
epochs=2)
scores = model.evaluate(inputs[test], targets[test], verbose=0)
Or you can use tf.keras.utils.image_dataset_from_directory
with a batch size of 1 and shuffle=False
but it is not so efficient:
import tensorflow as tf
import pathlib
import numpy as np
from sklearn.model_selection import KFold
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file('flower_photos', origin=dataset_url, untar=True)
data_dir = pathlib.Path(data_dir)
batch_size = 1
train_ds = tf.keras.utils.image_dataset_from_directory(
data_dir,
validation_split=0.2,
subset="training",
seed=123,
image_size=(180, 180),
batch_size=batch_size,
shuffle = False)
val_ds = tf.keras.utils.image_dataset_from_directory(
data_dir,
validation_split=0.2,
subset="validation",
seed=123,
image_size=(180, 180),
batch_size=batch_size,
shuffle = False)
ds = train_ds.concatenate(val_ds)
kfold = KFold(n_splits=5, shuffle=True)
for train, test in kfold.split(np.arange(len(ds))):
train = [x 1 for x in train]
test = [x 1 for x in test]
train_ds = tf.data.Dataset.from_tensor_slices([ds.skip(t-1).take(t) for t in train]).flat_map(lambda x: x).map(lambda x, y: (x[0, ...], y[0, ...]))
test_ds = tf.data.Dataset.from_tensor_slices([ds.skip(t-1).take(t) for t in test]).flat_map(lambda x: x).map(lambda x, y: (x[0, ...], y[0, ...]))
train_ds = train_ds.take(len(train)).batch(64, drop_remainder=True)
test_ds = test_ds.take(len(test)).batch(64, drop_remainder=True)
model = tf.keras.Sequential([
tf.keras.layers.Rescaling(1./255, input_shape=(180, 180, 3)),
tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(5)])
model.compile(optimizer='adam',loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
history = model.fit(train_ds,
epochs=2)
scores = model.evaluate(test_ds, verbose=0)
Another option would be to use dictionaries to store indices and tensors:
#...
ds = train_ds.concatenate(val_ds)
lookup_images = {}
lookup_labels = {}
for i, (x, y) in enumerate(ds):
lookup_images[i] = x
lookup_labels[i] = y
kfold = KFold(n_splits=5, shuffle=True)
for train, test in kfold.split(np.arange(len(ds))):
images_train = np.concatenate(list(map(lookup_images.get, train)))
labels_train = np.concatenate(list(map(lookup_labels.get, train)))
images_test = np.concatenate(list(map(lookup_images.get, test)))
labels_test = np.concatenate(list(map(lookup_labels.get, test)))
model = tf.keras.Sequential([
tf.keras.layers.Rescaling(1./255, input_shape=(180, 180, 3)),
tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(5)])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
history = model.fit(images_train, labels_train, epochs=2)
scores = model.evaluate(images_test, labels_test, verbose=0)