I am working on a neural network to recognize handwritten digits using the MNIST digits dataset. I wanted to use ImageDataGenerator from Keras to see if I could use that to increase the score of the predictions. But when I actually try to run the model I get this error: ValueError: Shapes (None, None) and (None, 28, 28, 10) are incompatible. the relevant code is:
datagen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=20,
validation_split=0.2)
datagen.fit(X_train.reshape(126000, 28, 28, 1))
print(X_train.shape)
print(y_train.shape)
model = keras.Sequential([
layers.Dense(784, activation='relu', input_shape=(28, 28, 1)),
layers.BatchNormalization(),
layers.Dropout(rate=0.5),
layers.Dense(784, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(rate=0.5),
layers.Dense(784, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(rate=0.5),
layers.Dense(784, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(rate=0.5),
layers.Dense(10, activation='softmax'),
])
model.compile(
optimizer="Adam",
loss="categorical_crossentropy",
metrics=["accuracy"]
)
X_train = np.reshape(X_train, (X_train.shape[0], 28, 28, 1))
history = model.fit(datagen.flow(
X_train,
y_train),
validation_data=(X_test, y_test),
batch_size=640,
epochs=100,
)
And this is the error I get:
ValueError: in user code:
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1021, in train_function *
return step_function(self, iterator)
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1010, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1000, in run_step **
outputs = model.train_step(data)
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 860, in train_step
loss = self.compute_loss(x, y, y_pred, sample_weight)
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 918, in compute_loss
return self.compiled_loss(
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 201, in __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/losses.py", line 141, in __call__
losses = call_fn(y_true, y_pred)
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/losses.py", line 245, in call **
return ag_fn(y_true, y_pred, **self._fn_kwargs)
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/losses.py", line 1789, in categorical_crossentropy
return backend.categorical_crossentropy(
File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/backend.py", line 5083, in categorical_crossentropy
target.shape.assert_is_compatible_with(output.shape)
ValueError: Shapes (None, None) and (None, 28, 28, 10) are incompatible
Complete code:
# %%
import pandas as pd
from scipy.ndimage import shift
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from sklearn.model_selection import train_test_split
import numpy as np
# Setup plotting
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
titleweight='bold', titlesize=18, titlepad=10)
physical_devices = tf.config.list_physical_devices('GPU')
try:
tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
# Invalid device or cannot modify virtual devices once initialized.
pass
# %%
# train = pd.read_csv("../data/train.csv")
# test = pd.read_csv("../data/test.csv")
train = pd.read_csv('../data/train.csv').astype('float32')
X, y = train.drop('label', axis = 1), train.label
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_predict = pd.read_csv('../data/test.csv')
# %%
class_names = [str(i) for i in range(10)]
print(class_names)
# %% [markdown]
# # Separating targets and features
#
# `X_train` contains the features: all 784 pixels from the images in an array
#
# `Y_train` contains the targets: the numerical labels from 1-9
# %%
# y_train = train["label"]
# X_train = train.drop(labels = ["label"],axis = 1)
# %%
plt.figure()
plt.imshow(X_train.iloc[0].values.reshape(28,28))
plt.colorbar()
plt.grid(False)
plt.show()
# %% [markdown]
# # Distribution
#
# Data is more or less evenly distributed. Category 5 contains some 1000 elements less than Category 1 but it's still quite a lot of training examples.
# %%
y_train.value_counts()
# %%
y_train.hist()
plt.show()
# %% [markdown]
# # Data preparation
#
# 1. Transform our data from 0-255 pixel value to 0-1 pixel value
# 2. Transform our labels from numerical values to one-hot encoded values
# %%
X_train = X_train / 255.0
X_test = X_test / 255.0
# %%
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5,5,i 1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(X_train.iloc[i].values.reshape(28,28), cmap=plt.cm.binary)
plt.show()
# %%
# y_train = keras.utils.to_categorical(y_train, num_classes=10)
# %%
# y_train[:5]
# %% [markdown]
# # Splitting our data
#
# We use a 80/20 test split since there's quite a lot of data and we just want to learn on as much as possible. Random state of The One True Number so we can safely repeat the split.
# %%
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state=0)
# %%
y_train = keras.utils.to_categorical(y_train, num_classes=10)
y_test = keras.utils.to_categorical(y_test, num_classes=10)
# %%
print(X_train.shape)
# %%
X_train = X_train.values # This is why I absolutely detest pandas. 3 hours of debugging, oh it's a dataframe haha.
print(X_train.shape)
print(y_train.shape)
# %%
# Method to shift the image by given dimension
# def shift_image(image, dx, dy):
# image = image.reshape((28, 28))
# shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
# return shifted_image.reshape([-1])
# %%
# Creating Augmented Dataset
# X_train_augmented = [image for image in X_train]
# y_train_augmented = [image for image in y_train]
# for dx, dy in ((1,0), (-1,0), (0,1), (0,-1)):
# for image, label in zip(X_train, y_train):
# X_train_augmented.append(shift_image(image, dx, dy))
# y_train_augmented.append(label)
# %%
# Shuffle the dataset
# shuffle_idx = np.random.permutation(len(X_train_augmented))
# X_train = np.array(X_train_augmented)[shuffle_idx]
# y_train = np.array(y_train_augmented)[shuffle_idx]
# %%
print(X_train.shape)
print(y_train.shape)
# %%
print(X_train.shape)
# %% [markdown]
# # Images of our data
#
# Which ones could the model have trouble with?
# %%
plt.figure(figsize=(10, 4))
for i in range(30):
plt.subplot(3, 10, i 1)
plt.imshow(X_train[i].reshape((28,28)))
plt.colorbar()
plt.axis('off')
plt.show()
# %%
type(X_train)
# %% [markdown]
# # Building our network
#
# We use 3Blue1Brown's model here. Input layer of 784 neurons (1 per pixel), ReLu activation, then two layers of 16 neurons with ReLu activation and finally a SoftMax layer.
# %%
datagen = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=20,
validation_split=0.2)
datagen.fit(X_train.reshape(X_train.shape[0], 28, 28, 1))
print(X_train.shape)
print(y_train.shape)
# %%
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)
# %%
model = keras.Sequential([
# layers.Dense(784, activation='relu', input_shape=(28, 28, 1)),
# layers.BatchNormalization(),
# layers.Dropout(rate=0.5),
# layers.Dense(784, activation='relu'),
# layers.BatchNormalization(),
# layers.Dropout(rate=0.5),
# layers.Dense(784, activation='relu'),
# layers.BatchNormalization(),
# layers.Dropout(rate=0.5),
# layers.Dense(784, activation='relu'),
# layers.BatchNormalization(),
# layers.Dropout(rate=0.5),
# layers.Dense(10, activation='softmax'),
layers.Flatten(input_shape=(28, 28, 1,)),
layers.Dense(784, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(rate=0.5),
layers.Dense(784, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(rate=0.5),
layers.Dense(784, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(rate=0.5),
layers.Dense(784, activation='relu'),
layers.BatchNormalization(),
layers.Dropout(rate=0.5),
layers.Dense(10, activation='softmax'),
])
# %% [markdown]
# # Compiling the model
#
# Adam optimizer is generally the best, but every year new ones come out so challenge it!
#
# Loss and metrics are hopefully obvious.
# %%
model.compile(
optimizer="Adam",
loss="categorical_crossentropy",
metrics=["accuracy"]
)
# %% [markdown]
# # Training the model
#
# Generally 5 epochs is enough to do better than most humans. Let's do 50 in the lecture as well to see the difference.
# %%
history = model.fit(datagen.flow(
X_train,
y_train),
validation_data=(X_test, y_test),
batch_size=640,
epochs=100,
)
# %%
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
print("Maximum Loss is: {:0.4f}".format(history_df['loss'].max()))
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()))
# %%
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()
print("Maximum accuracy is: {:0.4f}".format(history_df['accuracy'].max()))
print("Minimum Validation accuracy: {:0.4f}".format(history_df['val_accuracy'].min()))
# %% [markdown]
# # The predictions
#
# Unfortunately, the `predict` method just returns the SoftMax layers values, so we still need to retrieve the actual prediction from that. `np.argmax` does that: it retrieves the highest number from each array. Parameter `axis=1` makes it return the index from that array so actually get the number.
# %%
predictions = model.predict(X_test.values)
print(predictions[:5])
results = np.argmax(predictions ,axis = 1)
print(results[:5])
# %%
def plot_image(i, predictions_array, true_label, img):
true_label, img = true_label[i], img[i]
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.imshow(img, cmap=plt.cm.binary)
predicted_label = np.argmax(predictions_array)
if predicted_label == true_label:
color = 'blue'
else:
color = 'red'
plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
100*np.max(predictions_array),
class_names[true_label]),
color=color)
def plot_value_array(i, predictions_array, true_label):
true_label = true_label[i]
plt.grid(False)
plt.xticks(range(10))
plt.yticks([])
thisplot = plt.bar(range(10), predictions_array, color="#777777")
plt.ylim([0, 1])
predicted_label = np.argmax(predictions_array)
thisplot[predicted_label].set_color('red')
thisplot[true_label].set_color('blue')
# %%
y_test = y_test.argmax(axis=1)
# %%
y_train = y_train.argmax(axis=1)
# %%
X_train = np.reshape(X_train, (X_train.shape[0], 28, 28, 1))
# %%
i = 0
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], y_test, X_test)
plt.subplot(1,2,2)
plot_value_array(i, predictions[i], y_test)
plt.show()
# %%
# Plot the first X test images, their predicted labels, and the true labels.
# Color correct predictions in blue and incorrect predictions in red.
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
plt.subplot(num_rows, 2*num_cols, 2*i 1)
plot_image(i, predictions[i], y_test, X_test)
plt.subplot(num_rows, 2*num_cols, 2*i 2)
plot_value_array(i, predictions[i], y_test)
plt.tight_layout()
plt.show()
# %%
my_submission = pd.DataFrame({'ImageId': list(range(1,len(results) 1)), 'label': results})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)
# %%
CodePudding user response: