Convolutional Neural Networks (CNNs)

CNNs are designed for grid-structured data (images). A convolutional layer applies \(K\) learnable filters to local neighborhoods. Conceptually, for an input image \(X\) and filter \(W\):

  • The output feature map is built by sliding \(W\) over \(X\) and computing local dot products.

CNNs exploit:

  • Locality: nearby pixels are strongly related,
  • Weight sharing: the same filter is applied across the image,
  • Hierarchical features: deeper layers compose simple patterns into complex ones.

In this lesson we:

  • load Fashion-MNIST,
  • build a modern CNN with batch normalization and dropout,
  • validate with early stopping, and
  • evaluate test accuracy.

Step 1: Load Fashion-MNIST

# If dataset_fashion_mnist() is not available in your install, use dataset_mnist() instead.
data <- dataset_fashion_mnist()

x_train <- data$train$x
y_train <- data$train$y

x_test <- data$test$x
y_test <- data$test$y

dim(x_train); length(y_train)
## [1] 60000    28    28
## [1] 60000
dim(x_test);  length(y_test)
## [1] 10000    28    28
## [1] 10000

Step 2: Preprocess (scale and reshape)

CNN expects a 4D tensor: \((n, H, W, C)\).

# Convert to float and scale to [0,1]
x_train <- array_reshape(x_train, dim = c(nrow(x_train), 28, 28, 1)) / 255
x_test  <- array_reshape(x_test,  dim = c(nrow(x_test),  28, 28, 1)) / 255

num_classes <- 10
y_train_oh <- to_categorical(y_train, num_classes)
y_test_oh  <- to_categorical(y_test,  num_classes)

dim(x_train); dim(y_train_oh)
## [1] 60000    28    28     1
## [1] 60000    10

Step 3: Train/validation split

set.seed(123)
n <- dim(x_train)[1]
idx <- sample.int(n)

n_tr <- floor(0.9 * n)
tr_idx <- idx[1:n_tr]
va_idx <- idx[(n_tr + 1):n]

X_tr <- x_train[tr_idx, , , , drop = FALSE]
Y_tr <- y_train_oh[tr_idx, , drop = FALSE]

X_va <- x_train[va_idx, , , , drop = FALSE]
Y_va <- y_train_oh[va_idx, , drop = FALSE]

Step 4: CNN architecture (regularized)

We use:

  • Conv → BN → ReLU blocks
  • MaxPool to reduce spatial resolution
  • Dropout to regularize
  • Dense classifier head with softmax output
input <- layer_input(shape = c(28, 28, 1), name = "image")

x <- input |>
  layer_conv_2d(filters = 32, kernel_size = c(3,3), padding = "same") |>
  layer_batch_normalization() |>
  layer_activation("relu") |>
  layer_conv_2d(filters = 32, kernel_size = c(3,3), padding = "same") |>
  layer_batch_normalization() |>
  layer_activation("relu") |>
  layer_max_pooling_2d(pool_size = c(2,2)) |>
  layer_dropout(rate = 0.25) |>
  
  layer_conv_2d(filters = 64, kernel_size = c(3,3), padding = "same") |>
  layer_batch_normalization() |>
  layer_activation("relu") |>
  layer_conv_2d(filters = 64, kernel_size = c(3,3), padding = "same") |>
  layer_batch_normalization() |>
  layer_activation("relu") |>
  layer_max_pooling_2d(pool_size = c(2,2)) |>
  layer_dropout(rate = 0.30) |>
  
  layer_flatten() |>
  layer_dense(units = 128, activation = "relu") |>
  layer_dropout(rate = 0.40)

output <- x |>
  layer_dense(units = num_classes, activation = "softmax", name = "class_prob")

cnn <- keras_model(inputs = input, outputs = output)

cnn |>
  compile(
    optimizer = optimizer_adam(learning_rate = 1e-3),
    loss = "categorical_crossentropy",
    metrics = "accuracy"
  )

cnn
## Model: "functional_10"
## ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
## ┃ Layer (type)                               ┃ Output Shape                      ┃          Param # ┃ Trainable ┃
## ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
## │ image (InputLayer)                         │ (None, 28, 28, 1)                 │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ conv2d (Conv2D)                            │ (None, 28, 28, 32)                │              320 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ batch_normalization (BatchNormalization)   │ (None, 28, 28, 32)                │              128 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ activation (Activation)                    │ (None, 28, 28, 32)                │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ conv2d_1 (Conv2D)                          │ (None, 28, 28, 32)                │            9,248 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ batch_normalization_1 (BatchNormalization) │ (None, 28, 28, 32)                │              128 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ activation_1 (Activation)                  │ (None, 28, 28, 32)                │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ max_pooling2d (MaxPooling2D)               │ (None, 14, 14, 32)                │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ dropout_5 (Dropout)                        │ (None, 14, 14, 32)                │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ conv2d_2 (Conv2D)                          │ (None, 14, 14, 64)                │           18,496 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ batch_normalization_2 (BatchNormalization) │ (None, 14, 14, 64)                │              256 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ activation_2 (Activation)                  │ (None, 14, 14, 64)                │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ conv2d_3 (Conv2D)                          │ (None, 14, 14, 64)                │           36,928 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ batch_normalization_3 (BatchNormalization) │ (None, 14, 14, 64)                │              256 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ activation_3 (Activation)                  │ (None, 14, 14, 64)                │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ max_pooling2d_1 (MaxPooling2D)             │ (None, 7, 7, 64)                  │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ dropout_6 (Dropout)                        │ (None, 7, 7, 64)                  │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ flatten (Flatten)                          │ (None, 3136)                      │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ dense_16 (Dense)                           │ (None, 128)                       │          401,536 │     Y     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ dropout_7 (Dropout)                        │ (None, 128)                       │                0 │     -     │
## ├────────────────────────────────────────────┼───────────────────────────────────┼──────────────────┼───────────┤
## │ class_prob (Dense)                         │ (None, 10)                        │            1,290 │     Y     │
## └────────────────────────────────────────────┴───────────────────────────────────┴──────────────────┴───────────┘
##  Total params: 468,586 (1.79 MB)
##  Trainable params: 468,202 (1.79 MB)
##  Non-trainable params: 384 (1.50 KB)

Step 5: Training with early stopping and LR scheduling

cb_es <- callback_early_stopping(
  monitor = "val_accuracy",
  mode = "max",
  patience = 5,
  restore_best_weights = TRUE
)

cb_rlr <- callback_reduce_lr_on_plateau(
  monitor = "val_loss",
  factor = 0.5,
  patience = 2,
  min_lr = 1e-5
)

set.seed(123)
history <- cnn |>
  fit(
    x = X_tr, y = Y_tr,
    validation_data = list(X_va, Y_va),
    epochs = 50,
    batch_size = 128,
    callbacks = list(cb_es, cb_rlr),
    verbose = 2
  )

plot(history) +
  theme_minimal() +
  ggtitle("CNN learning curves — Fashion-MNIST")

Step 6: Test evaluation

metrics <- cnn |>
  evaluate(x_test, y_test_oh, verbose = 0)

setNames(as.numeric(metrics), cnn$metrics_names) |>
  round(4)
##            loss compile_metrics 
##          0.9379          0.1879

Step 7: Confusion matrix (optional)

p <- cnn |>
  predict(x_test)

pred <- max.col(p) - 1
cm <- table(truth = y_test, pred = pred)

cm
##      pred
## truth   0   1   2   3   4   5   6   7   8   9
##     0 896   0  11   9   0   0  82   0   2   0
##     1   1 988   0   6   1   0   3   0   1   0
##     2  14   2 906   8  34   0  36   0   0   0
##     3   8   2   8 936  21   0  25   0   0   0
##     4   0   0  15  18 925   0  42   0   0   0
##     5   0   0   0   0   0 985   0  10   0   5
##     6  79   0  37  22  59   0 800   0   3   0
##     7   0   0   0   0   0   2   0 986   0  12
##     8   2   0   0   2   2   1   0   0 993   0
##     9   0   0   0   0   0   4   0  32   0 964

Summary

  • CNNs are the standard architecture for image-like data because they exploit locality and weight sharing.
  • Batch normalization and dropout stabilize training and improve generalization.
  • Early stopping and learning-rate scheduling are practical tools for robust training.
  • A clean validation protocol is required to interpret improvements beyond chance and beyond overfitting.
 

A work by Gianluca Sottile

gianluca.sottile@unipa.it