An autoencoder is a neural network trained to reconstruct its input: \[ f_\theta: x \mapsto \hat{x} \] with architecture split into:
The overall model minimizes a reconstruction loss, e.g. mean squared error: \[ \mathcal{L}(\phi,\psi) = \frac{1}{n}\sum_{i=1}^n \| x_i - D_\psi(E_\phi(x_i)) \|_2^2 \]
Deep autoencoders use multiple layers in encoder and decoder, allowing non-linear representation learning.
Typical uses:
In this lesson we use the Wholesale Customers dataset (spending by product category) to:
local_path <- "raw_data/wholesale_customers.csv"
url_path <- "https://raw.githubusercontent.com/SobiaNoorAI/Wholesale-Customer-Segmentation-and-Spending-Behavior-Analysis-By-ML/main/Data/Wholesale%20customers%20data.csv"
if (!file.exists(local_path)) {
dir.create("raw_data", showWarnings = FALSE)
download.file(url_path, local_path, mode = "wb")
}
wholesale <- read_csv(local_path, show_col_types = FALSE)
glimpse(wholesale)## Rows: 440
## Columns: 8
## $ Channel <dbl> 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2,…
## $ Region <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ Fresh <dbl> 12669, 7057, 6353, 13265, 22615, 9413, 12126, 7579, 5963, 6006, 3366, 13146, 31714, 21…
## $ Milk <dbl> 9656, 9810, 8808, 1196, 5410, 8259, 3199, 4956, 3648, 11093, 5403, 1124, 12319, 6208, …
## $ Grocery <dbl> 7561, 9568, 7684, 4221, 7198, 5126, 6975, 9426, 6192, 18881, 12974, 4523, 11757, 14982…
## $ Frozen <dbl> 214, 1762, 2405, 6404, 3915, 666, 480, 1669, 425, 1159, 4400, 1420, 287, 3095, 294, 39…
## $ Detergents_Paper <dbl> 2674, 3293, 3516, 507, 1777, 1795, 3140, 3321, 1716, 7425, 5977, 549, 3881, 6707, 5058…
## $ Delicassen <dbl> 1338, 1776, 7844, 1788, 5185, 1451, 545, 2566, 750, 2098, 1744, 497, 2931, 602, 2168, …
Focus on the numeric spending variables:
spend_vars <- c("Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicassen")
X_raw <- wholesale |>
select(all_of(spend_vars)) |>
na.omit()
summary(X_raw)## Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 55 Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 3128 1st Qu.: 1533 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 8504 Median : 3627 Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 12000 Mean : 5796 Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.: 16934 3rd Qu.: 7190 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :112151 Max. :73498 Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
Spending is strongly right-skewed. We apply \(\log(1+x)\) and standardization:
X_log <- X_raw |>
mutate(across(everything(), log1p))
means <- sapply(X_log, mean)
sds <- sapply(X_log, sd)
X <- X_log |>
mutate(across(everything(), ~ (.x - means[cur_column()]) / sds[cur_column()]))
X_mat <- as.matrix(X)
dim(X_mat)## [1] 440 6
We will train the autoencoder only on this standardized matrix.
Even though there is no target, we still need a validation set to detect overfitting.
set.seed(123)
n <- nrow(X_mat)
idx <- sample.int(n)
train_frac <- 0.8
n_train <- floor(train_frac * n)
X_train <- X_mat[idx[1:n_train], , drop = FALSE]
X_valid <- X_mat[idx[(n_train + 1):n], , drop = FALSE]
dim(X_train)## [1] 352 6
## [1] 88 6
We choose a symmetric deep autoencoder:
We use:
Mathematically: \[ z = \sigma(W_2 \sigma(W_1 x + b_1) + b_2), \quad \hat{x} = W_4 \sigma(W_3 z + b_3) + b_4 \]
with \(\sigma\) = ReLU and loss \(\frac{1}{n}\sum_i \|x_i - \hat{x}_i\|^2\).
input_dim <- ncol(X_train)
latent_dim <- 2L
build_autoencoder <- function(input_dim, latent_dim,
l2_lambda = 1e-4, dropout_rate = 0.1) {
input <- layer_input(shape = input_dim, name = "input_layer")
# Encoder
encoded <- input |>
layer_dense(units = 64, activation = "relu",
kernel_regularizer = regularizer_l2(l2_lambda)) |>
layer_dropout(rate = dropout_rate) |>
layer_dense(units = 32, activation = "relu",
kernel_regularizer = regularizer_l2(l2_lambda)) |>
layer_dense(units = latent_dim, activation = "linear", name = "latent_code")
# Decoder
decoded <- encoded |>
layer_dense(units = 32, activation = "relu",
kernel_regularizer = regularizer_l2(l2_lambda)) |>
layer_dense(units = 64, activation = "relu",
kernel_regularizer = regularizer_l2(l2_lambda)) |>
layer_dense(units = input_dim, activation = "linear", name = "reconstruction")
autoencoder <- keras_model(inputs = input, outputs = decoded)
autoencoder |>
compile(
optimizer = optimizer_adam(learning_rate = 1e-3),
loss = "mse"
)
autoencoder
}
autoencoder <- build_autoencoder(input_dim, latent_dim)
autoencoder## Model: "functional_3"
## ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
## ┃ Layer (type) ┃ Output Shape ┃ Param # ┃
## ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
## │ input_layer (InputLayer) │ (None, 6) │ 0 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dense_5 (Dense) │ (None, 64) │ 448 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dropout_2 (Dropout) │ (None, 64) │ 0 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dense_6 (Dense) │ (None, 32) │ 2,080 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ latent_code (Dense) │ (None, 2) │ 66 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dense_7 (Dense) │ (None, 32) │ 96 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dense_8 (Dense) │ (None, 64) │ 2,112 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ reconstruction (Dense) │ (None, 6) │ 390 │
## └─────────────────────────────────────────────────┴──────────────────────────────────────┴──────────────────────┘
## Total params: 5,192 (20.28 KB)
## Trainable params: 5,192 (20.28 KB)
## Non-trainable params: 0 (0.00 B)
We can also define a standalone encoder model to map \(x \mapsto z\):
encoder <- keras_model(
inputs = autoencoder$input,
outputs = autoencoder$get_layer("latent_code")$output
)
encoder## Model: "functional_4"
## ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
## ┃ Layer (type) ┃ Output Shape ┃ Param # ┃
## ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
## │ input_layer (InputLayer) │ (None, 6) │ 0 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dense_5 (Dense) │ (None, 64) │ 448 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dropout_2 (Dropout) │ (None, 64) │ 0 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dense_6 (Dense) │ (None, 32) │ 2,080 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ latent_code (Dense) │ (None, 2) │ 66 │
## └─────────────────────────────────────────────────┴──────────────────────────────────────┴──────────────────────┘
## Total params: 2,594 (10.13 KB)
## Trainable params: 2,594 (10.13 KB)
## Non-trainable params: 0 (0.00 B)
callback_es <- callback_early_stopping(
monitor = "val_loss",
patience = 10,
restore_best_weights = TRUE
)
set.seed(123)
history <- autoencoder |>
fit(
x = X_train,
y = X_train, # target = input
validation_data = list(X_valid, X_valid),
epochs = 200,
batch_size = 64,
callbacks = list(callback_es),
verbose = 2
)Learning curve:
We want training and validation losses to converge to a low value without strong divergence, which would signal overfitting to idiosyncratic patterns.
Compute latent codes for all observations:
| z1 | z2 |
|---|---|
| -0.4417788 | 1.7300138 |
| 0.4743341 | 2.4331424 |
| 1.0019404 | 2.9899631 |
| 1.9196881 | -0.0989299 |
| 1.7836455 | 2.3567364 |
| -0.0447464 | 1.2960854 |
Compare with a 2D PCA projection:
pc <- prcomp(X_mat)
pc_df <- data.frame(
PC1 = pc$x[, 1],
PC2 = pc$x[, 2]
)
# Plot PCA vs autoencoder codes
p1 <- ggplot(pc_df, aes(PC1, PC2)) +
geom_point(alpha = 0.6) +
theme_minimal() +
ggtitle("PCA (2D) of wholesale spending")
p2 <- ggplot(latent_df, aes(z1, z2)) +
geom_point(alpha = 0.6, color = "steelblue") +
theme_minimal() +
ggtitle("Autoencoder latent space (2D)")
p1Discussion:
You can also examine how each latent dimension relates to original features via correlation:
## z1 z2 Fresh Milk Grocery Frozen Detergents_Paper
## z1 1.00000000 -0.01124613 0.76648757 -0.14774319 -0.3525622 0.72284490 -0.3609554
## z2 -0.01124613 1.00000000 0.06621823 0.88090330 0.8665913 0.02941615 0.8050640
## Fresh 0.76648757 0.06621823 1.00000000 -0.02109638 -0.1329889 0.38625790 -0.1587060
## Milk -0.14774319 0.88090330 -0.02109638 1.00000000 0.7611276 -0.05522851 0.6787248
## Grocery -0.35256221 0.86659126 -0.13298890 0.76112760 1.0000000 -0.16452522 0.7971412
## Frozen 0.72284490 0.02941615 0.38625790 -0.05522851 -0.1645252 1.00000000 -0.2127709
## Detergents_Paper -0.36095537 0.80506395 -0.15870598 0.67872480 0.7971412 -0.21277086 1.0000000
## Delicassen 0.51135379 0.56015721 0.25644186 0.34231006 0.2399975 0.25631819 0.1675729
## Delicassen
## z1 0.5113538
## z2 0.5601572
## Fresh 0.2564419
## Milk 0.3423101
## Grocery 0.2399975
## Frozen 0.2563182
## Detergents_Paper 0.1675729
## Delicassen 1.0000000
For each observation, the reconstruction error is \[ e_i = \| x_i - \hat{x}_i \|_2^2 \] which we can compute on the standardized scale.
X_hat <- autoencoder |>
predict(X_mat)
recon_error <- rowSums((X_mat - X_hat)^2)
summary(recon_error)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04672 0.44954 0.89665 1.51253 1.68348 19.81549
error_df <- data.frame(
error = recon_error
)
ggplot(error_df, aes(x = error)) +
geom_histogram(bins = 40, fill = "firebrick", alpha = 0.8, color = "white") +
theme_minimal() +
labs(
title = "Distribution of reconstruction error",
x = "Squared reconstruction error",
y = "Count"
)A simple heuristic for “potential anomalies” is to flag observations in the upper tail of the reconstruction error distribution (e.g., top 5–10%).
threshold <- quantile(recon_error, 0.95)
idx_anom <- which(recon_error >= threshold)
length(idx_anom) / n # proportion flagged## [1] 0.05
| Fresh | Milk | Grocery | Frozen | Detergents_Paper | Delicassen |
|---|---|---|---|---|---|
| 4591 | 15729 | 16709 | 33 | 6956 | 433 |
| 10850 | 7555 | 14961 | 188 | 6899 | 46 |
| 18291 | 1266 | 21042 | 5373 | 4173 | 14472 |
| 20398 | 1137 | 3 | 4407 | 3 | 975 |
| 717 | 3587 | 6532 | 7530 | 529 | 894 |
| 7864 | 542 | 4042 | 9735 | 165 | 46 |
Caveats:
A denoising autoencoder receives noisy inputs \(\tilde{x} = x + \epsilon\) as input, but learns to reconstruct the clean \(x\). The training objective becomes \[ \mathcal{L} = \frac{1}{n}\sum_i \| x_i - D_\psi(E_\phi(\tilde{x}_i)) \|^2 \] which forces the model to learn a representation robust to noise.
We can simulate noise by adding Gaussian perturbations during training.
set.seed(123)
add_noise <- function(X, noise_sd = 0.2) {
X + matrix(rnorm(length(X), mean = 0, sd = noise_sd), nrow = nrow(X))
}
X_train_noisy <- add_noise(X_train, noise_sd = 0.2)
X_valid_noisy <- add_noise(X_valid, noise_sd = 0.2)
denoising_ae <- build_autoencoder(input_dim, latent_dim,
l2_lambda = 1e-4, dropout_rate = 0.0)
history_denoise <- denoising_ae |>
fit(
x = X_train_noisy,
y = X_train, # target = clean input
validation_data = list(X_valid_noisy, X_valid),
epochs = 200,
batch_size = 64,
callbacks = list(callback_es),
verbose = 0
)
plot(history_denoise) +
theme_minimal() +
ggtitle("Denoising autoencoder training")The denoising autoencoder tends to learn codes that emphasize stable structure, improving robustness for downstream tasks (e.g., clustering on the latent space).
We can use the latent representation \(z\) as input to clustering algorithms (e.g., K-means) to obtain segmentations in a low-dimensional, non-linear feature space.
# Build encoder model once from the existing autoencoder
encoder_denoise <- keras_model(
inputs = denoising_ae$input,
outputs = denoising_ae$get_layer("latent_code")$output
)
# Now get latent codes for all observations
Z_all <- encoder_denoise |>
predict(X_mat)
set.seed(123)
km_latent <- kmeans(Z_all, centers = 4, nstart = 50)
latent_cluster_df <- data.frame(
z1 = Z_all[, 1],
z2 = Z_all[, 2],
cluster = factor(km_latent$cluster)
)
ggplot(latent_cluster_df, aes(z1, z2, color = cluster)) +
geom_point(alpha = 0.8) +
theme_minimal() +
labs(
title = "K-means on autoencoder latent space",
x = "z1", y = "z2"
)Comparing this clustering to K-means on the original standardized space can reveal whether the autoencoder has highlighted meaningful non-linear structure.
Autoencoders are versatile tools for:
In this lesson you:
A work by Gianluca Sottile
gianluca.sottile@unipa.it