Many business problems are sequential: sales, demand, traffic, call
volumes, inventory levels, and financial series.
A sequence model aims to predict a future value \(y_{t+1}\) (or a horizon) from a window of
past values \((y_{t-L+1}, \dots,
y_t)\).
A standard supervised formulation for one-step forecasting is:
We will use an LSTM, which maintains a hidden state and a memory cell to model long dependencies. In compact form:
In this lesson we:
We use hourly electricity consumption data (typical business use case: demand forecasting, energy trading, facility management).
local_path <- "raw_data/PJME_hourly.csv"
url_path <- "https://raw.githubusercontent.com/archd3sai/Hourly-Energy-Consumption-Prediction/master/PJME_hourly.csv"
if (!file.exists(local_path)) {
dir.create("raw_data", showWarnings = FALSE)
download.file(url_path, local_path, mode = "wb")
}
electricity <- read_csv(local_path, show_col_types = FALSE) |>
mutate(
Datetime = as.POSIXct(Datetime, tz = "UTC"),
PJME_MW = as.numeric(PJME_MW)
) |>
filter(!is.na(PJME_MW)) |>
rename(y = PJME_MW, date = Datetime) |>
filter(date >= "2016-01-01 00:00:00.0001") |>
select(date, y)
glimpse(electricity)## Rows: 22,680
## Columns: 2
## $ date <dttm> 2016-01-01 00:00:00, 2016-12-31 01:00:00, 2016-12-31 02:00:00, 2016-12-31 03:00:00, 2016-12-31 04…
## $ y <dbl> 26686, 29627, 28744, 28274, 28162, 28434, 29132, 30252, 31056, 31787, 32116, 31807, 31084, 30461, …
electricity |>
ggplot(aes(date, y)) +
geom_line(linewidth = 0.8, color = "steelblue") +
theme_minimal() +
labs(
title = "Hourly electricity consumption (PJME)",
subtitle = "From 2016 to 2018",
x = "Date",
y = "Power (MW)"
)For time series, use contiguous blocks to avoid data leakage:
n <- nrow(electricity)
train_end <- floor(0.70 * n)
valid_end <- floor(0.85 * n)
y_train_raw <- electricity$y[1:train_end]
y_valid_raw <- electricity$y[(train_end + 1):valid_end]
y_test_raw <- electricity$y[(valid_end + 1):n]
c(
train = length(y_train_raw),
valid = length(y_valid_raw),
test = length(y_test_raw)
)## train valid test
## 15875 3403 3402
We create supervised examples with window length \(L=168\) (1 week of hourly data):
make_windows <- function(series, L = 168) {
stopifnot(is.numeric(series), L >= 2, length(series) > L)
n <- length(series)
n_samples <- n - L
X <- array(0, dim = c(n_samples, L, 1))
y <- numeric(n_samples)
for (i in 1:n_samples) {
X[i, , 1] <- series[i:(i + L - 1)]
y[i] <- series[i + L]
}
list(X = X, y = y)
}
L <- 168 # 1 week
w_train <- make_windows(y_train, L)
w_valid <- make_windows(c(tail(y_train, L), y_valid), L)
w_test <- make_windows(c(c(y_train, y_valid), tail(c(y_train, y_valid), L)), L)
dim(w_train$X); length(w_train$y)## [1] 15707 168 1
## [1] 15707
## [1] 3403 168 1
## [1] 3403
## [1] 19278 168 1
## [1] 19278
Notes:
\(\hat{y}_{t+1} = y_t\).
yhat_naive_test <- w_test$X[, L, 1]
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
c(
rmse_naive = rmse(w_test$y, yhat_naive_test),
mae_naive = mae(w_test$y, yhat_naive_test)
)## rmse_naive mae_naive
## 0.2337917 0.1711627
We use a stacked LSTM with dropout and a dense head.
input <- layer_input(shape = c(L, 1), name = "sequence")
x <- input |>
layer_conv_1d(filters = 32, kernel_size = 3, activation = "relu") |>
layer_max_pooling_1d(pool_size = 2) |>
layer_lstm(units = 16) |>
layer_dense(units = 1)
model <- keras_model(inputs = input, outputs = x)
model |>
compile(
optimizer = optimizer_adam(learning_rate = 1e-3),
loss = "mse",
metrics = list("mae")
)
model## Model: "functional_8"
## ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
## ┃ Layer (type) ┃ Output Shape ┃ Param # ┃
## ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
## │ sequence (InputLayer) │ (None, 168, 1) │ 0 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ conv1d_1 (Conv1D) │ (None, 166, 32) │ 128 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ max_pooling1d_1 (MaxPooling1D) │ (None, 83, 32) │ 0 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ lstm_1 (LSTM) │ (None, 16) │ 3,136 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dense_14 (Dense) │ (None, 1) │ 17 │
## └─────────────────────────────────────────────────┴──────────────────────────────────────┴──────────────────────┘
## Total params: 3,281 (12.82 KB)
## Trainable params: 3,281 (12.82 KB)
## Non-trainable params: 0 (0.00 B)
cb_es <- callback_early_stopping(
monitor = "val_loss",
patience = 15,
restore_best_weights = TRUE
)
cb_rlr <- callback_reduce_lr_on_plateau(
monitor = "val_loss",
factor = 0.5,
patience = 7,
min_lr = 1e-5
)
set.seed(123)
history <- model |>
fit(
x = w_train$X, y = w_train$y,
validation_data = list(w_valid$X, w_valid$y),
epochs = 300,
batch_size = 32,
callbacks = list(cb_es, cb_rlr),
verbose = 2
)
plot(history) +
theme_minimal() +
ggtitle("LSTM learning curves — hourly electricity")pred_test_scaled <- model |>
predict(w_test$X)
rmse_lstm <- rmse(w_test$y, pred_test_scaled)
mae_lstm <- mae(w_test$y, pred_test_scaled)
c(rmse_lstm = rmse_lstm, mae_lstm = mae_lstm)## rmse_lstm mae_lstm
## 0.07929336 0.04774591
Invert scaling to MW units:
y_true <- inv_scale(w_test$y, mu, sdv)
y_pred <- inv_scale(as.numeric(pred_test_scaled), mu, sdv)
y_naive <- inv_scale(yhat_naive_test, mu, sdv)
c(
rmse_naive = rmse(y_true, y_naive),
rmse_lstm = rmse(y_true, y_pred),
mae_naive = mae(y_true, y_naive),
mae_lstm = mae(y_true, y_pred)
)## rmse_naive rmse_lstm mae_naive mae_lstm
## 1541.5438 522.8337 1128.5895 314.8204
plot_df <- data.frame(
t = 1:length(y_true),
truth = y_true,
lstm = y_pred,
naive = y_naive
)
ggplot(plot_df, aes(t, truth)) +
geom_line(linewidth = 1.2, color = "black") +
geom_line(aes(y = lstm), linewidth = 1, color = "steelblue") +
geom_line(aes(y = naive), linewidth = 1, color = "grey60", linetype = 2) +
theme_minimal() +
labs(
title = "Test forecasts (one-step ahead)",
subtitle = "LSTM vs naive persistence",
x = "Test time index (hours)",
y = "Electricity consumption (MW)"
)You learned how to:
This pipeline generalizes to other business time series (sales, website traffic, call volumes) by adjusting the window length, architecture depth, and evaluation horizon.
A work by Gianluca Sottile
gianluca.sottile@unipa.it