LSTM for Time Series Forecasting

Many business problems are sequential: sales, demand, traffic, call volumes, inventory levels, and financial series.
A sequence model aims to predict a future value \(y_{t+1}\) (or a horizon) from a window of past values \((y_{t-L+1}, \dots, y_t)\).

A standard supervised formulation for one-step forecasting is:

  • Input sequence \(x_t = (y_{t-L+1}, \dots, y_t)\)
  • Target \(y_{t+1}\)

We will use an LSTM, which maintains a hidden state and a memory cell to model long dependencies. In compact form:

  • Hidden update depends on gates and memory \(c_t\)
  • Output prediction is a function of the last hidden state \(h_t\)

In this lesson we:

  • build sliding windows and scaling correctly (train-only statistics),
  • fit an LSTM regression model,
  • regularize with dropout and weight decay,
  • validate with early stopping,
  • compare to a naive baseline, and
  • visualize forecast performance.

Step 1: Load electricity consumption data

We use hourly electricity consumption data (typical business use case: demand forecasting, energy trading, facility management).

local_path <- "raw_data/PJME_hourly.csv"
url_path <- "https://raw.githubusercontent.com/archd3sai/Hourly-Energy-Consumption-Prediction/master/PJME_hourly.csv"

if (!file.exists(local_path)) {
  dir.create("raw_data", showWarnings = FALSE)
  download.file(url_path, local_path, mode = "wb")
}

electricity <- read_csv(local_path, show_col_types = FALSE) |>
  mutate(
    Datetime = as.POSIXct(Datetime, tz = "UTC"),
    PJME_MW  = as.numeric(PJME_MW)
  ) |>
  filter(!is.na(PJME_MW)) |>
  rename(y = PJME_MW, date = Datetime) |>
  filter(date >= "2016-01-01 00:00:00.0001") |>
  select(date, y)

glimpse(electricity)
## Rows: 22,680
## Columns: 2
## $ date <dttm> 2016-01-01 00:00:00, 2016-12-31 01:00:00, 2016-12-31 02:00:00, 2016-12-31 03:00:00, 2016-12-31 04…
## $ y    <dbl> 26686, 29627, 28744, 28274, 28162, 28434, 29132, 30252, 31056, 31787, 32116, 31807, 31084, 30461, …
electricity |>
  ggplot(aes(date, y)) +
  geom_line(linewidth = 0.8, color = "steelblue") +
  theme_minimal() +
  labs(
    title = "Hourly electricity consumption (PJME)",
    subtitle = "From 2016 to 2018",
    x = "Date",
    y = "Power (MW)"
  )

Step 2: Train/validation/test split (time-aware)

For time series, use contiguous blocks to avoid data leakage:

n <- nrow(electricity)
train_end <- floor(0.70 * n)
valid_end <- floor(0.85 * n)

y_train_raw <- electricity$y[1:train_end]
y_valid_raw <- electricity$y[(train_end + 1):valid_end]
y_test_raw  <- electricity$y[(valid_end + 1):n]

c(
  train = length(y_train_raw),
  valid = length(y_valid_raw),
  test  = length(y_test_raw)
)
## train valid  test 
## 15875  3403  3402

Step 3: Scaling using training statistics only

mu <- mean(y_train_raw)
sdv <- sd(y_train_raw)

scale_with <- function(x, mu, sdv) (x - mu) / sdv
inv_scale  <- function(z, mu, sdv) z * sdv + mu

y_train <- scale_with(y_train_raw, mu, sdv)
y_valid <- scale_with(y_valid_raw, mu, sdv)
y_test  <- scale_with(y_test_raw,  mu, sdv)

Step 4: Sliding-window dataset construction

We create supervised examples with window length \(L=168\) (1 week of hourly data):

make_windows <- function(series, L = 168) {
  stopifnot(is.numeric(series), L >= 2, length(series) > L)
  
  n <- length(series)
  n_samples <- n - L
  
  X <- array(0, dim = c(n_samples, L, 1))
  y <- numeric(n_samples)
  
  for (i in 1:n_samples) {
    X[i, , 1] <- series[i:(i + L - 1)]
    y[i] <- series[i + L]
  }
  
  list(X = X, y = y)
}

L <- 168  # 1 week
w_train <- make_windows(y_train, L)
w_valid <- make_windows(c(tail(y_train, L), y_valid), L)
w_test  <- make_windows(c(c(y_train, y_valid), tail(c(y_train, y_valid), L)), L)

dim(w_train$X); length(w_train$y)
## [1] 15707   168     1
## [1] 15707
dim(w_valid$X); length(w_valid$y)
## [1] 3403  168    1
## [1] 3403
dim(w_test$X);  length(w_test$y)
## [1] 19278   168     1
## [1] 19278

Notes:

  • Validation windows prepend the last \(L\) training points to ensure proper sequence history.
  • The same applies to test windows.

Step 5: Naive baseline (persistence)

\(\hat{y}_{t+1} = y_t\).

yhat_naive_test <- w_test$X[, L, 1]
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))

c(
  rmse_naive = rmse(w_test$y, yhat_naive_test),
  mae_naive  = mae(w_test$y, yhat_naive_test)
)
## rmse_naive  mae_naive 
##  0.2337917  0.1711627

Step 6: LSTM architecture

We use a stacked LSTM with dropout and a dense head.

input <- layer_input(shape = c(L, 1), name = "sequence")

x <- input |>
  layer_conv_1d(filters = 32, kernel_size = 3, activation = "relu") |>
  layer_max_pooling_1d(pool_size = 2) |>
  layer_lstm(units = 16) |>
  layer_dense(units = 1)

model <- keras_model(inputs = input, outputs = x)

model |>
  compile(
    optimizer = optimizer_adam(learning_rate = 1e-3),
    loss = "mse",
    metrics = list("mae")
  )

model
## Model: "functional_8"
## ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
## ┃ Layer (type)                                    ┃ Output Shape                         ┃              Param # ┃
## ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
## │ sequence (InputLayer)                           │ (None, 168, 1)                       │                    0 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ conv1d_1 (Conv1D)                               │ (None, 166, 32)                      │                  128 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ max_pooling1d_1 (MaxPooling1D)                  │ (None, 83, 32)                       │                    0 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ lstm_1 (LSTM)                                   │ (None, 16)                           │                3,136 │
## ├─────────────────────────────────────────────────┼──────────────────────────────────────┼──────────────────────┤
## │ dense_14 (Dense)                                │ (None, 1)                            │                   17 │
## └─────────────────────────────────────────────────┴──────────────────────────────────────┴──────────────────────┘
##  Total params: 3,281 (12.82 KB)
##  Trainable params: 3,281 (12.82 KB)
##  Non-trainable params: 0 (0.00 B)

Step 7: Training with early stopping and LR scheduling

cb_es <- callback_early_stopping(
  monitor = "val_loss",
  patience = 15,
  restore_best_weights = TRUE
)

cb_rlr <- callback_reduce_lr_on_plateau(
  monitor = "val_loss",
  factor = 0.5,
  patience = 7,
  min_lr = 1e-5
)

set.seed(123)
history <- model |>
  fit(
    x = w_train$X, y = w_train$y,
    validation_data = list(w_valid$X, w_valid$y),
    epochs = 300,
    batch_size = 32,
    callbacks = list(cb_es, cb_rlr),
    verbose = 2
  )

plot(history) +
  theme_minimal() +
  ggtitle("LSTM learning curves — hourly electricity")

Step 8: Test evaluation (back to original scale)

pred_test_scaled <- model |>
  predict(w_test$X)

rmse_lstm <- rmse(w_test$y, pred_test_scaled)
mae_lstm  <- mae(w_test$y, pred_test_scaled)

c(rmse_lstm = rmse_lstm, mae_lstm = mae_lstm)
##  rmse_lstm   mae_lstm 
## 0.07929336 0.04774591

Invert scaling to MW units:

y_true <- inv_scale(w_test$y, mu, sdv)
y_pred <- inv_scale(as.numeric(pred_test_scaled), mu, sdv)
y_naive <- inv_scale(yhat_naive_test, mu, sdv)

c(
  rmse_naive = rmse(y_true, y_naive),
  rmse_lstm  = rmse(y_true, y_pred),
  mae_naive  = mae(y_true, y_naive),
  mae_lstm   = mae(y_true, y_pred)
)
## rmse_naive  rmse_lstm  mae_naive   mae_lstm 
##  1541.5438   522.8337  1128.5895   314.8204

Step 9: Forecast visualization

plot_df <- data.frame(
  t = 1:length(y_true),
  truth = y_true,
  lstm  = y_pred,
  naive = y_naive
)

ggplot(plot_df, aes(t, truth)) +
  geom_line(linewidth = 1.2, color = "black") +
  geom_line(aes(y = lstm), linewidth = 1, color = "steelblue") +
  geom_line(aes(y = naive), linewidth = 1, color = "grey60", linetype = 2) +
  theme_minimal() +
  labs(
    title = "Test forecasts (one-step ahead)",
    subtitle = "LSTM vs naive persistence",
    x = "Test time index (hours)",
    y = "Electricity consumption (MW)"
  )

Summary

You learned how to:

  • Frame time series forecasting as a supervised sequence-to-one regression problem using sliding windows.
  • Implement proper time-aware splits and train-only scaling to avoid data leakage.
  • Design a stacked LSTM with dropout and recurrent dropout for regularization.
  • Use early stopping and learning rate reduction to optimize training.
  • Evaluate against naive baselines and interpret improvements in business terms.

This pipeline generalizes to other business time series (sales, website traffic, call volumes) by adjusting the window length, architecture depth, and evaluation horizon.

 

A work by Gianluca Sottile

gianluca.sottile@unipa.it