4

By forecasting errors, I mean the differences between predicted and actual values.

I am doing a time series analysis using a deep learning model called the long-short term memory (LSTM) based on this great article. The author distributed the data set into 11 samples to train the model and then make future predictions. keras package is required to run this model. It is using TensorFlow backend.

What I am trying to do is to get a confidence level for any predicted value. For example, let's say the model predicts that there will be 56 sunspots on Friday. I'd like to find out the probability of the number of sunspots that is more than the average of 50 (this is just a arbitrary number).

A possible solution I can think of for this question (please let me know if there is a better way to solve it) is to get the distribution of of the errors (the differences between predicted and actual values) and then calculate the Z-score and look up the probability, assuming normal distribution. In my example, the error is 6 (56-50).

In the above mentioned article, the 11 sample predictions (sample_predictions_lstm_tbl) are in an tibble with classes "rolling_origin" "rset" "tbl_df" "tbl" "data.frame". I'd like to know if there is a way to extract the errors (predicted values - actual values) from all of the samples and transform them into a single data frame so that I can plot a histogram of errors.

# Core Tidyverse
library(tidyverse)
library(glue)
library(forcats)

# Time Series
library(timetk)
library(tidyquant)
library(tibbletime)

# Visualization
library(cowplot)

# Preprocessing
library(recipes)

# Sampling / Accuracy
library(rsample)
library(yardstick)

# Modeling
library(keras)
# Install Keras if you have not installed before
install_keras()

sun_spots <- datasets::sunspot.month %>%
  tk_tbl() %>%
  mutate(index = as_date(index)) %>%
  as_tbl_time(index = index)

# Distribute the samples into 11 sets
periods_train <- 12 * 50
periods_test  <- 12 * 10
skip_span     <- 12 * 20

rolling_origin_resamples <- rolling_origin(
  sun_spots,
  initial    = periods_train,
  assess     = periods_test,
  cumulative = FALSE,
  skip       = skip_span
)

split    <- rolling_origin_resamples$splits

# Backtesting on all samples
predict_keras_lstm <- function(split, epochs = 300, ...) {

  lstm_prediction <- function(split, epochs, ...) {

    # 5.1.2 Data Setup
    df_trn <- training(split)
    df_tst <- testing(split)

    df <- bind_rows(
      df_trn %>% add_column(key = "training"),
      df_tst %>% add_column(key = "testing")
    ) %>% 
      as_tbl_time(index = index)

    # 5.1.3 Preprocessing
    rec_obj <- recipe(value ~ ., df) %>%
      step_sqrt(value) %>%
      step_center(value) %>%
      step_scale(value) %>%
      prep()

    df_processed_tbl <- bake(rec_obj, df)

    center_history <- rec_obj$steps[[2]]$means["value"]
    scale_history  <- rec_obj$steps[[3]]$sds["value"]

    # 5.1.4 LSTM Plan
    lag_setting  <- 120 # = nrow(df_tst)
    batch_size   <- 40
    train_length <- 440
    tsteps       <- 1
    epochs       <- 300

    # 5.1.5 Train/Test Setup
    lag_train_tbl <- df_processed_tbl %>%
      mutate(value_lag = lag(value, n = lag_setting)) %>%
      filter(!is.na(value_lag)) %>%
      filter(key == "training") %>%
      tail(train_length)

    x_train_vec <- lag_train_tbl$value_lag
    x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))

    y_train_vec <- lag_train_tbl$value
    y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))

    lag_test_tbl <- df_processed_tbl %>%
      mutate(
        value_lag = lag(value, n = lag_setting)
      ) %>%
      filter(!is.na(value_lag)) %>%
      filter(key == "testing")

    x_test_vec <- lag_test_tbl$value_lag
    x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))

    y_test_vec <- lag_test_tbl$value
    y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))

    # 5.1.6 LSTM Model
    model <- keras_model_sequential()

    model %>%
      layer_lstm(units            = 50, 
                 input_shape      = c(tsteps, 1), 
                 batch_size       = batch_size,
                 return_sequences = TRUE, 
                 stateful         = TRUE) %>% 
      layer_lstm(units            = 50, 
                 return_sequences = FALSE, 
                 stateful         = TRUE) %>% 
      layer_dense(units = 1)

    model %>% 
      compile(loss = 'mae', optimizer = 'adam')

    # 5.1.7 Fitting LSTM
    for (i in 1:epochs) {
      model %>% fit(x          = x_train_arr, 
                    y          = y_train_arr, 
                    batch_size = batch_size,
                    epochs     = 1, 
                    verbose    = 1, 
                    shuffle    = FALSE)

      model %>% reset_states()
      cat("Epoch: ", i)

    }

    # 5.1.8 Predict and Return Tidy Data
    # Make Predictions
    pred_out <- model %>% 
      predict(x_test_arr, batch_size = batch_size) %>%
      .[,1] 

    # Retransform values
    pred_tbl <- tibble(
      index   = lag_test_tbl$index,
      value   = (pred_out * scale_history + center_history)^2
    ) 

    # Combine actual data with predictions
    tbl_1 <- df_trn %>%
      add_column(key = "actual")

    tbl_2 <- df_tst %>%
      add_column(key = "actual")

    tbl_3 <- pred_tbl %>%
      add_column(key = "predict")

    # Create time_bind_rows() to solve dplyr issue
    time_bind_rows <- function(data_1, data_2, index) {
      index_expr <- enquo(index)
      bind_rows(data_1, data_2) %>%
        as_tbl_time(index = !! index_expr)
    }

    ret <- list(tbl_1, tbl_2, tbl_3) %>%
      reduce(time_bind_rows, index = index) %>%
      arrange(key, index) %>%
      mutate(key = as_factor(key))

    return(ret)

  }

  safe_lstm <- possibly(lstm_prediction, otherwise = NA)

  safe_lstm(split, epochs, ...)

}

# Modified epochs to 10 to reduce processing time
predict_keras_lstm(split, epochs = 10)

# Map to all samples
sample_predictions_lstm_tbl <- rolling_origin_resamples %>%
  mutate(predict = map(splits, predict_keras_lstm, epochs = 5))
T-T
  • 693
  • 1
  • 10
  • 24
  • A confidence interval and the probability that the actual number is above/below some arbitrary number are two different problems. Which are you trying to get? Look up standard deviation vs standard error. – hmhensen Feb 10 '19 at 03:02
  • @hmhensen I am trying to get the probability above or below some arbitrary number. In other words, I need the standard deviations of the forecasting errors. The math is relatively easy. What I am having trouble with is getting all the forecasting errors into a data frame from all the samples that are in the `tibble` structure with classes `"rolling_origin" "rset" "tbl_df" "tbl" "data.frame"`. – T-T Feb 11 '19 at 15:12
  • Sorry, been busy. I tried to replicate your code but `predict_keras_lstm(split, epochs = 10)` only produced `NA` for me. Was that supposed to happen? – hmhensen Feb 15 '19 at 04:37
  • @hmhensen No worries. Did you `install_keras()`? That line of code basically generate predictions for each of the 11 samples. If you go to the LSTM article, the very first plot named Keras Stateful LSTM: Backtested Predictions is the visualization of the results for this code. – T-T Feb 15 '19 at 21:35

0 Answers0