Load packages and data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
df <- readr::read_rds("df.rds")
step_2_b_df <- readr::read_rds("step_2_b_df.rds")
step_2_a_df <- readr::read_rds("step_2_a_df.rds")
iii_models <- readr::read_rds("iii_models.rds")
iv_models <- readr::read_rds("iv_models.rds")

Overview

  1. Exploration
  2. Regression models
  3. Binary classification option b
  4. Binary classification option a
  5. Interpretation and optimization

This R Markdown file tackles part v, specifically evaluating the results from parts iii and iv, identifying the most important variables associated with the best performing models, visualizing the probability of failure as a function of the identified most important variables, and determining, based on visualizations, the input settings associated with minimizing the failure probability.

General

Evaluating response_1 as a feature

ROC

First compare the results between the two best performing models from parts iii and iv, based on AUC.

results <- resamples(list(iii_rf = iii_models$iii_rf,
                          iii_xgb = iii_models$iii_xgb,
                          iv_rf = iv_models$iv_rf,
                          iv_xgb = iv_models$iv_xgb))

Then we visually compare the performance metrics.

dotplot(results)

dotplot(results, metric = "ROC")

dotplot(results, metric = "Sens")

dotplot(results, metric = "Spec")

Based on AUC, the models including response_1 as a feature yield better performance in predicting outcome_2, as observable from the higher ROC values for both iii_rf and iii_xgb.

Accuracy

calc_accuracy <- function(model) {
  cf <- confusionMatrix.train(model)
  
  return( (cf$table[1,1] + cf$table[2,2]) / 100 )
}

models <- list(iii_rf = iii_models$iii_rf,
               iii_xgb = iii_models$iii_xgb,
               iv_rf = iv_models$iv_rf,
               iv_xgb = iv_models$iv_xgb)

accuracy_results <- purrr::map_dbl(models, calc_accuracy)

accuracy_results %>% sort(decreasing = TRUE)
##     iv_rf   iii_xgb    iii_rf    iv_xgb 
## 0.7335320 0.7318430 0.7303527 0.7223050

Based on Accuracy, including response_1 as a feature yield slightly better performance for the best performing model iv_rf, but slightly worse for iv_xgb.

Variable importance

Models with response_1 as a feature

plot(varImp(iii_models$iii_rf))

Plot variable importance based on xgb model.

plot(varImp(iii_models$iii_xgb))

x07, x08 and response_1 seem to be the three most important inputs.

Models without response_1 as a feature

plot(varImp(iv_models$iv_rf))

Plot variable importance based on xgb model.

plot(varImp(iv_models$iv_xgb))

x07 and x08 seem to be the two most important inputs.

Summary

In general, inputs x07, x08 seem to be the most importance variables. response_1 is also an important variable for the models including response_1 as one of the features.

Partial dependence - visualizing Fail probability against most important variables

library(pdp)
## 
## Attaching package: 'pdp'
## The following object is masked from 'package:purrr':
## 
##     partial
# Custom prediction function wrapper
# pdp_pred <- function(object, newdata)  {
#   results <- mean(as.vector(predict(object, newdata)))
#   return(results)
# }

# Compute partial dependence values
pd_values_x07 <- partial(
  iii_models$iii_rf,
  train = step_2_b_df, 
  pred.var = "x07"
)

head(pd_values_x07)  # take a peak
##        x07       yhat
## 1 140.3138 0.09749985
## 2 140.4843 0.09718262
## 3 140.6547 0.08530345
## 4 140.8251 0.08752993
## 5 140.9955 0.18662981
## 6 141.1660 0.20324699
# Partial dependence plot
autoplot(pd_values_x07)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.

# Compute partial dependence values
pd_values_x08 <- partial(
  iii_models$iii_rf,
  train = step_2_b_df, 
  pred.var = "x08"
)

head(pd_values_x08)  # take a peak
##        x08      yhat
## 1 74.03725 0.1086069
## 2 74.21489 0.1235168
## 3 74.39253 0.1235168
## 4 74.57017 0.1264428
## 5 74.74781 0.1307514
## 6 74.92545 0.1931101
# Partial dependence plot
autoplot(pd_values_x08)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.

Input settings to minimize Fail probability

Find the optimal settings for x07 and x08 to minimize Fail probability, from partial dependence calculations.

pd_values_x07 %>% as_tibble() %>% filter(pd_values_x07$yhat == min(pd_values_x07$yhat)) %>% select(x07)
## # A tibble: 1 x 1
##     x07
##   <dbl>
## 1  144.
pd_values_x08 %>% as_tibble() %>% filter(pd_values_x08$yhat == min(pd_values_x08$yhat)) %>% select(x08)
## # A tibble: 1 x 1
##     x08
##   <dbl>
## 1  78.3