20  Adjusted R2 VS Root Mean Squared Error - formula on resamples

20.1 Setup

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  select(-seqn)

glimpse(diab_pop)
Rows: 5,719
Columns: 9
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi   <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu   <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…

20.1.1 Let’s try to predict lbxglu:

df <- diab_pop %>% 
  na.omit()

my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(my_factor_vars)

  # Now:
  data %>% select(all_of(my_factor_vars))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
glimpse(df_as_nums)
Rows: 1,876
Columns: 9
$ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2…
$ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4…
$ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1…
$ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

20.1.2 preProcess

pP <- preProcess(df_as_nums, c('center','scale')) 

df_as_nums <- predict(pP,df_as_nums) 

glimpse(df_as_nums)
Rows: 1,876
Columns: 9
$ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1…
$ ridageyr <dbl> 0.15617810, 1.59749118, 1.25157604, -0.30504208, 0.96331342, …
$ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2…
$ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4…
$ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1…
$ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4…
$ bmxbmi   <dbl> 0.20545760, -0.08208648, -0.11084088, -0.75781505, 2.06011687…
$ diq010   <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1…
$ lbxglu   <dbl> -0.30006288, -0.70905532, -0.15571260, -0.70905532, 0.3976301…
dV.df <- dummyVars( ~ . , 
                   data = df_as_nums, 
                   fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 



features <- colnames(df_dV)[!colnames(df_dV) %in% c('seqn' , 'lbxglu')]
target <- 'lbxglu'
length(features)
[1] 28

We have 28 features.

20.1.3 test sampling function

sample_features <- sample(features, 4, replace = FALSE)

curent_formula <- paste0('lbxglu ~ ', paste0(sample_features, collapse = " + "))

as.formula(curent_formula)
lbxglu ~ dmdeduc2.3 + dmdmartl.6 + indhhin2.11 + dmdeduc2.2

20.2 make_model_order_num

This function will return a formula with a provided number of selected features selected at random

make_model_order_num <- function(num_features){
  
  set.seed(NULL)
  
  sample_features <- sample(features, num_features, replace = FALSE)

  curent_formula <- paste0('lbxglu ~ ', paste0(sample_features, collapse = " + "))

return(as.formula(curent_formula))
}

20.2.1 test

make_model_order_num(3)
lbxglu ~ dmdmartl.3 + riagendr.2 + diq010.2
<environment: 0x0000023b2120d008>
make_model_order_num(3)
lbxglu ~ dmdmartl.3 + dmdeduc2.2 + ridreth1.3
<environment: 0x0000023b22296880>
make_model_order_num(6)
lbxglu ~ dmdeduc2.3 + indhhin2.6 + dmdmartl.5 + dmdmartl.4 + 
    indhhin2.8 + indhhin2.2
<environment: 0x0000023b232527d0>

20.3 df_model

This is a dataframe to hold the model options

df_model <- tribble(
    ~model_name, ~model_id,
#    "zero", make_model(0)
    "Fold1", 1,
    "Fold2", 2,
    "Fold3", 3,
    "Fold4", 4,
    "Fold5", 5,
    "Fold6", 6,
    "Fold7", 7,
    "Fold8", 8,
  )

df_model
# A tibble: 8 × 2
  model_name model_id
  <chr>         <dbl>
1 Fold1             1
2 Fold2             2
3 Fold3             3
4 Fold4             4
5 Fold5             5
6 Fold6             6
7 Fold7             7
8 Fold8             8
map(df_model,4)$model_creator
NULL

20.4 Split data

library(rsample)

train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)

TRAIN.v_fold <- vfold_cv(TRAIN, v = 8, 
                         repeats = 321)

glimpse(TRAIN.v_fold)
Rows: 2,568
Columns: 3
$ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x 14…
$ id     <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat001"…
$ id2    <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7", …
TRAIN.v_fold %>% select(id2) %>% distinct()
# A tibble: 8 × 1
  id2  
  <chr>
1 Fold1
2 Fold2
3 Fold3
4 Fold4
5 Fold5
6 Fold6
7 Fold7
8 Fold8

20.4.1 Let’s Take a look at the 6th fold

TRAIN.v_fold %>%
  filter(id2=='Fold6') %>%
  glimpse()
Rows: 321
Columns: 3
$ splits <list> [<vfold_split[985 x 140 x 1125 x 29]>], [<vfold_split[985 x 14…
$ id     <chr> "Repeat001", "Repeat002", "Repeat003", "Repeat004", "Repeat005"…
$ id2    <chr> "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", "Fold6", …

20.4.1.1 This is the TRAINing data from the 56th sample of the 6th Fold

TRAIN.56.6 <- (TRAIN.v_fold %>%
  filter(id2=='Fold6'))$splits[[56]] %>% 
  analysis()

TRAIN.56.6
# A tibble: 985 × 29
   riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
        <dbl>    <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
 1          0  -1.52            1          0          0          0          0
 2          1  -1.57            0          1          0          0          0
 3          1  -0.882           1          0          0          0          0
 4          1   0.675           0          0          0          0          0
 5          1   1.54            0          0          0          0          0
 6          1  -0.0168          1          0          0          0          0
 7          1  -1.17            0          0          0          0          0
 8          1   1.60            0          1          0          0          0
 9          1  -0.593           0          0          0          0          0
10          1  -0.363           0          1          0          0          0
# ℹ 975 more rows
# ℹ 22 more variables: dmdeduc2.3 <dbl>, dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>,
#   dmdmartl.2 <dbl>, dmdmartl.3 <dbl>, dmdmartl.4 <dbl>, dmdmartl.5 <dbl>,
#   dmdmartl.6 <dbl>, indhhin2.2 <dbl>, indhhin2.3 <dbl>, indhhin2.4 <dbl>,
#   indhhin2.5 <dbl>, indhhin2.6 <dbl>, indhhin2.8 <dbl>, indhhin2.10 <dbl>,
#   indhhin2.11 <dbl>, indhhin2.12 <dbl>, indhhin2.13 <dbl>, indhhin2.14 <dbl>,
#   bmxbmi <dbl>, diq010.2 <dbl>, lbxglu <dbl>

20.4.1.2 This is the TESTing data from the 56th sample of the 6th Fold

TEST.56.6 <- (TRAIN.v_fold %>%
  filter(id2=='Fold6'))$splits[[56]] %>% 
  assessment()

TEST.56.6
# A tibble: 140 × 29
   riagendr.2 ridageyr ridreth1.2 ridreth1.3 ridreth1.4 ridreth1.5 dmdeduc2.2
        <dbl>    <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
 1          1  -1.17            0          0          1          0          0
 2          1   0.271           0          0          1          0          0
 3          0   1.02            0          0          0          0          0
 4          0   0.906           0          0          1          0          0
 5          1  -0.0168          0          1          0          0          0
 6          0  -1.52            0          1          0          0          0
 7          1   0.387           0          0          0          0          1
 8          0   1.08            0          0          0          1          0
 9          0  -1.46            0          0          0          0          0
10          1  -0.420           0          0          0          1          0
# ℹ 130 more rows
# ℹ 22 more variables: dmdeduc2.3 <dbl>, dmdeduc2.4 <dbl>, dmdeduc2.5 <dbl>,
#   dmdmartl.2 <dbl>, dmdmartl.3 <dbl>, dmdmartl.4 <dbl>, dmdmartl.5 <dbl>,
#   dmdmartl.6 <dbl>, indhhin2.2 <dbl>, indhhin2.3 <dbl>, indhhin2.4 <dbl>,
#   indhhin2.5 <dbl>, indhhin2.6 <dbl>, indhhin2.8 <dbl>, indhhin2.10 <dbl>,
#   indhhin2.11 <dbl>, indhhin2.12 <dbl>, indhhin2.13 <dbl>, indhhin2.14 <dbl>,
#   bmxbmi <dbl>, diq010.2 <dbl>, lbxglu <dbl>

20.5 Join df_model to folds

We’re joining the model tuning grid to the folds

glimpse(TRAIN.v_fold)
Rows: 2,568
Columns: 3
$ splits <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x 14…
$ id     <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat001"…
$ id2    <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7", …
glimpse(df_model)
Rows: 8
Columns: 2
$ model_name <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold…
$ model_id   <dbl> 1, 2, 3, 4, 5, 6, 7, 8
df_model <- TRAIN.v_fold %>% 
  left_join(df_model, by = c('id2'="model_name")) 

glimpse(df_model)
Rows: 2,568
Columns: 4
$ splits   <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x …
$ id       <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00…
$ id2      <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"…
$ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5…

20.6 Adjusted R2

The lm_model function below will take in:

  • an rsample::analysis data set from the sample
  • a formula that we will pass in later with ...:

From those inputs it is instructed to only return Adjusted R2 which is a numerical value.

lm_model <- function(data, ...){
  
  LM <- lm(... , analysis(data))
  R2 <- summary(LM)$adj.r.squared 
  return(R2)
}
glimpse(df_model)
Rows: 2,568
Columns: 4
$ splits   <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x …
$ id       <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00…
$ id2      <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"…
$ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5…

20.6.1 Get Estimates

The purrr library in R is mysterious and powerful; here, map2_dbl is going to look to return a numeric value, a double from the computation.

If you run ?map2_dbl it will display:

map2_dbl(.x, .y, .f, ...)

In the context of our current status:

  • df_model$spilts gives us a list of data - .x
  • map() is a function who returns a list , .y
    • map is mapping the values of df_model$model_id into the function make_model_order_num
  • lm_model is a function .f
    • this function takes in two values data and ...
  • The value it will return from us will be the Adj_R2 from lm_model

We will now run all the models and store the results in Adj_R2:

toc <- Sys.time()

df_model$Adj_R2 <- map2_dbl(
  df_model$splits,
  map(df_model$model_id, make_model_order_num),
  lm_model
)

tic <- Sys.time()

print(paste0("Adj R2 estimates in ", round(tic - toc , 4 ) , " seconds " ))
[1] "Adj R2 estimates in 3.2142 seconds "

We just ran a bunch of models and computed R2 for each of them!:

glimpse(df_model)
Rows: 2,568
Columns: 5
$ splits   <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x …
$ id       <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00…
$ id2      <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"…
$ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5…
$ Adj_R2   <dbl> 0.3107468825, 0.0133126990, 0.0091401534, 0.0141579446, 0.295…

20.6.1.1 Display Results

df_model %>%
  ggplot(aes(x=id, 
         y=Adj_R2,
         fill = Adj_R2)) +
  geom_bar(stat = 'identity') +
  scale_fill_gradient(low = "yellow", high = "red", na.value = NA) +
  coord_flip() +
  facet_wrap( ~ model_id) 

20.7 RMSE

To compute RMSE we need to know how the model performs it’s holdout set:

holdout_results <- function(splits, ...) {
  
  mod <- lm(..., data = analysis(splits))
  
  holdout <- assessment(splits)
  
  holdout$estimate <- predict(mod,holdout)
  
  yardstick::rmse(holdout,
                     truth=lbxglu, estimate)$.estimate
}

20.7.1 Compute Errors

toc <- Sys.time()

df_model$RMSE <- map2_dbl(
  df_model$splits,
  map(df_model$model_id, make_model_order_num),
  holdout_results
)

tic <- Sys.time()

print(paste0("RMSE estimates in ", round(tic - toc , 4 ) , " seconds " ))
[1] "RMSE estimates in 8.434 seconds "

20.8 Compare Results

glimpse(df_model)
Rows: 2,568
Columns: 6
$ splits   <list> [<vfold_split[984 x 141 x 1125 x 29]>], [<vfold_split[984 x …
$ id       <chr> "Repeat001", "Repeat001", "Repeat001", "Repeat001", "Repeat00…
$ id2      <chr> "Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Fold6", "Fold7"…
$ model_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5…
$ Adj_R2   <dbl> 0.3107468825, 0.0133126990, 0.0091401534, 0.0141579446, 0.295…
$ RMSE     <dbl> 0.9253085, 0.9474441, 0.8968000, 1.4262460, 1.1056944, 0.5020…

20.8.1 Graphs

While we see that models with higher number of features have larger Adjusted R2 There appears to be little correlation between Adjusted R2 and model performance:

df_model %>%
  ggplot(aes(x=Adj_R2,
             y=RMSE,
             color=id)) +
  geom_point() +
  facet_wrap(~model_id) + 
  theme(legend.position = "none")

# Normalize RMSE and R2 , remove outliers 
COR <-df_model %>%
  mutate_at(vars(Adj_R2,RMSE),scale) %>%
  filter(abs(Adj_R2) <2 ) %>%
  filter(abs(RMSE) <2 )
Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
ℹ Please use one dimensional logical vectors instead.
COR %>%
  ggplot(aes(x=Adj_R2,
             y=RMSE,
             color=id)) +
  geom_point() +
  facet_wrap(~model_id) + 
  theme(legend.position = "none")

cor.test(COR$Adj_R2, COR$RMSE, method=c("pearson"))

    Pearson's product-moment correlation

data:  COR$Adj_R2 and COR$RMSE
t = 1.2373, df = 2067, p-value = 0.2161
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.01590667  0.07021708
sample estimates:
       cor 
0.02720569 
t.test(COR$Adj_R2, COR$RMSE,paired=TRUE)

    Paired t-test

data:  COR$Adj_R2 and COR$RMSE
t = -18.599, df = 2068, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -0.4275935 -0.3460223
sample estimates:
mean difference 
     -0.3868079 

20.9 Top 5 Formulas

20.9.1 RMSE

(df_model %>%
  arrange(RMSE) %>%
  filter(row_number() < 5))$RMSE
[1] 0.4716807 0.4891897 0.5019929 0.5020018
# Top_5_RMSE_formulas <- (df_model %>%
#   arrange(RMSE) %>%
#   filter(row_number() < 5))$model_creator
# 
# Top_5_RMSE_formulas

20.9.2 Adj_R2

(df_model %>%
  arrange(Adj_R2) %>%
  filter(row_number() < 5))$Adj_R2
[1] -0.003155760 -0.003132371 -0.002973281 -0.002399214
# Top_5_AdjR2_formulas <- (df_model %>%
#   arrange(Adj_R2) %>%
#   filter(row_number() < 5))$model_creator
# 
# Top_5_AdjR2_formulas