16 Recursive Feature Elimination

16.1 Setup

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

glimpse(diab_pop)

Rows: 5,719
Columns: 10
$ seqn     <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi   <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu   <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…

16.1.1 Let’s try to predict `lbxglu`:

df <- diab_pop %>% 
  na.omit()

my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(my_factor_vars)

  # Now:
  data %>% select(all_of(my_factor_vars))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

glimpse(df_as_nums)

Rows: 1,876
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2…
$ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4…
$ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1…
$ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

library('caret')

Loading required package: lattice


Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 
df_dV$lbxglu <- df$lbxglu

16.2 Split data

library(rsample)

set.seed(8675309)

train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)

16.3 Create features and outcome

X <- TRAIN %>% select(-lbxglu) 

y <- TRAIN$lbxglu

16.4 The `rfeControl` function

ctrl <- rfeControl(functions = lmFuncs,
                   method = "repeatedcv",
                   number = 10,
                   repeats = 10,
                   verbose = FALSE)

16.5 The `rfe` function

lmProfile <- rfe(X, y,
                 sizes = c(1:ncol(X)),
                 rfeControl = ctrl)

16.6 Results

lmProfile


Recursive feature selection

Outer resampling method: Cross-Validated (10 fold, repeated 10 times) 

Resampling performance over subset size:

 Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
         1 35.25   0.2882 18.40  7.704    0.09247 2.294        *
         2 35.29   0.2864 18.44  7.668    0.08825 2.290         
         3 35.35   0.2843 18.50  7.589    0.08825 2.280         
         4 35.35   0.2843 18.58  7.556    0.08648 2.284         
         5 35.39   0.2826 18.64  7.538    0.08605 2.291         
         6 35.42   0.2811 18.70  7.531    0.08555 2.298         
         7 35.43   0.2811 18.79  7.526    0.08538 2.302         
         8 35.44   0.2806 18.85  7.519    0.08470 2.294         
         9 35.45   0.2807 18.89  7.523    0.08389 2.306         
        10 35.51   0.2788 18.98  7.490    0.08430 2.293         
        11 35.54   0.2771 19.05  7.488    0.08375 2.314         
        12 35.57   0.2762 19.12  7.476    0.08272 2.298         
        13 35.61   0.2747 19.19  7.455    0.08243 2.277         
        14 35.59   0.2755 19.21  7.468    0.08227 2.278         
        15 35.56   0.2771 19.22  7.495    0.08284 2.286         
        16 35.54   0.2778 19.20  7.504    0.08307 2.278         
        17 35.56   0.2776 19.22  7.482    0.08331 2.274         
        18 35.55   0.2780 19.23  7.469    0.08342 2.242         
        19 35.56   0.2777 19.23  7.466    0.08283 2.250         
        20 35.56   0.2776 19.25  7.478    0.08337 2.263         
        21 35.55   0.2778 19.26  7.462    0.08347 2.262         
        22 35.53   0.2786 19.26  7.464    0.08395 2.267         
        23 35.52   0.2789 19.25  7.449    0.08349 2.254         
        24 35.52   0.2789 19.26  7.441    0.08364 2.248         
        25 35.53   0.2783 19.26  7.459    0.08333 2.242         
        26 35.54   0.2783 19.22  7.486    0.08314 2.245         
        27 35.52   0.2791 19.16  7.502    0.08287 2.240         
        28 35.52   0.2790 19.08  7.531    0.08221 2.237         
        29 35.55   0.2781 19.09  7.540    0.08227 2.231         

The top 1 variables (out of 1):
   diq010.2

plot(lmProfile, type = c("g", "o"))

lmProfile$optVariables

[1] "diq010.2"

lmProfile$fit


Call:
lm(formula = y ~ ., data = tmp)

Coefficients:
(Intercept)     diq010.2  
     163.85       -59.56

y_hat <- predict(lmProfile$fit, TEST)

TEST.scored <- cbind(TEST,y_hat)

yardstick::rmse(TEST.scored, lbxglu, y_hat)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        32.5