16  Recursive Feature Elimination

16.1 Setup

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

glimpse(diab_pop)
Rows: 5,719
Columns: 10
$ seqn     <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi   <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu   <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…

16.1.1 Let’s try to predict lbxglu:

df <- diab_pop %>% 
  na.omit()

my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(my_factor_vars)

  # Now:
  data %>% select(all_of(my_factor_vars))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
glimpse(df_as_nums)
Rows: 1,876
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2…
$ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4…
$ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1…
$ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …
Loading required package: lattice

Attaching package: 'caret'
The following object is masked from 'package:purrr':

    lift
dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 
df_dV$lbxglu <- df$lbxglu 

16.2 Split data

library(rsample)

set.seed(8675309)

train_test <- initial_split(df_dV, prop = .6)
TRAIN <- training(train_test)
TEST <- testing(train_test)

16.3 Create features and outcome

X <- TRAIN %>% select(-lbxglu) 

y <- TRAIN$lbxglu

16.4 The rfeControl function

ctrl <- rfeControl(functions = lmFuncs,
                   method = "repeatedcv",
                   number = 10,
                   repeats = 10,
                   verbose = FALSE)

16.5 The rfe function

lmProfile <- rfe(X, y,
                 sizes = c(1:ncol(X)),
                 rfeControl = ctrl)

16.6 Results

lmProfile

Recursive feature selection

Outer resampling method: Cross-Validated (10 fold, repeated 10 times) 

Resampling performance over subset size:

 Variables  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
         1 35.25   0.2882 18.40  7.704    0.09247 2.294        *
         2 35.29   0.2864 18.44  7.668    0.08825 2.290         
         3 35.35   0.2843 18.50  7.589    0.08825 2.280         
         4 35.35   0.2843 18.58  7.556    0.08648 2.284         
         5 35.39   0.2826 18.64  7.538    0.08605 2.291         
         6 35.42   0.2811 18.70  7.531    0.08555 2.298         
         7 35.43   0.2811 18.79  7.526    0.08538 2.302         
         8 35.44   0.2806 18.85  7.519    0.08470 2.294         
         9 35.45   0.2807 18.89  7.523    0.08389 2.306         
        10 35.51   0.2788 18.98  7.490    0.08430 2.293         
        11 35.54   0.2771 19.05  7.488    0.08375 2.314         
        12 35.57   0.2762 19.12  7.476    0.08272 2.298         
        13 35.61   0.2747 19.19  7.455    0.08243 2.277         
        14 35.59   0.2755 19.21  7.468    0.08227 2.278         
        15 35.56   0.2771 19.22  7.495    0.08284 2.286         
        16 35.54   0.2778 19.20  7.504    0.08307 2.278         
        17 35.56   0.2776 19.22  7.482    0.08331 2.274         
        18 35.55   0.2780 19.23  7.469    0.08342 2.242         
        19 35.56   0.2777 19.23  7.466    0.08283 2.250         
        20 35.56   0.2776 19.25  7.478    0.08337 2.263         
        21 35.55   0.2778 19.26  7.462    0.08347 2.262         
        22 35.53   0.2786 19.26  7.464    0.08395 2.267         
        23 35.52   0.2789 19.25  7.449    0.08349 2.254         
        24 35.52   0.2789 19.26  7.441    0.08364 2.248         
        25 35.53   0.2783 19.26  7.459    0.08333 2.242         
        26 35.54   0.2783 19.22  7.486    0.08314 2.245         
        27 35.52   0.2791 19.16  7.502    0.08287 2.240         
        28 35.52   0.2790 19.08  7.531    0.08221 2.237         
        29 35.55   0.2781 19.09  7.540    0.08227 2.231         

The top 1 variables (out of 1):
   diq010.2
plot(lmProfile, type = c("g", "o"))

lmProfile$optVariables
[1] "diq010.2"
lmProfile$fit

Call:
lm(formula = y ~ ., data = tmp)

Coefficients:
(Intercept)     diq010.2  
     163.85       -59.56  
y_hat <- predict(lmProfile$fit, TEST)

TEST.scored <- cbind(TEST,y_hat)

yardstick::rmse(TEST.scored, lbxglu, y_hat)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        32.5