15  Selection by Filter

15.1 install_if_not function

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

install_if_not("gam") 
[1] "the package 'gam' is already installed"

15.2 Read in the data

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

glimpse(diab_pop)
Rows: 5,719
Columns: 10
$ seqn     <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi   <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu   <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…

15.2.1 Let’s try to predict lbxglu:

15.3 omit nas

df <- diab_pop %>% 
  na.omit()

15.4 select factors and transform to numeric

my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(my_factor_vars)

  # Now:
  data %>% select(all_of(my_factor_vars))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
glimpse(df_as_nums)
Rows: 1,876
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2…
$ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4…
$ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1…
$ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

\(~\)

\(~\)


\(~\)

\(~\)

15.5 Load caret

Loading required package: lattice

Attaching package: 'caret'
The following object is masked from 'package:purrr':

    lift

15.6 dummyVars

dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 
df_dV$lbxglu <- df$lbxglu 

15.7 split data

set.seed(8675309)

trainIndex <- createDataPartition(df_dV$lbxglu, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)
TRAIN <- df_dV[trainIndex, ]  
TEST <-  df_dV[-trainIndex, ] 

15.8 features versus target

X <- TRAIN %>% select(-lbxglu) 

dim(X)
[1] 1127   29
y <- TRAIN$lbxglu

\(~\)

\(~\)


\(~\)

\(~\)

15.9 sbfControl function

ctrl <- sbfControl(functions = lmSBF,
                   method = "repeatedcv",
                   number = 7,
                   repeats = 5,
                   verbose = FALSE)

15.10 sbf function

lmProfile <- sbf(X, y, 
                 sbfControl  = ctrl)

\(~\)

\(~\)


\(~\)

\(~\)

15.11 Results

lmProfile

Selection By Filter

Outer resampling method: Cross-Validated (7 fold, repeated 5 times) 

Resampling performance:

  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD
 34.44   0.3292 17.85  7.101    0.09527 2.506

Using the training set, 8 variables were selected:
   ridageyr, ridreth1.3, dmdmartl.2, dmdmartl.4, dmdmartl.5...

During resampling, the top 5 selected variables (out of a possible 16):
   bmxbmi (100%), diq010.2 (100%), dmdmartl.5 (100%), ridageyr (100%), ridreth1.3 (100%)

On average, 7.3 variables were selected (min = 5, max = 11)

15.12 Optimal Variables

lmProfile$optVariables
[1] "ridageyr"    "ridreth1.3"  "dmdmartl.2"  "dmdmartl.4"  "dmdmartl.5" 
[6] "indhhin2.14" "bmxbmi"      "diq010.2"   

15.13 Best Fit Model

lmProfile$fit

Call:
lm(formula = y ~ ., data = tmp)

Coefficients:
(Intercept)     ridageyr   ridreth1.3   dmdmartl.2   dmdmartl.4   dmdmartl.5  
  150.57672      0.08535     -2.50795      2.07176      8.96023     -1.90338  
indhhin2.14       bmxbmi     diq010.2  
   -2.63139      0.43097    -62.46384  

15.14 Score Test Data

y_hat <- predict(lmProfile$fit, TEST)

TEST.scored <- cbind(TEST, y_hat)

15.15 yardstick

yardstick::rmse(TEST.scored, lbxglu, y_hat)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        33.9