15 Selection by Filter

15.1 `install_if_not` function

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}

install_if_not("gam")

[1] "the package 'gam' is already installed"

15.2 Read in the data

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

glimpse(diab_pop)

Rows: 5,719
Columns: 10
$ seqn     <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi   <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010   <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu   <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…

15.2.1 Let’s try to predict `lbxglu`:

15.3 omit nas

df <- diab_pop %>% 
  na.omit()

15.4 select factors and transform to numeric

my_factor_vars <- df %>% select_if(is.factor) %>% colnames()

df_as_nums <- df %>%
  mutate_at(vars(my_factor_vars), as.integer) %>%
  mutate_at(vars(my_factor_vars), as.factor)

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(my_factor_vars)

  # Now:
  data %>% select(all_of(my_factor_vars))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

glimpse(df_as_nums)

Rows: 1,876
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> 3, 3, 1, 5, 2, 4, 2, 5, 1, 3, 3, 2, 4, 3, 2, 3, 4, 1, 1, 4, 2…
$ dmdeduc2 <fct> 3, 3, 2, 2, 5, 5, 1, 5, 1, 5, 1, 4, 3, 4, 1, 5, 4, 1, 3, 3, 4…
$ dmdmartl <fct> 3, 1, 4, 5, 1, 2, 4, 5, 3, 6, 1, 1, 5, 3, 2, 6, 5, 5, 1, 5, 1…
$ indhhin2 <fct> 4, 5, 13, 10, 6, 5, 5, 1, 4, 10, 4, 13, 13, 6, 3, 10, 6, 3, 4…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

\(~\)

15.5 Load `caret`

library('caret')

Loading required package: lattice


Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

15.6 `dummyVars`

dV.df <- dummyVars(lbxglu ~ . , data = df_as_nums, fullRank=TRUE)

df_dV <- as_tibble(predict(dV.df,df_as_nums)) 
df_dV$lbxglu <- df$lbxglu

15.7 split data

set.seed(8675309)

trainIndex <- createDataPartition(df_dV$lbxglu, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)
TRAIN <- df_dV[trainIndex, ]  
TEST <-  df_dV[-trainIndex, ]

15.8 features versus target

X <- TRAIN %>% select(-lbxglu) 

dim(X)

[1] 1127   29

y <- TRAIN$lbxglu

\(~\)

15.9 `sbfControl` function

ctrl <- sbfControl(functions = lmSBF,
                   method = "repeatedcv",
                   number = 7,
                   repeats = 5,
                   verbose = FALSE)

15.10 `sbf` function

lmProfile <- sbf(X, y, 
                 sbfControl  = ctrl)

\(~\)

15.11 Results

lmProfile


Selection By Filter

Outer resampling method: Cross-Validated (7 fold, repeated 5 times) 

Resampling performance:

  RMSE Rsquared   MAE RMSESD RsquaredSD MAESD
 34.44   0.3292 17.85  7.101    0.09527 2.506

Using the training set, 8 variables were selected:
   ridageyr, ridreth1.3, dmdmartl.2, dmdmartl.4, dmdmartl.5...

During resampling, the top 5 selected variables (out of a possible 16):
   bmxbmi (100%), diq010.2 (100%), dmdmartl.5 (100%), ridageyr (100%), ridreth1.3 (100%)

On average, 7.3 variables were selected (min = 5, max = 11)

15.12 Optimal Variables

lmProfile$optVariables

[1] "ridageyr"    "ridreth1.3"  "dmdmartl.2"  "dmdmartl.4"  "dmdmartl.5" 
[6] "indhhin2.14" "bmxbmi"      "diq010.2"

15.13 Best Fit Model

lmProfile$fit


Call:
lm(formula = y ~ ., data = tmp)

Coefficients:
(Intercept)     ridageyr   ridreth1.3   dmdmartl.2   dmdmartl.4   dmdmartl.5  
  150.57672      0.08535     -2.50795      2.07176      8.96023     -1.90338  
indhhin2.14       bmxbmi     diq010.2  
   -2.63139      0.43097    -62.46384

15.14 Score Test Data

y_hat <- predict(lmProfile$fit, TEST)

TEST.scored <- cbind(TEST, y_hat)

15.15 `yardstick`

yardstick::rmse(TEST.scored, lbxglu, y_hat)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        33.9

15.1 install_if_not function

15.2 Read in the data

15.2.1 Let’s try to predict lbxglu:

15.3 omit nas

15.4 select factors and transform to numeric

15.5 Load caret

15.6 dummyVars

15.7 split data

15.8 features versus target

15.9 sbfControl function

15.10 sbf function

15.11 Results

15.12 Optimal Variables

15.13 Best Fit Model

15.14 Score Test Data

15.15 yardstick

15.1 `install_if_not` function

15.2.1 Let’s try to predict `lbxglu`:

15.5 Load `caret`

15.6 `dummyVars`

15.9 `sbfControl` function

15.10 `sbf` function

15.15 `yardstick`