31 Modeling with PCA - Diab_Pop

31.1 SET UP

library(data.table)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()     masks data.table::between()
✖ dplyr::filter()      masks stats::filter()
✖ dplyr::first()       masks data.table::first()
✖ lubridate::hour()    masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag()         masks stats::lag()
✖ dplyr::last()        masks data.table::last()
✖ lubridate::mday()    masks data.table::mday()
✖ lubridate::minute()  masks data.table::minute()
✖ lubridate::month()   masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second()  masks data.table::second()
✖ purrr::transpose()   masks data.table::transpose()
✖ lubridate::wday()    masks data.table::wday()
✖ lubridate::week()    masks data.table::week()
✖ lubridate::yday()    masks data.table::yday()
✖ lubridate::year()    masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggbiplot)
library('caret')

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

numeric_features <- diab_pop %>%
  select_if(is.numeric) %>%
  select(-seqn) %>%
  colnames()

numeric_features

[1] "ridageyr" "bmxbmi"   "lbxglu"

31.2 Split Data

TrainInd <- createDataPartition(diab_pop$diq010,
                                p=.7,
                                list=FALSE)

TRAIN <- diab_pop[TrainInd,]

pP.TRAIN <- preProcess(TRAIN, c('center','scale'))

31.2.1 How many Clusters should we make ?

TRAIN_scaled <- predict(pP.TRAIN, TRAIN)

TRAIN.PCA_Model <- prcomp(TRAIN_scaled %>% select(bmxbmi, lbxglu, ridageyr))

summary(TRAIN.PCA_Model)

Importance of components:
                          PC1    PC2    PC3
Standard deviation     1.1338 0.9858 0.8618
Proportion of Variance 0.4285 0.3239 0.2476
Cumulative Proportion  0.4285 0.7524 1.0000

ggbiplot(TRAIN.PCA_Model)

32 How can we determine the number of principal components?

We aim to find the components with the maximum variance so we can retain as much information about the original dataset as possible.

To determine the number of principal components to retain in our analysis we need to compute the proportion of variance explained.

We can plot the cumulative proportion of variance explained in a scree plot to determine the number of principal components to retain.

# Pull SD
sd.TRAIN <- TRAIN.PCA_Model$sdev

# Compute variance
var.TRAIN <- sd.TRAIN^2

# Proportion of variance explained
prop.TRAIN <- var.TRAIN/sum(var.TRAIN)

# Plot the cumulative proportion of variance explained
plot(cumsum(prop.TRAIN), 
     xlab='Principal components', 
     ylab='Proportion of variance explained', 
     type='b')

TRAIN.PCA <- cbind(TRAIN, predict(TRAIN.PCA_Model, predict(pP.TRAIN, TRAIN)))


glimpse(TRAIN.PCA)

Rows: 1,314
Columns: 13
$ seqn     <dbl> 83734, 83750, 83754, 83757, 83789, 83790, 83799, 83809, 83813…
$ riagendr <fct> Male, Male, Female, Female, Male, Male, Female, Female, Male,…
$ ridageyr <dbl> 78, 45, 67, 57, 66, 56, 37, 20, 24, 80, 29, 39, 35, 40, 54, 7…
$ ridreth1 <fct> Non-Hispanic White, Other, Other Hispanic, Other Hispanic, No…
$ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, College grad or abov…
$ dmdmartl <fct> Married, Never married, Married, Separated, Living with partn…
$ indhhin2 <fct> "$20,000-$24,999", "$65,000-$74,999", "$25,000-$34,999", "$20…
$ bmxbmi   <dbl> 28.8, 24.1, 43.7, 35.4, 34.0, 24.4, 25.5, 26.2, 26.9, 28.5, 2…
$ diq010   <fct> Diabetes, No Diabetes, No Diabetes, Diabetes, No Diabetes, No…
$ lbxglu   <dbl> 84, 84, 130, 398, 113, 397, 100, 94, 105, 119, 102, 101, 97, …
$ PC1      <dbl> 0.39364202, -1.01945675, 1.71575141, 5.60741419, 0.80332766, …
$ PC2      <dbl> -0.8830738811, -0.4475439639, 1.2070666916, 0.1554659373, 0.0…
$ PC3      <dbl> -1.465045765, -0.092975735, -0.945046444, 4.782660350, -0.759…

features <- colnames(TRAIN.PCA)[!colnames(TRAIN.PCA) %in% c('seqn', 'diq010', 'PC3')] 

features

 [1] "riagendr" "ridageyr" "ridreth1" "dmdeduc2" "dmdmartl" "indhhin2"
 [7] "bmxbmi"   "lbxglu"   "PC1"      "PC2"

paste0('diq010 ~ ', paste( features , collapse = " + " ))

[1] "diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + PC1 + PC2"

formula1 <- as.formula('diq010 ~ PC1 + PC2')

formula2 <- as.formula('diq010 ~ bmxbmi + lbxglu + ridageyr')
  
formula3 <- as.formula(paste0('diq010 ~ ', paste( features , collapse = " + " )))

train_ctrl <- trainControl(method="boot", # type of resampling, in this case bootstrap
                                number = 5, # number of resamplings     
                                search = "random" # we are performing a "random" search
                                )

model_boot_glm_1 <- train(formula1,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale')
                         )

model_boot_glm_2 <- train(formula2,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale'))

model_boot_glm_3 <- train(formula3,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale'))

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

32.1 Dealing with the Test data

saveRDS(model_boot_glm_1,'model_boot_glm_1.RDS')
saveRDS(pP.TRAIN,'pP.TRAIN.RDS')
saveRDS(TRAIN.PCA_Model,'TRAIN.PCA_Model.RDS')

rm(model_boot_glm_1)
rm(pP.TRAIN)
rm(TRAIN.PCA_Model)

model_boot_glm_1

Error in eval(expr, envir, enclos): object 'model_boot_glm_1' not found

model_boot_glm_1 <- readRDS('model_boot_glm_1.RDS')
pP.TRAIN <- readRDS('pP.TRAIN.RDS')
TRAIN.PCA_Model <- readRDS('TRAIN.PCA_Model.RDS')

TEST <- diab_pop[-TrainInd,]

# this should be right 
str( predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)) , 1 )

 num [1:562, 1:3] -0.0455 0.5816 3.5456 -1.4748 0.8069 ...
 - attr(*, "dimnames")=List of 2

str( TEST %>% predict(pP.TRAIN, .) %>% predict(TRAIN.PCA_Model, . ) , 1)

 num [1:562, 1:3] -0.0455 0.5816 3.5456 -1.4748 0.8069 ...
 - attr(*, "dimnames")=List of 2

# this should be wrong 
predict(pP.TRAIN, predict(TRAIN.PCA_Model, TEST))

Error in newdata[, object$method$center, drop = FALSE]: subscript out of bounds

TEST.PCA <- cbind(TEST , predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)))

glimpse(TEST.PCA)

Rows: 562
Columns: 13
$ seqn     <dbl> 83733, 83737, 83755, 83761, 83787, 83820, 83822, 83834, 83849…
$ riagendr <fct> Male, Female, Male, Female, Female, Male, Female, Male, Male,…
$ ridageyr <dbl> 53, 72, 67, 24, 68, 70, 20, 69, 71, 37, 49, 41, 80, 38, 80, 2…
$ ridreth1 <fct> Non-Hispanic White, MexicanAmerican, Non-Hispanic Black, Othe…
$ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, College grad or abov…
$ dmdmartl <fct> Divorced, Separated, Widowed, Never married, Divorced, Living…
$ indhhin2 <fct> "$15,000-$19,999", "$75,000-$99,999", "$20,000-$24,999", "$0-…
$ bmxbmi   <dbl> 30.8, 28.6, 28.8, 25.3, 33.5, 27.0, 22.2, 28.2, 27.6, 35.3, 2…
$ diq010   <fct> No Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,…
$ lbxglu   <dbl> 101, 107, 284, 95, 111, 94, 80, 105, 76, 79, 126, 110, 99, 89…
$ PC1      <dbl> -0.04548605, 0.58162473, 3.54555630, -1.47481529, 0.80690079,…
$ PC2      <dbl> 0.10176882, -0.75395613, -0.80585151, 0.32553646, -0.04052202…
$ PC3      <dbl> -0.37719175, -0.81959143, 2.63415932, 0.78295503, -0.84211868…

32.1.1 Predicting on the TEST data

TEST_1 <- TEST.PCA

TEST_1$estimate <- predict(model_boot_glm_1, TEST.PCA, 'raw')
TEST_1$Diabetes <- predict(model_boot_glm_1, TEST.PCA, 'prob')$Diabetes

TEST_2 <- TEST.PCA

TEST_2$estimate <- predict(model_boot_glm_2, TEST.PCA, 'raw')
TEST_2$Diabetes <- predict(model_boot_glm_2, TEST.PCA, 'prob')$Diabetes

TEST_3 <- TEST.PCA

TEST_3$estimate <- predict(model_boot_glm_3, TEST.PCA, 'raw')
TEST_3$Diabetes <- predict(model_boot_glm_3, TEST.PCA, 'prob')$Diabetes



TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_PCAs_only'),
                          TEST_2 %>% mutate(model = 'w_nums'),
                          TEST_3 %>% mutate(model = "full_model")
                          )
library('yardstick')


Attaching package: 'yardstick'

The following objects are masked from 'package:caret':

    precision, recall, sensitivity, specificity

The following object is masked from 'package:readr':

    spec

library('glue')


Aucs <- TEST.stacked %>%
  group_by(model) %>%
  roc_auc(truth = diq010, Diabetes) %>%
  arrange(-.estimate) %>%
  glue_data("{model} AUC : {round(.estimate, 3)}") 
  


TEST.stacked %>%
  group_by(model) %>%
  roc_curve(truth = diq010, Diabetes) %>%
  autoplot() +
  labs(caption  = paste(Aucs, collapse = " \n "))