31  Modeling with PCA - Diab_Pop

31.1 SET UP

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()     masks data.table::between()
✖ dplyr::filter()      masks stats::filter()
✖ dplyr::first()       masks data.table::first()
✖ lubridate::hour()    masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag()         masks stats::lag()
✖ dplyr::last()        masks data.table::last()
✖ lubridate::mday()    masks data.table::mday()
✖ lubridate::minute()  masks data.table::minute()
✖ lubridate::month()   masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second()  masks data.table::second()
✖ purrr::transpose()   masks data.table::transpose()
✖ lubridate::wday()    masks data.table::wday()
✖ lubridate::week()    masks data.table::week()
✖ lubridate::yday()    masks data.table::yday()
✖ lubridate::year()    masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

numeric_features <- diab_pop %>%
  select_if(is.numeric) %>%
  select(-seqn) %>%
  colnames()

numeric_features
[1] "ridageyr" "bmxbmi"   "lbxglu"  

31.2 Split Data

TrainInd <- createDataPartition(diab_pop$diq010,
                                p=.7,
                                list=FALSE)

TRAIN <- diab_pop[TrainInd,]
pP.TRAIN <- preProcess(TRAIN, c('center','scale'))

31.2.1 How many Clusters should we make ?

TRAIN_scaled <- predict(pP.TRAIN, TRAIN)

TRAIN.PCA_Model <- prcomp(TRAIN_scaled %>% select(bmxbmi, lbxglu, ridageyr))

summary(TRAIN.PCA_Model)
Importance of components:
                          PC1    PC2    PC3
Standard deviation     1.1338 0.9858 0.8618
Proportion of Variance 0.4285 0.3239 0.2476
Cumulative Proportion  0.4285 0.7524 1.0000
ggbiplot(TRAIN.PCA_Model)

32 How can we determine the number of principal components?

We aim to find the components with the maximum variance so we can retain as much information about the original dataset as possible.

To determine the number of principal components to retain in our analysis we need to compute the proportion of variance explained.

We can plot the cumulative proportion of variance explained in a scree plot to determine the number of principal components to retain.

# Pull SD
sd.TRAIN <- TRAIN.PCA_Model$sdev

# Compute variance
var.TRAIN <- sd.TRAIN^2

# Proportion of variance explained
prop.TRAIN <- var.TRAIN/sum(var.TRAIN)

# Plot the cumulative proportion of variance explained
plot(cumsum(prop.TRAIN), 
     xlab='Principal components', 
     ylab='Proportion of variance explained', 
     type='b')

TRAIN.PCA <- cbind(TRAIN, predict(TRAIN.PCA_Model, predict(pP.TRAIN, TRAIN)))


glimpse(TRAIN.PCA) 
Rows: 1,314
Columns: 13
$ seqn     <dbl> 83734, 83750, 83754, 83757, 83789, 83790, 83799, 83809, 83813…
$ riagendr <fct> Male, Male, Female, Female, Male, Male, Female, Female, Male,…
$ ridageyr <dbl> 78, 45, 67, 57, 66, 56, 37, 20, 24, 80, 29, 39, 35, 40, 54, 7…
$ ridreth1 <fct> Non-Hispanic White, Other, Other Hispanic, Other Hispanic, No…
$ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, College grad or abov…
$ dmdmartl <fct> Married, Never married, Married, Separated, Living with partn…
$ indhhin2 <fct> "$20,000-$24,999", "$65,000-$74,999", "$25,000-$34,999", "$20…
$ bmxbmi   <dbl> 28.8, 24.1, 43.7, 35.4, 34.0, 24.4, 25.5, 26.2, 26.9, 28.5, 2…
$ diq010   <fct> Diabetes, No Diabetes, No Diabetes, Diabetes, No Diabetes, No…
$ lbxglu   <dbl> 84, 84, 130, 398, 113, 397, 100, 94, 105, 119, 102, 101, 97, …
$ PC1      <dbl> 0.39364202, -1.01945675, 1.71575141, 5.60741419, 0.80332766, …
$ PC2      <dbl> -0.8830738811, -0.4475439639, 1.2070666916, 0.1554659373, 0.0…
$ PC3      <dbl> -1.465045765, -0.092975735, -0.945046444, 4.782660350, -0.759…
features <- colnames(TRAIN.PCA)[!colnames(TRAIN.PCA) %in% c('seqn', 'diq010', 'PC3')] 

features
 [1] "riagendr" "ridageyr" "ridreth1" "dmdeduc2" "dmdmartl" "indhhin2"
 [7] "bmxbmi"   "lbxglu"   "PC1"      "PC2"     
paste0('diq010 ~ ', paste( features , collapse = " + " ))
[1] "diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + PC1 + PC2"
formula1 <- as.formula('diq010 ~ PC1 + PC2')

formula2 <- as.formula('diq010 ~ bmxbmi + lbxglu + ridageyr')
  
formula3 <- as.formula(paste0('diq010 ~ ', paste( features , collapse = " + " )))
train_ctrl <- trainControl(method="boot", # type of resampling, in this case bootstrap
                                number = 5, # number of resamplings     
                                search = "random" # we are performing a "random" search
                                )

model_boot_glm_1 <- train(formula1,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale')
                         )

model_boot_glm_2 <- train(formula2,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale'))

model_boot_glm_3 <- train(formula3,
                         data = TRAIN.PCA,
                         method = "glm", 
                         trControl = train_ctrl,
                         # options to be passed to glm
                         family = 'binomial',
                         preProcess = c('center','scale'))
Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999
Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999
Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999
Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999
Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999
Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

32.1 Dealing with the Test data

saveRDS(model_boot_glm_1,'model_boot_glm_1.RDS')
saveRDS(pP.TRAIN,'pP.TRAIN.RDS')
saveRDS(TRAIN.PCA_Model,'TRAIN.PCA_Model.RDS')

rm(model_boot_glm_1)
rm(pP.TRAIN)
rm(TRAIN.PCA_Model)
model_boot_glm_1
Error in eval(expr, envir, enclos): object 'model_boot_glm_1' not found
model_boot_glm_1 <- readRDS('model_boot_glm_1.RDS')
pP.TRAIN <- readRDS('pP.TRAIN.RDS')
TRAIN.PCA_Model <- readRDS('TRAIN.PCA_Model.RDS')

TEST <- diab_pop[-TrainInd,]

# this should be right 
str( predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)) , 1 )
 num [1:562, 1:3] -0.0455 0.5816 3.5456 -1.4748 0.8069 ...
 - attr(*, "dimnames")=List of 2
str( TEST %>% predict(pP.TRAIN, .) %>% predict(TRAIN.PCA_Model, . ) , 1)
 num [1:562, 1:3] -0.0455 0.5816 3.5456 -1.4748 0.8069 ...
 - attr(*, "dimnames")=List of 2
#
# this should be wrong 
predict(pP.TRAIN, predict(TRAIN.PCA_Model, TEST))
Error in newdata[, object$method$center, drop = FALSE]: subscript out of bounds
TEST.PCA <- cbind(TEST , predict(TRAIN.PCA_Model, predict(pP.TRAIN, TEST)))

glimpse(TEST.PCA)
Rows: 562
Columns: 13
$ seqn     <dbl> 83733, 83737, 83755, 83761, 83787, 83820, 83822, 83834, 83849…
$ riagendr <fct> Male, Female, Male, Female, Female, Male, Female, Male, Male,…
$ ridageyr <dbl> 53, 72, 67, 24, 68, 70, 20, 69, 71, 37, 49, 41, 80, 38, 80, 2…
$ ridreth1 <fct> Non-Hispanic White, MexicanAmerican, Non-Hispanic Black, Othe…
$ dmdeduc2 <fct> High school graduate/GED, Grades 9-11th, College grad or abov…
$ dmdmartl <fct> Divorced, Separated, Widowed, Never married, Divorced, Living…
$ indhhin2 <fct> "$15,000-$19,999", "$75,000-$99,999", "$20,000-$24,999", "$0-…
$ bmxbmi   <dbl> 30.8, 28.6, 28.8, 25.3, 33.5, 27.0, 22.2, 28.2, 27.6, 35.3, 2…
$ diq010   <fct> No Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes,…
$ lbxglu   <dbl> 101, 107, 284, 95, 111, 94, 80, 105, 76, 79, 126, 110, 99, 89…
$ PC1      <dbl> -0.04548605, 0.58162473, 3.54555630, -1.47481529, 0.80690079,…
$ PC2      <dbl> 0.10176882, -0.75395613, -0.80585151, 0.32553646, -0.04052202…
$ PC3      <dbl> -0.37719175, -0.81959143, 2.63415932, 0.78295503, -0.84211868…

32.1.1 Predicting on the TEST data

TEST_1 <- TEST.PCA

TEST_1$estimate <- predict(model_boot_glm_1, TEST.PCA, 'raw')
TEST_1$Diabetes <- predict(model_boot_glm_1, TEST.PCA, 'prob')$Diabetes

TEST_2 <- TEST.PCA

TEST_2$estimate <- predict(model_boot_glm_2, TEST.PCA, 'raw')
TEST_2$Diabetes <- predict(model_boot_glm_2, TEST.PCA, 'prob')$Diabetes

TEST_3 <- TEST.PCA

TEST_3$estimate <- predict(model_boot_glm_3, TEST.PCA, 'raw')
TEST_3$Diabetes <- predict(model_boot_glm_3, TEST.PCA, 'prob')$Diabetes



TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_PCAs_only'),
                          TEST_2 %>% mutate(model = 'w_nums'),
                          TEST_3 %>% mutate(model = "full_model")
                          )
library('yardstick')

Attaching package: 'yardstick'
The following objects are masked from 'package:caret':

    precision, recall, sensitivity, specificity
The following object is masked from 'package:readr':

    spec
library('glue')


Aucs <- TEST.stacked %>%
  group_by(model) %>%
  roc_auc(truth = diq010, Diabetes) %>%
  arrange(-.estimate) %>%
  glue_data("{model} AUC : {round(.estimate, 3)}") 
  


TEST.stacked %>%
  group_by(model) %>%
  roc_curve(truth = diq010, Diabetes) %>%
  autoplot() +
  labs(caption  = paste(Aucs, collapse = " \n "))