29 Modeling with k-means clustering - diab_pop

29.1 SET UP

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(factoextra)

Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library('caret')

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

numeric_features <- diab_pop %>%
  select_if(is.numeric) %>%
  select(-seqn) %>%
  colnames()

29.2 Split Data

TrainInd <- createDataPartition(diab_pop$diq010,
                                p=.7,
                                list=FALSE)

TRAIN <- diab_pop[TrainInd,]

29.2.1 How many Clusters should we make ?

fviz_nbclust(TRAIN %>%
               select(numeric_features) %>%
               scale(),
              kmeans, 
              method='wss')

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(numeric_features)

  # Now:
  data %>% select(all_of(numeric_features))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

29.3 Train Clustering Model

# Set seed for reproducible results
set.seed(1984)


pP.TRAIN <- preProcess(TRAIN, c('center','scale'))

# Run k-means algorithm
# Nstart = # random starting positions; I chose 523
km <- kmeans(predict(pP.TRAIN, TRAIN) %>% select(numeric_features), 
             centers = 5, 
             nstart = 523)

glimpse(km)

List of 9
 $ cluster     : Named int [1:1314] 2 1 4 1 2 1 1 2 2 1 ...
  ..- attr(*, "names")= chr [1:1314] "2" "11" "14" "17" ...
 $ centers     : num [1:5, 1:3] -0.92 0.915 0.804 0.166 -0.263 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:5] "1" "2" "3" "4" ...
  .. ..$ : chr [1:3] "ridageyr" "bmxbmi" "lbxglu"
 $ totss       : num 3939
 $ withinss    : num [1:5] 318.4 365.7 166.7 81.4 335.5
 $ tot.withinss: num 1268
 $ betweenss   : num 2671
 $ size        : int [1:5] 486 470 95 24 239
 $ iter        : int 3
 $ ifault      : int 0
 - attr(*, "class")= chr "kmeans"

TRAIN.cluster <- cbind(TRAIN, cluster=km$cluster)

head(TRAIN.cluster)

    seqn riagendr ridageyr           ridreth1                   dmdeduc2
2  83733     Male       53 Non-Hispanic White   High school graduate/GED
11 83750     Male       45              Other              Grades 9-11th
14 83755     Male       67 Non-Hispanic Black      College grad or above
17 83761   Female       24              Other      College grad or above
29 83787   Female       68    MexicanAmerican        Less than 9th grade
34 83799   Female       37     Other Hispanic Some college or AA degrees
        dmdmartl        indhhin2 bmxbmi      diq010 lbxglu cluster
2       Divorced $15,000-$19,999   30.8 No Diabetes    101       2
11 Never married $65,000-$74,999   24.1 No Diabetes     84       1
14       Widowed $20,000-$24,999   28.8    Diabetes    284       4
17 Never married       $0-$4,999   25.3 No Diabetes     95       1
29      Divorced $15,000-$19,999   33.5 No Diabetes    111       2
34       Married $75,000-$99,999   25.5 No Diabetes    100       1

fviz_cluster(km, 
             data = TRAIN.cluster %>%
               select(numeric_features, cluster) %>%
               scale()
             )

km$centers

    ridageyr     bmxbmi     lbxglu
1 -0.9203727 -0.5124555 -0.3748482
2  0.9147869 -0.3341831 -0.1500942
3  0.8035504  0.5623729  1.8154511
4  0.1662145  0.3187354  5.3059350
5 -0.2634945  1.4437002 -0.1970284

library(ggplot2)

TRAIN.cluster <- TRAIN.cluster %>%
  mutate(cluster= as.factor(cluster))

TRAIN.cluster %>%
  ggplot(aes(x=ridageyr,
             y=lbxglu,
             color=cluster)) +
  geom_point()

Plot_Clusters_by_Facet = function(dat, x, y, c, z) {
     ggplot(dat, 
            aes(x = .data[[x]], 
                y = .data[[y]],
                color=.data[[c]])
            ) +
          geom_point() +
          facet_wrap(.data[[z]] ~ .)
}


Plot_Clusters_by_Facet(TRAIN.cluster, 
            x='ridageyr',
            y='lbxglu',
            c='cluster',
            z='diq010')

Plot_Clusters_by_Facet(TRAIN.cluster, 
            x='bmxbmi',
            y='lbxglu',
            c='diq010',
            z='cluster')

library('arsenal')


Attaching package: 'arsenal'

The following object is masked from 'package:lubridate':

    is.Date

library('knitr')
library('kableExtra')


Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows

table_arsenal <- tableby(diq010 ~ cluster , 
                         data = TRAIN.cluster)

table_arsenal

tableby Object

Function Call:
tableby(formula = diq010 ~ cluster, data = TRAIN.cluster)

Variable(s):
diq010 ~ cluster

summary(table_arsenal)



|                    | Diabetes (N=197) | No Diabetes (N=1117) | Total (N=1314) | p value|
|:-------------------|:----------------:|:--------------------:|:--------------:|-------:|
|**cluster**         |                  |                      |                | < 0.001|
|&nbsp;&nbsp;&nbsp;1 |     7 (3.6%)     |     479 (42.9%)      |  486 (37.0%)   |        |
|&nbsp;&nbsp;&nbsp;2 |    67 (34.0%)    |     403 (36.1%)      |  470 (35.8%)   |        |
|&nbsp;&nbsp;&nbsp;3 |    73 (37.1%)    |      22 (2.0%)       |   95 (7.2%)    |        |
|&nbsp;&nbsp;&nbsp;4 |    22 (11.2%)    |       2 (0.2%)       |   24 (1.8%)    |        |
|&nbsp;&nbsp;&nbsp;5 |    28 (14.2%)    |     211 (18.9%)      |  239 (18.2%)   |        |

sum.table_arsenal <- summary(table_arsenal)

sum.table_arsenal



|                    | Diabetes (N=197) | No Diabetes (N=1117) | Total (N=1314) | p value|
|:-------------------|:----------------:|:--------------------:|:--------------:|-------:|
|**cluster**         |                  |                      |                | < 0.001|
|&nbsp;&nbsp;&nbsp;1 |     7 (3.6%)     |     479 (42.9%)      |  486 (37.0%)   |        |
|&nbsp;&nbsp;&nbsp;2 |    67 (34.0%)    |     403 (36.1%)      |  470 (35.8%)   |        |
|&nbsp;&nbsp;&nbsp;3 |    73 (37.1%)    |      22 (2.0%)       |   95 (7.2%)    |        |
|&nbsp;&nbsp;&nbsp;4 |    22 (11.2%)    |       2 (0.2%)       |   24 (1.8%)    |        |
|&nbsp;&nbsp;&nbsp;5 |    28 (14.2%)    |     211 (18.9%)      |  239 (18.2%)   |        |

29.4 Train logit models

model1 <- train(as.formula('diq010 ~ ridageyr + bmxbmi + lbxglu'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')

model2 <- train(as.formula('diq010 ~ cluster'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')


model3 <- train(as.formula('diq010 ~ riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + lbxglu + cluster'),
                TRAIN.cluster,
                preProcess = c('center','scale'),
                method = 'glm',
                family = 'binomial')

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
10, : These variables have zero variances: indhhin2$35,000-$44,999,
indhhin2$55,000-$64,999

29.5 Dealing with the Test data

TEST <- diab_pop[-TrainInd,]

library('clue')

cluster_predict <- clue::cl_predict(km, 
                                    predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu) 
                                    )

str(cluster_predict,1)

 'cl_class_ids' int [1:562] 2 2 5 4 2 4 1 2 2 1 ...

TEST$cluster <- clue::cl_predict(km, 
                                 predict(pP.TRAIN, TEST) %>% select(ridageyr, bmxbmi, lbxglu) 
                                 )

29.5.1 Predicting on the TEST data

TEST <- TEST %>%
  mutate(cluster = as.factor(cluster))

TEST_1 <- TEST

TEST_1$estimate <- predict(model1, TEST, 'raw')
TEST_1$Diabetes <- predict(model1, TEST, 'prob')$Diabetes

TEST_2 <- TEST

TEST_2$estimate <- predict(model2, TEST, 'raw')
TEST_2$Diabetes <- predict(model2, TEST, 'prob')$Diabetes

TEST_3 <- TEST

TEST_3$estimate <- predict(model3, TEST, 'raw')
TEST_3$Diabetes <- predict(model3, TEST, 'prob')$Diabetes



TEST.stacked <- bind_rows(TEST_1 %>% mutate(model = 'w_nums_only'),
                          TEST_2 %>% mutate(model = 'w_cluster'),
                          TEST_3 %>% mutate(model = "full_model")
                          )
library('yardstick')


Attaching package: 'yardstick'

The following objects are masked from 'package:caret':

    precision, recall, sensitivity, specificity

The following object is masked from 'package:readr':

    spec

library('glue')


Aucs <- TEST.stacked %>%
  group_by(model) %>%
  roc_auc(truth = diq010, Diabetes) %>%
  glue_data("{model} AUC : {round(.estimate, 3)}")


TEST.stacked %>%
  group_by(model) %>%
  roc_curve(truth = diq010, Diabetes) %>%
  autoplot() +
  labs(caption  = paste(Aucs, collapse = " \n "))