18 Resampling Samples - Classification with `caret` `glm` logit

18.1 STEP UP

set.seed(1701)

library('tidyverse')

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library('caret')

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

library('yardstick')


Attaching package: 'yardstick'

The following objects are masked from 'package:caret':

    precision, recall, sensitivity, specificity

The following object is masked from 'package:readr':

    spec

library('ggplot2')


diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit() 

glimpse(diab_pop)

Rows: 1,876
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Femal…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, Othe…
$ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades 9-…
$ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Widowed…
$ indhhin2 <fct> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "$65…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

levels(diab_pop$indhhin2)

 [1] "$0-$4,999"         "$5,000-$9,999"     "$10,000-$14,999"  
 [4] "$15,000-$19,999"   "$20,000-$24,999"   "$25,000-$34,999"  
 [7] "$35,000-$44,999"   "$45,000-$54,999"   "$55,000-$64,999"  
[10] "$65,000-$74,999"   "20,000+"           "less than $20,000"
[13] "$75,000-$99,999"   "$100,000+"

income_levels <- levels(diab_pop$indhhin2)


levels = c("$0-$4,999", 
           "$5,000-$9,999", 
           "$10,000-$14,999",
           "$15,000-$19,999",
           "less than $20,000",
           "20,000+", 
           "$20,000-$24,999",
           "$25,000-$34,999",
           "$35,000-$44,999",
           "$45,000-$54,999",
           "$55,000-$64,999",
           "$65,000-$74,999",
           "$75,000-$99,999",
           "$100,000+"
            )

setdiff(income_levels, levels)

character(0)

diab_pop$indhhin2 <- factor(diab_pop$indhhin2 ,
                            levels=levels,
                            ordered = TRUE)

odered_levels <- levels(diab_pop$indhhin2)

glimpse(diab_pop)

Rows: 1,876
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Femal…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, Othe…
$ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades 9-…
$ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Widowed…
$ indhhin2 <ord> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "$65…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

feature_names <- c('riagendr' , 'ridreth1' , 'dmdeduc2' , 'dmdmartl' , 'indhhin2' , 'lbxglu','bmxbmi')

feature_names_plus <- paste(feature_names, collapse = ' + ' )

feature_names_plus

[1] "riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + lbxglu + bmxbmi"

formula_1 <- as.formula(paste0('diq010 ~ ',feature_names_plus))

formula_1

diq010 ~ riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + 
    lbxglu + bmxbmi

18.2 WARNING - THIS IS A BAD OPTION

# THIS IS NOT A GREAT IDEA 

options(warn=-1)

# I have this on, there is an expected warning 
## "prediction from a rank-deficient fit may be misleading"
## without this option on the output is very difficult to read

18.3 `caret` `glm` logit `train` `function`

Train_Glm_Iteration <- function(data){
  
  TrainInd <- createDataPartition(data$diq010,
                                  p =.7,
                                  list=FALSE)

  TRAIN <- data[TrainInd, ] 
  
 bootstrap <- trainControl(method="boot", number=42)
  
  gml.model <- train(as.formula(formula_1),
    method='glm',
    data =TRAIN,
    family='binomial',
    preProcess = c('center','scale'),
    trControl=bootstrap
    )
  

  
  CoEff <-  as_tibble(gml.model$finalModel$coefficients, rownames="feature") %>%
    rename(coeff = value)
  
  TEST <- data[-TrainInd,]
  
  estimate <- predict(gml.model, TEST,'raw') 
  
  prob <- predict(gml.model, TEST,'prob')
  
  TEST.scored <- cbind(TEST, estimate, prob)
  
  return(list(Training_Data = TRAIN,
              gml.model = gml.model,
              CoEff = CoEff,
              TEST.scored =TEST.scored)
         )
  
}

18.4 Make Samples

18.4.1 SAMPLE 1

Id <- sample(diab_pop$seqn, nrow(diab_pop)*.3, replace=F)
length(Id)

[1] 562

t1 <- diab_pop %>% 
  filter(seqn %in% Id)

dim(t1)

[1] 562  10

X1 <- Train_Glm_Iteration(t1)

str(X1,1)

List of 4
 $ Training_Data:'data.frame':  394 obs. of  10 variables:
  ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
 $ gml.model    :List of 25
  ..- attr(*, "class")= chr [1:2] "train" "train.formula"
 $ CoEff        : tibble [30 × 2] (S3: tbl_df/tbl/data.frame)
 $ TEST.scored  :'data.frame':  168 obs. of  13 variables:

X1$TEST.scored %>%
  roc_auc(truth= diq010 , Diabetes)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.739

X1$TEST.scored %>%
  roc_curve(truth= diq010 , Diabetes) %>%
  autoplot()

conf_matX1 <- X1$TEST.scored %>%
  conf_mat(truth= diq010 , estimate)

conf_matX1

             Truth
Prediction    Diabetes No Diabetes
  Diabetes          11           8
  No Diabetes       15         134

sum.conf_matX1  <- summary(conf_matX1)

sum.conf_matX1

# A tibble: 13 × 3
   .metric              .estimator .estimate
   <chr>                <chr>          <dbl>
 1 accuracy             binary         0.863
 2 kap                  binary         0.412
 3 sens                 binary         0.423
 4 spec                 binary         0.944
 5 ppv                  binary         0.579
 6 npv                  binary         0.899
 7 mcc                  binary         0.419
 8 j_index              binary         0.367
 9 bal_accuracy         binary         0.683
10 detection_prevalence binary         0.113
11 precision            binary         0.579
12 recall               binary         0.423
13 f_meas               binary         0.489

nrow(X1$Training_Data) + nrow(X1$TEST.scored) == nrow(t1)

[1] TRUE

X1.comparedf <- arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn')) 

sum.X1.comparedf <- summary(X1.comparedf)

sum.X1.comparedf$comparison.summary.table

                                                     statistic value
1                                       Number of by-variables     1
2                         Number of non-by variables in common     9
3                                 Number of variables compared     9
4                           Number of variables in x but not y     0
5                           Number of variables in y but not x     3
6        Number of variables compared with some values unequal     0
7           Number of variables compared with all values equal     9
8                             Number of observations in common     0
9                        Number of observations in x but not y   394
10                       Number of observations in y but not x   168
11 Number of observations with some compared variables unequal     0
12    Number of observations with all compared variables equal     0
13                                    Number of values unequal     0

18.4.2 SAMPLE 2

Id2 <- sample(diab_pop$seqn, nrow(diab_pop)*.5, replace=F)

t2 <- diab_pop %>% 
  filter(seqn %in% Id2)

X2 <- Train_Glm_Iteration(t2)

18.4.3 SAMPLE 3 - “black swan”

18.4.3.1 Income == ‘$75,000-$99,999’ & Gender == ‘Female’ & ridreth1 == ‘Non-Hispanic White’

Swan <- diab_pop %>% 
  filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' &  ridreth1 == 'Non-Hispanic White') 

Id3 <- sample(Swan$seqn, nrow(Swan)*.8, replace=F)

t3 <- diab_pop %>% 
  filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' &  ridreth1 == 'Non-Hispanic White') %>%
  filter(seqn %in% Id3)

t3 %>% summary()

      seqn         riagendr     ridageyr                   ridreth1 
 Min.   :84166   Male  : 0   Min.   :21.00   MexicanAmerican   : 0  
 1st Qu.:85920   Female:29   1st Qu.:32.00   Other Hispanic    : 0  
 Median :89041               Median :48.00   Non-Hispanic White:29  
 Mean   :88741               Mean   :50.14   Non-Hispanic Black: 0  
 3rd Qu.:91064               3rd Qu.:61.00   Other             : 0  
 Max.   :92970               Max.   :80.00                          
                                                                    
                       dmdeduc2                 dmdmartl 
 Less than 9th grade       : 0   Married            :19  
 Grades 9-11th             : 1   Widowed            : 3  
 High school graduate/GED  : 4   Divorced           : 2  
 Some college or AA degrees:14   Separated          : 0  
 College grad or above     :10   Never married      : 1  
                                 Living with partner: 4  
                                                         
              indhhin2      bmxbmi              diq010       lbxglu     
 $75,000-$99,999  :29   Min.   :16.70   Diabetes   : 1   Min.   : 80.0  
 $0-$4,999        : 0   1st Qu.:23.70   No Diabetes:28   1st Qu.: 92.0  
 $5,000-$9,999    : 0   Median :26.80                    Median : 99.0  
 $10,000-$14,999  : 0   Mean   :30.05                    Mean   :104.3  
 $15,000-$19,999  : 0   3rd Qu.:33.30                    3rd Qu.:105.0  
 less than $20,000: 0   Max.   :63.60                    Max.   :207.0  
 (Other)          : 0

X3 <- Train_Glm_Iteration(t3)

18.4.4 SAMPLE 4

Id4 <- sample(diab_pop$seqn, nrow(diab_pop)*.9, replace=F)

t4 <- diab_pop %>% 
  filter(seqn %in% Id4)

X4 <- Train_Glm_Iteration(t4)

18.4.5 SAMPLE 5

M_union <- union(Id2,Id3)

Id5 <- setdiff(diab_pop$seqn, M_union)


t5 <- diab_pop %>% 
  filter(seqn %in% Id5)


X5 <- Train_Glm_Iteration(t5)

18.4.6 Compare SAMPLE 1 to SAMPLE 5

str(X2$Training_Data)

'data.frame':   657 obs. of  10 variables:
 $ seqn    : num  83757 83809 83813 83834 83851 ...
 $ riagendr: Factor w/ 2 levels "Male","Female": 2 2 1 1 2 1 2 2 1 2 ...
 $ ridageyr: num  57 20 24 69 37 74 80 80 75 33 ...
 $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 2 4 3 4 3 3 3 3 4 3 ...
 $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 1 3 4 3 3 5 3 4 4 4 ...
 $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 4 5 3 5 1 1 2 1 6 1 ...
 $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 7 13 8 3 10 8 4 10 4 12 ...
 $ bmxbmi  : num  35.4 26.2 26.9 28.2 35.3 27.2 23.5 26.9 30.8 25.9 ...
 $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 1 2 2 2 2 1 2 2 1 2 ...
 $ lbxglu  : num  398 94 105 105 79 123 137 110 145 83 ...
 - attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...

str(X3$Training_Data)

'data.frame':   21 obs. of  10 variables:
 $ seqn    : num  84166 84511 84517 84786 84816 ...
 $ riagendr: Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 2 2 2 2 ...
 $ ridageyr: num  67 78 80 50 28 61 61 40 73 68 ...
 $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 5 4 4 4 2 5 5 5 4 ...
 $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 2 2 1 1 3 1 1 1 6 ...
 $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 13 13 13 13 13 13 13 13 13 13 ...
 $ bmxbmi  : num  26.1 23.1 26.6 28.9 18.4 36.2 42.7 23.5 28.3 29.6 ...
 $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 2 2 2 ...
 $ lbxglu  : num  134 99 83 87 93 92 108 104 105 107 ...
 - attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...

str(X5$Training_Data)

'data.frame':   647 obs. of  10 variables:
 $ seqn    : num  83733 83737 83754 83755 83761 ...
 $ riagendr: Factor w/ 2 levels "Male","Female": 1 2 2 1 2 2 2 1 2 2 ...
 $ ridageyr: num  53 72 67 67 24 68 37 70 20 39 ...
 $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 1 2 4 5 1 2 3 4 1 ...
 $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 3 2 5 5 5 1 4 5 4 3 ...
 $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 3 4 1 2 5 3 1 6 5 1 ...
 $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 4 13 8 7 1 4 13 12 8 4 ...
 $ bmxbmi  : num  30.8 28.6 43.7 28.8 25.3 33.5 25.5 27 22.2 27.2 ...
 $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 1 2 2 2 2 2 2 ...
 $ lbxglu  : num  101 107 130 284 95 111 100 94 80 101 ...
 - attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...

arsenal::comparedf(X3$Training_Data,
                   X5$Training_Data)

Compare Object

Function Call: 
arsenal::comparedf(x = X3$Training_Data, y = X5$Training_Data)

Shared: 10 non-by variables and 21 observations.
Not shared: 0 variables and 626 observations.

Differences found in 10/10 variables compared.
0 variables compared have non-identical attributes.

18.5 Compare Coefficents across all samples

X1$CoEff

# A tibble: 30 × 2
   feature                                coeff
   <chr>                                  <dbl>
 1 (Intercept)                           2.68  
 2 riagendrFemale                        0.223 
 3 `ridreth1Other Hispanic`             -0.451 
 4 `ridreth1Non-Hispanic White`         -0.0250
 5 `ridreth1Non-Hispanic Black`         -0.483 
 6 ridreth1Other                        -0.337 
 7 `dmdeduc2Grades 9-11th`               0.420 
 8 `dmdeduc2High school graduate/GED`    0.207 
 9 `dmdeduc2Some college or AA degrees`  0.630 
10 `dmdeduc2College grad or above`       0.980 
# ℹ 20 more rows

CoEff_compare <- bind_rows(X1$CoEff %>% mutate(strat = 't1'),
          X2$CoEff %>% mutate(strat = 't2'),
          X3$CoEff %>% mutate(strat = 't3'),
          X4$CoEff %>% mutate(strat = 't4'),
          X5$CoEff %>% mutate(strat = 't5'))


glimpse(CoEff_compare)

Rows: 150
Columns: 3
$ feature <chr> "(Intercept)", "riagendrFemale", "`ridreth1Other Hispanic`", "…
$ coeff   <dbl> 2.67925964, 0.22277284, -0.45124437, -0.02501180, -0.48257023,…
$ strat   <chr> "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t…

CoEff_compare %>%
  group_by(strat) %>%
  ggplot(aes(x=feature, y=coeff)) +
  geom_point() + 
  coord_flip() +
  facet_wrap(.~strat)

CoEff_compare %>%
  ggplot(aes(x=feature, y=coeff)) +
  geom_boxplot() + 
  coord_flip()

18.6 Review Ouput

str(X1,1)

List of 4
 $ Training_Data:'data.frame':  394 obs. of  10 variables:
  ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
 $ gml.model    :List of 25
  ..- attr(*, "class")= chr [1:2] "train" "train.formula"
 $ CoEff        : tibble [30 × 2] (S3: tbl_df/tbl/data.frame)
 $ TEST.scored  :'data.frame':  168 obs. of  13 variables:

glimpse(X1$TEST.scored)

Rows: 168
Columns: 13
$ seqn          <dbl> 83734, 83755, 83790, 83849, 83908, 83947, 83963, 84151, …
$ riagendr      <fct> Male, Male, Male, Male, Male, Female, Female, Female, Fe…
$ ridageyr      <dbl> 78, 67, 56, 71, 51, 33, 44, 53, 44, 34, 64, 55, 23, 47, …
$ ridreth1      <fct> Non-Hispanic White, Non-Hispanic Black, Non-Hispanic Whi…
$ dmdeduc2      <fct> High school graduate/GED, College grad or above, Less th…
$ dmdmartl      <fct> Married, Widowed, Married, Married, Married, Married, Di…
$ indhhin2      <ord> "$20,000-$24,999", "$20,000-$24,999", "$15,000-$19,999",…
$ bmxbmi        <dbl> 28.8, 28.8, 24.4, 27.6, 24.7, 25.9, 52.1, 25.3, 32.6, 25…
$ diq010        <fct> Diabetes, Diabetes, No Diabetes, Diabetes, No Diabetes, …
$ lbxglu        <dbl> 84, 284, 397, 76, 102, 83, 109, 86, 102, 98, 134, 116, 1…
$ estimate      <fct> No Diabetes, Diabetes, Diabetes, No Diabetes, No Diabete…
$ Diabetes      <dbl> 0.040939395, 0.998934295, 0.999998287, 0.019392257, 0.14…
$ `No Diabetes` <dbl> 9.590606e-01, 1.065705e-03, 1.713443e-06, 9.806077e-01, …

look <- as_tibble(X1$TEST.scored)

look %>%
  roc_auc(truth=diq010, Diabetes)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.739

look %>%
  roc_curve(truth=diq010, Diabetes) %>%
  autoplot()

look %>%
  conf_mat(truth=diq010, estimate)

             Truth
Prediction    Diabetes No Diabetes
  Diabetes          11           8
  No Diabetes       15         134

look %>%
  conf_mat(truth=diq010, estimate) %>%
  summary()

# A tibble: 13 × 3
   .metric              .estimator .estimate
   <chr>                <chr>          <dbl>
 1 accuracy             binary         0.863
 2 kap                  binary         0.412
 3 sens                 binary         0.423
 4 spec                 binary         0.944
 5 ppv                  binary         0.579
 6 npv                  binary         0.899
 7 mcc                  binary         0.419
 8 j_index              binary         0.367
 9 bal_accuracy         binary         0.683
10 detection_prevalence binary         0.113
11 precision            binary         0.579
12 recall               binary         0.423
13 f_meas               binary         0.489

18.7 create `Get_Errors_Function`

Get_Errors_Function <- function(data){

  look <- data
  
  AUC <- look %>%
    roc_auc(truth=diq010, Diabetes)
  
  ROC_CURVE <- look %>%
    roc_curve(truth=diq010, Diabetes) 
  
  CONF_MAT <- look %>%
    conf_mat(truth=diq010, estimate)
  
  SUM.CONF_MAT <- look %>%
    conf_mat(truth=diq010, estimate) %>%
    summary()
  
  output = list(
    AUC = AUC,
    ROC_CURVE = ROC_CURVE,
    CONF_MAT = CONF_MAT,
    SUM.CONF_MAT = SUM.CONF_MAT
  )
  
  return(output)
}

$~$

18.8 Test on Sample 3

f3 <- diab_pop %>% 
  anti_join(t3 %>% select(seqn))

Joining with `by = join_by(seqn)`

glimpse(f3)

Rows: 1,847
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Femal…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, Othe…
$ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades 9-…
$ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Widowed…
$ indhhin2 <ord> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "$65…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

nrow(diab_pop) #1876

[1] 1876

nrow(t3) #

[1] 29

nrow(f3) #

[1] 1847

nrow(f3) + nrow(t3) == nrow(diab_pop)

[1] TRUE

arsenal::comparedf(t3,f3,by=c('seqn'))

Compare Object

Function Call: 
arsenal::comparedf(x = t3, y = f3, by = c("seqn"))

Shared: 9 non-by variables and 0 observations.
Not shared: 0 variables and 1876 observations.

Differences found in 0/9 variables compared.
0 variables compared have non-identical attributes.

compare_df_obj <- arsenal::comparedf(t3,f3,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table

                                                     statistic value
1                                       Number of by-variables     1
2                         Number of non-by variables in common     9
3                                 Number of variables compared     9
4                           Number of variables in x but not y     0
5                           Number of variables in y but not x     0
6        Number of variables compared with some values unequal     0
7           Number of variables compared with all values equal     9
8                             Number of observations in common     0
9                        Number of observations in x but not y    29
10                       Number of observations in y but not x  1847
11 Number of observations with some compared variables unequal     0
12    Number of observations with all compared variables equal     0
13                                    Number of values unequal     0

nrow(f3) + ( nrow(X3$TEST.scored) + nrow(X3$Training_Data) ) == nrow(diab_pop)

[1] TRUE

arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))

Compare Object

Function Call: 
arsenal::comparedf(x = X3$TEST.scored, y = f3, by = c("seqn"))

Shared: 9 non-by variables and 0 observations.
Not shared: 3 variables and 1855 observations.

Differences found in 0/9 variables compared.
0 variables compared have non-identical attributes.

compare_df_obj <- arsenal::comparedf(X3$TEST.scored,f3,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table

                                                     statistic value
1                                       Number of by-variables     1
2                         Number of non-by variables in common     9
3                                 Number of variables compared     9
4                           Number of variables in x but not y     3
5                           Number of variables in y but not x     0
6        Number of variables compared with some values unequal     0
7           Number of variables compared with all values equal     9
8                             Number of observations in common     0
9                        Number of observations in x but not y     8
10                       Number of observations in y but not x  1847
11 Number of observations with some compared variables unequal     0
12    Number of observations with all compared variables equal     0
13                                    Number of values unequal     0

f3 <- bind_rows(X3$TEST.scored %>% select(colnames(diab_pop)),
                f3)

18.9 Predict

18.9.1 Probs

str(predict(X3$gml.model, f3,'prob'),1)

'data.frame':   1855 obs. of  2 variables:
 $ Diabetes   : num  7.88e-12 7.88e-12 7.88e-12 7.88e-12 7.88e-12 ...
 $ No Diabetes: num  1 1 1 1 1 ...

f3$Diabetes <- predict(X3$gml.model, f3,'prob')$Diabetes

glimpse(f3)

Rows: 1,855
Columns: 11
$ seqn     <dbl> 85443, 87007, 87510, 90121, 90384, 90814, 92358, 92970, 83733…
$ riagendr <fct> Female, Female, Female, Female, Female, Female, Female, Femal…
$ ridageyr <dbl> 48, 45, 29, 44, 47, 80, 21, 25, 53, 78, 72, 45, 67, 67, 57, 2…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> High school graduate/GED, Some college or AA degrees, College…
$ dmdmartl <fct> Married, Married, Married, Married, Living with partner, Marr…
$ indhhin2 <ord> "$75,000-$99,999", "$75,000-$99,999", "$75,000-$99,999", "$75…
$ bmxbmi   <dbl> 27.1, 39.4, 22.6, 45.2, 28.1, 33.3, 21.8, 26.8, 30.8, 28.8, 2…
$ diq010   <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Diabet…
$ lbxglu   <dbl> 97, 207, 97, 104, 107, 102, 80, 92, 101, 84, 107, 84, 130, 28…
$ Diabetes <dbl> 7.884915e-12, 7.884915e-12, 7.884915e-12, 7.884915e-12, 7.884…

18.9.2 Predict Class

str(predict(X3$gml.model, f3,'raw'),1)

 Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 2 2 2 ...

f3$estimate <- predict(X3$gml.model, f3,'raw')

glimpse(f3)

Rows: 1,855
Columns: 12
$ seqn     <dbl> 85443, 87007, 87510, 90121, 90384, 90814, 92358, 92970, 83733…
$ riagendr <fct> Female, Female, Female, Female, Female, Female, Female, Femal…
$ ridageyr <dbl> 48, 45, 29, 44, 47, 80, 21, 25, 53, 78, 72, 45, 67, 67, 57, 2…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> High school graduate/GED, Some college or AA degrees, College…
$ dmdmartl <fct> Married, Married, Married, Married, Living with partner, Marr…
$ indhhin2 <ord> "$75,000-$99,999", "$75,000-$99,999", "$75,000-$99,999", "$75…
$ bmxbmi   <dbl> 27.1, 39.4, 22.6, 45.2, 28.1, 33.3, 21.8, 26.8, 30.8, 28.8, 2…
$ diq010   <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Diabet…
$ lbxglu   <dbl> 97, 207, 97, 104, 107, 102, 80, 92, 101, 84, 107, 84, 130, 28…
$ Diabetes <dbl> 7.884915e-12, 7.884915e-12, 7.884915e-12, 7.884915e-12, 7.884…
$ estimate <fct> No Diabetes, No Diabetes, No Diabetes, No Diabetes, No Diabet…

18.10 Test Function

18.10.1 Get AUC

Get_Errors_Function(f3)

$AUC
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.497

$ROC_CURVE
# A tibble: 5 × 3
   .threshold specificity sensitivity
        <dbl>       <dbl>       <dbl>
1 -Inf              0          1     
2    7.88e-12       0          1     
3    7.89e-12       0.803      0.214 
4    1.00e+ 0       0.806      0.0964
5  Inf              1          0     

$CONF_MAT
             Truth
Prediction    Diabetes No Diabetes
  Diabetes          27         306
  No Diabetes      253        1269

$SUM.CONF_MAT
# A tibble: 13 × 3
   .metric              .estimator .estimate
   <chr>                <chr>          <dbl>
 1 accuracy             binary        0.699 
 2 kap                  binary       -0.0908
 3 sens                 binary        0.0964
 4 spec                 binary        0.806 
 5 ppv                  binary        0.0811
 6 npv                  binary        0.834 
 7 mcc                  binary       -0.0913
 8 j_index              binary       -0.0979
 9 bal_accuracy         binary        0.451 
10 detection_prevalence binary        0.180 
11 precision            binary        0.0811
12 recall               binary        0.0964
13 f_meas               binary        0.0881

SAMPLE_3.ERRORS <- Get_Errors_Function(f3) 

str(SAMPLE_3.ERRORS,1)

List of 4
 $ AUC         : tibble [1 × 3] (S3: tbl_df/tbl/data.frame)
 $ ROC_CURVE   : roc_df [5 × 3] (S3: roc_df/tbl_df/tbl/data.frame)
 $ CONF_MAT    :List of 1
  ..- attr(*, "class")= chr "conf_mat"
 $ SUM.CONF_MAT: tibble [13 × 3] (S3: tbl_df/tbl/data.frame)

SAMPLE_3.ERRORS$AUC

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.497

18.11 Apply `Get_Errors_Function`

18.11.1 Model 3 Error By Classes

(f3 %>% 
  group_by(riagendr) %>%
  Get_Errors_Function())$AUC

# A tibble: 2 × 4
  riagendr .metric .estimator .estimate
  <fct>    <chr>   <chr>          <dbl>
1 Male     roc_auc binary         0.487
2 Female   roc_auc binary         0.509

18.11.2 Sex

  (f3 %>% 
  group_by(riagendr) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

18.11.3 Sex and Income

(f3 %>% 
  group_by(riagendr , indhhin2) %>%
  Get_Errors_Function())$AUC

# A tibble: 24 × 5
   riagendr indhhin2          .metric .estimator .estimate
   <fct>    <ord>             <chr>   <chr>          <dbl>
 1 Male     $0-$4,999         roc_auc binary         0.416
 2 Male     $5,000-$9,999     roc_auc binary         0.406
 3 Male     $10,000-$14,999   roc_auc binary         0.456
 4 Male     $15,000-$19,999   roc_auc binary         0.483
 5 Male     less than $20,000 roc_auc binary         0.643
 6 Male     20,000+           roc_auc binary         0.370
 7 Male     $20,000-$24,999   roc_auc binary         0.502
 8 Male     $25,000-$34,999   roc_auc binary         0.485
 9 Male     $45,000-$54,999   roc_auc binary         0.472
10 Male     $65,000-$74,999   roc_auc binary         0.504
# ℹ 14 more rows

(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=indhhin2,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$ROC_CURVE %>%
  autoplot() +
  labs( title = "ROC Curves by Sex and Income")

18.11.4 Income and Ethnicity

(f3 %>% 
  group_by(riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=ridreth1,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

18.11.4.1 Not all levels may be available

(f3 %>% 
  group_by(riagendr, indhhin2, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(riagendr) %>%
  ggplot(aes(x=indhhin2,
             y=AUC_EST,
             fill=riagendr)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap( ~ ridreth1)

Error in `roc_curve()`:
! No event observations were detected in `truth` with event level
  'Diabetes'.

18.11.5 Confusion Matricies

(f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$CONF_MAT

# A tibble: 24 × 3
   riagendr indhhin2          conf_mat  
   <fct>    <ord>             <list>    
 1 Male     $0-$4,999         <conf_mat>
 2 Male     $5,000-$9,999     <conf_mat>
 3 Male     $10,000-$14,999   <conf_mat>
 4 Male     $15,000-$19,999   <conf_mat>
 5 Male     less than $20,000 <conf_mat>
 6 Male     20,000+           <conf_mat>
 7 Male     $20,000-$24,999   <conf_mat>
 8 Male     $25,000-$34,999   <conf_mat>
 9 Male     $45,000-$54,999   <conf_mat>
10 Male     $65,000-$74,999   <conf_mat>
# ℹ 14 more rows

Sum_Conf_T3_Example <- (f3 %>% 
  group_by(riagendr, indhhin2) %>%
  Get_Errors_Function())$SUM.CONF_MAT

Sum_Conf_T3_Example

   riagendr               indhhin2 
 Male  :12   $0-$4,999        : 2  
 Female:12   $5,000-$9,999    : 2  
             $10,000-$14,999  : 2  
             $15,000-$19,999  : 2  
             less than $20,000: 2  
             20,000+          : 2  
             (Other)          :12  
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
                                   
 conf_mat.Length  conf_mat.Class  conf_mat.Mode
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list                      
 1         conf_mat  list

18.12 Random Error Model

f2 <- diab_pop %>% 
  filter(!seqn %in% Id2)


nrow(diab_pop) #1876

[1] 1876

nrow(t2) #

[1] 938

nrow(f2) #

[1] 938

nrow(f2) + nrow(t2) == nrow(diab_pop)

[1] TRUE

arsenal::comparedf(t2,f2,by=c('seqn'))

Compare Object

Function Call: 
arsenal::comparedf(x = t2, y = f2, by = c("seqn"))

Shared: 9 non-by variables and 0 observations.
Not shared: 0 variables and 1876 observations.

Differences found in 0/9 variables compared.
0 variables compared have non-identical attributes.

compare_df_obj <- arsenal::comparedf(t2,f2,by=c('seqn'))

summ.compare_df_obj <- summary(compare_df_obj)
summ.compare_df_obj$comparison.summary.table

                                                     statistic value
1                                       Number of by-variables     1
2                         Number of non-by variables in common     9
3                                 Number of variables compared     9
4                           Number of variables in x but not y     0
5                           Number of variables in y but not x     0
6        Number of variables compared with some values unequal     0
7           Number of variables compared with all values equal     9
8                             Number of observations in common     0
9                        Number of observations in x but not y   938
10                       Number of observations in y but not x   938
11 Number of observations with some compared variables unequal     0
12    Number of observations with all compared variables equal     0
13                                    Number of values unequal     0

f2$estimate <- predict(X2$gml.model, f2,'raw')
f2$Diabetes <- predict(X2$gml.model, f2,'prob')$Diabetes

18.13 Compare Random to Swan

TEST.Scored_stacked <- bind_rows(
  f2 %>% mutate(model = 'random'),
  f3 %>% mutate(model = 'black_swan')
)


(TEST.Scored_stacked %>%
  group_by(model) %>%
  Get_Errors_Function())$AUC

# A tibble: 2 × 4
  model      .metric .estimator .estimate
  <chr>      <chr>   <chr>          <dbl>
1 black_swan roc_auc binary         0.497
2 random     roc_auc binary         0.835

18.13.1 By Model by Sex

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=model)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip()

18.13.2 By Model by Sex by Income

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr,indhhin2) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, indhhin2) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=indhhin2)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(model~.)

18.13.3 By Model, Sex, & Ethnicity

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, ridreth1) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=ridreth1)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(model~.)

  (TEST.Scored_stacked %>% 
  group_by(model, riagendr, ridreth1) %>%
  Get_Errors_Function())$AUC %>%
  rename(AUC_EST = .estimate) %>%
  group_by(model, riagendr, ridreth1) %>%
  ggplot(aes(x=riagendr,
             y=AUC_EST,
             fill=model)) +
   geom_bar(stat = "identity",
            position = "dodge") + 
  coord_flip() +
  facet_wrap(ridreth1~.)

18.1 STEP UP

18.2 WARNING - THIS IS A BAD OPTION

18.3 caret glm logit train function

18.4 Make Samples

18.4.1 SAMPLE 1

18.4.2 SAMPLE 2

18.4.3 SAMPLE 3 - “black swan”

18.4.3.1 Income == ‘$75,000-$99,999’ & Gender == ‘Female’ & ridreth1 == ‘Non-Hispanic White’

18.4.4 SAMPLE 4

18.4.5 SAMPLE 5

18.4.6 Compare SAMPLE 1 to SAMPLE 5

18.5 Compare Coefficents across all samples

18.6 Review Ouput

18.7 create Get_Errors_Function

18.8 Test on Sample 3

18.9 Predict

18.9.1 Probs

18.9.2 Predict Class

18.10 Test Function

18.10.1 Get AUC

18.11 Apply Get_Errors_Function

18.11.1 Model 3 Error By Classes

18.11.2 Sex

18.11.3 Sex and Income

18.11.4 Income and Ethnicity

18.11.4.1 Not all levels may be available

18.11.5 Confusion Matricies

18.12 Random Error Model

18.13 Compare Random to Swan

18.13.1 By Model by Sex

18.13.2 By Model by Sex by Income

18.13.3 By Model, Sex, & Ethnicity

18.3 `caret` `glm` logit `train` `function`

18.7 create `Get_Errors_Function`

18.11 Apply `Get_Errors_Function`