17 Resampling Samples

17.1 STEP UP

library('tidyverse')

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library('caret')

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

library('yardstick')


Attaching package: 'yardstick'

The following objects are masked from 'package:caret':

    precision, recall, sensitivity, specificity

The following object is masked from 'package:readr':

    spec

library('ggplot2')

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS') %>%
  na.omit()

glimpse(diab_pop)

Rows: 1,876
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Femal…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, Othe…
$ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades 9-…
$ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Widowed…
$ indhhin2 <fct> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "$65…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

levels(diab_pop$indhhin2)

 [1] "$0-$4,999"         "$5,000-$9,999"     "$10,000-$14,999"  
 [4] "$15,000-$19,999"   "$20,000-$24,999"   "$25,000-$34,999"  
 [7] "$35,000-$44,999"   "$45,000-$54,999"   "$55,000-$64,999"  
[10] "$65,000-$74,999"   "20,000+"           "less than $20,000"
[13] "$75,000-$99,999"   "$100,000+"

income_levels <- levels(diab_pop$indhhin2)


levels = c("$0-$4,999", 
           "$5,000-$9,999", 
           "$10,000-$14,999",
           "$15,000-$19,999",
           "less than $20,000",
           "20,000+", 
           "$20,000-$24,999",
           "$25,000-$34,999",
           "$35,000-$44,999",
           "$45,000-$54,999",
           "$55,000-$64,999",
           "$65,000-$74,999",
           "$75,000-$99,999",
           "$100,000+"
          )

setdiff(income_levels, levels)

character(0)

diab_pop$indhhin2 <- factor(diab_pop$indhhin2 ,
                            levels=levels,
                            ordered = TRUE)

odered_levels <- levels(diab_pop$indhhin2)

glimpse(diab_pop)

Rows: 1,876
Columns: 10
$ seqn     <dbl> 83733, 83734, 83737, 83750, 83754, 83755, 83757, 83761, 83787…
$ riagendr <fct> Male, Male, Female, Male, Female, Male, Female, Female, Femal…
$ ridageyr <dbl> 53, 78, 72, 45, 67, 67, 57, 24, 68, 66, 56, 37, 20, 24, 80, 7…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, MexicanAmerican, Othe…
$ dmdeduc2 <fct> High school graduate/GED, High school graduate/GED, Grades 9-…
$ dmdmartl <fct> Divorced, Married, Separated, Never married, Married, Widowed…
$ indhhin2 <ord> "$15,000-$19,999", "$20,000-$24,999", "$75,000-$99,999", "$65…
$ bmxbmi   <dbl> 30.8, 28.8, 28.6, 24.1, 43.7, 28.8, 35.4, 25.3, 33.5, 34.0, 2…
$ diq010   <fct> No Diabetes, Diabetes, No Diabetes, No Diabetes, No Diabetes,…
$ lbxglu   <dbl> 101, 84, 107, 84, 130, 284, 398, 95, 111, 113, 397, 100, 94, …

feature_names <- c('riagendr' , 'ridreth1' , 'dmdeduc2' , 'dmdmartl' , 'indhhin2' , 'lbxglu', 'diq010')

feature_names_plus <- paste(feature_names, collapse = ' + ' )

feature_names_plus

[1] "riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + lbxglu + diq010"

formula_1 <- as.formula(paste0('bmxbmi ~ ',feature_names_plus))

formula_1

bmxbmi ~ riagendr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + 
    lbxglu + diq010

17.1.1 WARNING - THIS IS A BAD OPTION

# THIS IS NOT A GREAT IDEA 

options(warn=-1)

# I have this on, there is an expected warning 
## "prediction from a rank-deficient fit may be misleading"
## without this option on the output is very difficult to read

17.2 `caret` Train `glm` Function

Train_Glm_Iteration <- function(data){
  
  TrainInd <- createDataPartition(data$bmxbmi,
                                  p =.7,
                                  list=FALSE)

  TRAIN <- data[TrainInd, ] 
  
  gml_control <- trainControl(
    method = 'cv',
    number = 22,
    preProcOptions = c("zv","corr",'center','scale',"conditionalX")
  )
  
  gml.model <- train(as.formula(formula_1) ,
                     method='glm',
                     data =TRAIN,
                     trControl=gml_control,
                     family = "gaussian"
                     )
  
  
  CoEff <-  as_tibble(gml.model$finalModel$coefficients,
                      rownames="feature") %>%
    rename(coeff = value)
  
  TEST <- data[-TrainInd,]
  
  estimate <- as_tibble(predict(gml.model, TEST,'raw')) %>%
    rename(estimate= value)
  
  TEST.scored <- cbind(TEST, estimate)
  
  RMSE <- TEST.scored %>%
    rmse(truth=bmxbmi , estimate)
  
  return(list(Training_Data = TRAIN,
              gml.model = gml.model,
              CoEff = CoEff,
              TEST.scored =TEST.scored,
              RMSE_TEST = RMSE))
  
}

17.3 Make Samples

17.3.1 SAMPLE 1

Id <- sample(diab_pop$seqn, nrow(diab_pop)*.3, replace=F)
length(Id)

[1] 562

t1 <- diab_pop %>% 
  filter(seqn %in% Id)

dim(t1)

[1] 562  10

X1 <- Train_Glm_Iteration(t1)


str(X1,1)

List of 5
 $ Training_Data:'data.frame':  395 obs. of  10 variables:
  ..- attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  .. ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...
 $ gml.model    :List of 25
  ..- attr(*, "class")= chr [1:2] "train" "train.formula"
 $ CoEff        : tibble [30 × 2] (S3: tbl_df/tbl/data.frame)
 $ TEST.scored  :'data.frame':  167 obs. of  11 variables:
 $ RMSE_TEST    : tibble [1 × 3] (S3: tbl_df/tbl/data.frame)

nrow(X1$Training_Data) + nrow(X1$TEST.scored) == nrow(t1)

[1] TRUE

arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn'))

Compare Object

Function Call: 
arsenal::comparedf(x = X1$Training_Data, y = X1$TEST.scored, 
    by = c("seqn"))

Shared: 9 non-by variables and 0 observations.
Not shared: 1 variables and 562 observations.

Differences found in 0/9 variables compared.
0 variables compared have non-identical attributes.

X1.comparedf <- arsenal::comparedf(X1$Training_Data, X1$TEST.scored, by=c('seqn')) 

sum.X1.comparedf <- summary(X1.comparedf)

sum.X1.comparedf$comparison.summary.table

                                                     statistic value
1                                       Number of by-variables     1
2                         Number of non-by variables in common     9
3                                 Number of variables compared     9
4                           Number of variables in x but not y     0
5                           Number of variables in y but not x     1
6        Number of variables compared with some values unequal     0
7           Number of variables compared with all values equal     9
8                             Number of observations in common     0
9                        Number of observations in x but not y   395
10                       Number of observations in y but not x   167
11 Number of observations with some compared variables unequal     0
12    Number of observations with all compared variables equal     0
13                                    Number of values unequal     0

17.3.1.1 R²

rsq(X1$TEST.scored, 
    truth =bmxbmi , estimate)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard      0.0361

rsq_trad(X1$TEST.scored, 
         truth =bmxbmi , estimate)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 rsq_trad standard     -0.0985

summary(lm(formula_1, 
           X1$Training_Data %>% 
             mutate_if(is.numeric , scale)
           ))$r.squared

[1] 0.1793975

17.3.1.2 Adjusted R²

Adjusted_R²

Adj_R2_est <- function(r2_est,
                       num_features,
                       num_records){
  n <- num_records
  p <- num_features
  
  X <- 1-r2_est
  Y <- (n-1)
  Z <- (n-p-1)
  
  adj_r2_est = 1 - X*(Y/Z)
  
  return(adj_r2_est)
}

X1_estimate <- predict(X1$gml.model , X1$Training_Data, "raw")

X1$Training_Data_scored <- cbind(X1$Training_Data, X1_estimate) 

r2_est <- (rsq(X1$Training_Data_scored, 
                    truth =bmxbmi ,
                    X1_estimate))$.estimate

r2_est

[1] 0.1793975

Adj_R2_est(r2_est,
           length(feature_names), 
           nrow(X1$Training_Data))

[1] 0.1645545

summary(lm(formula_1, 
           X1$Training_Data %>% 
             mutate_if(is.numeric ,scale)
           ))$adj.r.squared

[1] 0.1190262

X1.glm <- glm(formula_1, 
              X1$Training_Data %>% mutate_if(is.numeric ,scale),
              family = "gaussian")


#install.packages('rsq')
rsq::rsq(X1.glm, adj=TRUE)

[1] 0.1190262

rsq::rsq(X1.glm, adj=FALSE)

[1] 0.1793975

17.4 SAMPLE 2

Id2 <- sample(diab_pop$seqn, nrow(diab_pop)*.5, replace=F)

t2 <- diab_pop %>% 
  filter(seqn %in% Id2)

X2 <- Train_Glm_Iteration(t2)

17.5 SAMPLE 3 - “black swan”

17.5.1 Income == ‘$75,000-$99,999’ & Gender == ‘Female’ & ridreth1 == ‘Non-Hispanic White’

Swan <- diab_pop %>% 
  filter(indhhin2 == '$75,000-$99,999' & riagendr == 'Female' &  ridreth1 == 'Non-Hispanic White') 

Id3 <- sample(Swan$seqn, nrow(Swan)*.8, replace=F)

t3 <- diab_pop %>% 
  filter(indhhin2 == '$75,000-$99,999' & 
         riagendr == 'Female' &  
         ridreth1 == 'Non-Hispanic White') %>%
  filter(seqn %in% Id3)

t3 %>%
  group_by(indhhin2,riagendr) %>%
  summary(cnt=n_distinct(seqn))

      seqn         riagendr     ridageyr                  ridreth1 
 Min.   :84511   Male  : 0   Min.   :22.0   MexicanAmerican   : 0  
 1st Qu.:86172   Female:29   1st Qu.:35.0   Other Hispanic    : 0  
 Median :88903               Median :47.0   Non-Hispanic White:29  
 Mean   :88572               Mean   :51.1   Non-Hispanic Black: 0  
 3rd Qu.:90745               3rd Qu.:64.0   Other             : 0  
 Max.   :93258               Max.   :80.0                          
                                                                   
                       dmdeduc2                 dmdmartl 
 Less than 9th grade       : 2   Married            :19  
 Grades 9-11th             : 1   Widowed            : 3  
 High school graduate/GED  : 2   Divorced           : 2  
 Some college or AA degrees:13   Separated          : 0  
 College grad or above     :11   Never married      : 2  
                                 Living with partner: 3  
                                                         
              indhhin2      bmxbmi             diq010       lbxglu     
 $75,000-$99,999  :29   Min.   :16.7   Diabetes   : 3   Min.   : 83.0  
 $0-$4,999        : 0   1st Qu.:25.3   No Diabetes:26   1st Qu.: 93.0  
 $5,000-$9,999    : 0   Median :28.9                    Median :100.0  
 $10,000-$14,999  : 0   Mean   :32.0                    Mean   :105.3  
 $15,000-$19,999  : 0   3rd Qu.:39.3                    3rd Qu.:107.0  
 less than $20,000: 0   Max.   :63.6                    Max.   :207.0  
 (Other)          : 0

X3 <- Train_Glm_Iteration(t3)

17.6 SAMPLE 4

Id4 <- sample(diab_pop$seqn, nrow(diab_pop)*.9, replace=F)

t4 <- diab_pop %>% 
  filter(seqn %in% Id4)

X4 <- Train_Glm_Iteration(t4)

17.7 SAMPLE 5

M_union <- union(Id2,Id3)

Id5 <- setdiff(diab_pop$seqn, M_union)


t5 <- diab_pop %>% 
  filter(seqn %in% Id5)


X5 <- Train_Glm_Iteration(t5)

17.7.1 Compare SAMPLE 1 to SAMPLE 5

str(X2$Training_Data)

'data.frame':   659 obs. of  10 variables:
 $ seqn    : num  83737 83755 83761 83813 83823 ...
 $ riagendr: Factor w/ 2 levels "Male","Female": 2 1 2 1 2 2 1 2 1 1 ...
 $ ridageyr: num  72 67 24 24 29 49 41 38 55 51 ...
 $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 1 4 5 3 1 3 4 5 3 4 ...
 $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 2 5 5 4 1 3 4 4 4 3 ...
 $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 4 2 5 3 5 1 1 1 1 1 ...
 $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 13 7 1 8 3 14 14 13 10 8 ...
 $ bmxbmi  : num  28.6 28.8 25.3 26.9 29.7 27.4 40.7 21.8 25 24.7 ...
 $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 2 1 2 2 2 2 2 2 2 2 ...
 $ lbxglu  : num  107 284 95 105 102 126 110 89 109 102 ...
 - attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...

str(X3$Training_Data)

'data.frame':   22 obs. of  10 variables:
 $ seqn    : num  84511 84517 84816 85093 85920 ...
 $ riagendr: Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 2 2 2 2 ...
 $ ridageyr: num  78 80 28 61 61 33 35 45 73 68 ...
 $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 4 4 2 5 1 5 4 5 4 ...
 $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 2 2 1 3 1 1 1 1 1 6 ...
 $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 13 13 13 13 13 13 13 13 13 13 ...
 $ bmxbmi  : num  23.1 26.6 18.4 36.2 42.7 37.7 28.9 39.4 28.3 29.6 ...
 $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 2 2 2 2 2 2 2 2 ...
 $ lbxglu  : num  99 83 93 92 108 101 100 207 105 107 ...
 - attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...

str(X5$Training_Data)

'data.frame':   648 obs. of  10 variables:
 $ seqn    : num  83733 83734 83787 83789 83790 ...
 $ riagendr: Factor w/ 2 levels "Male","Female": 1 1 2 1 1 2 2 2 1 1 ...
 $ ridageyr: num  53 78 68 66 56 20 80 20 40 54 ...
 $ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 1 3 3 4 2 4 4 1 ...
 $ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 3 3 1 5 1 3 1 4 4 2 ...
 $ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 3 1 3 6 1 5 2 5 5 1 ...
 $ indhhin2: Ord.factor w/ 14 levels "$0-$4,999"<"$5,000-$9,999"<..: 4 7 4 12 4 13 3 8 13 7 ...
 $ bmxbmi  : num  30.8 28.8 33.5 34 24.4 26.2 28.5 22.2 30.7 30.2 ...
 $ diq010  : Factor w/ 2 levels "Diabetes","No Diabetes": 2 1 2 2 2 2 2 2 2 2 ...
 $ lbxglu  : num  101 84 111 113 397 94 119 80 90 99 ...
 - attr(*, "na.action")= 'omit' Named int [1:3843] 1 4 5 7 8 9 10 12 16 18 ...
  ..- attr(*, "names")= chr [1:3843] "1" "4" "5" "7" ...

arsenal::comparedf(X3$Training_Data,
                   X5$Training_Data)

Compare Object

Function Call: 
arsenal::comparedf(x = X3$Training_Data, y = X5$Training_Data)

Shared: 10 non-by variables and 22 observations.
Not shared: 0 variables and 626 observations.

Differences found in 10/10 variables compared.
0 variables compared have non-identical attributes.

17.8 Compare Coefficents across all samples

CoEff_compare <- bind_rows(X1$CoEff %>% mutate(strat = 't1'),
          X2$CoEff %>% mutate(strat = 't2'),
          X3$CoEff %>% mutate(strat = 't3'),
          X4$CoEff %>% mutate(strat = 't4'),
          X5$CoEff %>% mutate(strat = 't5'))


glimpse(CoEff_compare)

Rows: 150
Columns: 3
$ feature <chr> "(Intercept)", "riagendrFemale", "`ridreth1Other Hispanic`", "…
$ coeff   <dbl> 30.382340838, 1.472028677, 0.536913847, -1.159127856, 1.857424…
$ strat   <chr> "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t1", "t…

library('ggplot2')

CoEff_compare %>%
  group_by(strat) %>%
  ggplot(aes(x=feature, y=coeff)) +
  geom_point() + 
  coord_flip() +
  facet_wrap(.~strat)

CoEff_compare %>%
  ggplot(aes(x=feature, y=coeff)) +
  geom_boxplot() + 
  coord_flip()

RMSE <- bind_rows(X1$RMSE_TEST %>% mutate(strat = 't1'),
                  X2$RMSE_TEST %>% mutate(strat = 't2'),
                  X3$RMSE_TEST %>% mutate(strat = 't3'),
                  X4$RMSE_TEST %>% mutate(strat = 't4'),
                  X5$RMSE_TEST %>% mutate(strat = 't5'))

RMSE

# A tibble: 5 × 4
  .metric .estimator .estimate strat
  <chr>   <chr>          <dbl> <chr>
1 rmse    standard        6.70 t1   
2 rmse    standard        7.03 t2   
3 rmse    standard       13.7  t3   
4 rmse    standard        6.50 t4   
5 rmse    standard        7.46 t5

mean(RMSE$.estimate)

[1] 8.284141

var(RMSE$.estimate)

[1] 9.374032

17.9 A Closer Look at SAMPLE 2

f2 <- diab_pop %>% 
  anti_join(t2 %>% select(seqn))

Joining with `by = join_by(seqn)`

nrow(diab_pop) #1876

[1] 1876

nrow(t2) #938

[1] 938

nrow(f2) #938

[1] 938

arsenal::comparedf(t2,f2,by='seqn')

Compare Object

Function Call: 
arsenal::comparedf(x = t2, y = f2, by = "seqn")

Shared: 9 non-by variables and 0 observations.
Not shared: 0 variables and 1876 observations.

Differences found in 0/9 variables compared.
0 variables compared have non-identical attributes.

Test2.estimate <- predict(X2$gml.model, f2)

Test2.Scored <- cbind(f2,Test2.estimate)

Test2.Scored %>%
  rmse(truth=bmxbmi, Test2.estimate)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        6.77

Test2.Scored %>%
  group_by(indhhin2) %>%
  rmse(truth=bmxbmi, Test2.estimate)

# A tibble: 12 × 4
   indhhin2          .metric .estimator .estimate
   <ord>             <chr>   <chr>          <dbl>
 1 $0-$4,999         rmse    standard        8.42
 2 $5,000-$9,999     rmse    standard        6.34
 3 $10,000-$14,999   rmse    standard        5.90
 4 $15,000-$19,999   rmse    standard        5.70
 5 less than $20,000 rmse    standard        5.78
 6 20,000+           rmse    standard        6.33
 7 $20,000-$24,999   rmse    standard        7.22
 8 $25,000-$34,999   rmse    standard        7.96
 9 $45,000-$54,999   rmse    standard        7.33
10 $65,000-$74,999   rmse    standard        6.80
11 $75,000-$99,999   rmse    standard        6.66
12 $100,000+         rmse    standard        6.12

17.10 Review SAMPLE 3

17.10.1 f5 data that model 3 has never seen

f5 <- diab_pop %>% 
  anti_join(t5 %>% select(seqn))

Joining with `by = join_by(seqn)`

nrow(diab_pop) #1876

[1] 1876

nrow(t5)

[1] 923

nrow(f5)

[1] 953

arsenal::comparedf(t5,f5,by='seqn')

Compare Object

Function Call: 
arsenal::comparedf(x = t5, y = f5, by = "seqn")

Shared: 9 non-by variables and 0 observations.
Not shared: 0 variables and 1876 observations.

Differences found in 0/9 variables compared.
0 variables compared have non-identical attributes.

Test3.estimate <- predict(X3$gml.model, f5)
Test3.Scored <- cbind(f5, Test3.estimate)

17.10.2 Score Model 2 on data on f5

arsenal::comparedf(t2,f5,by='seqn')

Compare Object

Function Call: 
arsenal::comparedf(x = t2, y = f5, by = "seqn")

Shared: 9 non-by variables and 938 observations.
Not shared: 0 variables and 15 observations.

Differences found in 0/9 variables compared.
0 variables compared have non-identical attributes.

Test3.estimate <- predict(X2$gml.model, f5)
Test3.Scored.2 <- cbind(f5, Test3.estimate)

17.11 Now we’re going to Compare model 2 and model 3

Test3.Scored.Stack <- rbind(Test3.Scored %>% mutate(strat=3), 
                            Test3.Scored.2 %>% mutate(strat=2))

Test3.Scored.Stack %>%
  group_by(strat) %>%
  rmse(truth=bmxbmi, Test3.estimate)

# A tibble: 2 × 4
  strat .metric .estimator .estimate
  <dbl> <chr>   <chr>          <dbl>
1     2 rmse    standard        6.70
2     3 rmse    standard       21.6

17.11.1 Error rates vary by class

Error_Rates_by_model_by_class <- Test3.Scored.Stack %>%
  mutate(strat =  case_when(
    strat ==3 ~ "black_swan",
    strat ==2 ~ "random"
  )) %>%
  group_by(strat, indhhin2, diq010, riagendr) %>%
  rmse(truth=bmxbmi, Test3.estimate) %>%
  arrange(desc(.estimate)) %>%
  rename(RMSE_est = .estimate)

Error_Rates_by_model_by_class

# A tibble: 96 × 7
   strat      indhhin2          diq010      riagendr .metric .estimator RMSE_est
   <chr>      <ord>             <fct>       <fct>    <chr>   <chr>         <dbl>
 1 black_swan less than $20,000 Diabetes    Male     rmse    standard       62.8
 2 black_swan $15,000-$19,999   Diabetes    Female   rmse    standard       59.5
 3 black_swan 20,000+           Diabetes    Male     rmse    standard       53.6
 4 black_swan $20,000-$24,999   Diabetes    Male     rmse    standard       37.0
 5 black_swan $0-$4,999         Diabetes    Female   rmse    standard       36.1
 6 black_swan $10,000-$14,999   Diabetes    Male     rmse    standard       31.4
 7 black_swan $15,000-$19,999   Diabetes    Male     rmse    standard       30.5
 8 black_swan $25,000-$34,999   Diabetes    Female   rmse    standard       29.1
 9 black_swan $65,000-$74,999   Diabetes    Female   rmse    standard       28.9
10 black_swan $0-$4,999         No Diabetes Female   rmse    standard       27.6
# ℹ 86 more rows

summary(Error_Rates_by_model_by_class$RMSE_est)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.662   6.025  12.768  15.615  22.603  62.845

options(warn=0)

17.11.2 Plot The Data

Error_Rates_by_model_by_class %>%
  group_by(strat, diq010, indhhin2, riagendr) %>%
  ggplot(aes(x=strat, 
             y=RMSE_est, 
             fill=indhhin2,
             label=diq010)) +
  geom_bar(stat = "identity",
           position = "dodge") + 
  coord_flip() +
  facet_wrap( ~ diq010 + riagendr)

Test3.Scored.Stack %>%
  mutate(strat =  case_when(
    strat ==3 ~ "black_swan",
    strat ==2 ~ "random"
  )) %>%
  group_by(strat, indhhin2, diq010, ridreth1) %>%
  rmse(truth=bmxbmi, Test3.estimate) %>%
  arrange(desc(.estimate)) %>%
  rename(RMSE_est = .estimate) %>%
  group_by(strat, diq010, indhhin2, ridreth1) %>%
  ggplot(aes(x=strat, 
             y=RMSE_est, 
             fill=indhhin2,
             label=diq010)) +
  geom_bar(stat = "identity",
           position = "dodge") + 
  coord_flip() +
  facet_wrap( ~ ridreth1 + diq010)

17.1 STEP UP

17.1.1 WARNING - THIS IS A BAD OPTION

17.2 caret Train glm Function

17.3 Make Samples

17.3.1 SAMPLE 1

17.3.1.1 R2

17.3.1.2 Adjusted R2

17.4 SAMPLE 2

17.5 SAMPLE 3 - “black swan”

17.5.1 Income == ‘$75,000-$99,999’ & Gender == ‘Female’ & ridreth1 == ‘Non-Hispanic White’

17.6 SAMPLE 4

17.7 SAMPLE 5

17.7.1 Compare SAMPLE 1 to SAMPLE 5

17.8 Compare Coefficents across all samples

17.9 A Closer Look at SAMPLE 2

17.10 Review SAMPLE 3

17.10.1 f5 data that model 3 has never seen

17.10.2 Score Model 2 on data on f5

17.11 Now we’re going to Compare model 2 and model 3

17.11.1 Error rates vary by class

17.11.2 Plot The Data

17.2 `caret` Train `glm` Function

17.3.1.1 R²

17.3.1.2 Adjusted R²