12  Logistic Regression with tidymodels

12.1 NHANES Dataset

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("NHANES")

data <- NHANES %>% 
  select(-DiabetesAge) %>%
  filter(!is.na(Diabetes)) 

dim(data)
[1] 9858   75
glimpse(data)
Rows: 9,858
Columns: 75
$ ID               <int> 51624, 51624, 51624, 51625, 51630, 51638, 51646, 5164…
$ SurveyYr         <fct> 2009_10, 2009_10, 2009_10, 2009_10, 2009_10, 2009_10,…
$ Gender           <fct> male, male, male, male, female, male, male, female, f…
$ Age              <int> 34, 34, 34, 4, 49, 9, 8, 45, 45, 45, 66, 58, 54, 10, …
$ AgeDecade        <fct>  30-39,  30-39,  30-39,  0-9,  40-49,  0-9,  0-9,  40…
$ AgeMonths        <int> 409, 409, 409, 49, 596, 115, 101, 541, 541, 541, 795,…
$ Race1            <fct> White, White, White, Other, White, White, White, Whit…
$ Race3            <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ Education        <fct> High School, High School, High School, NA, Some Colle…
$ MaritalStatus    <fct> Married, Married, Married, NA, LivePartner, NA, NA, M…
$ HHIncome         <fct> 25000-34999, 25000-34999, 25000-34999, 20000-24999, 3…
$ HHIncomeMid      <int> 30000, 30000, 30000, 22500, 40000, 87500, 60000, 8750…
$ Poverty          <dbl> 1.36, 1.36, 1.36, 1.07, 1.91, 1.84, 2.33, 5.00, 5.00,…
$ HomeRooms        <int> 6, 6, 6, 9, 5, 6, 7, 6, 6, 6, 5, 10, 6, 10, 10, 4, 3,…
$ HomeOwn          <fct> Own, Own, Own, Own, Rent, Rent, Own, Own, Own, Own, O…
$ Work             <fct> NotWorking, NotWorking, NotWorking, NA, NotWorking, N…
$ Weight           <dbl> 87.4, 87.4, 87.4, 17.0, 86.7, 29.8, 35.2, 75.7, 75.7,…
$ Length           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ HeadCirc         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ Height           <dbl> 164.7, 164.7, 164.7, 105.4, 168.4, 133.1, 130.6, 166.…
$ BMI              <dbl> 32.22, 32.22, 32.22, 15.30, 30.57, 16.82, 20.64, 27.2…
$ BMICatUnder20yrs <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ BMI_WHO          <fct> 30.0_plus, 30.0_plus, 30.0_plus, 12.0_18.5, 30.0_plus…
$ Pulse            <int> 70, 70, 70, NA, 86, 82, 72, 62, 62, 62, 60, 62, 76, 8…
$ BPSysAve         <int> 113, 113, 113, NA, 112, 86, 107, 118, 118, 118, 111, …
$ BPDiaAve         <int> 85, 85, 85, NA, 75, 47, 37, 64, 64, 64, 63, 74, 85, 6…
$ BPSys1           <int> 114, 114, 114, NA, 118, 84, 114, 106, 106, 106, 124, …
$ BPDia1           <int> 88, 88, 88, NA, 82, 50, 46, 62, 62, 62, 64, 76, 86, 6…
$ BPSys2           <int> 114, 114, 114, NA, 108, 84, 108, 118, 118, 118, 108, …
$ BPDia2           <int> 88, 88, 88, NA, 74, 50, 36, 68, 68, 68, 62, 72, 88, 6…
$ BPSys3           <int> 112, 112, 112, NA, 116, 88, 106, 118, 118, 118, 114, …
$ BPDia3           <int> 82, 82, 82, NA, 76, 44, 38, 60, 60, 60, 64, 76, 82, 7…
$ Testosterone     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ DirectChol       <dbl> 1.29, 1.29, 1.29, NA, 1.16, 1.34, 1.55, 2.12, 2.12, 2…
$ TotChol          <dbl> 3.49, 3.49, 3.49, NA, 6.70, 4.86, 4.09, 5.82, 5.82, 5…
$ UrineVol1        <int> 352, 352, 352, NA, 77, 123, 238, 106, 106, 106, 113, …
$ UrineFlow1       <dbl> NA, NA, NA, NA, 0.094, 1.538, 1.322, 1.116, 1.116, 1.…
$ UrineVol2        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ UrineFlow2       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ Diabetes         <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N…
$ HealthGen        <fct> Good, Good, Good, NA, Good, NA, NA, Vgood, Vgood, Vgo…
$ DaysPhysHlthBad  <int> 0, 0, 0, NA, 0, NA, NA, 0, 0, 0, 10, 0, 4, NA, NA, 0,…
$ DaysMentHlthBad  <int> 15, 15, 15, NA, 10, NA, NA, 3, 3, 3, 0, 0, 0, NA, NA,…
$ LittleInterest   <fct> Most, Most, Most, NA, Several, NA, NA, None, None, No…
$ Depressed        <fct> Several, Several, Several, NA, Several, NA, NA, None,…
$ nPregnancies     <int> NA, NA, NA, NA, 2, NA, NA, 1, 1, 1, NA, NA, NA, NA, N…
$ nBabies          <int> NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Age1stBaby       <int> NA, NA, NA, NA, 27, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ SleepHrsNight    <int> 4, 4, 4, NA, 8, NA, NA, 8, 8, 8, 7, 5, 4, NA, 5, 7, N…
$ SleepTrouble     <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, No, No, Y…
$ PhysActive       <fct> No, No, No, NA, No, NA, NA, Yes, Yes, Yes, Yes, Yes, …
$ PhysActiveDays   <int> NA, NA, NA, NA, NA, NA, NA, 5, 5, 5, 7, 5, 1, NA, 2, …
$ TVHrsDay         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ CompHrsDay       <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ TVHrsDayChild    <int> NA, NA, NA, 4, NA, 5, 1, NA, NA, NA, NA, NA, NA, 4, N…
$ CompHrsDayChild  <int> NA, NA, NA, 1, NA, 0, 6, NA, NA, NA, NA, NA, NA, 3, N…
$ Alcohol12PlusYr  <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, Yes, Y…
$ AlcoholDay       <int> NA, NA, NA, NA, 2, NA, NA, 3, 3, 3, 1, 2, 6, NA, NA, …
$ AlcoholYear      <int> 0, 0, 0, NA, 20, NA, NA, 52, 52, 52, 100, 104, 364, N…
$ SmokeNow         <fct> No, No, No, NA, Yes, NA, NA, NA, NA, NA, No, NA, NA, …
$ Smoke100         <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, Yes, No, …
$ Smoke100n        <fct> Smoker, Smoker, Smoker, NA, Smoker, NA, NA, Non-Smoke…
$ SmokeAge         <int> 18, 18, 18, NA, 38, NA, NA, NA, NA, NA, 13, NA, NA, N…
$ Marijuana        <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, NA, Ye…
$ AgeFirstMarij    <int> 17, 17, 17, NA, 18, NA, NA, 13, 13, 13, NA, 19, 15, N…
$ RegularMarij     <fct> No, No, No, NA, No, NA, NA, No, No, No, NA, Yes, Yes,…
$ AgeRegMarij      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20, 15, N…
$ HardDrugs        <fct> Yes, Yes, Yes, NA, Yes, NA, NA, No, No, No, No, Yes, …
$ SexEver          <fct> Yes, Yes, Yes, NA, Yes, NA, NA, Yes, Yes, Yes, Yes, Y…
$ SexAge           <int> 16, 16, 16, NA, 12, NA, NA, 13, 13, 13, 17, 22, 12, N…
$ SexNumPartnLife  <int> 8, 8, 8, NA, 10, NA, NA, 20, 20, 20, 15, 7, 100, NA, …
$ SexNumPartYear   <int> 1, 1, 1, NA, 1, NA, NA, 0, 0, 0, NA, 1, 1, NA, NA, 1,…
$ SameSex          <fct> No, No, No, NA, Yes, NA, NA, Yes, Yes, Yes, No, No, N…
$ SexOrientation   <fct> Heterosexual, Heterosexual, Heterosexual, NA, Heteros…
$ PregnantNow      <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

12.2 Missing data

data |> 
  as.data.frame() |> 
  Amelia::missmap()

SumNa <- function(col){sum(is.na(col))}

data.sum <- data %>% 
    summarise_all(SumNa) %>%
    tidyr::gather(key='feature', value='SumNa') %>%
    arrange(-SumNa) %>%
    mutate(PctNa = SumNa/nrow(data))

data.sum
# A tibble: 75 × 3
   feature          SumNa PctNa
   <chr>            <int> <dbl>
 1 HeadCirc          9858 1    
 2 Length            9452 0.959
 3 TVHrsDayChild     9205 0.934
 4 CompHrsDayChild   9205 0.934
 5 BMICatUnder20yrs  8587 0.871
 6 AgeRegMarij       8492 0.861
 7 UrineFlow2        8382 0.850
 8 UrineVol2         8380 0.850
 9 PregnantNow       8162 0.828
10 Age1stBaby        7976 0.809
# ℹ 65 more rows
data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Diabetes'))) %>%
  filter(PctNa < .45)


data.sum2$feature
 [1] "SexAge"          "SexNumPartnLife" "HardDrugs"       "SexEver"        
 [5] "SameSex"         "AlcoholYear"     "Alcohol12PlusYr" "LittleInterest" 
 [9] "Depressed"       "Education"       "MaritalStatus"   "Smoke100"       
[13] "Smoke100n"       "DaysPhysHlthBad" "DaysMentHlthBad" "HealthGen"      
[17] "SleepHrsNight"   "Work"            "SleepTrouble"    "BPSys1"         
[21] "BPDia1"          "PhysActive"      "BPSys2"          "BPDia2"         
[25] "BPSys3"          "BPDia3"          "UrineFlow1"      "DirectChol"     
[29] "TotChol"         "BPSysAve"        "BPDiaAve"        "Pulse"          
[33] "UrineVol1"       "HHIncome"        "HHIncomeMid"     "Poverty"        
[37] "AgeDecade"       "BMI_WHO"         "BMI"             "Height"         
[41] "Weight"          "HomeRooms"       "HomeOwn"         "SurveyYr"       
[45] "Gender"          "Age"             "Race1"          
data_F <- data %>% 
  select(ID, Diabetes, data.sum2$feature) %>%
  filter(!is.na(Diabetes))

data_F |>
  as.data.frame() |>
  Amelia::missmap()

features <- colnames(data_F)[!colnames(data_F) %in% c('diq010', 'seqn')]
features_plus <- paste0(features, collapse = " + ")
my_formula <- paste0("Diabetes ~ ", features_plus) |> as.formula()

my_formula
Diabetes ~ ID + Diabetes + SexAge + SexNumPartnLife + HardDrugs + 
    SexEver + SameSex + AlcoholYear + Alcohol12PlusYr + LittleInterest + 
    Depressed + Education + MaritalStatus + Smoke100 + Smoke100n + 
    DaysPhysHlthBad + DaysMentHlthBad + HealthGen + SleepHrsNight + 
    Work + SleepTrouble + BPSys1 + BPDia1 + PhysActive + BPSys2 + 
    BPDia2 + BPSys3 + BPDia3 + UrineFlow1 + DirectChol + TotChol + 
    BPSysAve + BPDiaAve + Pulse + UrineVol1 + HHIncome + HHIncomeMid + 
    Poverty + AgeDecade + BMI_WHO + BMI + Height + Weight + HomeRooms + 
    HomeOwn + SurveyYr + Gender + Age + Race1

12.2.1 Install if not Function

install_if_not <- function( list.of.packages ) {
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) { install.packages(new.packages) } else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}
install_if_not('tidymodels')
[1] "the package 'tidymodels' is already installed"
splits <- initial_split(data,
                        prop = 3/4)
set.seed(1001)
diab_folds <- vfold_cv(training(splits), v = 10)
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.6      ✔ recipes      1.0.10
✔ dials        1.2.1      ✔ tune         1.2.1 
✔ infer        1.0.7      ✔ workflows    1.1.4 
✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Learn how to get started at https://www.tidymodels.org/start/
glmnet_rec <- recipe(my_formula, data = training(splits)) %>% 
  step_impute_knn(all_predictors(), neighbors = 3) %>%
  step_range(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) 
glmnet_rec %>% 
  prep() %>% 
  bake(new_data = NULL) |>
  as.data.frame() |>
  Amelia::missmap()

diab_folds$splits[[1]] %>%  analysis()
# A tibble: 6,653 × 75
      ID SurveyYr Gender   Age AgeDecade AgeMonths Race1   Race3 Education  
   <int> <fct>    <fct>  <int> <fct>         <int> <fct>   <fct> <fct>      
 1 66165 2011_12  male      24 " 20-29"         NA White   White High School
 2 60907 2009_10  male       3 " 0-9"           44 Other   <NA>  <NA>       
 3 61328 2009_10  female    25 " 20-29"        307 White   <NA>  High School
 4 53753 2009_10  male      35 " 30-39"        420 White   <NA>  High School
 5 69857 2011_12  male      16 " 10-19"         NA Other   Other <NA>       
 6 53512 2009_10  female    10 " 10-19"        129 White   <NA>  <NA>       
 7 52908 2009_10  female    12 " 10-19"        153 Mexican <NA>  <NA>       
 8 65664 2011_12  female    13 " 10-19"         NA White   White <NA>       
 9 57835 2009_10  female    18 " 10-19"        227 White   <NA>  <NA>       
10 67584 2011_12  male      38 " 30-39"         NA Black   Black High School
# ℹ 6,643 more rows
# ℹ 66 more variables: MaritalStatus <fct>, HHIncome <fct>, HHIncomeMid <int>,
#   Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>, Work <fct>, Weight <dbl>,
#   Length <dbl>, HeadCirc <dbl>, Height <dbl>, BMI <dbl>,
#   BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>, BPSysAve <int>,
#   BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>,
#   BPSys3 <int>, BPDia3 <int>, Testosterone <dbl>, DirectChol <dbl>, …
glmnet_rec %>% 
  prep() %>% 
  bake(new_data = diab_folds$splits[[1]] %>% analysis()) |>
  as.data.frame() |>
  Amelia::missmap()

glmnet_spec <- logistic_reg(penalty = tune() ,
                             mixture = tune()) %>% 
  set_engine("glmnet") %>% 
  set_mode("classification")
glmnet_wflow <- workflow() %>% 
  add_model(glmnet_spec) %>% 
  add_recipe(glmnet_rec)
glmnet_param <- glmnet_wflow %>% 
  extract_parameter_set_dials()

The mixture parameter can bet set between 0 and 1 where: - ridge model (mixture = alpha = 0) - lasso model (mixture = alpha = 1)

start_grid <- extract_parameter_set_dials(glmnet_spec) %>% 
  grid_latin_hypercube(size = 25, original = TRUE)
  
#expand.grid(
#  penalty = seq(10^(-10), 10^0, length.out = 5),
#  mixture = seq(0, 1, length.out =5)
#)
# All operating systems
library(doParallel)
Loading required package: foreach

Attaching package: 'foreach'
The following objects are masked from 'package:purrr':

    accumulate, when
Loading required package: iterators
Loading required package: parallel
tic()

glmnet_initial <- glmnet_wflow %>% 
  tune_grid(resamples = diab_folds, 
            grid = start_grid, 
            metrics = metric_set(yardstick::pr_auc))

toc(log = TRUE, quiet = FALSE)
431.6 sec elapsed
collect_metrics(glmnet_initial) %>%
  ggplot(aes(x = penalty, y=mixture, z = mean)) + 
  geom_contour_filled() +
  scale_fill_gradient(low = "green", high = "red")
Warning: `stat_contour()`: Zero contours were generated
Warning in min(x): no non-missing arguments to min; returning Inf
Warning in max(x): no non-missing arguments to max; returning -Inf

tic()

glmnet_tune <- glmnet_wflow %>%
  tune_bayes(
    resamples = diab_folds,
    metrics = metric_set(yardstick::pr_auc),
    initial = glmnet_initial,
    param_info = glmnet_param,
    iter = 25,
    control = control_bayes(verbose = TRUE)
  )
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
i Gaussian process model
✓ Gaussian process model
i Generating 5000 candidates
i Predicted candidates
i Estimating performance
✓ Estimating performance
! No improvement for 10 iterations; returning current results.
toc(log = TRUE, quiet = FALSE)
1028.25 sec elapsed
collect_metrics(glmnet_tune) %>%
  knitr::kable()
penalty mixture .metric .estimator mean n std_err .config .iter
0.0001604 0.0534481 pr_auc binary 0.9867551 10 0.0009349 Preprocessor1_Model01 0
0.0000001 0.1201741 pr_auc binary 0.9867993 10 0.0009181 Preprocessor1_Model02 0
0.0000000 0.1603847 pr_auc binary 0.9868346 10 0.0009108 Preprocessor1_Model03 0
0.0000108 0.1861176 pr_auc binary 0.9867873 10 0.0008943 Preprocessor1_Model04 0
0.0000000 0.2073130 pr_auc binary 0.9868743 10 0.0008858 Preprocessor1_Model05 0
0.0013584 0.2660645 pr_auc binary 0.9866762 10 0.0009631 Preprocessor1_Model06 0
0.0000000 0.3090274 pr_auc binary 0.9868466 10 0.0009000 Preprocessor1_Model07 0
0.0002595 0.3357708 pr_auc binary 0.9867741 10 0.0009520 Preprocessor1_Model08 0
0.0000455 0.3824599 pr_auc binary 0.9868250 10 0.0009116 Preprocessor1_Model09 0
0.0000000 0.4275603 pr_auc binary 0.9868424 10 0.0009075 Preprocessor1_Model10 0
0.0000000 0.4351937 pr_auc binary 0.9868333 10 0.0009071 Preprocessor1_Model11 0
0.0000053 0.4703896 pr_auc binary 0.9869031 10 0.0008966 Preprocessor1_Model12 0
0.0000229 0.5329401 pr_auc binary 0.9868057 10 0.0009139 Preprocessor1_Model13 0
0.3591664 0.5746618 pr_auc binary 0.9617886 10 0.0011877 Preprocessor1_Model14 0
0.0017089 0.6138430 pr_auc binary 0.9865322 10 0.0009730 Preprocessor1_Model15 0
0.0189272 0.6266940 pr_auc binary 0.9856033 10 0.0007715 Preprocessor1_Model16 0
0.0313534 0.6693337 pr_auc binary 0.9846103 10 0.0007117 Preprocessor1_Model17 0
0.0000000 0.7321192 pr_auc binary 0.9867915 10 0.0009167 Preprocessor1_Model18 0
0.0000000 0.7426910 pr_auc binary 0.9868037 10 0.0009143 Preprocessor1_Model19 0
0.6246937 0.8056136 pr_auc binary 0.9617886 10 0.0011877 Preprocessor1_Model20 0
0.0000005 0.8389666 pr_auc binary 0.9868221 10 0.0009010 Preprocessor1_Model21 0
0.0000012 0.8482231 pr_auc binary 0.9869230 10 0.0009060 Preprocessor1_Model22 0
0.1050023 0.8867248 pr_auc binary 0.9617886 10 0.0011877 Preprocessor1_Model23 0
0.0000002 0.9497602 pr_auc binary 0.9868720 10 0.0008986 Preprocessor1_Model24 0
0.0090437 0.9744071 pr_auc binary 0.9859341 10 0.0008247 Preprocessor1_Model25 0
0.0000024 0.1528006 pr_auc binary 0.9868094 10 0.0009256 Iter1 1
0.0000000 0.5857375 pr_auc binary 0.9866851 10 0.0009106 Iter2 2
0.0005858 0.3685729 pr_auc binary 0.9866868 10 0.0009407 Iter3 3
0.0000833 0.1050921 pr_auc binary 0.9868059 10 0.0008620 Iter4 4
0.0036980 0.0878247 pr_auc binary 0.9864247 10 0.0009972 Iter5 5
0.0000000 0.7287737 pr_auc binary 0.9866792 10 0.0009409 Iter6 6
0.0000000 0.2582396 pr_auc binary 0.9868311 10 0.0008817 Iter7 7
0.0000000 0.6822934 pr_auc binary 0.9866481 10 0.0009139 Iter8 8
0.0000000 0.8099529 pr_auc binary 0.9868063 10 0.0008905 Iter9 9
0.0000008 0.3358433 pr_auc binary 0.9867329 10 0.0009258 Iter10 10
collect_metrics(glmnet_tune) %>%
  ggplot(aes(x = penalty, y=mixture, z = mean)) + 
  geom_contour_filled() +
  scale_fill_gradient(low = "green", high = "red")
Warning: `stat_contour()`: Zero contours were generated
Warning in min(x): no non-missing arguments to min; returning Inf
Warning in max(x): no non-missing arguments to max; returning -Inf

autoplot(glmnet_tune, type = "performance")

autoplot(glmnet_tune, type = "parameters")

best_params <- select_best(glmnet_tune) %>%
  glimpse()
Warning in select_best(glmnet_tune): No value of `metric` was given; "pr_auc"
will be used.
Rows: 1
Columns: 3
$ penalty <dbl> 1.154583e-06
$ mixture <dbl> 0.8482231
$ .config <chr> "Preprocessor1_Model22"
glmnet_spec_final <- tibble(
    penalty = best_params$penalty,
    mixture = best_params$mixture
  )
#update(glmnet_spec,
                      #penalty = best_params$penalty,
                      #mixture = best_params$mixture)
glmnet_spec_final 
# A tibble: 1 × 2
     penalty mixture
       <dbl>   <dbl>
1 0.00000115   0.848
final_glmnet_wflow <- glmnet_wflow %>% 
  finalize_workflow(glmnet_spec_final)
final_glmnet_fit <- final_glmnet_wflow %>% 
  fit(training(splits))
final_glmnet_fit
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
3 Recipe Steps

• step_impute_knn()
• step_range()
• step_dummy()

── Model ───────────────────────────────────────────────────────────────────────

Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~0.84822312461352) 

   Df  %Dev   Lambda
1   0  0.00 0.092040
2   1  2.24 0.083860
3   1  4.18 0.076410
4   1  5.85 0.069620
5   1  7.30 0.063440
6   2  9.02 0.057800
7   2 10.66 0.052670
8   2 12.08 0.047990
9   3 13.44 0.043730
10  3 14.77 0.039840
11  5 15.96 0.036300
12  6 17.30 0.033080
13  6 18.45 0.030140
14  6 19.42 0.027460
15  6 20.24 0.025020
16  9 21.04 0.022800
17 10 21.77 0.020770
18 11 22.44 0.018930
19 13 23.23 0.017250
20 13 23.92 0.015710
21 14 24.53 0.014320
22 15 25.12 0.013050
23 15 25.63 0.011890
24 15 26.07 0.010830
25 17 26.46 0.009869
26 18 26.84 0.008992
27 20 27.19 0.008193
28 21 27.52 0.007465
29 23 27.83 0.006802
30 25 28.13 0.006198
31 29 28.43 0.005647
32 35 28.77 0.005146
33 37 29.10 0.004688
34 39 29.39 0.004272
35 40 29.66 0.003892
36 41 29.89 0.003547
37 43 30.09 0.003232
38 45 30.28 0.002945
39 47 30.47 0.002683
40 49 30.63 0.002445
41 51 30.79 0.002227
42 52 30.93 0.002030
43 52 31.06 0.001849
44 53 31.16 0.001685
45 55 31.26 0.001535
46 57 31.35 0.001399

...
and 41 more lines.
final_glmnet_fit %>% 
  # This returns the parsnip object:
  extract_fit_parsnip() %>% 
  # Now tidy the linear model object:
  tidy() 

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
Loaded glmnet 4.1-8
# A tibble: 84 × 3
   term            estimate    penalty
   <chr>              <dbl>      <dbl>
 1 (Intercept)      -8.09   0.00000115
 2 ID                0.447  0.00000115
 3 SexAge           -0.382  0.00000115
 4 SexNumPartnLife  -0.187  0.00000115
 5 AlcoholYear      -0.795  0.00000115
 6 DaysPhysHlthBad   0.306  0.00000115
 7 DaysMentHlthBad   0.372  0.00000115
 8 SleepHrsNight     0.573  0.00000115
 9 BPSys1           -0.0623 0.00000115
10 BPDia1           -3.41   0.00000115
# ℹ 74 more rows
extract_fit_parsnip(final_glmnet_fit)
parsnip model object


Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~0.84822312461352) 

   Df  %Dev   Lambda
1   0  0.00 0.092040
2   1  2.24 0.083860
3   1  4.18 0.076410
4   1  5.85 0.069620
5   1  7.30 0.063440
6   2  9.02 0.057800
7   2 10.66 0.052670
8   2 12.08 0.047990
9   3 13.44 0.043730
10  3 14.77 0.039840
11  5 15.96 0.036300
12  6 17.30 0.033080
13  6 18.45 0.030140
14  6 19.42 0.027460
15  6 20.24 0.025020
16  9 21.04 0.022800
17 10 21.77 0.020770
18 11 22.44 0.018930
19 13 23.23 0.017250
20 13 23.92 0.015710
21 14 24.53 0.014320
22 15 25.12 0.013050
23 15 25.63 0.011890
24 15 26.07 0.010830
25 17 26.46 0.009869
26 18 26.84 0.008992
27 20 27.19 0.008193
28 21 27.52 0.007465
29 23 27.83 0.006802
30 25 28.13 0.006198
31 29 28.43 0.005647
32 35 28.77 0.005146
33 37 29.10 0.004688
34 39 29.39 0.004272
35 40 29.66 0.003892
36 41 29.89 0.003547
37 43 30.09 0.003232
38 45 30.28 0.002945
39 47 30.47 0.002683
40 49 30.63 0.002445
41 51 30.79 0.002227
42 52 30.93 0.002030
43 52 31.06 0.001849
44 53 31.16 0.001685
45 55 31.26 0.001535
46 57 31.35 0.001399
47 57 31.42 0.001275
48 58 31.49 0.001161
49 58 31.55 0.001058
50 59 31.60 0.000964
51 62 31.65 0.000879
52 62 31.69 0.000800
53 62 31.72 0.000729
54 62 31.74 0.000665
55 62 31.77 0.000606
56 64 31.79 0.000552
57 64 31.81 0.000503
58 67 31.84 0.000458
59 70 31.87 0.000417
60 71 31.90 0.000380
61 72 31.93 0.000347
62 73 31.95 0.000316
63 73 31.98 0.000288
64 73 32.00 0.000262
65 75 32.02 0.000239
66 74 32.03 0.000218
67 74 32.05 0.000198
68 74 32.06 0.000181
69 76 32.07 0.000165
70 76 32.08 0.000150
71 77 32.09 0.000137
72 78 32.10 0.000124
73 78 32.11 0.000113
74 78 32.12 0.000103
75 78 32.13 0.000094
76 79 32.14 0.000086
77 79 32.15 0.000078
78 78 32.15 0.000071
79 78 32.16 0.000065
80 79 32.16 0.000059
81 79 32.17 0.000054
82 78 32.17 0.000049
83 79 32.17 0.000045
84 80 32.17 0.000041
85 81 32.18 0.000037
86 80 32.18 0.000034
87 81 32.18 0.000031
#library('modelr')

pred <- predict(final_glmnet_fit, testing(splits))
prob <- predict(final_glmnet_fit, testing(splits), 'prob' )
test_scored <- testing(splits) %>% cbind(pred, prob)
library('yardstick')

test_scored %>%
  conf_mat(Diabetes, .pred_class)
          Truth
Prediction   No  Yes
       No  2239  151
       Yes   42   33
test_scored %>%
  conf_mat(Diabetes, .pred_class) %>%
  summary() %>%
  ggplot(aes(y=.metric, x=.estimate, fill=.metric)) +
  geom_bar(stat="identity")

test_scored %>%
  roc_auc(Diabetes, .pred_Yes, event_level = 'second')
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.875
test_scored %>%
  roc_curve(Diabetes, .pred_Yes, event_level = 'second') %>%
  autoplot() 

test_scored %>%
  pr_auc(Diabetes, .pred_Yes, event_level = 'second')
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 pr_auc  binary         0.358
test_scored %>%
  pr_curve(Diabetes, .pred_Yes, event_level = 'second') %>%
  autoplot()