33  Random Forest Imputation and SVM Multi-class Classifier

\(~\)

\(~\)

33.1 NHANES data

#install.packages('svmLinear3')
#install.packages('LiblineaR')
#install.packages('svmLinear2')

library('tidyverse')
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(NHANES)

NHANES_DATA_12 <- NHANES %>%
  filter(!is.na(Depressed))

33.1.1 Note that Depressed is has 3 potiential classes

NHANES_DATA_12 %>% 
  select(Depressed) %>%
  distinct()
# A tibble: 3 × 1
  Depressed
  <fct>    
1 Several  
2 None     
3 Most     

\(~\)

\(~\)


\(~\)

\(~\)

33.2 Some data prep

SumNa <- function(col){sum(is.na(col))}

data.sum <- NHANES_DATA_12 %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(NHANES_DATA_12))

data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('ID','Depressed'))) %>%
  filter(PctNa < .85)

data.sum2$feature
 [1] "UrineFlow2"      "UrineVol2"       "AgeRegMarij"     "PregnantNow"    
 [5] "Age1stBaby"      "nBabies"         "nPregnancies"    "SmokeAge"       
 [9] "AgeFirstMarij"   "SmokeNow"        "Testosterone"    "AgeMonths"      
[13] "TVHrsDay"        "Race3"           "CompHrsDay"      "PhysActiveDays" 
[17] "SexOrientation"  "AlcoholDay"      "SexNumPartYear"  "Marijuana"      
[21] "RegularMarij"    "SexAge"          "SexNumPartnLife" "HardDrugs"      
[25] "SexEver"         "SameSex"         "AlcoholYear"     "HHIncome"       
[29] "HHIncomeMid"     "Poverty"         "UrineFlow1"      "BPSys1"         
[33] "BPDia1"          "AgeDecade"       "DirectChol"      "TotChol"        
[37] "BPSys2"          "BPDia2"          "Education"       "BPSys3"         
[41] "BPDia3"          "MaritalStatus"   "Smoke100"        "Smoke100n"      
[45] "Alcohol12PlusYr" "BPSysAve"        "BPDiaAve"        "Pulse"          
[49] "BMI_WHO"         "BMI"             "Weight"          "HomeRooms"      
[53] "Height"          "HomeOwn"         "UrineVol1"       "SleepHrsNight"  
[57] "LittleInterest"  "DaysPhysHlthBad" "DaysMentHlthBad" "Diabetes"       
[61] "Work"            "SurveyYr"        "Gender"          "Age"            
[65] "Race1"           "HealthGen"       "SleepTrouble"    "PhysActive"     
data_F <- NHANES_DATA_12 %>% 
  select(ID, Depressed, data.sum2$feature) %>%
  filter(!is.na(Depressed))

33.2.1 note that data_F still has missing values

Amelia::missmap(as.data.frame(data_F))

\(~\)

\(~\)


\(~\)

\(~\)

33.3 Random Forest Impute with rfImpute

randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'
The following object is masked from 'package:dplyr':

    combine
The following object is masked from 'package:ggplot2':

    margin
data_F.imputed <- rfImpute(Depressed ~ . , 
                           data_F, 
                           iter=2, 
                           ntree=300)
ntree      OOB      1      2      3
  300:   8.33%  0.91% 36.37% 33.73%
ntree      OOB      1      2      3
  300:   8.59%  1.16% 36.77% 33.73%

33.3.1 Note we no longer have missing data

Amelia::missmap(as.data.frame(data_F.imputed))

\(~\)

\(~\)


\(~\)

\(~\)

33.4 Split Data

Loading required package: lattice

Attaching package: 'caret'
The following object is masked from 'package:purrr':

    lift
set.seed(8576309)

trainIndex <- createDataPartition(data_F.imputed$Depressed, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

TRAIN <- data_F.imputed[trainIndex, ]
TEST <- data_F.imputed[-trainIndex, ]

\(~\)

\(~\)


\(~\)

\(~\)

33.5 Train model

To see list of models compatible and tuneable hyperparameters within the caret package you can visit: https://topepo.github.io/caret/available-models.html

SVMGrid <- expand.grid(cost = c(1,2,5))

train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
                           number=3)

toc <- Sys.time()
model_svm <- train(Depressed ~ .,
                       data = TRAIN,
                       method = "svmLinear2", # this will use the svmLinear3 library
                       metric = "Accuracy", # which metric should be optimized for 
                       trControl = train_ctrl,
                       tuneGrid = SVMGrid,
                       probability = TRUE) 
tic <- Sys.time()

tic - toc
Time difference of 2.558912 mins

33.5.1 Model output

model_svm
Support Vector Machines with Linear Kernel 

4005 samples
  69 predictor
   3 classes: 'None', 'Several', 'Most' 

No pre-processing
Resampling: Cross-Validated (3 fold) 
Summary of sample sizes: 2670, 2669, 2671 
Resampling results across tuning parameters:

  cost  Accuracy   Kappa    
  1     0.8409540  0.5138785
  2     0.8409546  0.5151339
  5     0.8392062  0.5136547

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cost = 2.
varImp(model_svm)
ROC curve variable importance

  variables are sorted by maximum importance across the classes
  only 20 most important variables shown (out of 69)

                  None Several   Most
DaysMentHlthBad 100.00   70.36 100.00
LittleInterest   90.65   57.66  90.65
Age1stBaby       78.02   50.93  78.02
SmokeNow         57.62   35.72  57.62
HealthGen        51.07   30.52  51.07
HHIncome         44.78   25.85  44.78
Poverty          43.72   25.48  43.72
nPregnancies     43.26   29.41  43.26
TVHrsDay         42.76   25.09  42.76
HHIncomeMid      39.32   23.75  39.32
Education        38.95   25.72  38.95
DaysPhysHlthBad  34.66   19.15  34.66
SleepTrouble     33.61   19.76  33.61
Work             33.05   23.59  33.05
PregnantNow      32.42   20.25  32.42
SleepHrsNight    32.07   17.37  32.07
AgeRegMarij      31.97   24.04  31.97
Testosterone     31.42   21.28  31.42
HomeRooms        31.39   16.93  31.39
nBabies          30.61   20.07  30.61
plot(varImp(model_svm), top = 20)

TEST_probs <- predict(model_svm, TEST, 'prob')

TEST_scored <- cbind(TEST, TEST_probs)

Attaching package: 'yardstick'
The following objects are masked from 'package:caret':

    precision, recall, sensitivity, specificity
The following object is masked from 'package:readr':

    spec
TEST_scored %>%
  mutate(Most_binary = as.factor(if_else(Depressed == "Most",1,0))) %>%
  roc_auc(Most_binary, Most, event_level = 'second')
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.943