# read in the data
diab_pop <- readRDS(here::here('DATA/diab_pop.RDS'))4 Exploratory Data Analysis
4.1 Packages, Libraries, and Functions
The first step is to read in the data. This can be done with a number of functions. Some common ones in base R include:
- for reading in R RDS files:
readRDS - for reading in csv files:
read.csv
Help for functions can be found within the R terminal by use of the question mark: ?readRDS
Sometimes you may want to use newer or more specialized packages and libraries to read in or handle data. First you will need to make sure the library is installed with install.packages('readr') then you can load the library with library('readr') , and you will have access to all of the functionality withing the package.
Alternatively, you can access individual functions from libraries by using notation such as: readr::read_csv; this tells R to look in the readr package for the function read_csv. This can be useful in programming functions and packages as well as if multiple packages contain functions with the same name.
For excel files two useful packages::functions to read in excel are:
xlsx::read.xlsxreadxl::read_excel
4.2 Read in the Data
Use the arrow <- for assignments:
4.2.1 Reminders about the Data
tibble::tribble(
~"Variable in Data", ~"Definition", ~"Data Type",
'seqn', 'Respondent sequence number', 'Identifier',
'riagendr', 'Gender', 'Categorical',
'ridageyr', 'Age in years at screening', 'Continuous / Numerical',
'ridreth1', 'Race/Hispanic origin', 'Categorical',
'dmdeduc2', 'Education level', 'Adults 20+ - Categorical',
'dmdmartl', 'Marital status', 'Categorical',
'indhhin2', 'Annual household income', 'Categorical',
'bmxbmi', 'Body Mass Index (kg/m**2)', 'Continuous / Numerical',
'diq010', 'Doctor diagnosed diabetes', 'Categorical',
'lbxglu', 'Fasting Glucose (mg/dL)', 'Continuous / Numerical'
) |>
knitr::kable()| Variable in Data | Definition | Data Type |
|---|---|---|
| seqn | Respondent sequence number | Identifier |
| riagendr | Gender | Categorical |
| ridageyr | Age in years at screening | Continuous / Numerical |
| ridreth1 | Race/Hispanic origin | Categorical |
| dmdeduc2 | Education level | Adults 20+ - Categorical |
| dmdmartl | Marital status | Categorical |
| indhhin2 | Annual household income | Categorical |
| bmxbmi | Body Mass Index (kg/m**2) | Continuous / Numerical |
| diq010 | Doctor diagnosed diabetes | Categorical |
| lbxglu | Fasting Glucose (mg/dL) | Continuous / Numerical |
4.3 Structure
The R Structure function will compactly display the internal structure of an R object, a diagnostic function and an alternative to summary.
To see help page for a function use ? before the function name, for example try: ?str
#structure command
str(diab_pop)'data.frame': 5719 obs. of 10 variables:
$ seqn : num 83732 83733 83734 83735 83736 ...
$ riagendr: Factor w/ 2 levels "Male","Female": 1 1 1 2 2 2 1 2 1 1 ...
$ ridageyr: num 62 53 78 56 42 72 22 32 56 46 ...
$ ridreth1: Factor w/ 5 levels "MexicanAmerican",..: 3 3 3 3 4 1 4 1 4 3 ...
$ dmdeduc2: Factor w/ 5 levels "Less than 9th grade",..: 5 3 3 5 4 2 4 4 3 5 ...
$ dmdmartl: Factor w/ 6 levels "Married","Widowed",..: 1 3 1 6 3 4 5 1 3 6 ...
$ indhhin2: Factor w/ 14 levels "$0-$4,999","$5,000-$9,999",..: 10 4 5 10 NA 13 NA 6 3 3 ...
$ bmxbmi : num 27.8 30.8 28.8 42.4 20.3 28.6 28 28.2 33.6 27.6 ...
$ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 1 2 1 2 2 2 2 2 1 2 ...
$ lbxglu : num NA 101 84 NA 84 107 95 NA NA NA ...
4.4 Show the data
We can view the first 10 records in the data with the head command:
# show data
head(diab_pop,10) seqn riagendr ridageyr ridreth1 dmdeduc2
1 83732 Male 62 Non-Hispanic White College grad or above
2 83733 Male 53 Non-Hispanic White High school graduate/GED
3 83734 Male 78 Non-Hispanic White High school graduate/GED
4 83735 Female 56 Non-Hispanic White College grad or above
5 83736 Female 42 Non-Hispanic Black Some college or AA degrees
6 83737 Female 72 MexicanAmerican Grades 9-11th
7 83741 Male 22 Non-Hispanic Black Some college or AA degrees
8 83742 Female 32 MexicanAmerican Some college or AA degrees
9 83744 Male 56 Non-Hispanic Black High school graduate/GED
10 83747 Male 46 Non-Hispanic White College grad or above
dmdmartl indhhin2 bmxbmi diq010 lbxglu
1 Married $65,000-$74,999 27.8 Diabetes NA
2 Divorced $15,000-$19,999 30.8 No Diabetes 101
3 Married $20,000-$24,999 28.8 Diabetes 84
4 Living with partner $65,000-$74,999 42.4 No Diabetes NA
5 Divorced <NA> 20.3 No Diabetes 84
6 Separated $75,000-$99,999 28.6 No Diabetes 107
7 Never married <NA> 28.0 No Diabetes 95
8 Married $25,000-$34,999 28.2 No Diabetes NA
9 Divorced $10,000-$14,999 33.6 Diabetes NA
10 Living with partner $10,000-$14,999 27.6 No Diabetes NA
4.5 Libraries
To check which packages are loaded you can use:
# loaded packages
(.packages())[1] "stats" "graphics" "grDevices" "datasets" "utils" "methods"
[7] "base"
Make sure that the tidyverse and dplyr packages are installed. You can run install.packages(c('tidyverse','dplyr')) to install both.
To see which packages are installed you can use:
# which packages are installed
library()$results[,1] [1] "abind" "Amelia" "anytime" "arsenal"
[5] "askpass" "backports" "base64enc" "BH"
[9] "bit" "bit64" "bitops" "blob"
[13] "brew" "brio" "broom" "broom.helpers"
[17] "bslib" "cachem" "callr" "car"
[21] "carData" "caret" "caTools" "cellranger"
[25] "checkmate" "chron" "classInt" "cli"
[29] "clipr" "clock" "clue" "colorspace"
[33] "combinat" "commonmark" "compare" "conflicted"
[37] "corrplot" "cowplot" "cpp11" "crayon"
[41] "credentials" "crosstalk" "curl" "data.table"
[45] "DBI" "dbplyr" "deming" "dendextend"
[49] "DEoptimR" "Deriv" "desc" "devtools"
[53] "diagram" "dials" "DiceDesign" "diffobj"
[57] "digest" "doFuture" "doParallel" "downlit"
[61] "dplyr" "DT" "dtplyr" "e1071"
[65] "ellipse" "ellipsis" "emmeans" "estimability"
[69] "evaluate" "factoextra" "FactoMineR" "fansi"
[73] "farver" "fastmap" "flashClust" "fontawesome"
[77] "forcats" "foreach" "Formula" "fs"
[81] "furrr" "future" "future.apply" "gam"
[85] "gargle" "gcookbook" "gdata" "generics"
[89] "gert" "GGally" "ggbernie" "ggbiplot"
[93] "ggdendro" "ggplot2" "ggpubr" "ggrepel"
[97] "ggsci" "ggsignif" "ggstats" "gh"
[101] "gitcreds" "glmnet" "globals" "glue"
[105] "gmodels" "googledrive" "googlesheets4" "gower"
[109] "GPfit" "gplots" "grateful" "gridExtra"
[113] "gsubfn" "gtable" "gtools" "gtrendsR"
[117] "hardhat" "haven" "here" "highr"
[121] "Hmisc" "hms" "htmlTable" "htmltools"
[125] "htmlwidgets" "httpuv" "httr" "httr2"
[129] "ids" "igraph" "infer" "ini"
[133] "ipred" "ISLR" "isoband" "iterators"
[137] "jomo" "jquerylib" "jsonlite" "kableExtra"
[141] "klaR" "knitr" "labeling" "labelled"
[145] "laeken" "later" "lava" "lazyeval"
[149] "leaps" "lhs" "LiblineaR" "lifecycle"
[153] "listenv" "lme4" "lmtest" "lubridate"
[157] "magrittr" "MatrixModels" "memoise" "mice"
[161] "mime" "miniUI" "minqa" "mitml"
[165] "mitools" "mnormt" "modeldata" "modelenv"
[169] "ModelMetrics" "modelr" "multcompView" "munsell"
[173] "mvtnorm" "networkD3" "NHANES" "nloptr"
[177] "numDeriv" "openssl" "ordinal" "packrat"
[181] "pan" "parallelly" "parsnip" "patchwork"
[185] "pbkrtest" "pillar" "pkgbuild" "pkgconfig"
[189] "pkgdown" "pkgload" "PKI" "plogr"
[193] "plotrix" "plyr" "png" "polynom"
[197] "praise" "prettyunits" "pROC" "processx"
[201] "prodlim" "profvis" "progress" "progressr"
[205] "promises" "proto" "proxy" "ps"
[209] "psych" "purrr" "quantreg" "quarto"
[213] "questionr" "R.cache" "R.methodsS3" "R.oo"
[217] "R.utils" "R6" "ragg" "randomForest"
[221] "ranger" "RANN" "rappdirs" "rcmdcheck"
[225] "RColorBrewer" "Rcpp" "RcppArmadillo" "RcppEigen"
[229] "readr" "readxl" "recipes" "rematch"
[233] "rematch2" "remotes" "renv" "repr"
[237] "reprex" "reprtree" "reshape" "reshape2"
[241] "rlang" "rmarkdown" "robustbase" "roxygen2"
[245] "rpart.plot" "rprojroot" "rsample" "rsconnect"
[249] "rsq" "RSQLite" "rstatix" "rstudioapi"
[253] "rversions" "rvest" "sass" "scales"
[257] "scatterplot3d" "selectr" "sessioninfo" "shape"
[261] "shiny" "skimr" "slider" "sourcetools"
[265] "sp" "SparseM" "sqldf" "SQUAREM"
[269] "stringi" "stringr" "styler" "survey"
[273] "svglite" "sys" "systemfonts" "tableone"
[277] "testthat" "textshaping" "tibble" "tictoc"
[281] "tidymodels" "tidyr" "tidyselect" "tidyverse"
[285] "timechange" "timeDate" "tinytex" "tree"
[289] "tune" "tzdb" "ucminf" "urlchecker"
[293] "usethis" "utf8" "uuid" "vcd"
[297] "vctrs" "VIM" "viridis" "viridisLite"
[301] "vroom" "waldo" "warp" "webshot"
[305] "whisker" "withr" "workflows" "workflowsets"
[309] "xfun" "xml2" "xopen" "xtable"
[313] "yaml" "yardstick" "zip" "zoo"
[317] "base" "boot" "class" "cluster"
[321] "codetools" "compiler" "datasets" "foreign"
[325] "graphics" "grDevices" "grid" "KernSmooth"
[329] "lattice" "MASS" "Matrix" "methods"
[333] "mgcv" "nlme" "nnet" "parallel"
[337] "rpart" "spatial" "splines" "stats"
[341] "stats4" "survival" "tcltk" "tools"
[345] "utils"
# some additional informaion on the package including the version
tibble::as_tibble(installed.packages())# A tibble: 345 × 16
Package LibPath Version Priority Depends Imports LinkingTo Suggests Enhances
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 abind C:/Use… 1.4-5 <NA> R (>= … method… <NA> <NA> <NA>
2 Amelia C:/Use… 1.8.2 <NA> R (>= … foreig… Rcpp (>=… "tcltk,… <NA>
3 anytime C:/Use… 0.3.9 <NA> R (>= … Rcpp (… Rcpp (>=… "tinyte… <NA>
4 arsenal C:/Use… 3.6.3 <NA> R (>= … knitr … <NA> "broom … <NA>
5 askpass C:/Use… 1.2.0 <NA> <NA> sys (>… <NA> "testth… <NA>
6 backpor… C:/Use… 1.4.1 <NA> R (>= … <NA> <NA> <NA> <NA>
7 base64e… C:/Use… 0.1-3 <NA> R (>= … <NA> <NA> <NA> png
8 BH C:/Use… 1.84.0… <NA> <NA> <NA> <NA> <NA> <NA>
9 bit C:/Use… 4.0.5 <NA> R (>= … <NA> <NA> "testth… <NA>
10 bit64 C:/Use… 4.0.5 <NA> R (>= … <NA> <NA> <NA> <NA>
# ℹ 335 more rows
# ℹ 7 more variables: License <chr>, License_is_FOSS <chr>,
# License_restricts_use <chr>, OS_type <chr>, MD5sum <chr>,
# NeedsCompilation <chr>, Built <chr>
The following function will check if a list of packages are already installed, if not then it will install them:
# install package if missing
install_if_not <- function( list.of.packages ) {
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) { install.packages(new.packages) }
else { print(paste0("the package '", list.of.packages , "' is already installed")) }
}You can use the function like this:
# test function
install_if_not(c("caret","randomForest"))[1] "the package 'caret' is already installed"
[2] "the package 'randomForest' is already installed"
We make sections of code accessible to installed packages by using library command:
loaded_package_before <- (.packages())
# everthing below here can call functions in dplyr package
library('dplyr')
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
glimpse(diab_pop)Rows: 5,719
Columns: 10
$ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…
(.packages())[1] "dplyr" "stats" "graphics" "grDevices" "datasets" "utils"
[7] "methods" "base"
loaded_package_after <- (.packages())
setdiff(loaded_package_after, loaded_package_before)[1] "dplyr"
detach(package:dplyr)
#the dplyr package has now been detached. calls to functions may have errorsAdditionally, we can access a function from a library without loading the entire library, this can be done by using a command such as package::function. This notation is needed in any instance where two or more loaded packages have at least one function with the same name. This notation is also useful in development of functions and packages.
For instance, the glimpse function from the dplyr package can also be accessed by using the following command
# glimpse the data
dplyr::glimpse(diab_pop)Rows: 5,719
Columns: 10
$ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…
And just notice that without the library loaded or the dplyr:: in front we can error:
# error
glimpse(diab_pop)Error in glimpse(diab_pop): could not find function "glimpse"
A Tibble is an more modern R version of a dataframe, you can easily convert a dataframe to a tibble.
tibble::as_tibble(diab_pop)# A tibble: 5,719 × 10
seqn riagendr ridageyr ridreth1 dmdeduc2 dmdmartl indhhin2 bmxbmi diq010
<dbl> <fct> <dbl> <fct> <fct> <fct> <fct> <dbl> <fct>
1 83732 Male 62 Non-Hispani… College… Married $65,000… 27.8 Diabe…
2 83733 Male 53 Non-Hispani… High sc… Divorced $15,000… 30.8 No Di…
3 83734 Male 78 Non-Hispani… High sc… Married $20,000… 28.8 Diabe…
4 83735 Female 56 Non-Hispani… College… Living … $65,000… 42.4 No Di…
5 83736 Female 42 Non-Hispani… Some co… Divorced <NA> 20.3 No Di…
6 83737 Female 72 MexicanAmer… Grades … Separat… $75,000… 28.6 No Di…
7 83741 Male 22 Non-Hispani… Some co… Never m… <NA> 28 No Di…
8 83742 Female 32 MexicanAmer… Some co… Married $25,000… 28.2 No Di…
9 83744 Male 56 Non-Hispani… High sc… Divorced $10,000… 33.6 Diabe…
10 83747 Male 46 Non-Hispani… College… Living … $10,000… 27.6 No Di…
# ℹ 5,709 more rows
# ℹ 1 more variable: lbxglu <dbl>
4.6 summary
The summary is a generic function used to produce result summaries of the results of various model fitting functions. See ?summary for more information.
summary(diab_pop) seqn riagendr ridageyr ridreth1
Min. :83732 Male :2747 Min. :20.00 MexicanAmerican : 995
1st Qu.:86171 Female:2972 1st Qu.:34.00 Other Hispanic : 768
Median :88651 Median :49.00 Non-Hispanic White:1863
Mean :88672 Mean :49.54 Non-Hispanic Black:1198
3rd Qu.:91166 3rd Qu.:64.00 Other : 895
Max. :93702 Max. :80.00
dmdeduc2 dmdmartl
Less than 9th grade : 688 Married :2886
Grades 9-11th : 676 Widowed : 421
High school graduate/GED :1236 Divorced : 614
Some college or AA degrees:1692 Separated : 192
College grad or above :1422 Never married :1048
NA's : 5 Living with partner: 555
NA's : 3
indhhin2 bmxbmi diq010 lbxglu
$100,000+ : 910 Min. :14.50 Diabetes : 844 Min. : 21.0
$25,000-$34,999: 596 1st Qu.:24.50 No Diabetes:4875 1st Qu.: 95.0
$75,000-$99,999: 524 Median :28.50 Median :103.0
$45,000-$54,999: 451 Mean :29.54 Mean :113.6
$20,000-$24,999: 350 3rd Qu.:33.20 3rd Qu.:114.0
(Other) :1590 Max. :67.30 Max. :479.0
NA's :1298 NA's :313 NA's :3267
by(diab_pop, diab_pop$diq010, summary)diab_pop$diq010: Diabetes
seqn riagendr ridageyr ridreth1
Min. :83732 Male :456 Min. :21.00 MexicanAmerican :200
1st Qu.:86069 Female:388 1st Qu.:53.00 Other Hispanic :119
Median :88610 Median :63.00 Non-Hispanic White:217
Mean :88574 Mean :61.34 Non-Hispanic Black:203
3rd Qu.:90995 3rd Qu.:71.00 Other :105
Max. :93689 Max. :80.00
dmdeduc2 dmdmartl
Less than 9th grade :161 Married :471
Grades 9-11th :123 Widowed :100
High school graduate/GED :185 Divorced :106
Some college or AA degrees:229 Separated : 29
College grad or above :145 Never married : 83
NA's : 1 Living with partner: 55
indhhin2 bmxbmi diq010 lbxglu
$100,000+ : 89 Min. :16.0 Diabetes :844 Min. : 21
$25,000-$34,999: 86 1st Qu.:27.4 No Diabetes: 0 1st Qu.:117
$10,000-$14,999: 81 Median :31.4 Median :147
$20,000-$24,999: 68 Mean :32.6 Mean :166
$45,000-$54,999: 65 3rd Qu.:36.3 3rd Qu.:192
(Other) :261 Max. :67.3 Max. :479
NA's :194 NA's :55 NA's :462
------------------------------------------------------------
diab_pop$diq010: No Diabetes
seqn riagendr ridageyr ridreth1
Min. :83733 Male :2291 Min. :20.00 MexicanAmerican : 795
1st Qu.:86190 Female:2584 1st Qu.:32.00 Other Hispanic : 649
Median :88654 Median :46.00 Non-Hispanic White:1646
Mean :88689 Mean :47.49 Non-Hispanic Black: 995
3rd Qu.:91193 3rd Qu.:61.00 Other : 790
Max. :93702 Max. :80.00
dmdeduc2 dmdmartl
Less than 9th grade : 527 Married :2415
Grades 9-11th : 553 Widowed : 321
High school graduate/GED :1051 Divorced : 508
Some college or AA degrees:1463 Separated : 163
College grad or above :1277 Never married : 965
NA's : 4 Living with partner: 500
NA's : 3
indhhin2 bmxbmi diq010 lbxglu
$100,000+ : 821 Min. :14.50 Diabetes : 0 Min. : 64.0
$25,000-$34,999: 510 1st Qu.:24.10 No Diabetes:4875 1st Qu.: 94.0
$75,000-$99,999: 472 Median :28.00 Median :101.0
$45,000-$54,999: 386 Mean :29.02 Mean :103.9
$15,000-$19,999: 290 3rd Qu.:32.50 3rd Qu.:109.0
(Other) :1292 Max. :64.60 Max. :397.0
NA's :1104 NA's :258 NA's :2805
4.7 Missing Data
4.7.1 Amelia
install_if_not("Amelia")[1] "the package 'Amelia' is already installed"
Amelia::missmap(diab_pop)
4.7.2 Mice
install_if_not("mice")[1] "the package 'mice' is already installed"
install_if_not("VIM")[1] "the package 'VIM' is already installed"
install_if_not("lattice")[1] "the package 'lattice' is already installed"
library(mice)
Attaching package: 'mice'
The following object is masked from 'package:stats':
filter
The following objects are masked from 'package:base':
cbind, rbind
library(VIM)Loading required package: colorspace
Loading required package: grid
VIM is ready to use.
Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
Attaching package: 'VIM'
The following object is masked from 'package:datasets':
sleep
library(lattice)
#understand the missing value pattern
md.pattern(diab_pop)
seqn riagendr ridageyr ridreth1 diq010 dmdmartl dmdeduc2 bmxbmi indhhin2
1876 1 1 1 1 1 1 1 1 1
2332 1 1 1 1 1 1 1 1 1
551 1 1 1 1 1 1 1 1 0
642 1 1 1 1 1 1 1 1 0
18 1 1 1 1 1 1 1 0 1
192 1 1 1 1 1 1 1 0 1
6 1 1 1 1 1 1 1 0 0
95 1 1 1 1 1 1 1 0 0
2 1 1 1 1 1 1 0 1 1
1 1 1 1 1 1 1 0 1 0
1 1 1 1 1 1 1 0 0 1
1 1 1 1 1 1 0 1 1 0
1 1 1 1 1 1 0 1 1 0
1 1 1 1 1 1 0 0 0 0
0 0 0 0 0 3 5 313 1298
lbxglu
1876 1 0
2332 0 1
551 1 1
642 0 2
18 1 1
192 0 2
6 1 2
95 0 3
2 0 2
1 0 3
1 0 3
1 1 2
1 0 3
1 0 5
3267 4886
#plot the missing values
diab_pop_miss = aggr(diab_pop,
numbers=TRUE,
sortVars=TRUE,
labels=colnames(diab_pop),
cex.axis=.7,
gap=3,
ylab=c("Proportion of missingness","Missingness Pattern"))Warning in plot.aggr(res, ...): not enough horizontal space to display
frequencies

Variables sorted by number of missings:
Variable Count
lbxglu 0.5712537157
indhhin2 0.2269627557
bmxbmi 0.0547298479
dmdeduc2 0.0008742787
dmdmartl 0.0005245672
seqn 0.0000000000
riagendr 0.0000000000
ridageyr 0.0000000000
ridreth1 0.0000000000
diq010 0.0000000000
#Drawing margin plot
marginplot(diab_pop[, c("lbxglu", "diq010")],
cex.numbers = 1.2,
pch = 19)
4.8 sqldf
You can also create summary tables with sql and the sqldf:
install_if_not('sqldf')[1] "the package 'sqldf' is already installed"
# Average age by Diabetes
sqldf::sqldf('select diq010,
avg(ridageyr) as mean_age
from diab_pop
group by diq010
order by diq010') diq010 mean_age
1 Diabetes 61.33768
2 No Diabetes 47.49313
# Notice we can save our tables with assignments
# Counts by Diabetic and Age
T_sqldf <- sqldf::sqldf('select diq010,
riagendr,
count(*) as cnt
from diab_pop
group by diq010, riagendr
order by cnt desc')
str(T_sqldf)'data.frame': 4 obs. of 3 variables:
$ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 1 1
$ riagendr: Factor w/ 2 levels "Male","Female": 2 1 1 2
$ cnt : int 2584 2291 456 388
T_sqldf diq010 riagendr cnt
1 No Diabetes Female 2584
2 No Diabetes Male 2291
3 Diabetes Male 456
4 Diabetes Female 388
4.9 dplyr
dplyr is a popular package to manage many common data manipulation tasks and data summaries.
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
### Average age by Diabetes
diab_pop %>%
group_by(diq010) %>%
summarise(mean_age = mean(ridageyr)) %>%
arrange(diq010)# A tibble: 2 × 2
diq010 mean_age
<fct> <dbl>
1 Diabetes 61.3
2 No Diabetes 47.5
### Counts by Diabetic and Age
T_dplyr_dm2_gender <- diab_pop %>%
group_by(diq010,riagendr) %>%
summarise(cnt= n()) %>%
ungroup() %>%
arrange(-cnt)`summarise()` has grouped output by 'diq010'. You can override using the
`.groups` argument.
str(T_dplyr_dm2_gender)tibble [4 × 3] (S3: tbl_df/tbl/data.frame)
$ diq010 : Factor w/ 2 levels "Diabetes","No Diabetes": 2 2 1 1
$ riagendr: Factor w/ 2 levels "Male","Female": 2 1 1 2
$ cnt : int [1:4] 2584 2291 456 388
T_dplyr_dm2_gender # A tibble: 4 × 3
diq010 riagendr cnt
<fct> <fct> <int>
1 No Diabetes Female 2584
2 No Diabetes Male 2291
3 Diabetes Male 456
4 Diabetes Female 388
4.9.1 Additional dplyr resources
- What is
dplyr: https://dplyr.tidyverse.org/ - Introduction to
dplyrvignette: https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html dplyrcheatsheet: https://github.com/rstudio/cheatsheets/blob/master/data-transformation.pdf
4.10 comparedf
The comparedf function in the arsenal package can be used to compare two data-frames:
install_if_not('arsenal')[1] "the package 'arsenal' is already installed"
my_comparison <- arsenal::comparedf(T_sqldf , T_dplyr_dm2_gender)
my_comparisonCompare Object
Function Call:
arsenal::comparedf(x = T_sqldf, y = T_dplyr_dm2_gender)
Shared: 3 non-by variables and 4 observations.
Not shared: 0 variables and 0 observations.
Differences found in 0/3 variables compared.
0 variables compared have non-identical attributes.
summary(my_comparison)
Table: Summary of data.frames
version arg ncol nrow
-------- ------------------- ----- -----
x T_sqldf 3 4
y T_dplyr_dm2_gender 3 4
Table: Summary of overall comparison
statistic value
------------------------------------------------------------ ------
Number of by-variables 0
Number of non-by variables in common 3
Number of variables compared 3
Number of variables in x but not y 0
Number of variables in y but not x 0
Number of variables compared with some values unequal 0
Number of variables compared with all values equal 3
Number of observations in common 4
Number of observations in x but not y 0
Number of observations in y but not x 0
Number of observations with some compared variables unequal 0
Number of observations with all compared variables equal 4
Number of values unequal 0
Table: Variables not shared
------------------------
No variables not shared
------------------------
Table: Other variables not compared
--------------------------------
No other variables not compared
--------------------------------
Table: Observations not shared
---------------------------
No observations not shared
---------------------------
Table: Differences detected by variable
var.x var.y n NAs
--------- --------- --- ----
diq010 diq010 0 0
riagendr riagendr 0 0
cnt cnt 0 0
Table: Differences detected
------------------------
No differences detected
------------------------
Table: Non-identical attributes
----------------------------
No non-identical attributes
----------------------------
str(my_comparison)List of 4
$ frame.summary:Classes 'comparedf.frame.summary' and 'data.frame': 2 obs. of 8 variables:
..$ version : chr [1:2] "x" "y"
..$ arg : chr [1:2] "T_sqldf" "T_dplyr_dm2_gender"
..$ ncol : int [1:2] 3 3
..$ nrow : int [1:2] 4 4
..$ by :List of 2
.. ..$ : chr "..row.names.."
.. ..$ : chr "..row.names.."
.. ..- attr(*, "byrow")= logi TRUE
..$ attrs :List of 2
.. ..$ :List of 3
.. .. ..$ names : chr [1:3] "diq010" "riagendr" "cnt"
.. .. ..$ row.names: int [1:4] 1 2 3 4
.. .. ..$ class : chr "data.frame"
.. ..$ :List of 3
.. .. ..$ class : chr [1:3] "tbl_df" "tbl" "data.frame"
.. .. ..$ row.names: int [1:4] 1 2 3 4
.. .. ..$ names : chr [1:3] "diq010" "riagendr" "cnt"
..$ unique :List of 2
.. ..$ :'data.frame': 0 obs. of 2 variables:
.. .. ..$ ..row.names..: int(0)
.. .. ..$ observation : int(0)
.. ..$ :'data.frame': 0 obs. of 2 variables:
.. .. ..$ ..row.names..: int(0)
.. .. ..$ observation : int(0)
..$ n.shared: int [1:2] 4 4
$ vars.summary :Classes 'comparedf.vars.summary' and 'data.frame': 4 obs. of 8 variables:
..$ var.x : chr [1:4] "diq010" "riagendr" "cnt" "..row.names.."
..$ pos.x : int [1:4] 1 2 3 4
..$ class.x:List of 4
.. ..$ : chr "factor"
.. ..$ : chr "factor"
.. ..$ : chr "integer"
.. ..$ : chr "integer"
..$ var.y : chr [1:4] "diq010" "riagendr" "cnt" "..row.names.."
..$ pos.y : int [1:4] 1 2 3 4
..$ class.y:List of 4
.. ..$ : chr "factor"
.. ..$ : chr "factor"
.. ..$ : chr "integer"
.. ..$ : chr "integer"
..$ values :List of 4
.. ..$ :'data.frame': 0 obs. of 5 variables:
.. .. ..$ ..row.names..: int(0)
.. .. ..$ values.x : Factor w/ 2 levels "Diabetes","No Diabetes":
.. .. ..$ values.y : Factor w/ 2 levels "Diabetes","No Diabetes":
.. .. ..$ row.x : int(0)
.. .. ..$ row.y : int(0)
.. ..$ :'data.frame': 0 obs. of 5 variables:
.. .. ..$ ..row.names..: int(0)
.. .. ..$ values.x : Factor w/ 2 levels "Male","Female":
.. .. ..$ values.y : Factor w/ 2 levels "Male","Female":
.. .. ..$ row.x : int(0)
.. .. ..$ row.y : int(0)
.. ..$ :'data.frame': 0 obs. of 5 variables:
.. .. ..$ ..row.names..: int(0)
.. .. ..$ values.x : 'AsIs' int(0)
.. .. ..$ values.y : 'AsIs' int(0)
.. .. ..$ row.x : int(0)
.. .. ..$ row.y : int(0)
.. ..$ : chr "by-variable"
..$ attrs :List of 4
.. ..$ :'data.frame': 0 obs. of 3 variables:
.. .. ..$ name : chr(0)
.. .. ..$ attr.x: Named list()
.. .. ..$ attr.y: Named list()
.. ..$ :'data.frame': 0 obs. of 3 variables:
.. .. ..$ name : chr(0)
.. .. ..$ attr.x: Named list()
.. .. ..$ attr.y: Named list()
.. ..$ : NULL
.. ..$ : NULL
$ control :List of 16
..$ tol.logical :List of 1
.. ..$ :function (x, y)
..$ tol.num :List of 1
.. ..$ :function (x, y, tol)
..$ tol.num.val : num 1.49e-08
..$ int.as.num : logi FALSE
..$ tol.char :List of 1
.. ..$ :function (x, y)
..$ tol.factor :List of 1
.. ..$ :function (x, y)
..$ factor.as.char : logi FALSE
..$ tol.date :List of 1
.. ..$ :function (x, y, tol)
..$ tol.date.val : num 0
..$ tol.other :List of 1
.. ..$ :function (x, y)
..$ tol.vars : chr "none"
..$ max.print.vars : logi NA
..$ max.print.obs : logi NA
..$ max.print.diffs.per.var: num 10
..$ max.print.diffs : num 50
..$ max.print.attrs : logi NA
$ Call : language arsenal::comparedf(x = T_sqldf, y = T_dplyr_dm2_gender)
- attr(*, "class")= chr "comparedf"
4.11 Chi Square Test
# The table command can give us a table of counts of categorical columns:
table(diab_pop$diq010, diab_pop$riagendr)
Male Female
Diabetes 456 388
No Diabetes 2291 2584
table(diab_pop$dmdmartl, diab_pop$riagendr, diab_pop$diq010), , = Diabetes
Male Female
Married 299 172
Widowed 30 70
Divorced 46 60
Separated 11 18
Never married 39 44
Living with partner 31 24
, , = No Diabetes
Male Female
Married 1229 1186
Widowed 79 242
Divorced 200 308
Separated 58 105
Never married 471 494
Living with partner 253 247
# We can use similar call for the chi square test:
chisq.test(diab_pop$diq010, diab_pop$riagendr)
Pearson's Chi-squared test with Yates' continuity correction
data: diab_pop$diq010 and diab_pop$riagendr
X-squared = 13.978, df = 1, p-value = 0.0001849
# We can store the results and then use them:
table.sex_dm2 <- table(diab_pop$riagendr, diab_pop$diq010)
table.sex_dm2
Diabetes No Diabetes
Male 456 2291
Female 388 2584
chisq.test(table.sex_dm2)
Pearson's Chi-squared test with Yates' continuity correction
data: table.sex_dm2
X-squared = 13.978, df = 1, p-value = 0.0001849
4.11.1 Use the output
The R functions can have multiple outputs. We can also see other data associated with the output, the structure command can be used to help:
chisq_results <- chisq.test(diab_pop$diq010, diab_pop$riagendr)
str(chisq_results)List of 9
$ statistic: Named num 14
..- attr(*, "names")= chr "X-squared"
$ parameter: Named int 1
..- attr(*, "names")= chr "df"
$ p.value : num 0.000185
$ method : chr "Pearson's Chi-squared test with Yates' continuity correction"
$ data.name: chr "diab_pop$diq010 and diab_pop$riagendr"
$ observed : 'table' int [1:2, 1:2] 456 2291 388 2584
..- attr(*, "dimnames")=List of 2
.. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
.. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
$ expected : num [1:2, 1:2] 405 2342 439 2533
..- attr(*, "dimnames")=List of 2
.. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
.. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
$ residuals: 'table' num [1:2, 1:2] 2.51 -1.05 -2.42 1.01
..- attr(*, "dimnames")=List of 2
.. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
.. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
$ stdres : 'table' num [1:2, 1:2] 3.78 -3.78 -3.78 3.78
..- attr(*, "dimnames")=List of 2
.. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
.. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
- attr(*, "class")= chr "htest"
# We can access different portions of the output:
chisq_results$p.value[1] 0.0001849292
chisq_results$statisticX-squared
13.97834
chisq_results$method[1] "Pearson's Chi-squared test with Yates' continuity correction"
4.11.2 broom
The default output of the chisq.test test is a list, the tidy function in the broom package will take many results from other packages and convert into tibbles for easier handling:
str(chisq_results)List of 9
$ statistic: Named num 14
..- attr(*, "names")= chr "X-squared"
$ parameter: Named int 1
..- attr(*, "names")= chr "df"
$ p.value : num 0.000185
$ method : chr "Pearson's Chi-squared test with Yates' continuity correction"
$ data.name: chr "diab_pop$diq010 and diab_pop$riagendr"
$ observed : 'table' int [1:2, 1:2] 456 2291 388 2584
..- attr(*, "dimnames")=List of 2
.. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
.. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
$ expected : num [1:2, 1:2] 405 2342 439 2533
..- attr(*, "dimnames")=List of 2
.. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
.. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
$ residuals: 'table' num [1:2, 1:2] 2.51 -1.05 -2.42 1.01
..- attr(*, "dimnames")=List of 2
.. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
.. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
$ stdres : 'table' num [1:2, 1:2] 3.78 -3.78 -3.78 3.78
..- attr(*, "dimnames")=List of 2
.. ..$ diab_pop$diq010 : chr [1:2] "Diabetes" "No Diabetes"
.. ..$ diab_pop$riagendr: chr [1:2] "Male" "Female"
- attr(*, "class")= chr "htest"
install_if_not('broom')[1] "the package 'broom' is already installed"
tidy_chisq_results <- broom::tidy(chisq_results)
str(tidy_chisq_results)tibble [1 × 4] (S3: tbl_df/tbl/data.frame)
$ statistic: Named num 14
..- attr(*, "names")= chr "X-squared"
$ p.value : num 0.000185
$ parameter: Named int 1
..- attr(*, "names")= chr "df"
$ method : chr "Pearson's Chi-squared test with Yates' continuity correction"
tidy_chisq_results$p.value[1] 0.0001849292
4.11.3 Some Example Plots
We can also make some basic graphics that are in base R:
plot(table.sex_dm2)
mosaicplot(table.sex_dm2,
main = "Diabetes by Sex Mosaic Plot",
xlab = "Sex",
ylab = "Diabetes",
las = 1,
border = "chocolate",
shade = TRUE) 
Other graphics are accessible by installing various packages:
4.11.4 ggplot2
install_if_not('ggplot2')[1] "the package 'ggplot2' is already installed"
library(ggplot2)
plot <- ggplot(diab_pop, aes(x=factor(riagendr))) +
geom_bar(stat="count") +
facet_wrap(~diq010)
plot
4.11.5 gplots
install_if_not('gplots')[1] "the package 'gplots' is already installed"
gplots::balloonplot(table.sex_dm2,
main ="Balloon Plot for Sex by Diabetes \n Area is proportional to Freq.")
4.11.6 corrplot
We can view the residuals from the chi-square test:
chisq_results$residuals diab_pop$riagendr
diab_pop$diq010 Male Female
Diabetes 2.513228 -2.416222
No Diabetes -1.045721 1.005358
install_if_not('corrplot')[1] "the package 'corrplot' is already installed"
corrplot::corrplot(chisq_results$residuals, is.cor = FALSE)
4.11.7 vcd
install_if_not('vcd')[1] "the package 'vcd' is already installed"
vcd::assoc(table.sex_dm2, shade = TRUE, las=3)
4.11.7.1 See also
4.12 T-Test
t.test(diab_pop$ridageyr ~ diab_pop$diq010)
Welch Two Sample t-test
data: diab_pop$ridageyr by diab_pop$diq010
t = 26.406, df = 1407.4, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Diabetes and group No Diabetes is not equal to 0
95 percent confidence interval:
12.81607 14.87303
sample estimates:
mean in group Diabetes mean in group No Diabetes
61.33768 47.49313
library(ggplot2)
# Basic box plot
p <- ggplot(diab_pop, aes(x=diq010 , y=ridageyr)) +
geom_boxplot()
p
diab_pop.age.NoDM2 <- (diab_pop %>%
filter(diq010 == "No Diabetes"))$ridageyr
diab_pop.age.DM2 <- (diab_pop %>%
filter(diq010 == "Diabetes"))$ridageyr
t.test(diab_pop.age.NoDM2 , diab_pop.age.DM2 )
Welch Two Sample t-test
data: diab_pop.age.NoDM2 and diab_pop.age.DM2
t = -26.406, df = 1407.4, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-14.87303 -12.81607
sample estimates:
mean of x mean of y
47.49313 61.33768
diab_pop %>%
ggplot( aes(riagendr,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010)))
ttest_reults <- t.test(diab_pop.age.NoDM2 , diab_pop.age.DM2 )
str(ttest_reults)List of 10
$ statistic : Named num -26.4
..- attr(*, "names")= chr "t"
$ parameter : Named num 1407
..- attr(*, "names")= chr "df"
$ p.value : num 3.83e-125
$ conf.int : num [1:2] -14.9 -12.8
..- attr(*, "conf.level")= num 0.95
$ estimate : Named num [1:2] 47.5 61.3
..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
$ null.value : Named num 0
..- attr(*, "names")= chr "difference in means"
$ stderr : num 0.524
$ alternative: chr "two.sided"
$ method : chr "Welch Two Sample t-test"
$ data.name : chr "diab_pop.age.NoDM2 and diab_pop.age.DM2"
- attr(*, "class")= chr "htest"
ttest_reults$p.value[1] 3.825137e-125
broom::tidy(ttest_reults)# A tibble: 1 × 10
estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -13.8 47.5 61.3 -26.4 3.83e-125 1407. -14.9 -12.8
# ℹ 2 more variables: method <chr>, alternative <chr>
broom::tidy(ttest_reults)$p.value[1] 3.825137e-125
4.13 ggplot2
library(ggplot2)
ggplot(diab_pop, aes(ridageyr, color=diq010) ) +
geom_freqpoly(binwidth = 1) +
labs(title="Age Distribution by Diabetes")
ggplot(diab_pop, aes(riagendr,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010))) +
labs(title="Box Plot of Age by Gender and Diabetes")
ggplot(diab_pop, aes(bmxbmi, fill=diq010) ) +
geom_histogram(binwidth = 1) +
labs(title="BMI Distribution by Diabetes")Warning: Removed 313 rows containing non-finite outside the scale range
(`stat_bin()`).

ggplot(diab_pop, aes(ridageyr)) +
geom_density(aes(fill=factor(diq010)), alpha=0.8) +
labs(title="Density Plot of Age by Diabetes")
ggplot(diab_pop, aes(x=ridageyr, y=bmxbmi, shape=diq010, color=diq010)) +
geom_point() +
geom_smooth(method=lm , aes(fill=diq010)) +
labs(title="Scatter Plot of Age by BMI by Diabetes")`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 313 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 313 rows containing missing values or values outside the scale range
(`geom_point()`).

4.13.1 ggplot - Additional Resources
- ggplot cookbook : http://www.cookbook-r.com/Graphs/
- Top 50 ggplot Visualizations : http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
4.14 dplyr Revisited
Let’s look at some more complicated summary tables that we can produce
diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~min(.), ~mean(.), ~max(.)))# A tibble: 12 × 12
# Groups: diq010, riagendr [4]
diq010 riagendr age_ntile ridageyr_min bmxbmi_min lbxglu_min ridageyr_mean
<fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
1 Diabetes Male 1 21 NA NA 31.5
2 Diabetes Male 2 39 NA NA 51.5
3 Diabetes Male 3 60 NA NA 69.9
4 Diabetes Female 1 21 NA NA 31.6
5 Diabetes Female 2 39 NA NA 50.2
6 Diabetes Female 3 60 NA NA 69.6
7 No Diabe… Male 1 20 NA NA 29.4
8 No Diabe… Male 2 39 NA NA 49.1
9 No Diabe… Male 3 59 NA NA 70.2
10 No Diabe… Female 1 20 NA NA 29.2
11 No Diabe… Female 2 39 NA NA 48.6
12 No Diabe… Female 3 59 NA NA 70.5
# ℹ 5 more variables: bmxbmi_mean <dbl>, lbxglu_mean <dbl>, ridageyr_max <dbl>,
# bmxbmi_max <dbl>, lbxglu_max <dbl>
# We can tell R to remove NA values from computation:
diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr), list(~min(.,na.rm=TRUE), ~mean(.,na.rm=TRUE), ~max(.,na.rm=TRUE)))# A tibble: 12 × 6
# Groups: diq010, riagendr [4]
diq010 riagendr age_ntile min mean max
<fct> <fct> <int> <dbl> <dbl> <dbl>
1 Diabetes Male 1 21 31.5 39
2 Diabetes Male 2 39 51.5 59
3 Diabetes Male 3 60 69.9 80
4 Diabetes Female 1 21 31.6 39
5 Diabetes Female 2 39 50.2 59
6 Diabetes Female 3 60 69.6 80
7 No Diabetes Male 1 20 29.4 39
8 No Diabetes Male 2 39 49.1 59
9 No Diabetes Male 3 59 70.2 80
10 No Diabetes Female 1 20 29.2 39
11 No Diabetes Female 2 39 48.6 59
12 No Diabetes Female 3 59 70.5 80
We can also store our results of our dplyr query as a new dataframe:
Table_Stats_1 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~min(.,na.rm=TRUE), ~mean(.,na.rm=TRUE), ~max(.,na.rm=TRUE))) %>%
ungroup()
Table_Stats_1# A tibble: 12 × 12
diq010 riagendr age_ntile ridageyr_min bmxbmi_min lbxglu_min ridageyr_mean
<fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
1 Diabetes Male 1 21 19.8 76 31.5
2 Diabetes Male 2 39 16 21 51.5
3 Diabetes Male 3 60 17.7 68 69.9
4 Diabetes Female 1 21 24 84 31.6
5 Diabetes Female 2 39 19.6 75 50.2
6 Diabetes Female 3 60 17.7 50 69.6
7 No Diabe… Male 1 20 16.2 70 29.4
8 No Diabe… Male 2 39 15.1 74 49.1
9 No Diabe… Male 3 59 16.4 64 70.2
10 No Diabe… Female 1 20 16.3 74 29.2
11 No Diabe… Female 2 39 14.5 77 48.6
12 No Diabe… Female 3 59 14.5 64 70.5
# ℹ 5 more variables: bmxbmi_mean <dbl>, lbxglu_mean <dbl>, ridageyr_max <dbl>,
# bmxbmi_max <dbl>, lbxglu_max <dbl>
We can pass in user defined functions where they make sense:
SumNa <- function(col){sum(is.na(col))}Table_Stats_2 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(~SumNa(.))) %>%
rename_at(vars(ridageyr,bmxbmi,lbxglu), list(~paste0(.,"_SumNa"))) %>%
ungroup()
Table_Stats_2# A tibble: 12 × 6
diq010 riagendr age_ntile ridageyr_SumNa bmxbmi_SumNa lbxglu_SumNa
<fct> <fct> <int> <int> <int> <int>
1 Diabetes Male 1 0 2 18
2 Diabetes Male 2 0 3 79
3 Diabetes Male 3 0 24 155
4 Diabetes Female 1 0 1 18
5 Diabetes Female 2 0 5 64
6 Diabetes Female 3 0 20 128
7 No Diabetes Male 1 0 57 531
8 No Diabetes Male 2 0 29 420
9 No Diabetes Male 3 0 49 360
10 No Diabetes Female 1 0 44 576
11 No Diabetes Female 2 0 43 515
12 No Diabetes Female 3 0 36 403
We can join our two tables by the ridageyr,bmxbmi, and lbxglu values:
Table_Stats_3 <- Table_Stats_1 %>%
full_join(Table_Stats_2, by=c('diq010','riagendr','age_ntile'))
Table_Stats_3# A tibble: 12 × 15
diq010 riagendr age_ntile ridageyr_min bmxbmi_min lbxglu_min ridageyr_mean
<fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
1 Diabetes Male 1 21 19.8 76 31.5
2 Diabetes Male 2 39 16 21 51.5
3 Diabetes Male 3 60 17.7 68 69.9
4 Diabetes Female 1 21 24 84 31.6
5 Diabetes Female 2 39 19.6 75 50.2
6 Diabetes Female 3 60 17.7 50 69.6
7 No Diabe… Male 1 20 16.2 70 29.4
8 No Diabe… Male 2 39 15.1 74 49.1
9 No Diabe… Male 3 59 16.4 64 70.2
10 No Diabe… Female 1 20 16.3 74 29.2
11 No Diabe… Female 2 39 14.5 77 48.6
12 No Diabe… Female 3 59 14.5 64 70.5
# ℹ 8 more variables: bmxbmi_mean <dbl>, lbxglu_mean <dbl>, ridageyr_max <dbl>,
# bmxbmi_max <dbl>, lbxglu_max <dbl>, ridageyr_SumNa <int>,
# bmxbmi_SumNa <int>, lbxglu_SumNa <int>
We can also get the same result with one query:
Table_Stats_3a <- Table_Stats_1 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup()Let’s compare to make sure things are the same:
arsenal::comparedf(Table_Stats_3,Table_Stats_3a)Compare Object
Function Call:
arsenal::comparedf(x = Table_Stats_3, y = Table_Stats_3a)
Shared: 15 non-by variables and 12 observations.
Not shared: 0 variables and 0 observations.
Differences found in 0/15 variables compared.
0 variables compared have non-identical attributes.
4.15 The tidyverse
Even when we create a summary table things can be complicated, this result will have 3+(3*6) = 21 columns. In this section we will see even more of the data wrangling capabilities in the tidyverse
Table_Stats_3b <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(cnt =~ n() ,
min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
sd = ~ sd(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup()
colnames(Table_Stats_3b) [1] "diq010" "riagendr" "age_ntile" "ridageyr_cnt"
[5] "bmxbmi_cnt" "lbxglu_cnt" "ridageyr_min" "bmxbmi_min"
[9] "lbxglu_min" "ridageyr_mean" "bmxbmi_mean" "lbxglu_mean"
[13] "ridageyr_sd" "bmxbmi_sd" "lbxglu_sd" "ridageyr_max"
[17] "bmxbmi_max" "lbxglu_max" "ridageyr_SumNa" "bmxbmi_SumNa"
[21] "lbxglu_SumNa"
ncol(Table_Stats_3b)[1] 21
Table_Stats_3b# A tibble: 12 × 21
diq010 riagendr age_ntile ridageyr_cnt bmxbmi_cnt lbxglu_cnt ridageyr_min
<fct> <fct> <int> <int> <int> <int> <dbl>
1 Diabetes Male 1 25 25 25 21
2 Diabetes Male 2 136 136 136 39
3 Diabetes Male 3 295 295 295 60
4 Diabetes Female 1 33 33 33 21
5 Diabetes Female 2 124 124 124 39
6 Diabetes Female 3 231 231 231 60
7 No Diabet… Male 1 889 889 889 20
8 No Diabet… Male 2 750 750 750 39
9 No Diabet… Male 3 652 652 652 59
10 No Diabet… Female 1 960 960 960 20
11 No Diabet… Female 2 896 896 896 39
12 No Diabet… Female 3 728 728 728 59
# ℹ 14 more variables: bmxbmi_min <dbl>, lbxglu_min <dbl>, ridageyr_mean <dbl>,
# bmxbmi_mean <dbl>, lbxglu_mean <dbl>, ridageyr_sd <dbl>, bmxbmi_sd <dbl>,
# lbxglu_sd <dbl>, ridageyr_max <dbl>, bmxbmi_max <dbl>, lbxglu_max <dbl>,
# ridageyr_SumNa <int>, bmxbmi_SumNa <int>, lbxglu_SumNa <int>
The gather command is used to “un-pivot” a dataframe, it makes a “wide” data-frame longer. Here I am gathering all columns except for diq010,riagendr,age_ntile, the column names will be placed in Feature_Type and the values will be placed in Value:
Table_Stats_4 <- Table_Stats_3b %>%
tidyr::gather(-diq010,-riagendr,-age_ntile, key="Feature_Type", value="Value")
Table_Stats_4# A tibble: 216 × 5
diq010 riagendr age_ntile Feature_Type Value
<fct> <fct> <int> <chr> <dbl>
1 Diabetes Male 1 ridageyr_cnt 25
2 Diabetes Male 2 ridageyr_cnt 136
3 Diabetes Male 3 ridageyr_cnt 295
4 Diabetes Female 1 ridageyr_cnt 33
5 Diabetes Female 2 ridageyr_cnt 124
6 Diabetes Female 3 ridageyr_cnt 231
7 No Diabetes Male 1 ridageyr_cnt 889
8 No Diabetes Male 2 ridageyr_cnt 750
9 No Diabetes Male 3 ridageyr_cnt 652
10 No Diabetes Female 1 ridageyr_cnt 960
# ℹ 206 more rows
The separate command can be used to separate a column into two columns using a delimiter. Here I am separating Feature_Type into Feature and Type:
Table_Stats_5 <- Table_Stats_4 %>%
tidyr::separate(Feature_Type, into=c('Feature','Type'), sep = "_")
Table_Stats_5# A tibble: 216 × 6
diq010 riagendr age_ntile Feature Type Value
<fct> <fct> <int> <chr> <chr> <dbl>
1 Diabetes Male 1 ridageyr cnt 25
2 Diabetes Male 2 ridageyr cnt 136
3 Diabetes Male 3 ridageyr cnt 295
4 Diabetes Female 1 ridageyr cnt 33
5 Diabetes Female 2 ridageyr cnt 124
6 Diabetes Female 3 ridageyr cnt 231
7 No Diabetes Male 1 ridageyr cnt 889
8 No Diabetes Male 2 ridageyr cnt 750
9 No Diabetes Male 3 ridageyr cnt 652
10 No Diabetes Female 1 ridageyr cnt 960
# ℹ 206 more rows
The spread function is the opposite of gather, it is used to pivot a dataframe and makes data go from a “wide” to “long” format:
Table_Stats_6 <- Table_Stats_5 %>%
tidyr::spread(key='Type',value='Value')
Table_Stats_6# A tibble: 36 × 10
diq010 riagendr age_ntile Feature cnt max mean min sd SumNa
<fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Diabetes Male 1 bmxbmi 25 54 32.5 19.8 8.87 2
2 Diabetes Male 1 lbxglu 25 369 209. 76 120. 18
3 Diabetes Male 1 ridageyr 25 39 31.5 21 5.36 0
4 Diabetes Male 2 bmxbmi 136 57.4 32.6 16 8.12 3
5 Diabetes Male 2 lbxglu 136 454 176. 21 78.8 79
6 Diabetes Male 2 ridageyr 136 59 51.5 39 5.23 0
7 Diabetes Male 3 bmxbmi 295 54.2 30.7 17.7 6.22 24
8 Diabetes Male 3 lbxglu 295 479 163. 68 68.0 155
9 Diabetes Male 3 ridageyr 295 80 69.9 60 6.41 0
10 Diabetes Female 1 bmxbmi 33 59.4 36.6 24 9.07 1
# ℹ 26 more rows
We can now add new columns that we might be interested in:
Table_Stats_7 <- Table_Stats_6 %>%
mutate(pct_NA = SumNa/cnt)
Table_Stats_7# A tibble: 36 × 11
diq010 riagendr age_ntile Feature cnt max mean min sd SumNa pct_NA
<fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Diabe… Male 1 bmxbmi 25 54 32.5 19.8 8.87 2 0.08
2 Diabe… Male 1 lbxglu 25 369 209. 76 120. 18 0.72
3 Diabe… Male 1 ridage… 25 39 31.5 21 5.36 0 0
4 Diabe… Male 2 bmxbmi 136 57.4 32.6 16 8.12 3 0.0221
5 Diabe… Male 2 lbxglu 136 454 176. 21 78.8 79 0.581
6 Diabe… Male 2 ridage… 136 59 51.5 39 5.23 0 0
7 Diabe… Male 3 bmxbmi 295 54.2 30.7 17.7 6.22 24 0.0814
8 Diabe… Male 3 lbxglu 295 479 163. 68 68.0 155 0.525
9 Diabe… Male 3 ridage… 295 80 69.9 60 6.41 0 0
10 Diabe… Female 1 bmxbmi 33 59.4 36.6 24 9.07 1 0.0303
# ℹ 26 more rows
We can make R show us all the columns:
options(tibble.width = Inf)Table_Stats_7# A tibble: 36 × 11
diq010 riagendr age_ntile Feature cnt max mean min sd SumNa
<fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Diabetes Male 1 bmxbmi 25 54 32.5 19.8 8.87 2
2 Diabetes Male 1 lbxglu 25 369 209. 76 120. 18
3 Diabetes Male 1 ridageyr 25 39 31.5 21 5.36 0
4 Diabetes Male 2 bmxbmi 136 57.4 32.6 16 8.12 3
5 Diabetes Male 2 lbxglu 136 454 176. 21 78.8 79
6 Diabetes Male 2 ridageyr 136 59 51.5 39 5.23 0
7 Diabetes Male 3 bmxbmi 295 54.2 30.7 17.7 6.22 24
8 Diabetes Male 3 lbxglu 295 479 163. 68 68.0 155
9 Diabetes Male 3 ridageyr 295 80 69.9 60 6.41 0
10 Diabetes Female 1 bmxbmi 33 59.4 36.6 24 9.07 1
pct_NA
<dbl>
1 0.08
2 0.72
3 0
4 0.0221
5 0.581
6 0
7 0.0814
8 0.525
9 0
10 0.0303
# ℹ 26 more rows
Finally we can sort to see the feature with the highest percentage of missing values is lbxglu among the lowest aged diabetic male population:
Table_Stats_8 <- Table_Stats_7 %>%
arrange(desc(pct_NA))
Table_Stats_8# A tibble: 36 × 11
diq010 riagendr age_ntile Feature cnt max mean min sd SumNa
<fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Diabetes Male 1 lbxglu 25 369 209. 76 120. 18
2 No Diabetes Female 1 lbxglu 960 335 96.0 74 15.3 576
3 No Diabetes Male 1 lbxglu 889 322 101. 70 17.0 531
4 Diabetes Male 2 lbxglu 136 454 176. 21 78.8 79
5 No Diabetes Female 2 lbxglu 896 285 103. 77 20.5 515
6 No Diabetes Male 2 lbxglu 750 397 109. 74 27.5 420
7 Diabetes Female 3 lbxglu 231 461 160. 50 68.4 128
8 No Diabetes Female 3 lbxglu 728 309 109. 64 22.4 403
9 No Diabetes Male 3 lbxglu 652 234 108. 64 16.3 360
10 Diabetes Female 1 lbxglu 33 390 150. 84 79.9 18
pct_NA
<dbl>
1 0.72
2 0.6
3 0.597
4 0.581
5 0.575
6 0.56
7 0.554
8 0.554
9 0.552
10 0.545
# ℹ 26 more rows
Although we built the table one step at a time we can also string together our commands to achieve the same result:
library('tidyverse')── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
✔ readr 2.1.5
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks mice::filter(), stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Table_Stats_9 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3)) %>%
group_by(diq010,riagendr,age_ntile) %>%
summarise_at(vars(ridageyr,bmxbmi,lbxglu), list(cnt =~ n() ,
min = ~ min(.,na.rm=TRUE),
mean= ~ mean(.,na.rm=TRUE),
sd = ~ sd(.,na.rm=TRUE),
max = ~ max(.,na.rm=TRUE),
SumNa =~ SumNa(.))
) %>%
ungroup() %>%
gather(-diq010,-riagendr,-age_ntile, key="Feature_Type", value="Value") %>%
separate(Feature_Type,into=c('Feature','Type')) %>%
spread(key='Type',value='Value') %>%
mutate(pct_NA = SumNa/cnt) %>%
arrange(desc(pct_NA))
Table_Stats_9# A tibble: 36 × 11
diq010 riagendr age_ntile Feature cnt max mean min sd SumNa
<fct> <fct> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Diabetes Male 1 lbxglu 25 369 209. 76 120. 18
2 No Diabetes Female 1 lbxglu 960 335 96.0 74 15.3 576
3 No Diabetes Male 1 lbxglu 889 322 101. 70 17.0 531
4 Diabetes Male 2 lbxglu 136 454 176. 21 78.8 79
5 No Diabetes Female 2 lbxglu 896 285 103. 77 20.5 515
6 No Diabetes Male 2 lbxglu 750 397 109. 74 27.5 420
7 Diabetes Female 3 lbxglu 231 461 160. 50 68.4 128
8 No Diabetes Female 3 lbxglu 728 309 109. 64 22.4 403
9 No Diabetes Male 3 lbxglu 652 234 108. 64 16.3 360
10 Diabetes Female 1 lbxglu 33 390 150. 84 79.9 18
pct_NA
<dbl>
1 0.72
2 0.6
3 0.597
4 0.581
5 0.575
6 0.56
7 0.554
8 0.554
9 0.552
10 0.545
# ℹ 26 more rows
arsenal::comparedf(Table_Stats_8,Table_Stats_9)Compare Object
Function Call:
arsenal::comparedf(x = Table_Stats_8, y = Table_Stats_9)
Shared: 11 non-by variables and 36 observations.
Not shared: 0 variables and 0 observations.
Differences found in 0/11 variables compared.
0 variables compared have non-identical attributes.
4.15.1 See also
tidyverse: https://www.tidyverse.org/Rfor Data Science: https://r4ds.had.co.nz/- Modern
Rwith thetidyverse: https://b-rodrigues.github.io/modern_R/
4.16 One Hot Encoding
Often categorical variables are encoded into binary 0 or 1 flags of new features.
diab_pop %>%
group_by(diq010,riagendr) %>%
summarise(cnt = n())`summarise()` has grouped output by 'diq010'. You can override using the
`.groups` argument.
# A tibble: 4 × 3
# Groups: diq010 [2]
diq010 riagendr cnt
<fct> <fct> <int>
1 Diabetes Male 456
2 Diabetes Female 388
3 No Diabetes Male 2291
4 No Diabetes Female 2584
diab_pop.EncodeSex <- diab_pop %>%
mutate(SexMale = ifelse(riagendr == 'Male',1,0)) %>%
mutate(DM2 = ifelse(diq010 == 'Diabetes',1,0))
glimpse(diab_pop.EncodeSex)Rows: 5,719
Columns: 12
$ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living with partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…
$ SexMale <dbl> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0…
$ DM2 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
diab_pop.EncodeSex %>%
group_by(DM2) %>%
summarise(cnt = sum(SexMale))# A tibble: 2 × 2
DM2 cnt
<dbl> <dbl>
1 0 2291
2 1 456
diab_pop.NoDm2 <- diab_pop.EncodeSex %>%
filter(diq010 == 'No Diabetes')
diab_pop.Dm2 <- diab_pop.EncodeSex %>%
filter(diq010 == 'Diabetes')
hist(diab_pop.NoDm2$SexMale, col=rgb(1,0,0,0.5), main="Overlapping Histogram", xlab="Sex Male")
hist(diab_pop.Dm2$SexMale, col=rgb(0,0,1,0.5), add=T)
box()
heatmap(as.matrix(diab_pop.EncodeSex[,c('DM2','SexMale')]))
Of course encoding each categorical variable can take time so we can use the dummyVars function in the caret package:
library('caret')
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
# The return of dummyVars is a mapping which tells R which variables are factors and will need encoding
caret.dummyVars.diab_pop <- dummyVars(" ~ .", data = diab_pop)
str(caret.dummyVars.diab_pop)List of 9
$ call : language dummyVars.default(formula = " ~ .", data = diab_pop)
$ form :Class 'formula' language ~.
.. ..- attr(*, ".Environment")=<environment: 0x000001ee2a660f08>
$ vars : chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
$ facVars : chr [1:6] "riagendr" "ridreth1" "dmdeduc2" "dmdmartl" ...
$ lvls :List of 6
..$ riagendr: chr [1:2] "Male" "Female"
..$ ridreth1: chr [1:5] "MexicanAmerican" "Other Hispanic" "Non-Hispanic White" "Non-Hispanic Black" ...
..$ dmdeduc2: chr [1:5] "Less than 9th grade" "Grades 9-11th" "High school graduate/GED" "Some college or AA degrees" ...
..$ dmdmartl: chr [1:6] "Married" "Widowed" "Divorced" "Separated" ...
..$ indhhin2: chr [1:14] "$0-$4,999" "$5,000-$9,999" "$10,000-$14,999" "$15,000-$19,999" ...
..$ diq010 : chr [1:2] "Diabetes" "No Diabetes"
$ sep : chr "."
$ terms :Classes 'terms', 'formula' language ~seqn + riagendr + ridageyr + ridreth1 + dmdeduc2 + dmdmartl + indhhin2 + bmxbmi + diq010 + lbxglu
.. ..- attr(*, "variables")= language list(seqn, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, diq010, lbxglu)
.. ..- attr(*, "factors")= int [1:10, 1:10] 1 0 0 0 0 0 0 0 0 0 ...
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
.. .. .. ..$ : chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
.. ..- attr(*, "term.labels")= chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
.. ..- attr(*, "order")= int [1:10] 1 1 1 1 1 1 1 1 1 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 0
.. ..- attr(*, ".Environment")=<environment: 0x000001ee2a660f08>
.. ..- attr(*, "predvars")= language list(seqn, riagendr, ridageyr, ridreth1, dmdeduc2, dmdmartl, indhhin2, bmxbmi, diq010, lbxglu)
.. ..- attr(*, "dataClasses")= Named chr [1:10] "numeric" "factor" "numeric" "factor" ...
.. .. ..- attr(*, "names")= chr [1:10] "seqn" "riagendr" "ridageyr" "ridreth1" ...
$ levelsOnly: logi FALSE
$ fullRank : logi FALSE
- attr(*, "class")= chr "dummyVars"
# We have to apply the mapping to the set:
predict_dummyVars_diab_pop <- predict(caret.dummyVars.diab_pop, newdata = diab_pop)
str(predict_dummyVars_diab_pop) num [1:5719, 1:38] 83732 83733 83734 83735 83736 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:5719] "1" "2" "3" "4" ...
..$ : chr [1:38] "seqn" "riagendr.Male" "riagendr.Female" "ridageyr" ...
# And then we can turn back into a tibble:
encode.diab_pop <- as_tibble(predict_dummyVars_diab_pop)
str(encode.diab_pop)tibble [5,719 × 38] (S3: tbl_df/tbl/data.frame)
$ seqn : num [1:5719] 83732 83733 83734 83735 83736 ...
$ riagendr.Male : num [1:5719] 1 1 1 0 0 0 1 0 1 1 ...
$ riagendr.Female : num [1:5719] 0 0 0 1 1 1 0 1 0 0 ...
$ ridageyr : num [1:5719] 62 53 78 56 42 72 22 32 56 46 ...
$ ridreth1.MexicanAmerican : num [1:5719] 0 0 0 0 0 1 0 1 0 0 ...
$ ridreth1.Other Hispanic : num [1:5719] 0 0 0 0 0 0 0 0 0 0 ...
$ ridreth1.Non-Hispanic White : num [1:5719] 1 1 1 1 0 0 0 0 0 1 ...
$ ridreth1.Non-Hispanic Black : num [1:5719] 0 0 0 0 1 0 1 0 1 0 ...
$ ridreth1.Other : num [1:5719] 0 0 0 0 0 0 0 0 0 0 ...
$ dmdeduc2.Less than 9th grade : num [1:5719] 0 0 0 0 0 0 0 0 0 0 ...
$ dmdeduc2.Grades 9-11th : num [1:5719] 0 0 0 0 0 1 0 0 0 0 ...
$ dmdeduc2.High school graduate/GED : num [1:5719] 0 1 1 0 0 0 0 0 1 0 ...
$ dmdeduc2.Some college or AA degrees: num [1:5719] 0 0 0 0 1 0 1 1 0 0 ...
$ dmdeduc2.College grad or above : num [1:5719] 1 0 0 1 0 0 0 0 0 1 ...
$ dmdmartl.Married : num [1:5719] 1 0 1 0 0 0 0 1 0 0 ...
$ dmdmartl.Widowed : num [1:5719] 0 0 0 0 0 0 0 0 0 0 ...
$ dmdmartl.Divorced : num [1:5719] 0 1 0 0 1 0 0 0 1 0 ...
$ dmdmartl.Separated : num [1:5719] 0 0 0 0 0 1 0 0 0 0 ...
$ dmdmartl.Never married : num [1:5719] 0 0 0 0 0 0 1 0 0 0 ...
$ dmdmartl.Living with partner : num [1:5719] 0 0 0 1 0 0 0 0 0 1 ...
$ indhhin2.$0-$4,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
$ indhhin2.$5,000-$9,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
$ indhhin2.$10,000-$14,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 1 1 ...
$ indhhin2.$15,000-$19,999 : num [1:5719] 0 1 0 0 NA 0 NA 0 0 0 ...
$ indhhin2.$20,000-$24,999 : num [1:5719] 0 0 1 0 NA 0 NA 0 0 0 ...
$ indhhin2.$25,000-$34,999 : num [1:5719] 0 0 0 0 NA 0 NA 1 0 0 ...
$ indhhin2.$35,000-$44,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
$ indhhin2.$45,000-$54,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
$ indhhin2.$55,000-$64,999 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
$ indhhin2.$65,000-$74,999 : num [1:5719] 1 0 0 1 NA 0 NA 0 0 0 ...
$ indhhin2.20,000+ : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
$ indhhin2.less than $20,000 : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
$ indhhin2.$75,000-$99,999 : num [1:5719] 0 0 0 0 NA 1 NA 0 0 0 ...
$ indhhin2.$100,000+ : num [1:5719] 0 0 0 0 NA 0 NA 0 0 0 ...
$ bmxbmi : num [1:5719] 27.8 30.8 28.8 42.4 20.3 28.6 28 28.2 33.6 27.6 ...
$ diq010.Diabetes : num [1:5719] 1 0 1 0 0 0 0 0 1 0 ...
$ diq010.No Diabetes : num [1:5719] 0 1 0 1 1 1 1 1 0 1 ...
$ lbxglu : num [1:5719] NA 101 84 NA 84 107 95 NA NA NA ...
heatmap(as.matrix(encode.diab_pop[,c('diq010.Diabetes','riagendr.Male','dmdmartl.Married','dmdmartl.Widowed','dmdmartl.Divorced', 'dmdmartl.Separated','dmdmartl.Never married')]))
In our last example we left a space in the column name dmdmartl.Never married here is one way to remove it:
library('stringr')
diab_pop_s1 <- diab_pop %>%
mutate(dmdmartl = str_replace(dmdmartl ," ","_"))
glimpse(diab_pop_s1) Rows: 5,719
Columns: 10
$ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <chr> "Married", "Divorced", "Married", "Living_with partner", "Div…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…
The stringr::str_replace function only replaced the first whitespace so that “Living with partner” is now “Living_with partner”. We can use str_replace_all instead to get “Living_with_partner”.
diab_pop_s1 <- diab_pop %>%
mutate(dmdmartl = str_replace_all(dmdmartl ," ","_")) %>%
mutate(dmdmartl = as.factor(dmdmartl))
glimpse(diab_pop_s1) Rows: 5,719
Columns: 10
$ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic White, Non-Hispanic White, Non-Hispanic White, N…
$ dmdeduc2 <fct> College grad or above, High school graduate/GED, High school …
$ dmdmartl <fct> Married, Divorced, Married, Living_with_partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010 <fct> Diabetes, No Diabetes, Diabetes, No Diabetes, No Diabetes, No…
$ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…
diab_pop_f1 <- diab_pop %>%
mutate(dmdmartl = str_replace_all(dmdmartl," ","_")) %>%
mutate(dmdmartl = as.factor(dmdmartl)) %>%
mutate(ridreth1 = str_replace_all(ridreth1," ","_")) %>%
mutate(ridreth1 = as.factor(ridreth1)) %>%
mutate(diq010 = str_replace_all(ridreth1," ","_")) %>%
mutate(diq010 = as.factor(diq010)) %>%
mutate(dmdeduc2 = str_replace_all(dmdeduc2," ","_") ) %>%
mutate(dmdeduc2 = as.factor(dmdeduc2)) %>%
mutate(indhhin2 = str_replace_all(indhhin2," ","_") ) %>%
mutate(indhhin2 = as.factor(indhhin2))
caret.dummyVars.diab_pop_f1 <- dummyVars(" ~ .", data = diab_pop_f1)
predict_dummyVars_diab_pop <- predict(caret.dummyVars.diab_pop_f1, newdata = diab_pop_f1)
encode.diab_pop_f1 <- as_tibble(predict_dummyVars_diab_pop)
glimpse(encode.diab_pop_f1)Rows: 5,719
Columns: 39
$ seqn <dbl> 83732, 83733, 83734, 83735, 83736,…
$ riagendr.Male <dbl> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0…
$ riagendr.Female <dbl> 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56…
$ ridreth1.MexicanAmerican <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
$ `ridreth1.Non-Hispanic_Black` <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0…
$ `ridreth1.Non-Hispanic_White` <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0…
$ ridreth1.Other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ ridreth1.Other_Hispanic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
$ dmdeduc2.College_grad_or_above <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0…
$ `dmdeduc2.Grades_9-11th` <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0…
$ `dmdeduc2.High_school_graduate/GED` <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0…
$ dmdeduc2.Less_than_9th_grade <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ dmdeduc2.Some_college_or_AA_degrees <dbl> 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1…
$ dmdmartl.Divorced <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0…
$ dmdmartl.Living_with_partner <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1…
$ dmdmartl.Married <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0…
$ dmdmartl.Never_married <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0…
$ dmdmartl.Separated <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
$ dmdmartl.Widowed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `indhhin2.$0-$4,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,…
$ `indhhin2.$10,000-$14,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 1, 1, 0,…
$ `indhhin2.$100,000+` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,…
$ `indhhin2.$15,000-$19,999` <dbl> 0, 1, 0, 0, NA, 0, NA, 0, 0, 0, 0,…
$ `indhhin2.$20,000-$24,999` <dbl> 0, 0, 1, 0, NA, 0, NA, 0, 0, 0, 0,…
$ `indhhin2.$25,000-$34,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 1, 0, 0, 0,…
$ `indhhin2.$45,000-$54,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,…
$ `indhhin2.$5,000-$9,999` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,…
$ `indhhin2.$65,000-$74,999` <dbl> 1, 0, 0, 1, NA, 0, NA, 0, 0, 0, 1,…
$ `indhhin2.$75,000-$99,999` <dbl> 0, 0, 0, 0, NA, 1, NA, 0, 0, 0, 0,…
$ `indhhin2.20,000+` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,…
$ `indhhin2.less_than_$20,000` <dbl> 0, 0, 0, 0, NA, 0, NA, 0, 0, 0, 0,…
$ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6…
$ diq010.MexicanAmerican <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0…
$ `diq010.Non-Hispanic_Black` <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0…
$ `diq010.Non-Hispanic_White` <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0…
$ diq010.Other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
$ diq010.Other_Hispanic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
$ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, …
Notice that we are applying the same functions to the columns, so another way to solve this issue might be with a function:
my_replace_all_factor_function <- function(my_data_in , the_column){
enquo_the_column <- enquo(the_column)
my_data_out <- my_data_in %>%
mutate( !! enquo_the_column := str_replace_all(!! enquo_the_column," ","_")) %>%
mutate( !! enquo_the_column := as.factor(!! enquo_the_column))
}Then we can apply our function itteratively:
result1 <- my_replace_all_factor_function(diab_pop,dmdmartl)
result2 <- my_replace_all_factor_function(result1,ridreth1)
result3 <- my_replace_all_factor_function(result2,diq010)
result4 <- my_replace_all_factor_function(result3,dmdeduc2)
result5 <- my_replace_all_factor_function(result4,indhhin2)
arsenal::comparedf(diab_pop_f1 , result5)Compare Object
Function Call:
arsenal::comparedf(x = diab_pop_f1, y = result5)
Shared: 10 non-by variables and 5719 observations.
Not shared: 0 variables and 0 observations.
Differences found in 1/10 variables compared.
1 variables compared have non-identical attributes.
Here’s another way to apply our function:
result_f <- my_replace_all_factor_function(diab_pop,dmdmartl) %>%
my_replace_all_factor_function(ridreth1) %>%
my_replace_all_factor_function(diq010) %>%
my_replace_all_factor_function(dmdeduc2) %>%
my_replace_all_factor_function(indhhin2)
glimpse(result_f)Rows: 5,719
Columns: 10
$ seqn <dbl> 83732, 83733, 83734, 83735, 83736, 83737, 83741, 83742, 83744…
$ riagendr <fct> Male, Male, Male, Female, Female, Female, Male, Female, Male,…
$ ridageyr <dbl> 62, 53, 78, 56, 42, 72, 22, 32, 56, 46, 45, 30, 67, 67, 57, 8…
$ ridreth1 <fct> Non-Hispanic_White, Non-Hispanic_White, Non-Hispanic_White, N…
$ dmdeduc2 <fct> College_grad_or_above, High_school_graduate/GED, High_school_…
$ dmdmartl <fct> Married, Divorced, Married, Living_with_partner, Divorced, Se…
$ indhhin2 <fct> "$65,000-$74,999", "$15,000-$19,999", "$20,000-$24,999", "$65…
$ bmxbmi <dbl> 27.8, 30.8, 28.8, 42.4, 20.3, 28.6, 28.0, 28.2, 33.6, 27.6, 2…
$ diq010 <fct> Diabetes, No_Diabetes, Diabetes, No_Diabetes, No_Diabetes, No…
$ lbxglu <dbl> NA, 101, 84, NA, 84, 107, 95, NA, NA, NA, 84, NA, 130, 284, 3…
arsenal::comparedf(result5 , result_f)Compare Object
Function Call:
arsenal::comparedf(x = result5, y = result_f)
Shared: 10 non-by variables and 5719 observations.
Not shared: 0 variables and 0 observations.
Differences found in 0/10 variables compared.
0 variables compared have non-identical attributes.
4.16.1 addition resources: programming with dplyr
- programming with
dplyrvignette: https://cran.r-project.org/web/packages/dplyr/vignettes/programming.html
4.17 ks-test
library('tidyverse')
diab_pop.NoDM2.age <- (diab_pop %>%
filter(diq010 == "No Diabetes"))$ridageyr
diab_pop.DM2.age <- (diab_pop %>%
filter(diq010 == "Diabetes"))$ridageyr
ks.test(diab_pop.NoDM2.age , diab_pop.DM2.age)$p.valueWarning in ks.test.default(diab_pop.NoDM2.age, diab_pop.DM2.age): p-value will
be approximate in the presence of ties
[1] 1.334401e-92
Now let’s create a table of some ks-tests. We can write a function to assist us:
ks_wrapper <- function(df, factor, factor_level, continuous_feature){
F <- NULL
factor <- enquo(factor)
data.local <- df %>%
select(!! factor, continuous_feature)
X <- data.local %>%
filter(!! factor == factor_level)
Y <- data.local %>%
filter(!(!! factor == factor_level))
F$x <- X[,continuous_feature]
F$y <- Y[,continuous_feature]
ks_test_stat_T <- ks.test(F$x,F$y)
ks_test_stat <- ks_test_stat_T$p.value
result_row <- tibble::enframe(ks_test_stat)
result_row <- result_row %>%
mutate(Feature = continuous_feature)
return(result_row)
}A single call to the function will give us a table with the p-value of the ks-test:
diab_pop_f2 <- diab_pop %>%
mutate(DM2_num = ifelse(diq010 == "Diabetes",1,0))
ks_wrapper(diab_pop_f2, DM2_num,1,"ridageyr")Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(continuous_feature)
# Now:
data %>% select(all_of(continuous_feature))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
Warning in ks.test.default(F$x, F$y): p-value will be approximate in the
presence of ties
# A tibble: 1 × 3
name value Feature
<int> <dbl> <chr>
1 1 1.33e-92 ridageyr
We can iterate our function over other columns and use purrr::map_dfr to make a single table:
compute_ks_cols <- c('ridageyr','bmxbmi','lbxglu')
ks_test_results_table <- purrr::map_dfr(compute_ks_cols,
df = diab_pop_f2,
factor = DM2_num,
factor_level = 1,
ks_wrapper)Warning in ks.test.default(F$x, F$y): p-value will be approximate in the
presence of ties
Warning in ks.test.default(F$x, F$y): p-value will be approximate in the
presence of ties
Warning in ks.test.default(F$x, F$y): p-value will be approximate in the
presence of ties
ks_test_results_table# A tibble: 3 × 3
name value Feature
<int> <dbl> <chr>
1 1 1.33e- 92 ridageyr
2 1 1.00e- 27 bmxbmi
3 1 3.52e-111 lbxglu
ks_test_results_table <- ks_test_results_table %>%
rename(ks_test_p_value=value)
ks_test_results_table# A tibble: 3 × 3
name ks_test_p_value Feature
<int> <dbl> <chr>
1 1 1.33e- 92 ridageyr
2 1 1.00e- 27 bmxbmi
3 1 3.52e-111 lbxglu
We can add to the table, by finding some related values to showcase along:
means.diab_pop_f2 <- diab_pop_f2 %>%
group_by(DM2_num) %>%
summarise_at(vars(c('ridageyr','bmxbmi','lbxglu')), list(~mean(.,na.rm = TRUE),~sd(.,na.rm = TRUE))) %>%
gather(-DM2_num, key="Feature_Type", value="Value") %>%
separate(Feature_Type, into= c('Feature','Type'), sep='_') %>%
mutate(DM2_stats_char = ifelse(DM2_num == 1,"Diabetes","No_Diabetes")) %>%
mutate(DM2_stats_char_Type = paste0(DM2_stats_char,"_",Type)) %>%
select(-DM2_num,-Type,-DM2_stats_char) %>%
select(Feature, DM2_stats_char_Type, Value) %>%
spread(key='DM2_stats_char_Type', value = 'Value')
means.diab_pop_f2# A tibble: 3 × 5
Feature Diabetes_mean Diabetes_sd No_Diabetes_mean No_Diabetes_sd
<chr> <dbl> <dbl> <dbl> <dbl>
1 bmxbmi 32.6 7.73 29.0 6.82
2 lbxglu 166. 73.5 104. 20.7
3 ridageyr 61.3 13.3 47.5 17.6
Finally we can join our tables together:
ks_table_1 <- ks_test_results_table %>%
left_join(means.diab_pop_f2) %>%
select(Feature, Diabetes_mean, Diabetes_sd, ks_test_p_value, No_Diabetes_mean, No_Diabetes_sd)Joining with `by = join_by(Feature)`
ks_table_1# A tibble: 3 × 6
Feature Diabetes_mean Diabetes_sd ks_test_p_value No_Diabetes_mean
<chr> <dbl> <dbl> <dbl> <dbl>
1 ridageyr 61.3 13.3 1.33e- 92 47.5
2 bmxbmi 32.6 7.73 1.00e- 27 29.0
3 lbxglu 166. 73.5 3.52e-111 104.
No_Diabetes_sd
<dbl>
1 17.6
2 6.82
3 20.7
We can also use R to make the table more presentable:
install_if_not('kableExtra')[1] "the package 'kableExtra' is already installed"
library('kableExtra')
Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':
group_rows
ks_table_style <- kable(ks_table_1) %>%
kable_styling("striped") %>%
add_header_above(c(" " = 1, "Diabetes" = 2, "ks-test pvalue" = 1, "Non Diabetes" = 2))
ks_table_style| Feature | Diabetes_mean | Diabetes_sd | ks_test_p_value | No_Diabetes_mean | No_Diabetes_sd |
|---|---|---|---|---|---|
| ridageyr | 61.33768 | 13.347476 | 0 | 47.49313 | 17.635870 |
| bmxbmi | 32.59632 | 7.733781 | 0 | 29.01988 | 6.823482 |
| lbxglu | 165.98953 | 73.475333 | 0 | 103.94155 | 20.720142 |
ggplot(diab_pop, aes(diq010,ridageyr)) +
geom_boxplot(aes(fill=factor(diq010)))
ggplot(diab_pop, aes(x=ridageyr)) + geom_histogram() + facet_wrap(~diq010)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

diab_pop.NoDm2 <- diab_pop %>% filter(diq010 == 'No Diabetes')
diab_pop.Dm2 <- diab_pop %>% filter(diq010 == 'Diabetes')
hist(diab_pop.NoDm2$ridageyr, col=rgb(1,0,0,0.5), main="Overlapping Histograms Diabetics by Age", xlab="Age")
hist(diab_pop.Dm2$ridageyr, col=rgb(0,0,1,0.5), add=T)
legend("topright", c("No Diabetes", "Diabetes"), fill=c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)))
box()
4.17.1 addition resources
- programming with
purrrvignette: https://purrr.tidyverse.org/ purrrcheatsheet: https://github.com/rstudio/cheatsheets/blob/master/purrr.pdf- kableExtra : https://cran.r-project.org/web/packages/kableExtra/vignettes/awesome_table_in_html.html
4.18 tableone
install_if_not('tableone')[1] "the package 'tableone' is already installed"
diab_pop2 <- diab_pop %>%
mutate(age_ntile = ntile(ridageyr,3))
my_table_one <- tableone::CreateTableOne(
vars = c('ridageyr','bmxbmi','lbxglu',"riagendr","age_ntile"),
strata = c("diq010"),
data = diab_pop2,
factorVars = c("riagendr","age_ntile")
)
my_table_one Stratified by diq010
Diabetes No Diabetes p test
n 844 4875
ridageyr (mean (SD)) 61.34 (13.35) 47.49 (17.64) <0.001
bmxbmi (mean (SD)) 32.60 (7.73) 29.02 (6.82) <0.001
lbxglu (mean (SD)) 165.99 (73.48) 103.94 (20.72) <0.001
riagendr = Female (%) 388 (46.0) 2584 (53.0) <0.001
age_ntile (%) <0.001
1 58 ( 6.9) 1849 (37.9)
2 260 (30.8) 1646 (33.8)
3 526 (62.3) 1380 (28.3)
4.18.1 tableone - Additional Resources
- tableone vignette: https://cran.r-project.org/web/packages/tableone/vignettes/introduction.html
4.19 tableby with arsenal
install_if_not(c('arsenal','knitr','kableExtra'))[1] "the package 'arsenal' is already installed"
[2] "the package 'knitr' is already installed"
[3] "the package 'kableExtra' is already installed"
library('arsenal')
Attaching package: 'arsenal'
The following object is masked from 'package:lubridate':
is.Date
library('knitr')
library('kableExtra')
table_arsenal <- tableby(diq010 ~ riagendr + dmdmartl, data = diab_pop)
table_arsenaltableby Object
Function Call:
tableby(formula = diq010 ~ riagendr + dmdmartl, data = diab_pop)
Variable(s):
diq010 ~ riagendr, dmdmartl
summary(table_arsenal, title="Diabetes by Sex and Marital Status")
Table: Diabetes by Sex and Marital Status
| | Diabetes (N=844) | No Diabetes (N=4875) | Total (N=5719) | p value|
|:-------------------------------------|:----------------:|:--------------------:|:--------------:|-------:|
|**riagendr** | | | | < 0.001|
| Male | 456 (54.0%) | 2291 (47.0%) | 2747 (48.0%) | |
| Female | 388 (46.0%) | 2584 (53.0%) | 2972 (52.0%) | |
|**dmdmartl** | | | | < 0.001|
| N-Miss | 0 | 3 | 3 | |
| Married | 471 (55.8%) | 2415 (49.6%) | 2886 (50.5%) | |
| Widowed | 100 (11.8%) | 321 (6.6%) | 421 (7.4%) | |
| Divorced | 106 (12.6%) | 508 (10.4%) | 614 (10.7%) | |
| Separated | 29 (3.4%) | 163 (3.3%) | 192 (3.4%) | |
| Never married | 83 (9.8%) | 965 (19.8%) | 1048 (18.3%) | |
| Living with partner | 55 (6.5%) | 500 (10.3%) | 555 (9.7%) | |
summary.table_arsenal <- summary(table_arsenal, title="Diabetes by Sex and Marital Status",results="asis") %>%
kable() %>%
kable_styling("striped")
summary.table_arsenal | Diabetes (N=844) | No Diabetes (N=4875) | Total (N=5719) | p value | |
|---|---|---|---|---|
| **riagendr** | < 0.001 | |||
| Male | 456 (54.0%) | 2291 (47.0%) | 2747 (48.0%) | |
| Female | 388 (46.0%) | 2584 (53.0%) | 2972 (52.0%) | |
| **dmdmartl** | < 0.001 | |||
| N-Miss | 0 | 3 | 3 | |
| Married | 471 (55.8%) | 2415 (49.6%) | 2886 (50.5%) | |
| Widowed | 100 (11.8%) | 321 (6.6%) | 421 (7.4%) | |
| Divorced | 106 (12.6%) | 508 (10.4%) | 614 (10.7%) | |
| Separated | 29 (3.4%) | 163 (3.3%) | 192 (3.4%) | |
| Never married | 83 (9.8%) | 965 (19.8%) | 1048 (18.3%) | |
| Living with partner | 55 (6.5%) | 500 (10.3%) | 555 (9.7%) |
4.19.1 tableby - Additional Resources
- tableby vignette: https://cran.r-project.org/web/packages/arsenal/vignettes/tableby.html
4.20 anova
res.aov <- aov(ridageyr ~ diq010, data=diab_pop)
summary(res.aov) Df Sum Sq Mean Sq F value Pr(>F)
diq010 1 137897 137897 473.2 <2e-16 ***
Residuals 5717 1666115 291
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
4.21 manova
res.mov <- manova(cbind(ridageyr,bmxbmi) ~ diq010, data=diab_pop)
summary(res.mov) Df Pillai approx F num Df den Df Pr(>F)
diq010 1 0.10247 308.43 2 5403 < 2.2e-16 ***
Residuals 5404
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(res.mov) Response ridageyr :
Df Sum Sq Mean Sq F value Pr(>F)
diq010 1 122870 122870 425.86 < 2.2e-16 ***
Residuals 5404 1559176 289
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Response bmxbmi :
Df Sum Sq Mean Sq F value Pr(>F)
diq010 1 8619 8619.1 177.74 < 2.2e-16 ***
Residuals 5404 262052 48.5
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
313 observations deleted due to missingness
4.22 psych package
install_if_not("psych")[1] "the package 'psych' is already installed"
psych::describe(diab_pop) vars n mean sd median trimmed mad min max
seqn 1 5719 88671.71 2877.82 88651.0 88662.57 3703.53 83732.0 93702.0
riagendr* 2 5719 1.52 0.50 2.0 1.52 0.00 1.0 2.0
ridageyr 3 5719 49.54 17.76 49.0 49.19 22.24 20.0 80.0
ridreth1* 4 5719 3.04 1.29 3.0 3.05 1.48 1.0 5.0
dmdeduc2* 5 5714 3.43 1.30 4.0 3.54 1.48 1.0 5.0
dmdmartl* 6 5716 2.61 1.89 1.0 2.39 0.00 1.0 6.0
indhhin2* 7 4421 8.47 4.32 8.0 8.59 5.93 1.0 14.0
bmxbmi 8 5406 29.54 7.08 28.5 28.86 6.38 14.5 67.3
diq010* 9 5719 1.85 0.35 2.0 1.94 0.00 1.0 2.0
lbxglu 10 2452 113.61 41.33 103.0 105.08 13.34 21.0 479.0
range skew kurtosis se
seqn 9970.0 0.02 -1.20 38.05
riagendr* 1.0 -0.08 -1.99 0.01
ridageyr 60.0 0.11 -1.14 0.23
ridreth1* 4.0 -0.12 -0.96 0.02
dmdeduc2* 4.0 -0.49 -0.84 0.02
dmdmartl* 5.0 0.63 -1.26 0.03
indhhin2* 13.0 -0.03 -1.46 0.06
bmxbmi 52.8 1.12 2.01 0.10
diq010* 1.0 -1.99 1.95 0.00
lbxglu 458.0 3.98 20.33 0.83
library(psych)
Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':
%+%, alpha
describeBy(diab_pop, diab_pop$diq010, skew=FALSE, ranges=TRUE)
Descriptive statistics by group
group: Diabetes
vars n mean sd median min max range se
seqn 1 844 88574.35 2845.20 88610.0 83732 93689.0 9957.0 97.94
riagendr 2 844 1.46 0.50 1.0 1 2.0 1.0 0.02
ridageyr 3 844 61.34 13.35 63.0 21 80.0 59.0 0.46
ridreth1 4 844 2.87 1.35 3.0 1 5.0 4.0 0.05
dmdeduc2 5 843 3.09 1.37 3.0 1 5.0 4.0 0.05
dmdmartl 6 844 2.19 1.65 1.0 1 6.0 5.0 0.06
indhhin2 7 650 7.24 4.23 6.0 1 14.0 13.0 0.17
bmxbmi 8 789 32.60 7.73 31.4 16 67.3 51.3 0.28
diq010 9 844 1.00 0.00 1.0 1 1.0 0.0 0.00
lbxglu 10 382 165.99 73.48 147.0 21 479.0 458.0 3.76
------------------------------------------------------------
group: No Diabetes
vars n mean sd median min max range se
seqn 1 4875 88688.57 2883.38 88654 83733.0 93702.0 9969.0 41.30
riagendr 2 4875 1.53 0.50 2 1.0 2.0 1.0 0.01
ridageyr 3 4875 47.49 17.64 46 20.0 80.0 60.0 0.25
ridreth1 4 4875 3.07 1.28 3 1.0 5.0 4.0 0.02
dmdeduc2 5 4871 3.49 1.28 4 1.0 5.0 4.0 0.02
dmdmartl 6 4872 2.68 1.92 2 1.0 6.0 5.0 0.03
indhhin2 7 3771 8.68 4.30 8 1.0 14.0 13.0 0.07
bmxbmi 8 4617 29.02 6.82 28 14.5 64.6 50.1 0.10
diq010 9 4875 2.00 0.00 2 2.0 2.0 0.0 0.00
lbxglu 10 2070 103.94 20.72 101 64.0 397.0 333.0 0.46
detach(package:psych)4.22.1 psych - Additional Resources
- Into to psych : https://rdrr.io/cran/psych/f/inst/doc/intro.pdf
4.23 GGally::ggpairs
install_if_not('GGally')[1] "the package 'GGally' is already installed"
data2 <- diab_pop %>% na.omit()
GGally::ggpairs(data2)Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4.24 skimr
install_if_not('skimr')[1] "the package 'skimr' is already installed"
library('skimr')
skim(diab_pop) %>%
summary() | Name | diab_pop |
| Number of rows | 5719 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| factor | 6 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
skim(diab_pop) | Name | diab_pop |
| Number of rows | 5719 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| factor | 6 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| riagendr | 0 | 1.00 | FALSE | 2 | Fem: 2972, Mal: 2747 |
| ridreth1 | 0 | 1.00 | FALSE | 5 | Non: 1863, Non: 1198, Mex: 995, Oth: 895 |
| dmdeduc2 | 5 | 1.00 | FALSE | 5 | Som: 1692, Col: 1422, Hig: 1236, Les: 688 |
| dmdmartl | 3 | 1.00 | FALSE | 6 | Mar: 2886, Nev: 1048, Div: 614, Liv: 555 |
| indhhin2 | 1298 | 0.77 | FALSE | 12 | $10: 910, $25: 596, $75: 524, $45: 451 |
| diq010 | 0 | 1.00 | FALSE | 2 | No : 4875, Dia: 844 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| seqn | 0 | 1.00 | 88671.71 | 2877.82 | 83732.0 | 86170.5 | 88651.0 | 91165.5 | 93702.0 | ▇▇▇▇▇ |
| ridageyr | 0 | 1.00 | 49.54 | 17.76 | 20.0 | 34.0 | 49.0 | 64.0 | 80.0 | ▇▇▇▇▆ |
| bmxbmi | 313 | 0.95 | 29.54 | 7.08 | 14.5 | 24.5 | 28.5 | 33.2 | 67.3 | ▅▇▂▁▁ |
| lbxglu | 3267 | 0.43 | 113.61 | 41.33 | 21.0 | 95.0 | 103.0 | 114.0 | 479.0 | ▇▂▁▁▁ |
diab_pop %>%
group_by(diq010) %>%
skim() | Name | Piped data |
| Number of rows | 5719 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| factor | 5 |
| numeric | 4 |
| ________________________ | |
| Group variables | diq010 |
Variable type: factor
| skim_variable | diq010 | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|---|
| riagendr | Diabetes | 0 | 1.00 | FALSE | 2 | Mal: 456, Fem: 388 |
| riagendr | No Diabetes | 0 | 1.00 | FALSE | 2 | Fem: 2584, Mal: 2291 |
| ridreth1 | Diabetes | 0 | 1.00 | FALSE | 5 | Non: 217, Non: 203, Mex: 200, Oth: 119 |
| ridreth1 | No Diabetes | 0 | 1.00 | FALSE | 5 | Non: 1646, Non: 995, Mex: 795, Oth: 790 |
| dmdeduc2 | Diabetes | 1 | 1.00 | FALSE | 5 | Som: 229, Hig: 185, Les: 161, Col: 145 |
| dmdeduc2 | No Diabetes | 4 | 1.00 | FALSE | 5 | Som: 1463, Col: 1277, Hig: 1051, Gra: 553 |
| dmdmartl | Diabetes | 0 | 1.00 | FALSE | 6 | Mar: 471, Div: 106, Wid: 100, Nev: 83 |
| dmdmartl | No Diabetes | 3 | 1.00 | FALSE | 6 | Mar: 2415, Nev: 965, Div: 508, Liv: 500 |
| indhhin2 | Diabetes | 194 | 0.77 | FALSE | 12 | $10: 89, $25: 86, $10: 81, $20: 68 |
| indhhin2 | No Diabetes | 1104 | 0.77 | FALSE | 12 | $10: 821, $25: 510, $75: 472, $45: 386 |
Variable type: numeric
| skim_variable | diq010 | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| seqn | Diabetes | 0 | 1.00 | 88574.35 | 2845.20 | 83732.0 | 86068.75 | 88610.0 | 90994.5 | 93689.0 | ▇▇▇▇▇ |
| seqn | No Diabetes | 0 | 1.00 | 88688.57 | 2883.38 | 83733.0 | 86189.50 | 88654.0 | 91193.0 | 93702.0 | ▇▇▇▇▇ |
| ridageyr | Diabetes | 0 | 1.00 | 61.34 | 13.35 | 21.0 | 53.00 | 63.0 | 71.0 | 80.0 | ▁▂▅▇▇ |
| ridageyr | No Diabetes | 0 | 1.00 | 47.49 | 17.64 | 20.0 | 32.00 | 46.0 | 61.0 | 80.0 | ▇▇▆▆▅ |
| bmxbmi | Diabetes | 55 | 0.93 | 32.60 | 7.73 | 16.0 | 27.40 | 31.4 | 36.3 | 67.3 | ▂▇▂▁▁ |
| bmxbmi | No Diabetes | 258 | 0.95 | 29.02 | 6.82 | 14.5 | 24.10 | 28.0 | 32.5 | 64.6 | ▃▇▂▁▁ |
| lbxglu | Diabetes | 462 | 0.45 | 165.99 | 73.48 | 21.0 | 117.00 | 147.0 | 192.0 | 479.0 | ▃▇▂▁▁ |
| lbxglu | No Diabetes | 2805 | 0.42 | 103.94 | 20.72 | 64.0 | 94.00 | 101.0 | 109.0 | 397.0 | ▇▁▁▁▁ |
4.25 Sankey Network
install_if_not('networkD3')[1] "the package 'networkD3' is already installed"
library(networkD3)
library(tidyverse)
by_sex <- diab_pop %>%
group_by(riagendr) %>%
summarise(cnt = n_distinct(seqn))
by_sex_by_eth <- diab_pop %>%
group_by(riagendr, ridreth1) %>%
summarise(cnt = n_distinct(seqn))`summarise()` has grouped output by 'riagendr'. You can override using the
`.groups` argument.
by_sex_by_dm2 <- diab_pop %>%
group_by(riagendr, diq010) %>%
summarise(cnt = n_distinct(seqn))`summarise()` has grouped output by 'riagendr'. You can override using the
`.groups` argument.
join1 <- by_sex %>%
left_join(by_sex_by_eth,
by= c('riagendr'='riagendr'),
suffix = c("_sex","_eth")) %>%
mutate(diff_sex_eth = cnt_sex - cnt_eth) %>%
mutate(source = riagendr,
target = ridreth1) %>%
mutate(diff = diff_sex_eth) %>%
select(source, target, diff)
join2 <- by_sex %>%
left_join(by_sex_by_dm2,
by= c('riagendr'='riagendr'),
suffix = c("_sex","_dm2")) %>%
mutate(diff_sex_dm2 = cnt_sex - cnt_dm2 )%>%
mutate(source = riagendr,
target = diq010) %>%
mutate(diff = diff_sex_dm2)%>%
select(source, target, diff)
by_dm2 <-diab_pop %>%
group_by(diq010)%>%
summarise(cnt = n_distinct(seqn))
by_dm2_by_eth <-diab_pop %>%
group_by(diq010, ridreth1)%>%
summarise(cnt = n_distinct(seqn))`summarise()` has grouped output by 'diq010'. You can override using the
`.groups` argument.
join3 <- by_dm2 %>%
left_join(by_dm2_by_eth,
by= c('diq010'='diq010'),
suffix = c("_dm2","_eth")) %>%
mutate(diff_dm2_eth = cnt_dm2 - cnt_eth) %>%
mutate(source = ridreth1,
target = diq010) %>%
mutate(diff = diff_dm2_eth) %>%
select(source, target, diff)
JOIN <- bind_rows(join1, join2, join3)
nodes <- data.frame(
name=c( as.character(JOIN$source),
as.character(JOIN$target))
%>% unique()
)
JOIN$IDsource <- match(JOIN$source, nodes$name)-1
JOIN$IDtarget <- match(JOIN$target, nodes$name)-1
p <- sankeyNetwork(Links = JOIN,
Nodes = nodes,
Source = "IDsource",
Target = "IDtarget",
Value = "diff",
NodeID = "name",
sinksRight=FALSE)Links is a tbl_df. Converting to a plain data frame.
p4.25.1 Additional Sankey Network resources:
- Wikipedia Sankey Diagram - https://en.wikipedia.org/wiki/Sankey_diagram
- Other resources - https://www.r-graph-gallery.com/sankey-diagram.html
- Automated Sankey on a Feature Set - https://rpubs.com/jkylearmstrong/Automated_Sankey_Network_Diagram_Example