8 Functional dbplyr, purrr, and furrr

Code

library('tidyverse')

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Code

source(here::here('Functions/connect_help.R'))

8.0.0.1 `source` current target

Code

source(here::here("DATA/Part_4/FEATURES/Store_TARGET_DM2.R"))

Joining with `by = join_by(yr_range)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`

8.0.0.2 `source` current features

Code

source(here::here('DATA/Part_4/FEATURES/Store_DEMO.R'))

Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`

Code

source(here::here('DATA/Part_4/FEATURES/Store_EXAM.R'))

Joining with `by = join_by(SEQN, yr_range)`
Joining with `by = join_by(SEQN, yr_range)`

Code

source(here::here('DATA/Part_4/FEATURES/Store_LABS.R'))

Joining with `by = join_by(SEQN, yr_range)`
Joining with `by = join_by(SEQN, yr_range)`
Joining with `by = join_by(SEQN, yr_range)`
Joining with `by = join_by(SEQN, yr_range)`
Joining with `by = join_by(SEQN, yr_range)`

Warning: Expected 2 pieces. Missing pieces filled with `NA` in 30 rows [1, 2, 3, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, ...].

8.0.1 `A_DATA_TBL_2`

We want to make a new Analytic data-set from additional features that have been developed. We might begin to cultivate a Feature Store which might comprise of either code or data to replicate features.

Code

A_DATA_TBL_2 <- DEMO %>%
  select(SEQN) %>%
  left_join(OUTCOME_TBL) %>%
  left_join(DEMO_FEATURES) %>%
  left_join(EXAM_TBL) %>%
  left_join(LABS_TBL)

Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN)`
Joining with `by = join_by(SEQN, yr_range)`

Get the dimension from a connection

Code

nrow.table <- function(con_tbl){
  # number of rows
  Tot_N_Rows <- con_tbl %>%
    summarise(n = n()) %>%
    pull(n)  
  Tot_N_Rows
}

nrow.table(A_DATA_TBL_2)

[1] 101316

Code

Tot_N_Rows

Error in eval(expr, envir, enclos): object 'Tot_N_Rows' not found

Code

dim.table <- function(con_tbl){
  # number of rows
  Tot_N_Rows <- nrow.table(con_tbl)  

  # number of columns
  Tot_N_Cols <- tbl %>% 
    colnames() %>% 
    length()
  
  result <- c(Tot_N_Rows, Tot_N_Cols)
  result
} 



dim.table(A_DATA_TBL_2)

[1] 101316      0

Code

Tot_N_Rows <- nrow.table(A_DATA_TBL_2)

Code

#look at the data-set 
A_DATA_TBL_2 %>% 
  head()

# Source:   SQL [6 x 133]
# Database: sqlite 3.46.0 [C:\Users\jkyle\Documents\GitHub\Jeff_R_Data_Wrangling\DATA\sql_db\NHANES.sqlite]
   SEQN DIABETES AGE_AT_DIAG_DM2   Age Gender Race  USAF  Birth_Country
  <dbl>    <dbl> <lgl>           <dbl> <chr>  <chr> <chr> <chr>        
1     1        0 NA                  2 Female Black <NA>  USA          
2     2        0 NA                 77 Male   White Yes   USA          
3     3        0 NA                 10 Female White <NA>  <NA>         
4     4        0 NA                  1 Male   Black <NA>  USA          
5     5        0 NA                 49 Male   White Yes   USA          
6     6        0 NA                 19 Female Other No    USA          
# ℹ 125 more variables: Grade_Level <chr>, Grade_Range <chr>,
#   Marital_Status <chr>, Pregnant <chr>, Household_Icome <lgl>,
#   Family_Income <lgl>, Poverty_Income_Ratio <dbl>, yr_range <chr>,
#   PEASCST1 <dbl>, PEASCTM1 <dbl>, PEASCCT1 <dbl>, BPXCHR <dbl>,
#   BPQ150A <dbl>, BPQ150B <dbl>, BPQ150C <dbl>, BPQ150D <dbl>, BPAARM <dbl>,
#   BPACSZ <dbl>, BPXPLS <dbl>, BPXDB <dbl>, BPXPULS <dbl>, BPXPTY <dbl>,
#   BPXML1 <dbl>, BPXSY1 <dbl>, BPXDI1 <dbl>, BPAEN1 <dbl>, BPXSY2 <dbl>, …

8.1 Determining Categorical and Continuous Features

Code

Not_features <- c('SEQN', 'DIABETES', 'AGE_AT_DIAG_DM2')
All_featres <- setdiff(colnames(A_DATA_TBL_2), Not_features)

Code

N_levels_features <- A_DATA_TBL_2 %>%
  select(- all_of(Not_features)) %>%
  summarise(
    across(
      all_of(All_featres), 
      \(x){n_distinct(x)}
      )
  ) %>%
  collect() # note this collects all the data

N_levels_features %>%
  head()

# A tibble: 1 × 130
    Age Gender  Race  USAF Birth_Country Grade_Level Grade_Range Marital_Status
  <int>  <int> <int> <int>         <int>       <int>       <int>          <int>
1    86      2     5     2             2          20           5              6
# ℹ 122 more variables: Pregnant <int>, Household_Icome <int>,
#   Family_Income <int>, Poverty_Income_Ratio <int>, yr_range <int>,
#   PEASCST1 <int>, PEASCTM1 <int>, PEASCCT1 <int>, BPXCHR <int>,
#   BPQ150A <int>, BPQ150B <int>, BPQ150C <int>, BPQ150D <int>, BPAARM <int>,
#   BPACSZ <int>, BPXPLS <int>, BPXDB <int>, BPXPULS <int>, BPXPTY <int>,
#   BPXML1 <int>, BPXSY1 <int>, BPXDI1 <int>, BPAEN1 <int>, BPXSY2 <int>,
#   BPXDI2 <int>, BPAEN2 <int>, BPXSY3 <int>, BPXDI3 <int>, BPAEN3 <int>, …

Code

N_levels_features %>%
  pivot_longer( cols = all_of(All_featres), values_to = "N_Distinct_Values") %>%
  arrange(desc(N_Distinct_Values))

# A tibble: 130 × 2
   name     N_Distinct_Values
   <chr>                <int>
 1 WTSAF2YR             18496
 2 URDACT                6720
 3 BMXBMI                4245
 4 LBDINSI               3990
 5 LBXIN                 3989
 6 URXUMA                2498
 7 URXUMS                2498
 8 BMXWT                 2077
 9 URDACT2               2035
10 PEASCTM1              1474
# ℹ 120 more rows

Code

N_levels_features %>%
  pivot_longer( cols = all_of(All_featres), values_to = "N_Distinct_Values") %>%
  mutate(Fearure = reorder(name, -N_Distinct_Values)) %>%
  mutate(Percentage_Distinct_Features = N_Distinct_Values/Tot_N_Rows ) %>%
  ggplot(aes(x= Fearure, y=Percentage_Distinct_Features , fill=Percentage_Distinct_Features)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme(legend.position = "none")

Count of Distinct Number of Values Per Feature

Code

N_levels_features %>%
  pivot_longer( cols = all_of(All_featres), values_to = "N_Distinct_Values") %>%
  arrange(N_Distinct_Values) %>%
  filter(15 <  N_Distinct_Values &  N_Distinct_Values < 70)

# A tibble: 9 × 2
  name        N_Distinct_Values
  <chr>                   <int>
1 Grade_Level                20
2 PHAFSTHR                   44
3 BPXML1                     47
4 PHAFSTMN                   60
5 BPXDI4                     64
6 BPXPLS                     66
7 BPXDI1                     67
8 BPXDI3                     68
9 BPXDI2                     69

Code

categorical_features <- N_levels_features %>%
  pivot_longer( cols = all_of(All_featres), values_to = "N_Distinct_Values") %>%
  arrange(N_Distinct_Values) %>%
  filter(N_Distinct_Values <= 25) %>%
  pull(name)

names(categorical_features) <- categorical_features
categorical_features

         BMAUPREL          BMAUPLEL           BMIHEAD            BMILEG 
       "BMAUPREL"        "BMAUPLEL"         "BMIHEAD"          "BMILEG" 
          BMICALF           BMIARML           BMIARMC          BMIWAIST 
        "BMICALF"         "BMIARML"         "BMIARMC"        "BMIWAIST" 
         BMITHICR          BMAUREXT          BMAULEXT          BMALOREX 
       "BMITHICR"        "BMAUREXT"        "BMAULEXT"        "BMALOREX" 
         BMALORKN          BMALLKNE          BMDRECUF           BMDSUBF 
       "BMALORKN"        "BMALLKNE"        "BMDRECUF"         "BMDSUBF" 
         BMDTHICF           BMDLEGF          BMDARMLF          BMDCALFF 
       "BMDTHICF"         "BMDLEGF"        "BMDARMLF"        "BMDCALFF" 
           BMIHIP            Gender              USAF     Birth_Country 
         "BMIHIP"          "Gender"            "USAF"   "Birth_Country" 
         Pregnant           BPQ150A           BPQ150B           BPQ150C 
       "Pregnant"         "BPQ150A"         "BPQ150B"         "BPQ150C" 
          BPQ150D           BPXPULS            BPXPTY            BPAEN1 
        "BPQ150D"         "BPXPULS"          "BPXPTY"          "BPAEN1" 
           BPAEN2            BPAEN3            BPAEN4          BMIRECUM 
         "BPAEN2"          "BPAEN3"          "BPAEN4"        "BMIRECUM" 
            BMIHT            BMITRI            BMISUB            BMAAMP 
          "BMIHT"          "BMITRI"          "BMISUB"          "BMAAMP" 
         BMALLEXT          URDUMALC          URDUCRLC           LBDINLC 
       "BMALLEXT"        "URDUMALC"        "URDUCRLC"         "LBDINLC" 
         PEASCST1            BPAARM          BMAEXSTS             BMIWT 
       "PEASCST1"          "BPAARM"        "BMAEXSTS"           "BMIWT" 
          URXPREG          BMDSTATS           BMDBMIC              Race 
        "URXPREG"        "BMDSTATS"         "BMDBMIC"            "Race" 
      Grade_Range            BPACSZ          BMDSADCM    Marital_Status 
    "Grade_Range"          "BPACSZ"        "BMDSADCM"  "Marital_Status" 
         BMAEXCMT          yr_range             BPXDB   Household_Icome 
       "BMAEXCMT"        "yr_range"           "BPXDB" "Household_Icome" 
    Family_Income          PEASCCT1       Grade_Level 
  "Family_Income"        "PEASCCT1"     "Grade_Level"

Currently, there are 63 and

Code

numeric_features <- N_levels_features %>%
  pivot_longer( cols = all_of(All_featres), values_to = "N_Distinct_Values") %>%
  arrange(N_Distinct_Values) %>%
  filter(N_Distinct_Values > 25) %>%
  pull(name)

names(numeric_features) <- numeric_features 
numeric_features

              PHAFSTHR                 BPXML1               PHAFSTMN 
            "PHAFSTHR"               "BPXML1"             "PHAFSTMN" 
                BPXDI4                 BPXPLS                 BPXDI1 
              "BPXDI4"               "BPXPLS"               "BPXDI1" 
                BPXDI3                 BPXDI2                 BPXSY4 
              "BPXDI3"               "BPXDI2"               "BPXSY4" 
                BPXCHR                 BPXSY3                    Age 
              "BPXCHR"               "BPXSY3"                  "Age" 
                BPXSY2                 BPXSY1                 BPXDAR 
              "BPXSY2"               "BPXSY1"               "BPXDAR" 
               BMXHEAD                 LBDHDD               LBDHDDSI 
             "BMXHEAD"               "LBDHDD"             "LBDHDDSI" 
                BPXSAR                 LBXAPB               LBDAPBSI 
              "BPXSAR"               "LBXAPB"             "LBDAPBSI" 
               LBDLDLM               LBDLDMSI                LBDLDLN 
             "LBDLDLM"             "LBDLDMSI"              "LBDLDLN" 
              LBDLDNSI                BMXSAD3                BMXSAD4 
            "LBDLDNSI"              "BMXSAD3"              "BMXSAD4" 
                LBDLDL               LBDLDLSI                BMXSAD1 
              "LBDLDL"             "LBDLDLSI"              "BMXSAD1" 
               BMXSAD2               BMDAVSAD                 BMXLEG 
             "BMXSAD2"             "BMDAVSAD"               "BMXLEG" 
                 LBXTC                LBDTCSI                 LBXGLU 
               "LBXTC"              "LBDTCSI"               "LBXGLU" 
               BMXCALF                BMXARML                 BMXSUB 
             "BMXCALF"              "BMXARML"               "BMXSUB" 
               URXUCR2               URDUCR2S                 BMXTRI 
             "URXUCR2"             "URDUCR2S"               "BMXTRI" 
               BMXARMC   Poverty_Income_Ratio               BMAEXLEN 
             "BMXARMC" "Poverty_Income_Ratio"             "BMAEXLEN" 
                URXUCR               BMXTHICR               LBDGLUSI 
              "URXUCR"             "BMXTHICR"             "LBDGLUSI" 
                 LBXTR                LBDTRSI               BMXRECUM 
               "LBXTR"              "LBDTRSI"             "BMXRECUM" 
               URXUMA2               URDUMA2S                 BMXHIP 
             "URXUMA2"             "URDUMA2S"               "BMXHIP" 
                URXCRS                  BMXHT               BMXWAIST 
              "URXCRS"                "BMXHT"             "BMXWAIST" 
              PEASCTM1                URDACT2                  BMXWT 
            "PEASCTM1"              "URDACT2"                "BMXWT" 
                URXUMA                 URXUMS                  LBXIN 
              "URXUMA"               "URXUMS"                "LBXIN" 
               LBDINSI                 BMXBMI                 URDACT 
             "LBDINSI"               "BMXBMI"               "URDACT" 
              WTSAF2YR 
            "WTSAF2YR"

we have 67 numeric features to analyze.

We can save both of these lists into an new object:

Code

names(Not_features) <- Not_features

FEATURE_TYPE <- new.env()
FEATURE_TYPE$numeric_features <- numeric_features
FEATURE_TYPE$categorical_features <- categorical_features
FEATURE_TYPE$Not_features <- Not_features

We can still access the information, however, now it is grouped together:

Code

FEATURE_TYPE$categorical_features

         BMAUPREL          BMAUPLEL           BMIHEAD            BMILEG 
       "BMAUPREL"        "BMAUPLEL"         "BMIHEAD"          "BMILEG" 
          BMICALF           BMIARML           BMIARMC          BMIWAIST 
        "BMICALF"         "BMIARML"         "BMIARMC"        "BMIWAIST" 
         BMITHICR          BMAUREXT          BMAULEXT          BMALOREX 
       "BMITHICR"        "BMAUREXT"        "BMAULEXT"        "BMALOREX" 
         BMALORKN          BMALLKNE          BMDRECUF           BMDSUBF 
       "BMALORKN"        "BMALLKNE"        "BMDRECUF"         "BMDSUBF" 
         BMDTHICF           BMDLEGF          BMDARMLF          BMDCALFF 
       "BMDTHICF"         "BMDLEGF"        "BMDARMLF"        "BMDCALFF" 
           BMIHIP            Gender              USAF     Birth_Country 
         "BMIHIP"          "Gender"            "USAF"   "Birth_Country" 
         Pregnant           BPQ150A           BPQ150B           BPQ150C 
       "Pregnant"         "BPQ150A"         "BPQ150B"         "BPQ150C" 
          BPQ150D           BPXPULS            BPXPTY            BPAEN1 
        "BPQ150D"         "BPXPULS"          "BPXPTY"          "BPAEN1" 
           BPAEN2            BPAEN3            BPAEN4          BMIRECUM 
         "BPAEN2"          "BPAEN3"          "BPAEN4"        "BMIRECUM" 
            BMIHT            BMITRI            BMISUB            BMAAMP 
          "BMIHT"          "BMITRI"          "BMISUB"          "BMAAMP" 
         BMALLEXT          URDUMALC          URDUCRLC           LBDINLC 
       "BMALLEXT"        "URDUMALC"        "URDUCRLC"         "LBDINLC" 
         PEASCST1            BPAARM          BMAEXSTS             BMIWT 
       "PEASCST1"          "BPAARM"        "BMAEXSTS"           "BMIWT" 
          URXPREG          BMDSTATS           BMDBMIC              Race 
        "URXPREG"        "BMDSTATS"         "BMDBMIC"            "Race" 
      Grade_Range            BPACSZ          BMDSADCM    Marital_Status 
    "Grade_Range"          "BPACSZ"        "BMDSADCM"  "Marital_Status" 
         BMAEXCMT          yr_range             BPXDB   Household_Icome 
       "BMAEXCMT"        "yr_range"           "BPXDB" "Household_Icome" 
    Family_Income          PEASCCT1       Grade_Level 
  "Family_Income"        "PEASCCT1"     "Grade_Level"

Code

FEATURE_TYPE[['numeric_features']]

              PHAFSTHR                 BPXML1               PHAFSTMN 
            "PHAFSTHR"               "BPXML1"             "PHAFSTMN" 
                BPXDI4                 BPXPLS                 BPXDI1 
              "BPXDI4"               "BPXPLS"               "BPXDI1" 
                BPXDI3                 BPXDI2                 BPXSY4 
              "BPXDI3"               "BPXDI2"               "BPXSY4" 
                BPXCHR                 BPXSY3                    Age 
              "BPXCHR"               "BPXSY3"                  "Age" 
                BPXSY2                 BPXSY1                 BPXDAR 
              "BPXSY2"               "BPXSY1"               "BPXDAR" 
               BMXHEAD                 LBDHDD               LBDHDDSI 
             "BMXHEAD"               "LBDHDD"             "LBDHDDSI" 
                BPXSAR                 LBXAPB               LBDAPBSI 
              "BPXSAR"               "LBXAPB"             "LBDAPBSI" 
               LBDLDLM               LBDLDMSI                LBDLDLN 
             "LBDLDLM"             "LBDLDMSI"              "LBDLDLN" 
              LBDLDNSI                BMXSAD3                BMXSAD4 
            "LBDLDNSI"              "BMXSAD3"              "BMXSAD4" 
                LBDLDL               LBDLDLSI                BMXSAD1 
              "LBDLDL"             "LBDLDLSI"              "BMXSAD1" 
               BMXSAD2               BMDAVSAD                 BMXLEG 
             "BMXSAD2"             "BMDAVSAD"               "BMXLEG" 
                 LBXTC                LBDTCSI                 LBXGLU 
               "LBXTC"              "LBDTCSI"               "LBXGLU" 
               BMXCALF                BMXARML                 BMXSUB 
             "BMXCALF"              "BMXARML"               "BMXSUB" 
               URXUCR2               URDUCR2S                 BMXTRI 
             "URXUCR2"             "URDUCR2S"               "BMXTRI" 
               BMXARMC   Poverty_Income_Ratio               BMAEXLEN 
             "BMXARMC" "Poverty_Income_Ratio"             "BMAEXLEN" 
                URXUCR               BMXTHICR               LBDGLUSI 
              "URXUCR"             "BMXTHICR"             "LBDGLUSI" 
                 LBXTR                LBDTRSI               BMXRECUM 
               "LBXTR"              "LBDTRSI"             "BMXRECUM" 
               URXUMA2               URDUMA2S                 BMXHIP 
             "URXUMA2"             "URDUMA2S"               "BMXHIP" 
                URXCRS                  BMXHT               BMXWAIST 
              "URXCRS"                "BMXHT"             "BMXWAIST" 
              PEASCTM1                URDACT2                  BMXWT 
            "PEASCTM1"              "URDACT2"                "BMXWT" 
                URXUMA                 URXUMS                  LBXIN 
              "URXUMA"               "URXUMS"                "LBXIN" 
               LBDINSI                 BMXBMI                 URDACT 
             "LBDINSI"               "BMXBMI"               "URDACT" 
              WTSAF2YR 
            "WTSAF2YR"

8.1.1 distinct_N_levels_per_cols

Below is a helper function for the count of distinct column entries in a table per the first sample_n records queried

Code

distinct_N_levels_per_cols <- function(data){
  
  data.colnames <- colnames(data)
  
  N_levels_features <- data %>%
    summarise(
      across(
        all_of(data.colnames), 
        \(x){n_distinct(x)})
      ) %>%
    collect() %>%
    pivot_longer( cols = all_of(data.colnames), values_to = "N_Distinct_Values")
  
  plot <- N_levels_features %>%
    mutate(Fearure = reorder(name, -N_Distinct_Values)) %>%
    mutate(Percentage_Distinct_Features = N_Distinct_Values/nrow.table(data)) %>%
    ggplot(aes(x= Fearure, y=Percentage_Distinct_Features , fill=Percentage_Distinct_Features)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    labs(title ="Percentage of Distinct Entries Per Column") +  
    theme(legend.position = "none")
  
  return( list( data = N_levels_features,
                plot = plot))
}

Code

distinct_N_levels_per_cols(A_DATA_TBL_2)$plot

Figure 8.1: Count of Distinct Number of Values Per Feature records

8.2 Categorical Features

Recall, we normally want to review categorical data by looking at frequency counts:

Code

A_DATA_TBL_2 %>%
  select(DIABETES, Gender) %>%
  group_by(DIABETES, Gender) %>%
  tally() %>%
  collect() %>%
  ungroup() %>%
  mutate(
    across(
      c("DIABETES", "Gender"),
      \(x){as.character(x)}
    )
    ) %>%
  mutate(
    across(
      c("DIABETES", "Gender"),
      \(x){if_else(is.na(x), "Missing", x)}
    )
    ) %>%
  pivot_wider(names_from = DIABETES, values_from = n)

# A tibble: 2 × 4
  Gender Missing   `0`   `1`
  <chr>    <int> <int> <int>
1 Female    2864 45188  3371
2 Male      2905 43552  3436

Code

A_DATA_TBL_2 %>%
  select(DIABETES, Race) %>%
  group_by(DIABETES, Race) %>%
  tally() %>%
  collect() %>%
  ungroup() %>%
  mutate(
    across(
      c("DIABETES", "Race"),
      \(x){as.character(x)}
    )
    ) %>%
  mutate(
    across(
      c("DIABETES", "Race"),
      \(x){if_else(is.na(x), "Missing", x)}
    )
    ) %>%
  pivot_wider(names_from = DIABETES, values_from = n)

# A tibble: 5 × 4
  Race             Missing   `0`   `1`
  <chr>              <int> <int> <int>
1 Black               1129 20692  1823
2 Mexican American    1650 19426  1373
3 Other                510  8376   611
4 Other Hispanic       501  7201   592
5 White               1979 33045  2408

8.2.1 `Count_Query` function

So to functionalize this process we create the following:

Code

Count_Query <- function(data, my_feature, outcome){
  enquo_feature <- enquo(my_feature)
  enquo_outcome <- enquo(outcome)
  
 tmp <- data %>%
    select(!!enquo_outcome, !!enquo_feature) %>%
    group_by(!!enquo_outcome, !!enquo_feature) %>%
    tally() %>%
    collect() %>%
    ungroup() %>%
    mutate(
      across(
        c(!!enquo_outcome, !!enquo_feature),
        \(x){as.character(x)}
      )
    ) %>%
    mutate(
      across(
        c(!!enquo_outcome, !!enquo_feature),
        \(x){if_else(is.na(x), "Missing", x)}
      )
    ) %>%
    pivot_wider(names_from = !!enquo_outcome, 
                values_from = n,
                values_fill = 0)
 
row_names <- if_else(is.na(tmp[[1]]),"NA", tmp[[1]]) 
tmp <- tmp[-1]
matrix_tmp <- as.matrix(tmp)

row.names(matrix_tmp) <- row_names
 
return(matrix_tmp)
}

What is enquo?

enquo() and enquos() delay the execution of one or several function arguments. enquo() returns a single quoted expression, which is like a blueprint for the delayed computation. enquos() returns a list of such quoted expressions. We can resolve an enquo with !!.

Code

Count_Query(A_DATA_TBL_2, Grade_Level, DIABETES)

                                   Missing     0    1
Missing                               5663 59497 6681
10th grade                               9  2177   11
11th grade                               8  2146   12
12th grade, no diploma                   2   438    0
1st grade                                2  2126    6
2nd grade                                1  2102    4
3rd grade                                6  2031    6
4th grade                               12  2036    6
5th grade                               10  2110    9
6th grade                                5  2219   14
7th grade                               11  2178    8
8th grade                               12  2292   11
9th grade                               15  2195   18
High school graduate                     7  1422    9
Less than 9th grade                      1   277    1
More than high school                    3   980    6
Never attended / kindergarten only       2  2362    5
Don't Know                               0     9    0
GED or equivalent                        0   115    0
Less than 5th grade                      0    26    0
Refused                                  0     2    0

Code

Count_Query(A_DATA_TBL_2, Household_Icome, DIABETES)

                   Missing     0    1
Missing               2548 39949 2422
$ 0 to $ 4,999         118  1349   96
$ 5,000 to $ 9,999     150  2012  268
$10,000 to $14,999     236  3169  452
$100,000 and Over      440  8226  460
$15,000 to $19,999     254  3427  375
$20,000 and Over       119  1781  194
$20,000 to $24,999     277  3902  417
$25,000 to $34,999     399  5994  526
$35,000 to $44,999     311  4738  449
$45,000 to $54,999     220  3779  311
$55,000 to $64,999     191  2878  268
$65,000 to $74,999     155  2323  205
$75,000 to $99,999     294  4580  297
Under $20,000           57   633   67

We also remark that there are some helpers for this practice if we don’t want to utilize the underlying enquo() and !! in the rlang package. This is known as the embrace operator {{}}. The embrace operator combines combines enquo() and !! in one step. For instance, out Count_Query function can be re-written as:

Code

Count_Query <- function(data, my_feature, outcome){
  my_feature_car <- data %>% 
    select({{my_feature}}) |> 
    colnames()
  
  my_outcome_car <- data %>% 
    select({{outcome}}) |> 
    colnames()
  
 tmp <- data %>%
    select({{outcome}}, {{my_feature}}) %>%
    group_by({{outcome}}, {{my_feature}}) %>%
    tally() %>%
    collect() %>%
    ungroup() %>%
    mutate(
      across(
        c({{outcome}}, {{my_feature}}),
        \(x){as.character(x)}
      )
    ) %>%
    mutate(
      across(
        c({{outcome}}, {{my_feature}}),
        \(x){if_else(is.na(x), "Missing", x)}
      )
    ) %>%
    pivot_wider(names_from = {{outcome}}, 
                values_from = n,
                values_fill = 0)
 
row_names <- if_else(is.na(tmp[[1]]),"NA", tmp[[1]]) 
tmp <- tmp[-1]
matrix_tmp <- as.matrix(tmp)

row.names(matrix_tmp) <- row_names

dimnames(matrix_tmp) <- list(row_names, colnames(matrix_tmp))

names(dimnames(matrix_tmp)) <- c(my_feature_car, my_outcome_car)
 
return(matrix_tmp)
}

8.2.2 Graphing Helpers

We can also make a function to display the results of the Count_Query function. This function will be a ggplot function that will display the results of the Count_Query function:

Code

plot.chisq.test.bar <- function(X){
  
  if("matrix" %in% class(X) | "table" %in% class(X)){
    tryCatch({
      X <- chisq.test(X)
    }, error = function(e) {
      stop("Could not run chisq.test on the input matrix or table")
    })
  }
  
  if(class(X) != "htest"){
    stop("Input must be a matrix or table")
  }

  row_sums_mat <- X$observed |> 
    rowSums()

  x_name <- names(dimnames(X$observed))[[1]]
  y_name <- names(dimnames(X$observed))[[2]]

  v <-  as.data.frame(X$observed/row_sums_mat) |>
    rownames_to_column(x_name) |>
    tidyr::pivot_longer(
      cols = -all_of(x_name),
      names_to = y_name, 
      values_to = 'Proportion') 
  
  v %>%
    ggplot(aes(x = Proportion, y = .data[[x_name]], fill = .data[[y_name]])) +
    geom_bar(stat = 'identity') +
    labs(title = "Chi-Square Test",
         caption = paste0("Chi-Square Test: p-value = ", round(X$p.value, 4))) 
}

Code

Count_Query(A_DATA_TBL_2, Household_Icome, DIABETES) |>
  plot.chisq.test.bar()

Or a function to help us with balloon plots:

Code

plot.chisq.test.balloon <- function(X){
  
  if("matrix" %in% class(X) | "table" %in% class(X)){
    tryCatch({
      X <- chisq.test(X)
    }, error = function(e) {
      stop("Could not run chisq.test on the input matrix or table")
    })
  }
  
  if(class(X) != "htest"){
    stop("Input must be a matrix or table")
  }
  
    row_sums_mat <- X$observed |> 
    rowSums()

  x_name <- names(dimnames(X$observed))[[1]]
  y_name <- names(dimnames(X$observed))[[2]]

  v <-  as.data.frame(X$observed/row_sums_mat) |>
    rownames_to_column(x_name) |>
    tidyr::pivot_longer(
      cols = -all_of(x_name),
      names_to = y_name, 
      values_to = 'Proportion') 
  
v %>%
  ggplot(aes(x = .data[[y_name]], y = .data[[x_name]], 
             size = Proportion, color=Proportion)) +
  geom_point() +
  scale_size_continuous(range = c(3, 10)) + 
  scale_color_gradient(low = "dodgerblue", high = "darkblue") +
  labs(title = "Chi-Square Test",
       caption = paste0("Chi-Square Test: p-value = ", round(X$p.value, 4)))
}

Code

Count_Query(A_DATA_TBL_2, Household_Icome, DIABETES) |>
  plot.chisq.test.balloon()

8.2.2.1 Combine functions to create new functions

We can combine functions:

Code

plot.chisq.test <- function(X, method){
  
  if(method == 'bar'){
    return(plot.chisq.test.bar(X))
  }
  
  if(method == 'balloon'){
    return(plot.chisq.test.balloon(X))
  }
  
  if(method == 'both'){
    return(
      gridExtra::grid.arrange(
        plot.chisq.test.bar(X), 
        plot.chisq.test.balloon(X), 
        nrow = 2))
  }
}

Code

Count_Query(A_DATA_TBL_2, Gender, DIABETES) %>%
  plot.chisq.test(method = 'both')

8.3 Additional Programming Refrences

8.4 Continuous Features

Previously we had the following algorithm for a t-test:

Code

## DONT RUN
{
dm2_age <- (A_DATA %>%
  filter(DIABETES == 1))$Age

non_dm2_Age <- (A_DATA %>%
  filter(DIABETES == 0))$Age

tt_age <- t.test(Non_DM2.Age, DM2.Age, 
                 alternative = 'two.sided', 
                 conf.level = 0.95)

tibble_tt_age <- tt_age %>%
  broom::tidy()
}

That process was for one t.test, for one continuous feature, Age.

We might want to functionalize the process of computing t-tests for other similar analytic data-sets, like the one we are currently working with.

Developing such functions or wrappers may take time and skill but over-time you will come to rely on a personal library of functions you develop or work with to iterate on projects faster. These functions might start in your personal library and eventually move into a team Git where you or other members of your team can manage their development.

For instance, upon inspection, we notice that t.test requires similar information to ks.test. It would make sense to bundle two tests together and save us some time later on.

8.4.1 t-test / ks-test wrapper function

In the function wrapper.t_ks_test below takes in:

df a tibble - data-frame or connection
factor a categorical variable in df
- Note in SQLlite there is no factor type but the user thinks of this variable as a factor
factor_level - sets the “first” class in a one-versus-rest t.test and ks.test analysis
continuous_feature represented by a character string of the feature name to perform t.test and ks.test on
verbose was useful in creating the function to identify variables causing errors, and create output for those types of cases.

Code

wrapper.t_ks_test <- function(df, factor, factor_level,  continuous_feature, verbose = FALSE){

  if(verbose == TRUE){
    cat(paste0('\n','Now on Feature ',continuous_feature,' \n'))
  }
  
  data.local <- df %>%
    select({{factor}}, matches(continuous_feature)) %>%
    collect() 
  
  X <- data.local %>%
    filter({{factor}} == factor_level) %>%
    pull(continuous_feature)
  
  Y <- data.local %>%
    filter({{factor}} != factor_level) %>%
    pull(continuous_feature)
  
  sd_x <- sd(X, na.rm=TRUE)
  sd_y <- sd(Y, na.rm=TRUE)

  if(sum(!is.na(X)) < 3 | sum(!is.na(Y)) < 3 | is.na(sd_x) | is.na(sd_y) | sd_x == 0 | sd_y == 0){
    err_return <- tibble(
      estimate1 = mean(X, na.rm = TRUE),
      estimate2 = mean(Y, na.rm = TRUE),
      N_Target = sum(!is.na(X)),
      N_Control = sum(!is.na(Y))
    ) %>%
      mutate(estimate = estimate1 - estimate2)
    return(err_return)
  }

  test_result <- t.test(X,Y) %>%
    broom::tidy() %>%
    rename(ttest.pvalue = p.value) %>%
    rename(t.statistic = statistic) 

  kstest_result <- ks.test(X,Y) %>%
    broom::tidy() %>%
    rename(kstest.pvalue = p.value) %>%
    rename(ks.statistic = statistic) %>%
    select(kstest.pvalue, ks.statistic)
  
  result <- cbind(test_result, kstest_result) %>%
    mutate(N_Target = sum(!is.na(X))) %>%
    mutate(N_Control = sum(!is.na(Y))) 

  return(result)

  if(verbose == TRUE){
    cat(paste0('\n','Finished Feature ',continuous_feature,' \n'))
  }
}

8.4.2 Test Function

Now we can test our function:

Code

wrapper.t_ks_test(df=A_DATA_TBL_2 , 
              factor = DIABETES, 
              factor_level = 1, 
              'Age') %>%
  select(-where(is.character)) %>%
  pivot_longer(everything(),
               names_to = 'Test',
               values_to = 'Value')

Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties

# A tibble: 12 × 2
   Test              Value
   <chr>             <dbl>
 1 estimate         31.5  
 2 estimate1        61.5  
 3 estimate2        30.0  
 4 t.statistic     161.   
 5 ttest.pvalue      0    
 6 parameter      9709.   
 7 conf.low         31.1  
 8 conf.high        31.9  
 9 kstest.pvalue     0    
10 ks.statistic      0.599
11 N_Target       6807    
12 N_Control     88740

You may want to put on additional safe-guards to ensure that the t.test and ks.test get valid samples, with the feature BMXRECUM we only had only 5 Diabetic member who had a non-missing value:

Code

wrapper.t_ks_test(df = A_DATA_TBL_2 , 
              factor = DIABETES , 
              factor_level = 1, 
              'BMXRECUM') %>%
  glimpse()

Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties

Rows: 1
Columns: 14
$ estimate      <dbl> -0.4785482
$ estimate1     <dbl> 89.64
$ estimate2     <dbl> 90.11855
$ t.statistic   <dbl> -0.1170228
$ ttest.pvalue  <dbl> 0.9124758
$ parameter     <dbl> 4.004916
$ conf.low      <dbl> -11.82694
$ conf.high     <dbl> 10.86984
$ method        <chr> "Welch Two Sample t-test"
$ alternative   <chr> "two.sided"
$ kstest.pvalue <dbl> 0.9444457
$ ks.statistic  <dbl> 0.2355309
$ N_Target      <int> 5
$ N_Control     <int> 7205

8.5 `purrr`

We will showcase how to now apply our function to every feature in our data-frame. First lets just look at the syntax for the first five numeric features: numeric_features[1:5] = PHAFSTHR, BPXML1, PHAFSTMN, BPXDI4, BPXPLS. We use the function map_dfr in the purrr library, map_dfr will attempt to return a data-frame by row-binding the results.

Below we create t_ks_result_sql with a call to map_dfr(.x, .f, ..., .id = NULL) where:

.x - A vector (numeric_features[1:5])
.f - A function (wrapper.t_ks_test)
- Additional arguments passed in to the mapped function .f = wrapper.t_ks_test
  - df = A_DATA_TBL_2
  - factor = DIABETES
  - factor_level = 1
  - x the feature from the vector .x
.id - An optional character vector that will be used to name the output column of identifiers.

Each entry in numeric_features[1:5] will get passed to our function wrapper.t_ks_test produce a row like we have already seen; these rows will then be stacked together to create a data-frame.

Code

toc <- Sys.time()

t_ks_result_sql <- map_dfr(
  numeric_features[1:5], # vector of columns  (.x)
  \(x){  # function to apply (.f)
    wrapper.t_ks_test(
      df = A_DATA_TBL_2,
      factor = DIABETES,
      factor_level = 1,
      x# feature from the vector .x
    )},
  .id = 'Feature' # name of the output column of identifiers 
) # end of map_dfr

Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties

Code

tic <- Sys.time()

sql_con_time <- difftime(tic, toc, units='secs')
sql_con_time

Time difference of 14.5852 secs

We can see this function works, however, the SQLlight connection is slow for this kind of process, it took 14.5852031707764 seconds to perform 10 tests and get 5 result records.

We might try to download the data:

Code

toc <- Sys.time()

A_DATA_2 <- A_DATA_TBL_2 %>% 
  collect()

tic <- Sys.time()
download_time <- difftime(tic , toc, units='secs')

Code

download_time

Time difference of 6.019896 secs

How large is A_DATA_2 ?

Code

dim(A_DATA_2)

[1] 101316    133

Code

format(object.size(A_DATA_2),"Mb")

[1] "102.8 Mb"

Now let’s test our function here, in R, but this time we’ll use get the results for all the features:

Code

toc <- Sys.time()


t_ks_result_purr <- map(
  numeric_features, # vector of columns 
  \(x){wrapper.t_ks_test( # function to apply
    df = A_DATA_2 , 
    factor = DIABETES , 
    factor_level = 1,
    x)}
  ) %>%  # end of map
  list_rbind(names_to = "Feature") # list_rbind turns into a data-frame

Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties

Code

tic <- Sys.time()

local_time <- difftime(tic , toc , units='secs')

Code

local_time

Time difference of 2.087457 secs

Code

as.numeric(local_time) / as.numeric(sql_con_time)

[1] 0.1431216

That’s quite a speed-up to get all 67 results in 2.08745694160461 seconds but we can most likely do even better with the furrr package:

8.6 `furrr`

Code

library('furrr')

Loading required package: future

Code

no_cores <- availableCores() 
no_cores

system 
    16

Code

future::plan(multicore, workers = no_cores)

Code

toc <- Sys.time()

t_ks_result_furr <- furrr::future_map_dfr(
  numeric_features, # vector of columns 
  function(x){
    wrapper.t_ks_test(
      df=A_DATA_2 ,
      factor = DIABETES ,
      factor_level = 1,
      x) 
    })

Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties
Warning in ks.test.default(X, Y): p-value will be approximate in the presence
of ties

Code

tic <- Sys.time()

furrr_time <- tic - toc

8.6.1 Compare Run Times

Code

sql_v_purrr_v_furrr <- bind_rows(
  tibble(time = sql_con_time , 
         n_records = nrow(t_ks_result_sql), 
         method = 'sqlite_time'),
  tibble(time = local_time , 
         n_records = nrow(t_ks_result_purr), 
         method = 'purrr_time'),
  tibble(time = furrr_time , 
         n_records = nrow(t_ks_result_purr), 
         method = 'furrr_time')
) %>% 
  mutate(records_per_second = n_records/as.double(time))

Code

#} label: 4-1-load-manual-run
#| echo: false

sql_v_purrr_v_furrr <- readRDS(here::here("DATA/Part_4/sql_v_purrr_v_furrr.RDS"))

Code

sql_v_purrr_v_furrr %>%
  ggplot(aes(x= method, y=records_per_second, fill=method)) +
  geom_bar(stat = "identity", position=position_dodge()) +
  coord_flip()

Code

sql_v_purrr_v_furrr

# A tibble: 3 × 4
  time          n_records method      records_per_second
  <drtn>            <int> <chr>                    <dbl>
1 60.69406 secs         5 sqlite_time             0.0824
2 11.29607 secs        67 purrr_time              5.93  
3 10.18687 secs        67 furrr_time              6.58

Even accounting for download times we can see that does not have much of an effect:

Code

sql_v_purrr_v_furrr %>%
  mutate(download_time = download_time) %>%
  mutate(time_plus_download = if_else( method != 'sqlite_time',
                                       time + download_time,
                                       time)) %>%
  mutate(records_per_second = n_records/as.numeric(time_plus_download)) %>%
  ggplot(aes(x= method, y=records_per_second, fill=method)) +
  geom_bar(stat = "identity", position=position_dodge()) +
  coord_flip()

Code

records_per_second_plus_download <- sql_v_purrr_v_furrr %>%
  mutate(download_time = download_time) %>%
  mutate(time_plus_download = if_else( method != 'sqlite_time',
                                       time + download_time,
                                       time)) %>%
  mutate(records_per_second = n_records/as.numeric(time_plus_download)) %>%
  select(method, records_per_second) %>%
  pivot_wider(names_from = method, values_from = records_per_second)

records_per_second_plus_download

# A tibble: 1 × 3
  sqlite_time purrr_time furrr_time
        <dbl>      <dbl>      <dbl>
1      0.0824       3.87       4.13

Even accounting for download speeds locally with purrr our function is

Code

records_per_second_plus_download$purrr_time / records_per_second_plus_download$sqlite_time

[1] 46.96824

times faster than with our SQLite connection and in this case furrr is

Code

records_per_second_plus_download$furrr_time / records_per_second_plus_download$purrr_time

[1] 1.068441

times faster than purrr.

8.6.1.1 Results Vary By Connection Type

We have already mentioned that these results are confined to our SQLite connection. In some cases, you might be able to write a t-test / ks-test wrapper that performs better when computed on a server rather than within a local R session.

8.7 Compairing Outputs

The arsenal package contains a handy function comparedf to compare two data-frames. We can see that t_ks_result_purr and t_ks_result_furr match:

Code

arsenal::comparedf(t_ks_result_purr,
                   t_ks_result_furr)

Compare Object

Function Call: 
arsenal::comparedf(x = t_ks_result_purr, y = t_ks_result_furr)

Shared: 14 non-by variables and 67 observations.
Not shared: 1 variables and 0 observations.

Differences found in 0/14 variables compared.
0 variables compared have non-identical attributes.

However, t_ks_result_sql and t_ks_result_purr do not:

Code

arsenal::comparedf(t_ks_result_purr,
                   t_ks_result_sql)

Compare Object

Function Call: 
arsenal::comparedf(x = t_ks_result_purr, y = t_ks_result_sql)

Shared: 15 non-by variables and 5 observations.
Not shared: 0 variables and 62 observations.

Differences found in 0/15 variables compared.
0 variables compared have non-identical attributes.

we can get a detailed summary of the outputs

Code

compare.details <- arsenal::comparedf(t_ks_result_purr,
                                      t_ks_result_sql) %>% 
                           summary()

there are several tables to review

Code

str(compare.details,1)

List of 9
 $ frame.summary.table     :'data.frame':   2 obs. of  4 variables:
 $ comparison.summary.table:'data.frame':   13 obs. of  2 variables:
 $ vars.ns.table           :'data.frame':   0 obs. of  4 variables:
 $ vars.nc.table           :'data.frame':   0 obs. of  6 variables:
 $ obs.table               :'data.frame':   62 obs. of  3 variables:
 $ diffs.byvar.table       :'data.frame':   15 obs. of  4 variables:
 $ diffs.table             :'data.frame':   0 obs. of  7 variables:
 $ attrs.table             :'data.frame':   0 obs. of  3 variables:
 $ control                 :List of 16
 - attr(*, "class")= chr "summary.comparedf"

this is the comparison.summary.table

Code

compare.details$comparison.summary.table

                                                     statistic value
1                                       Number of by-variables     0
2                         Number of non-by variables in common    15
3                                 Number of variables compared    15
4                           Number of variables in x but not y     0
5                           Number of variables in y but not x     0
6        Number of variables compared with some values unequal     0
7           Number of variables compared with all values equal    15
8                             Number of observations in common     5
9                        Number of observations in x but not y    62
10                       Number of observations in y but not x     0
11 Number of observations with some compared variables unequal     0
12    Number of observations with all compared variables equal     5
13                                    Number of values unequal     0

for instance compare.details$obs.table has the elements in t_ks_result_purr (x) that are not within t_ks_result_sql (y):

Code

compare.details$obs.table %>%
  head()

   version ..row.names.. observation
6        x             6           6
7        x             7           7
8        x             8           8
9        x             9           9
10       x            10          10
11       x            11          11

If we want the top 10 corresponding records we could do:

Code

t_ks_result_purr[compare.details$obs.table$observation[1:10],] %>%
  select(Feature, ttest.pvalue, kstest.pvalue) %>%
  knitr::kable()

Feature	ttest.pvalue	kstest.pvalue
BPXDI1	0.0000000	0.0000000
BPXDI3	0.0000000	0.0000000
BPXDI2	0.0000000	0.0000000
BPXSY4	0.0000000	0.0000000
BPXCHR	0.5185255	0.8169532
BPXSY3	0.0000000	0.0000000
Age	0.0000000	0.0000000
BPXSY2	0.0000000	0.0000000
BPXSY1	0.0000000	0.0000000
BPXDAR	0.0689078	0.0164992

8.8 Disscussion

How do the functions Count_Query, wrapper.t_ks_test, and logit_model_scorer from last chapter compare?

8.8.0.1 Save Data

Code

A_DATA_2 %>%
  saveRDS(here::here('DATA/Part_4/A_DATA_2.RDS'))

t_ks_result_furr %>%
  saveRDS(here::here('DATA/Part_4/t_ks_result_furr.RDS'))

FEATURE_TYPE %>% 
  saveRDS(here::here('DATA/Part_4/FEATURE_TYPE.RDS'))

Code

DBI::dbDisconnect(NHANES_DB)

8.0.0.1 source current target

8.0.0.2 source current features

8.0.1 A_DATA_TBL_2

8.1 Determining Categorical and Continuous Features

8.1.1 distinct_N_levels_per_cols

8.2 Categorical Features

8.2.1 Count_Query function

8.2.2 Graphing Helpers

8.2.2.1 Combine functions to create new functions

8.3 Additional Programming Refrences

8.4 Continuous Features

8.4.1 t-test / ks-test wrapper function

8.4.2 Test Function

8.5 purrr

8.6 furrr

8.6.1 Compare Run Times

8.6.1.1 Results Vary By Connection Type

8.7 Compairing Outputs

8.8 Disscussion

8.8.0.1 Save Data

8.0.0.1 `source` current target

8.0.0.2 `source` current features

8.0.1 `A_DATA_TBL_2`

8.2.1 `Count_Query` function

8.5 `purrr`

8.6 `furrr`