Module 10: Improving Model Fits

LOADING REQUIRED PACKAGES

library(here)
here() starts at /Users/shiwanisapkota/Desktop/MADA Course/shiwanisapkota-MADA-portfolio
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   0.3.4
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.2.0     ✔ stringr 1.4.1
✔ readr   2.1.2     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.1     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tune         1.0.1
✔ infer        1.0.4     ✔ workflows    1.1.3
✔ modeldata    1.1.0     ✔ workflowsets 1.0.0
✔ parsnip      1.0.4     ✔ yardstick    1.1.0
✔ recipes      1.0.5     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Search for functions across packages at https://www.tidymodels.org/find/
library(performance)

Attaching package: 'performance'

The following objects are masked from 'package:yardstick':

    mae, rmse
library(broom)

LOADING DATA

fludata_clean <- readRDS(here("fluanalysis", "data", "fludata_clean.rds"))
glimpse(fludata_clean)
Rows: 730
Columns: 32
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y…
$ ChestCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y…
$ ChillsSweats      <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, …
$ NasalCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y…
$ CoughYN           <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, …
$ Sneeze            <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, …
$ Fatigue           <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ SubjectiveFever   <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes…
$ Headache          <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes…
$ Weakness          <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi…
$ WeaknessYN        <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ CoughIntensity    <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve…
$ CoughYN2          <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes…
$ Myalgia           <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, …
$ MyalgiaYN         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ RunnyNose         <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No…
$ AbPain            <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N…
$ ChestPain         <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, …
$ Diarrhea          <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,…
$ EyePn             <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye…
$ Insomnia          <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y…
$ ItchyEye          <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,…
$ Nausea            <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y…
$ EarPn             <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y…
$ Hearing           <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,…
$ Pharyngitis       <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, …
$ Breathless        <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, …
$ ToothPn           <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N…
$ Vision            <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, …
$ Vomit             <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N…
$ Wheeze            <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N…
$ BodyTemp          <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, …

DATA SPLITTING

# Creating a new object to work
flu_module10 <- fludata_clean

# Setting seed
set.seed(123)

# Putting 3/4 of the data into the training set
data_split <- initial_split(flu_module10, prop = 3/4)

# Creating data frames for the two sets
train_data <- training(data_split)
test_data  <- testing(data_split)

FULL MODEL: CREATING RECIPE THAT FITS A LOGISTIC MODEL USING NAUSEA AS OUTCOME OF INTEREST AND ALL OTHER VARIABLES AS PREDICTORS

# Using Nausea as a categorical outcome of interest and all other variables as predictors
flu_module10_rec <- recipe(Nausea ~ ., data = train_data)

# Fitting the logistic model
flu_module10_mod <- logistic_reg() %>% 
                    set_engine("glm")

# Modelling workflow for pairing model and recipe 
flu_module10_wflow <- workflow() %>% 
  add_model(flu_module10_mod) %>% 
  add_recipe(flu_module10_rec)
flu_module10_wflow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Computational engine: glm 
# Using the resulting predictors for preparing recipe and training the model
flu_module10_fit <- 
 flu_module10_wflow %>% 
  fit(data = train_data)

# Pulling the fitted model object and using tidy() function for getting a tidy tibble of model coefficients
flu_module10_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()
# A tibble: 38 × 5
   term                  estimate std.error statistic p.value
   <chr>                    <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)          -2.42         9.14  -0.265     0.791 
 2 SwollenLymphNodesYes -0.440        0.236 -1.86      0.0624
 3 ChestCongestionYes    0.278        0.255  1.09      0.275 
 4 ChillsSweatsYes       0.314        0.356  0.881     0.378 
 5 NasalCongestionYes    0.213        0.302  0.707     0.480 
 6 CoughYNYes           -0.000535     0.708 -0.000755  0.999 
 7 SneezeYes             0.142        0.255  0.556     0.578 
 8 FatigueYes            0.287        0.462  0.621     0.535 
 9 SubjectiveFeverYes    0.434        0.276  1.57      0.116 
10 HeadacheYes           0.634        0.367  1.73      0.0843
# … with 28 more rows

FULL MODEL: USING TRAINED WORKFLOW TO PREDICT

# Using the trained workflow (flu_module10_fit) to predict with the unseen test data
predict(flu_module10_fit, test_data)
# A tibble: 183 × 1
   .pred_class
   <fct>      
 1 No         
 2 Yes        
 3 No         
 4 No         
 5 No         
 6 No         
 7 No         
 8 No         
 9 No         
10 Yes        
# … with 173 more rows
# Using argument() with the model plus test data for saving them together
flu_module10_aug <- 
  augment(flu_module10_fit, test_data)
  
flu_module10_aug %>%
  select(Nausea, .pred_No, .pred_Yes)
# A tibble: 183 × 3
   Nausea .pred_No .pred_Yes
   <fct>     <dbl>     <dbl>
 1 No        0.958    0.0417
 2 Yes       0.125    0.875 
 3 Yes       0.781    0.219 
 4 Yes       0.744    0.256 
 5 Yes       0.779    0.221 
 6 No        0.773    0.227 
 7 No        0.555    0.445 
 8 No        0.749    0.251 
 9 No        0.937    0.0626
10 Yes       0.176    0.824 
# … with 173 more rows
# Creating ROC curve and piping to the autoplot() method
flu_module10_aug %>% 
  roc_curve(truth = Nausea, .pred_No) %>% 
  autoplot()

# Using roc_auc() for estimating the area under the curve 
flu_module10_aug %>% 
  roc_auc(truth = Nausea, .pred_No)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.706

ALTERNATIVE MODEL: USING NAUSEA AS OUTCOME OF INTEREST AND RUNNYNOSE AS MAIN PREDICTOR

# Using Nausea as a categorical outcome of interest and RunnyNose as main predictor
flu_module10_rec2 <- recipe(Nausea ~ RunnyNose, data = train_data)

# Fitting the logistic model
flu_module10_mod2 <- logistic_reg() %>% 
                    set_engine("glm")

# Modelling workflow for pairing model and recipe 
flu_module10_wflow2 <- workflow() %>% 
  add_model(flu_module10_mod2) %>% 
  add_recipe(flu_module10_rec2)
flu_module10_wflow2
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Computational engine: glm 
# Using the resulting predictors for preparing recipe and training the model
flu_module10_fit2 <- 
 flu_module10_wflow2 %>% 
  fit(data = train_data)

# Pulling the fitted model object and using tidy() function for getting a tidy tibble of model coefficients
flu_module10_fit2 %>% 
  extract_fit_parsnip() %>% 
  tidy()
# A tibble: 2 × 5
  term         estimate std.error statistic  p.value
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)   -0.646      0.168    -3.84  0.000121
2 RunnyNoseYes  -0.0706     0.200    -0.353 0.724   

ALTERNATIVE MODEL: USING TRAINED WORKFLOW TO PREDICT

# Using the trained workflow (flu_module10_fit) to predict with the unseen test data
predict(flu_module10_fit2, test_data)
# A tibble: 183 × 1
   .pred_class
   <fct>      
 1 No         
 2 No         
 3 No         
 4 No         
 5 No         
 6 No         
 7 No         
 8 No         
 9 No         
10 No         
# … with 173 more rows
# Using argument() with the model plus test data for saving them together
flu_module10_aug2 <- 
  augment(flu_module10_fit2, test_data)
  
flu_module10_aug2 %>%
  select(Nausea, .pred_No, .pred_Yes)
# A tibble: 183 × 3
   Nausea .pred_No .pred_Yes
   <fct>     <dbl>     <dbl>
 1 No        0.656     0.344
 2 Yes       0.672     0.328
 3 Yes       0.672     0.328
 4 Yes       0.672     0.328
 5 Yes       0.672     0.328
 6 No        0.672     0.328
 7 No        0.656     0.344
 8 No        0.656     0.344
 9 No        0.656     0.344
10 Yes       0.656     0.344
# … with 173 more rows
# Creating ROC curve and piping to the autoplot() method
flu_module10_aug2 %>% 
  roc_curve(truth = Nausea, .pred_No) %>% 
  autoplot()

# Using roc_auc() for estimating the area under the curve 
flu_module10_aug2 %>% 
  roc_auc(truth = Nausea, .pred_No)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.460

From the above result, we see that the full model (using Nausea as the main outcome of interest and all variables as predictors) has ROC-AUC as 0.71 while the alternative model (using Nausea as the main outcome of interest and RunnyNose as the main predictor) has ROC-AUC as 0.46. So, we can say that for Nausea, the full model with all the predictors is better than the alternative model having only RunnyNose as the main predictor.

THIS SECTION ADDED BY IRENE CAVROS

We will first split 75% of the data into the training set.

set.seed(2)
flu_split_ic <- initial_split(fludata_clean, prop = 3/4)

Next we can make the two data sets we just split into objects.

train_data_ic <- training(flu_split_ic)
test_data_ic <- testing(flu_split_ic)

Full model

Let’s set up the recipe using body temperature as our continuous outcome.

flu_recipe1_ic <- recipe(BodyTemp ~ ., data = fludata_clean)

Prepping our model

model1_ic <- linear_reg() %>%
  set_engine("lm")

Setting up our workflow

workflow1_ic <- workflow() %>%
  add_model(model1_ic) %>%
  add_recipe(flu_recipe1_ic)

Training the model using training data

fit1_ic <- workflow1_ic %>%
  fit(data = train_data_ic)
fit1_ic %>%
  extract_fit_parsnip() %>%
  tidy()
# A tibble: 38 × 5
   term                 estimate std.error statistic  p.value
   <chr>                   <dbl>     <dbl>     <dbl>    <dbl>
 1 (Intercept)           98.2        0.351   280.    0       
 2 SwollenLymphNodesYes  -0.100      0.109    -0.923 0.356   
 3 ChestCongestionYes     0.0519     0.117     0.441 0.659   
 4 ChillsSweatsYes        0.163      0.151     1.08  0.282   
 5 NasalCongestionYes    -0.239      0.132    -1.82  0.0698  
 6 CoughYNYes             0.141      0.271     0.522 0.602   
 7 SneezeYes             -0.310      0.116    -2.67  0.00784 
 8 FatigueYes             0.197      0.185     1.06  0.288   
 9 SubjectiveFeverYes     0.476      0.124     3.83  0.000147
10 HeadacheYes            0.0282     0.151     0.187 0.852   
# … with 28 more rows

Predicting using the test data

predict(fit1_ic, test_data_ic)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
# A tibble: 183 × 1
   .pred
   <dbl>
 1  99.6
 2  99.8
 3  99.8
 4  98.4
 5  98.7
 6  98.6
 7  98.9
 8  99.0
 9  99.7
10  99.3
# … with 173 more rows

Augmenting

aug1_ic <- augment(fit1_ic, test_data_ic)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
aug1_ic %>%
  select(BodyTemp, .pred)
# A tibble: 183 × 2
   BodyTemp .pred
      <dbl> <dbl>
 1    100.   99.6
 2     98.2  99.8
 3     97.9  99.8
 4     98.1  98.4
 5     99.2  98.7
 6     98.1  98.6
 7     98.5  98.9
 8    100.   99.0
 9     98.9  99.7
10     98.8  99.3
# … with 173 more rows

Model with one predictor

The one predictor we will use for this exercise is RunnyNose

flu_recipe2_ic <- recipe(BodyTemp ~ RunnyNose, data = fludata_clean)

We do not need to set up our model again since the one we already did will work.

Setting up our workflow

workflow2_ic <- workflow() %>%
  add_model(model1_ic) %>%
  add_recipe(flu_recipe2_ic)

Training the model using training data

fit2_ic <- workflow2_ic %>%
  fit(data = train_data_ic)
fit2_ic %>%
  extract_fit_parsnip() %>%
  tidy()
# A tibble: 2 × 5
  term         estimate std.error statistic p.value
  <chr>           <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)    99.2      0.0968   1024.   0      
2 RunnyNoseYes   -0.366    0.114      -3.21 0.00139

Predicting using the test data

predict(fit2_ic, test_data_ic)
# A tibble: 183 × 1
   .pred
   <dbl>
 1  99.2
 2  99.2
 3  99.2
 4  98.8
 5  98.8
 6  98.8
 7  98.8
 8  98.8
 9  99.2
10  99.2
# … with 173 more rows

Augmenting

aug2_ic <- augment(fit2_ic, test_data_ic)

aug2_ic %>%
  select(BodyTemp, .pred)
# A tibble: 183 × 2
   BodyTemp .pred
      <dbl> <dbl>
 1    100.   99.2
 2     98.2  99.2
 3     97.9  99.2
 4     98.1  98.8
 5     99.2  98.8
 6     98.1  98.8
 7     98.5  98.8
 8    100.   98.8
 9     98.9  99.2
10     98.8  99.2
# … with 173 more rows