Fitting of Flu Data

LOADING REQUIRED PACKAGES

library(here)
here() starts at /Users/shiwanisapkota/Desktop/MADA Course/shiwanisapkota-MADA-portfolio
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   0.3.4
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.2.0     ✔ stringr 1.4.1
✔ readr   2.1.2     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.1     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tune         1.0.1
✔ infer        1.0.4     ✔ workflows    1.1.3
✔ modeldata    1.1.0     ✔ workflowsets 1.0.0
✔ parsnip      1.0.4     ✔ yardstick    1.1.0
✔ recipes      1.0.5     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Search for functions across packages at https://www.tidymodels.org/find/
library(performance)

Attaching package: 'performance'

The following objects are masked from 'package:yardstick':

    mae, rmse
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi

LOADING DATA

fludata_clean <- readRDS(here("fluanalysis", "data", "fludata_clean.rds"))
glimpse(fludata_clean)
Rows: 730
Columns: 32
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y…
$ ChestCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y…
$ ChillsSweats      <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, …
$ NasalCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y…
$ CoughYN           <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, …
$ Sneeze            <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, …
$ Fatigue           <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ SubjectiveFever   <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes…
$ Headache          <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes…
$ Weakness          <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi…
$ WeaknessYN        <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ CoughIntensity    <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve…
$ CoughYN2          <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes…
$ Myalgia           <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, …
$ MyalgiaYN         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
$ RunnyNose         <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No…
$ AbPain            <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N…
$ ChestPain         <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, …
$ Diarrhea          <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,…
$ EyePn             <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye…
$ Insomnia          <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y…
$ ItchyEye          <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,…
$ Nausea            <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y…
$ EarPn             <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y…
$ Hearing           <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,…
$ Pharyngitis       <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, …
$ Breathless        <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, …
$ ToothPn           <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N…
$ Vision            <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, …
$ Vomit             <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N…
$ Wheeze            <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N…
$ BodyTemp          <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, …

DATA FITTING

Fitting a linear model using continuous outcome (Body Temperature)

# Setting up the linear model
lm_model <- linear_reg() %>%
  set_engine("lm")

# Fitting a linear model to the continuous outcome (Body temperature) using only the main predictor of interest (RunnyNose)
lm_fit1 <- lm_model %>% fit(BodyTemp ~ RunnyNose, data = fludata_clean)
lm_fit1
parsnip model object


Call:
stats::lm(formula = BodyTemp ~ RunnyNose, data = data)

Coefficients:
 (Intercept)  RunnyNoseYes  
     99.1431       -0.2926  
# Looking at the results of lm_fit1
glance(lm_fit1)
# A tibble: 1 × 12
  r.squ…¹ adj.r…² sigma stati…³ p.value    df logLik   AIC   BIC devia…⁴ df.re…⁵
    <dbl>   <dbl> <dbl>   <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>   <dbl>   <int>
1  0.0123  0.0110  1.19    9.08 0.00268     1 -1162. 2329. 2343.   1031.     728
# … with 1 more variable: nobs <int>, and abbreviated variable names
#   ¹​r.squared, ²​adj.r.squared, ³​statistic, ⁴​deviance, ⁵​df.residual
# Looking at the additional summary of lm_fit1
tidy(lm_fit1)
# A tibble: 2 × 5
  term         estimate std.error statistic p.value
  <chr>           <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)    99.1      0.0819   1210.   0      
2 RunnyNoseYes   -0.293    0.0971     -3.01 0.00268
# Fitting a linear model to the continuous outcome (Body temperature) using all the predictors
lm_fit2 <- lm_model %>% fit(BodyTemp ~ ., data = fludata_clean)
lm_fit2
parsnip model object


Call:
stats::lm(formula = BodyTemp ~ ., data = data)

Coefficients:
           (Intercept)    SwollenLymphNodesYes      ChestCongestionYes  
             97.925243               -0.165302                0.087326  
       ChillsSweatsYes      NasalCongestionYes              CoughYNYes  
              0.201266               -0.215771                0.313893  
             SneezeYes              FatigueYes      SubjectiveFeverYes  
             -0.361924                0.264762                0.436837  
           HeadacheYes            WeaknessMild        WeaknessModerate  
              0.011453                0.018229                0.098944  
        WeaknessSevere           WeaknessYNYes      CoughIntensityMild  
              0.373435                      NA                0.084881  
CoughIntensityModerate    CoughIntensitySevere             CoughYN2Yes  
             -0.061384               -0.037272                      NA  
           MyalgiaMild         MyalgiaModerate           MyalgiaSevere  
              0.164242               -0.024064               -0.129263  
          MyalgiaYNYes            RunnyNoseYes               AbPainYes  
                    NA               -0.080485                0.031574  
          ChestPainYes             DiarrheaYes                EyePnYes  
              0.105071               -0.156806                0.131544  
           InsomniaYes             ItchyEyeYes               NauseaYes  
             -0.006824               -0.008016               -0.034066  
              EarPnYes              HearingYes          PharyngitisYes  
              0.093790                0.232203                0.317581  
         BreathlessYes              ToothPnYes               VisionYes  
              0.090526               -0.022876               -0.274625  
              VomitYes               WheezeYes  
              0.165272               -0.046665  
# Looking at the results of lm_fit2
glance(lm_fit2)
# A tibble: 1 × 12
  r.squ…¹ adj.r…² sigma stati…³ p.value    df logLik   AIC   BIC devia…⁴ df.re…⁵
    <dbl>   <dbl> <dbl>   <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>   <dbl>   <int>
1   0.129  0.0860  1.14    3.02 4.20e-8    34 -1116. 2304. 2469.    909.     695
# … with 1 more variable: nobs <int>, and abbreviated variable names
#   ¹​r.squared, ²​adj.r.squared, ³​statistic, ⁴​deviance, ⁵​df.residual
# Looking at the additional summary of lm_fit2
tidy(lm_fit2)
# A tibble: 38 × 5
   term                 estimate std.error statistic   p.value
   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
 1 (Intercept)           97.9       0.304   322.     0        
 2 SwollenLymphNodesYes  -0.165     0.0920   -1.80   0.0727   
 3 ChestCongestionYes     0.0873    0.0975    0.895  0.371    
 4 ChillsSweatsYes        0.201     0.127     1.58   0.114    
 5 NasalCongestionYes    -0.216     0.114    -1.90   0.0584   
 6 CoughYNYes             0.314     0.241     1.30   0.193    
 7 SneezeYes             -0.362     0.0983   -3.68   0.000249 
 8 FatigueYes             0.265     0.161     1.65   0.0996   
 9 SubjectiveFeverYes     0.437     0.103     4.22   0.0000271
10 HeadacheYes            0.0115    0.125     0.0913 0.927    
# … with 28 more rows
# Comparing lm_fit1 and lm_fit2 on the basis of model performance
compare_performance(lm_fit1, lm_fit2)
# Comparison of Model Performance Indices

Name    | Model |  AIC (weights) | AICc (weights) |  BIC (weights) |    R2 | R2 (adj.) |  RMSE | Sigma
------------------------------------------------------------------------------------------------------
lm_fit1 |   _lm | 2329.3 (<.001) | 2329.4 (<.001) | 2343.1 (>.999) | 0.012 |     0.011 | 1.188 | 1.190
lm_fit2 |   _lm | 2303.8 (>.999) | 2307.7 (>.999) | 2469.2 (<.001) | 0.129 |     0.086 | 1.116 | 1.144

Here, the model with only main predictor of interest (RunnyNose) seem to be a better fit compared to the full model including all predictors as the model with only main predictor of interest has lower AIC of 2329.3 and higher adj. R2 of 0.086.

Fitting a logistic model using categorical outcome (Nausea)

# Setting up the logistic model
lm_model1 <- logistic_reg() %>%
  set_engine("glm")

# Fitting a logistic model to the categorical outcome (Nausea) using only the main predictor of interest (RunnyNose)
lm_fit3 <- lm_model1 %>% fit(Nausea ~ RunnyNose, data = fludata_clean)
lm_fit3
parsnip model object


Call:  stats::glm(formula = Nausea ~ RunnyNose, family = stats::binomial, 
    data = data)

Coefficients:
 (Intercept)  RunnyNoseYes  
    -0.65781       0.05018  

Degrees of Freedom: 729 Total (i.e. Null);  728 Residual
Null Deviance:      944.7 
Residual Deviance: 944.6    AIC: 948.6
# Looking at the results of lm_fit3
glance(lm_fit3)
# A tibble: 1 × 8
  null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
          <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
1          945.     729  -472.  949.  958.     945.         728   730
# Looking at the additional summary of lm_fit3
tidy(lm_fit3)
# A tibble: 2 × 5
  term         estimate std.error statistic    p.value
  <chr>           <dbl>     <dbl>     <dbl>      <dbl>
1 (Intercept)   -0.658      0.145    -4.53  0.00000589
2 RunnyNoseYes   0.0502     0.172     0.292 0.770     
# Fitting a linear model to the continuous outcome (Body temperature) using all the predictors
lm_fit4 <- lm_model1 %>% fit(Nausea ~ ., data = fludata_clean)
lm_fit4
parsnip model object


Call:  stats::glm(formula = Nausea ~ ., family = stats::binomial, data = data)

Coefficients:
           (Intercept)    SwollenLymphNodesYes      ChestCongestionYes  
              0.222870               -0.251083                0.275554  
       ChillsSweatsYes      NasalCongestionYes              CoughYNYes  
              0.274097                0.425817               -0.140423  
             SneezeYes              FatigueYes      SubjectiveFeverYes  
              0.176724                0.229062                0.277741  
           HeadacheYes            WeaknessMild        WeaknessModerate  
              0.331259               -0.121606                0.310849  
        WeaknessSevere           WeaknessYNYes      CoughIntensityMild  
              0.823187                      NA               -0.220794  
CoughIntensityModerate    CoughIntensitySevere             CoughYN2Yes  
             -0.362678               -0.950544                      NA  
           MyalgiaMild         MyalgiaModerate           MyalgiaSevere  
             -0.004146                0.204743                0.120758  
          MyalgiaYNYes            RunnyNoseYes               AbPainYes  
                    NA                0.045324                0.939304  
          ChestPainYes             DiarrheaYes                EyePnYes  
              0.070777                1.063934               -0.341991  
           InsomniaYes             ItchyEyeYes                EarPnYes  
              0.084175               -0.063364               -0.181719  
            HearingYes          PharyngitisYes           BreathlessYes  
              0.323052                0.275364                0.526801  
            ToothPnYes               VisionYes                VomitYes  
              0.480649                0.125498                2.458466  
             WheezeYes                BodyTemp  
             -0.304435               -0.031246  

Degrees of Freedom: 729 Total (i.e. Null);  695 Residual
Null Deviance:      944.7 
Residual Deviance: 751.5    AIC: 821.5
# Looking at the results of lm_fit4
glance(lm_fit4)
# A tibble: 1 × 8
  null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
          <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
1          945.     729  -376.  821.  982.     751.         695   730
# Looking at the additional summary of lm_fit4
tidy(lm_fit4)
# A tibble: 38 × 5
   term                 estimate std.error statistic p.value
   <chr>                   <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)             0.223     7.83     0.0285  0.977 
 2 SwollenLymphNodesYes   -0.251     0.196   -1.28    0.200 
 3 ChestCongestionYes      0.276     0.213    1.30    0.195 
 4 ChillsSweatsYes         0.274     0.288    0.952   0.341 
 5 NasalCongestionYes      0.426     0.255    1.67    0.0944
 6 CoughYNYes             -0.140     0.519   -0.271   0.787 
 7 SneezeYes               0.177     0.210    0.840   0.401 
 8 FatigueYes              0.229     0.372    0.616   0.538 
 9 SubjectiveFeverYes      0.278     0.225    1.23    0.218 
10 HeadacheYes             0.331     0.285    1.16    0.245 
# … with 28 more rows
# Comparing lm_fit3 and lm_fit4 on the basis of model performance
compare_performance(lm_fit3, lm_fit4)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading

Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading

Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading

Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
# Comparison of Model Performance Indices

Name    | Model | AIC (weights) | AICc (weights) | BIC (weights) | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log | Score_spherical |   PCP
---------------------------------------------------------------------------------------------------------------------------------------------
lm_fit3 |  _glm | 948.6 (<.001) |  948.6 (<.001) | 957.8 (>.999) | 1.169e-04 | 0.477 | 1.139 |    0.647 |  -107.871 |           0.012 | 0.545
lm_fit4 |  _glm | 821.5 (>.999) |  825.1 (>.999) | 982.2 (<.001) |     0.247 | 0.414 | 1.040 |    0.515 |      -Inf |           0.002 | 0.658

Here, the full model including all predictors seem to be a better fit compared to the model with only main predictor of interest (RunnyNose) as the full model has lower AIC of 821.5.