R Coding Exercise

Loading required packages

# Loading dslabs and tidyverse packages
library("dslabs") 
library("tidyverse")
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   0.3.4
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.2.0     ✔ stringr 1.4.1
✔ readr   2.1.2     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Loading and checking gapminder data

# Looking at help file for gapminder data
help(gapminder)
# Getting an overview of data structure
str(gapminder)
'data.frame':   10545 obs. of  9 variables:
 $ country         : Factor w/ 185 levels "Albania","Algeria",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ year            : int  1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
 $ infant_mortality: num  115.4 148.2 208 NA 59.9 ...
 $ life_expectancy : num  62.9 47.5 36 63 65.4 ...
 $ fertility       : num  6.19 7.65 7.32 4.43 3.11 4.55 4.82 3.45 2.7 5.57 ...
 $ population      : num  1636054 11124892 5270844 54681 20619075 ...
 $ gdp             : num  NA 1.38e+10 NA NA 1.08e+11 ...
 $ continent       : Factor w/ 5 levels "Africa","Americas",..: 4 1 1 2 2 3 2 5 4 3 ...
 $ region          : Factor w/ 22 levels "Australia and New Zealand",..: 19 11 10 2 15 21 2 1 22 21 ...
# Getting summary of data from gapminder
summary(gapminder)
                country           year      infant_mortality life_expectancy
 Albania            :   57   Min.   :1960   Min.   :  1.50   Min.   :13.20  
 Algeria            :   57   1st Qu.:1974   1st Qu.: 16.00   1st Qu.:57.50  
 Angola             :   57   Median :1988   Median : 41.50   Median :67.54  
 Antigua and Barbuda:   57   Mean   :1988   Mean   : 55.31   Mean   :64.81  
 Argentina          :   57   3rd Qu.:2002   3rd Qu.: 85.10   3rd Qu.:73.00  
 Armenia            :   57   Max.   :2016   Max.   :276.90   Max.   :83.90  
 (Other)            :10203                  NA's   :1453                    
   fertility       population             gdp               continent   
 Min.   :0.840   Min.   :3.124e+04   Min.   :4.040e+07   Africa  :2907  
 1st Qu.:2.200   1st Qu.:1.333e+06   1st Qu.:1.846e+09   Americas:2052  
 Median :3.750   Median :5.009e+06   Median :7.794e+09   Asia    :2679  
 Mean   :4.084   Mean   :2.701e+07   Mean   :1.480e+11   Europe  :2223  
 3rd Qu.:6.000   3rd Qu.:1.523e+07   3rd Qu.:5.540e+10   Oceania : 684  
 Max.   :9.220   Max.   :1.376e+09   Max.   :1.174e+13                  
 NA's   :187     NA's   :185         NA's   :2972                       
             region    
 Western Asia   :1026  
 Eastern Africa : 912  
 Western Africa : 912  
 Caribbean      : 741  
 South America  : 684  
 Southern Europe: 684  
 (Other)        :5586  
# Determining the type of gapminder object
class(gapminder)
[1] "data.frame"

Processing data

Creating an object named ‘africadata’

# Creating an object africadata containing only the African countries
africadata <- gapminder %>% filter(continent == "Africa")

# Looking at the structure and summary of object (africadata)
str(africadata)
'data.frame':   2907 obs. of  9 variables:
 $ country         : Factor w/ 185 levels "Albania","Algeria",..: 2 3 18 22 26 27 29 31 32 33 ...
 $ year            : int  1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
 $ infant_mortality: num  148 208 187 116 161 ...
 $ life_expectancy : num  47.5 36 38.3 50.3 35.2 ...
 $ fertility       : num  7.65 7.32 6.28 6.62 6.29 6.95 5.65 6.89 5.84 6.25 ...
 $ population      : num  11124892 5270844 2431620 524029 4829291 ...
 $ gdp             : num  1.38e+10 NA 6.22e+08 1.24e+08 5.97e+08 ...
 $ continent       : Factor w/ 5 levels "Africa","Americas",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ region          : Factor w/ 22 levels "Australia and New Zealand",..: 11 10 20 17 20 5 10 20 10 10 ...
summary(africadata)
         country          year      infant_mortality life_expectancy
 Algeria     :  57   Min.   :1960   Min.   : 11.40   Min.   :13.20  
 Angola      :  57   1st Qu.:1974   1st Qu.: 62.20   1st Qu.:48.23  
 Benin       :  57   Median :1988   Median : 93.40   Median :53.98  
 Botswana    :  57   Mean   :1988   Mean   : 95.12   Mean   :54.38  
 Burkina Faso:  57   3rd Qu.:2002   3rd Qu.:124.70   3rd Qu.:60.10  
 Burundi     :  57   Max.   :2016   Max.   :237.40   Max.   :77.60  
 (Other)     :2565                  NA's   :226                     
   fertility       population             gdp               continent   
 Min.   :1.500   Min.   :    41538   Min.   :4.659e+07   Africa  :2907  
 1st Qu.:5.160   1st Qu.:  1605232   1st Qu.:8.373e+08   Americas:   0  
 Median :6.160   Median :  5570982   Median :2.448e+09   Asia    :   0  
 Mean   :5.851   Mean   : 12235961   Mean   :9.346e+09   Europe  :   0  
 3rd Qu.:6.860   3rd Qu.: 13888152   3rd Qu.:6.552e+09   Oceania :   0  
 Max.   :8.450   Max.   :182201962   Max.   :1.935e+11                  
 NA's   :51      NA's   :51          NA's   :637                        
                       region   
 Eastern Africa           :912  
 Western Africa           :912  
 Middle Africa            :456  
 Northern Africa          :342  
 Southern Africa          :285  
 Australia and New Zealand:  0  
 (Other)                  :  0  
glimpse(africadata)
Rows: 2,907
Columns: 9
$ country          <fct> "Algeria", "Angola", "Benin", "Botswana", "Burkina Fa…
$ year             <int> 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960,…
$ infant_mortality <dbl> 148.2, 208.0, 186.9, 115.5, 161.3, 145.1, 166.9, NA, …
$ life_expectancy  <dbl> 47.50, 35.98, 38.29, 50.34, 35.21, 40.58, 43.46, 50.1…
$ fertility        <dbl> 7.65, 7.32, 6.28, 6.62, 6.29, 6.95, 5.65, 6.89, 5.84,…
$ population       <dbl> 11124892, 5270844, 2431620, 524029, 4829291, 2786740,…
$ gdp              <dbl> 13828152297, NA, 621797131, 124460933, 596612183, 341…
$ continent        <fct> Africa, Africa, Africa, Africa, Africa, Africa, Afric…
$ region           <fct> Northern Africa, Middle Africa, Western Africa, South…

Creating an object named ‘infantmort_lifeexp’

# Creating an object infantmort_lifeexp containing only infant_mortality and life_expectancy variables
infantmort_lifeexp <- africadata %>%
                      select(infant_mortality, life_expectancy)

# Looking at the structure and summary of object (infantmort_lifeexp)
str(infantmort_lifeexp)
'data.frame':   2907 obs. of  2 variables:
 $ infant_mortality: num  148 208 187 116 161 ...
 $ life_expectancy : num  47.5 36 38.3 50.3 35.2 ...
summary(infantmort_lifeexp)
 infant_mortality life_expectancy
 Min.   : 11.40   Min.   :13.20  
 1st Qu.: 62.20   1st Qu.:48.23  
 Median : 93.40   Median :53.98  
 Mean   : 95.12   Mean   :54.38  
 3rd Qu.:124.70   3rd Qu.:60.10  
 Max.   :237.40   Max.   :77.60  
 NA's   :226                     
glimpse(infantmort_lifeexp)
Rows: 2,907
Columns: 2
$ infant_mortality <dbl> 148.2, 208.0, 186.9, 115.5, 161.3, 145.1, 166.9, NA, …
$ life_expectancy  <dbl> 47.50, 35.98, 38.29, 50.34, 35.21, 40.58, 43.46, 50.1…

Creating an object named ‘pop_lifeexp’

# Creating an object pop_lifeexp containing only population and life_expectancy variables
pop_lifeexp <- africadata %>%
                      select(population, life_expectancy)

# Looking at the structure and summary of object (pop_lifeexp)
str(pop_lifeexp)
'data.frame':   2907 obs. of  2 variables:
 $ population     : num  11124892 5270844 2431620 524029 4829291 ...
 $ life_expectancy: num  47.5 36 38.3 50.3 35.2 ...
summary(pop_lifeexp)
   population        life_expectancy
 Min.   :    41538   Min.   :13.20  
 1st Qu.:  1605232   1st Qu.:48.23  
 Median :  5570982   Median :53.98  
 Mean   : 12235961   Mean   :54.38  
 3rd Qu.: 13888152   3rd Qu.:60.10  
 Max.   :182201962   Max.   :77.60  
 NA's   :51                         
glimpse(pop_lifeexp)
Rows: 2,907
Columns: 2
$ population      <dbl> 11124892, 5270844, 2431620, 524029, 4829291, 2786740, …
$ life_expectancy <dbl> 47.50, 35.98, 38.29, 50.34, 35.21, 40.58, 43.46, 50.12…

Plotting

# Plotting life expectancy as a function of infant mortality
ggplot(data = infantmort_lifeexp,
       (aes(x = infant_mortality, y = life_expectancy))) +
  theme_classic() +
  geom_point(size = 1.0) +
  labs(title = "Life expectancy as a function of infant mortality", 
       x = "Infant Mortality", y = "Life Expectancy (In Years)") +
  theme(axis.text = element_text(face = "bold"), plot.title =   element_text(hjust = 0.5, size = 14),
        axis.title = element_text(size = 14))
Warning: Removed 226 rows containing missing values (`geom_point()`).

# Plotting life expectancy as a function of population size
ggplot(data = pop_lifeexp,
       (aes(x = population, y = life_expectancy))) +
  theme_classic() +
  geom_point(size = 1.0) +
  scale_x_continuous(trans = 'log10') +
  labs(title = "Life expectancy as a function of population size", 
       x = "Population (In Logscale)", y = "Life Expectancy (In Years)") +
  theme(axis.text = element_text(face = "bold"), plot.title =   element_text(hjust = 0.5, size = 14),
        axis.title = element_text(size = 14))
Warning: Removed 51 rows containing missing values (`geom_point()`).

Checking data missingness

# Looking at the years having missing data for infant mortality in africadata
infant_mortality_yrs <- africadata %>% 
                        filter(is.na(infant_mortality))
table(infant_mortality_yrs$year)

1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 
  10   17   16   16   15   14   13   11   11    7    5    6    6    6    5    5 
1976 1977 1978 1979 1980 1981 2016 
   3    3    2    2    1    1   51 

Data processing using African countries data for the year 2000

# Creating a new object named africadata_2000 by using only year 2000 data from africadata object
africadata_2000 <- africadata %>%
                   filter(year == 2000)
# Looking at the structure and summary of object (africadata_2000)
str(africadata_2000)
'data.frame':   51 obs. of  9 variables:
 $ country         : Factor w/ 185 levels "Albania","Algeria",..: 2 3 18 22 26 27 29 31 32 33 ...
 $ year            : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
 $ infant_mortality: num  33.9 128.3 89.3 52.4 96.2 ...
 $ life_expectancy : num  73.3 52.3 57.2 47.6 52.6 46.7 54.3 68.4 45.3 51.5 ...
 $ fertility       : num  2.51 6.84 5.98 3.41 6.59 7.06 5.62 3.7 5.45 7.35 ...
 $ population      : num  31183658 15058638 6949366 1736579 11607944 ...
 $ gdp             : num  5.48e+10 9.13e+09 2.25e+09 5.63e+09 2.61e+09 ...
 $ continent       : Factor w/ 5 levels "Africa","Americas",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ region          : Factor w/ 22 levels "Australia and New Zealand",..: 11 10 20 17 20 5 10 20 10 10 ...
summary(africadata_2000)
         country        year      infant_mortality life_expectancy
 Algeria     : 1   Min.   :2000   Min.   : 12.30   Min.   :37.60  
 Angola      : 1   1st Qu.:2000   1st Qu.: 60.80   1st Qu.:51.75  
 Benin       : 1   Median :2000   Median : 80.30   Median :54.30  
 Botswana    : 1   Mean   :2000   Mean   : 78.93   Mean   :56.36  
 Burkina Faso: 1   3rd Qu.:2000   3rd Qu.:103.30   3rd Qu.:60.00  
 Burundi     : 1   Max.   :2000   Max.   :143.30   Max.   :75.00  
 (Other)     :45                                                  
   fertility       population             gdp               continent 
 Min.   :1.990   Min.   :    81154   Min.   :2.019e+08   Africa  :51  
 1st Qu.:4.150   1st Qu.:  2304687   1st Qu.:1.274e+09   Americas: 0  
 Median :5.550   Median :  8799165   Median :3.238e+09   Asia    : 0  
 Mean   :5.156   Mean   : 15659800   Mean   :1.155e+10   Europe  : 0  
 3rd Qu.:5.960   3rd Qu.: 17391242   3rd Qu.:8.654e+09   Oceania : 0  
 Max.   :7.730   Max.   :122876723   Max.   :1.329e+11                
                                                                      
                       region  
 Eastern Africa           :16  
 Western Africa           :16  
 Middle Africa            : 8  
 Northern Africa          : 6  
 Southern Africa          : 5  
 Australia and New Zealand: 0  
 (Other)                  : 0  
glimpse(africadata_2000)
Rows: 51
Columns: 9
$ country          <fct> "Algeria", "Angola", "Benin", "Botswana", "Burkina Fa…
$ year             <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,…
$ infant_mortality <dbl> 33.9, 128.3, 89.3, 52.4, 96.2, 93.4, 91.9, 29.1, 113.…
$ life_expectancy  <dbl> 73.3, 52.3, 57.2, 47.6, 52.6, 46.7, 54.3, 68.4, 45.3,…
$ fertility        <dbl> 2.51, 6.84, 5.98, 3.41, 6.59, 7.06, 5.62, 3.70, 5.45,…
$ population       <dbl> 31183658, 15058638, 6949366, 1736579, 11607944, 67670…
$ gdp              <dbl> 54790058957, 9129180361, 2254838685, 5632391130, 2610…
$ continent        <fct> Africa, Africa, Africa, Africa, Africa, Africa, Afric…
$ region           <fct> Northern Africa, Middle Africa, Western Africa, South…

Plotting using African countries data for the year 2000

Plot 1

# Plotting life expectancy as a function of infant mortality in the year 2000
ggplot(data = africadata_2000,
       (aes(x = infant_mortality, y = life_expectancy))) +
  theme_classic() +
  geom_point(size = 1.0) +
  labs(title = "Life expectancy as a function of infant mortality in the year 2000", 
       x = "Infant Mortality", y = "Life Expectancy (In Years)") +
  theme(axis.text = element_text(face = "bold"), plot.title = element_text(hjust = 0.5, size = 14),
        axis.title = element_text(size = 14))

Plot 2

# Plotting life expectancy as a function of population size in the year 2000
ggplot(data = africadata_2000,
       (aes(x = population, y = life_expectancy))) +
  theme_classic() +
  geom_point(size = 1.0) +
  scale_x_continuous(trans = 'log10') +
  labs(title = "Life expectancy as a function of population size in the year 2000", 
       x = "Population (In Logscale)", y = "Life Expectancy (In Years)") +
  theme(axis.text = element_text(face = "bold"), plot.title =   element_text(hjust = 0.5, size = 14),
        axis.title = element_text(size = 14))

A simple fit using African countries data for the year 2000

# Using lm function to fit linear regression model using life expectancy as the outcome and infant mortality as the predictor for the year 2000
fit1 <- lm(life_expectancy ~ infant_mortality, data = africadata_2000)
summary(fit1)

Call:
lm(formula = life_expectancy ~ infant_mortality, data = africadata_2000)

Residuals:
     Min       1Q   Median       3Q      Max 
-22.6651  -3.7087   0.9914   4.0408   8.6817 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      71.29331    2.42611  29.386  < 2e-16 ***
infant_mortality -0.18916    0.02869  -6.594 2.83e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.221 on 49 degrees of freedom
Multiple R-squared:  0.4701,    Adjusted R-squared:  0.4593 
F-statistic: 43.48 on 1 and 49 DF,  p-value: 2.826e-08
# Using lm function to fit linear regression model using life expectancy as the outcome and population size as the predictor for the year 2000
fit2 <- lm(life_expectancy ~ population, data = africadata_2000)
summary(fit2)

Call:
lm(formula = life_expectancy ~ population, data = africadata_2000)

Residuals:
    Min      1Q  Median      3Q     Max 
-18.429  -4.602  -2.568   3.800  18.802 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 5.593e+01  1.468e+00  38.097   <2e-16 ***
population  2.756e-08  5.459e-08   0.505    0.616    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.524 on 49 degrees of freedom
Multiple R-squared:  0.005176,  Adjusted R-squared:  -0.01513 
F-statistic: 0.2549 on 1 and 49 DF,  p-value: 0.6159

Conclusion

Based on the p-values for each fit, we found statistically significant negative correlation between infant mortality and life expectancy in the year 2000 (p-value: 2.83e-08) while we found no statistically significant correlation between population size and life expectancy in the year 2000 (p-value: 0.616).

—————————————

THIS SECTION ADDED BY NICOLE LUISI

—————————————

#Create some plots

# Install and load new packages
#install.packages(c("plotly", "broom"))
library(plotly)
library(broom)
# Create subset for 4 specific regions
africadata_limitregion <- africadata %>%
      filter(region %in% c("Northern Africa", "Southern Africa", "Western Africa", "Middle Africa"))
# Set color palette
manualcolors<-c('coral', 'forestgreen', 'darkorange1', 'firebrick4', 'cornflowerblue', 'darkseagreen', 'cyan3', 
                'brown1', 'blueviolet')
# Generate figure
fig <- plot_ly(data=africadata_limitregion, x = ~year, y = ~life_expectancy, text = ~region, type = 'scatter', mode = 'markers', color = ~region, colors=manualcolors)
fig <- fig %>% layout(title = 'Life Expectancy (years) by Year in 4 Regions of Africa',
         yaxis = list(title = "Life Expectancy (yrs)"),
         xaxis = list(title = "Year"))
# View figure, hover for data points
fig
#library(ggplot2)
# Generate figure
theme_set(
  theme_bw() + 
    theme(legend.position = "top"))
ggplot(africadata, aes(x = gdp, y = infant_mortality)) + 
  geom_point(aes(color = region, size = fertility), alpha = 0.5) +
  scale_color_manual(values = c("#d1495b", "#edae49", "#66a182", "#2e4057", "79b7c5")) +
  scale_size(range = c(0.5, 6))  +
  xlab("GDP") +
  ylab("Infant Mortality (yrs)") +
  ggtitle("Infant Mortality by GDP, with Region as Color and Size as Fertility")
Warning: Removed 704 rows containing missing values (`geom_point()`).

#Fit a model

# Model for life_expectancy ~ fertility for 4 regions in Africa
newmodel <- lm(life_expectancy ~ fertility, data = africadata_limitregion)
# Summary for new model
summary(newmodel)

Call:
lm(formula = life_expectancy ~ fertility, data = africadata_limitregion)

Residuals:
    Min      1Q  Median      3Q     Max 
-21.013  -3.950   1.235   5.186  21.800 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  76.8629     0.7396  103.93   <2e-16 ***
fertility    -3.9165     0.1244  -31.47   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.376 on 1958 degrees of freedom
  (35 observations deleted due to missingness)
Multiple R-squared:  0.3359,    Adjusted R-squared:  0.3356 
F-statistic: 990.5 on 1 and 1958 DF,  p-value: < 2.2e-16
# Use broom to make table for new model
tidy(newmodel)
# A tibble: 2 × 5
  term        estimate std.error statistic   p.value
  <chr>          <dbl>     <dbl>     <dbl>     <dbl>
1 (Intercept)    76.9      0.740     104.  0        
2 fertility      -3.92     0.124     -31.5 2.78e-176
# Create function to summarize results another way
customoutput <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point() +
  xlab("Fertility") +
  ylab("Life Expectancy (yrs)") +
  ggtitle("Life Expectancy and Fertility in 4 Regions of Africa") + 
  stat_smooth(method = "lm", col = "red") +
  labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
                     "Intercept =",signif(fit$coef[[1]],5 ),
                     " Slope =",signif(fit$coef[[2]], 5),
                     " P =",signif(summary(fit)$coef[2,4], 5)))
}
customoutput(newmodel)
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation ideoms with `aes()`
`geom_smooth()` using formula = 'y ~ x'