Open your RStudio, create a project and save it. Go to the root directory of the project and create folders named: “Code”, “Data”, “Figures” and “Tables.” Download the life_expectancy.csv dataset and store it in the folder “Data”. Create an R script (or .Rmd) file in the Code folder.
# Call one package
library(pacman)
## Warning: package 'pacman' was built under R version 4.1.3
# Call multiple packages at once
p_load(tidyverse, here, modelsummary)
df <- read.csv(here("data/life_expectancy.csv"))
# There are other packages to read in data that are much faster and more flexible
df <- filter(df, year == 2015)
df <- df %>% select(country, status, schooling, life_expectancy)
df <- rename(df, region = country)
# Replace existing variable
df <- df %>%
mutate(life_expectancy = round(life_expectancy, digits = 0))
# Create a new one
df <- df %>%
mutate(life2 = life_expectancy * life_expectancy)
head(df)
## region status schooling life_expectancy life2
## 1 Afghanistan 0 10.1 65 4225
## 2 Albania 0 14.2 78 6084
## 3 Algeria 0 14.4 76 5776
## 4 Angola 0 11.4 52 2704
## 5 Antigua and Barbuda 0 13.9 76 5776
## 6 Argentina 0 17.3 76 5776
str(df)
## 'data.frame': 183 obs. of 5 variables:
## $ region : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ status : num 0 0 0 0 0 0 0 1 1 0 ...
## $ schooling : num 10.1 14.2 14.4 11.4 13.9 17.3 12.7 20.4 15.9 12.7 ...
## $ life_expectancy: num 65 78 76 52 76 76 75 83 82 73 ...
## $ life2 : num 4225 6084 5776 2704 5776 ...
df$status <- factor(df$status,
levels = c(0, 1),
labels = c("Developing", "Developed"))
# One at a time
sum(is.na(df$schooling))
## [1] 10
# All at once
sapply(df, function(x) sum(is.na(x)))
## region status schooling life_expectancy life2
## 0 0 10 0 0
# In this case, we'll use listwise
df <- filter(df, !is.na(schooling))
mean(df$life_expectancy)
## [1] 71.73988
median(df$life_expectancy)
## [1] 74
sd(df$life_expectancy)
## [1] 7.965512
summary(df$life_expectancy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 51.00 66.00 74.00 71.74 77.00 88.00
table(df$status, exclude=NULL)
##
## Developing Developed
## 144 29
counts <- table(df$status)
barplot(counts)

hist(df$life_expectancy)

plot(density(df$schooling))

desc_df <- select(df, -c(life2))
names(desc_df) <- c("Region", "Status", "Schooling (Yrs)",
"Life Expectancy (Yrs)")
datasummary_skim(desc_df,
fun_numeric = list(Mean = Mean, SD = SD, Min = Min,
Median = Median, Max = Max))
## Warning: These variables were omitted because they include more than 50 levels:
## Region.
| Mean | SD | Min | Median | Max | |
|---|---|---|---|---|---|
| Schooling (Yrs) | 12.9 | 2.9 | 4.9 | 13.1 | 20.4 |
| Life Expectancy (Yrs) | 71.7 | 8.0 | 51.0 | 74.0 | 88.0 |
| Status | N | % | |||
| Developing | 144 | 83.2 | |||
| Developed | 29 | 16.8 |
datasummary_skim(desc_df, type="categorical")
## Warning: These variables were omitted because they include more than 50 levels:
## Region.
| Status | N | % |
|---|---|---|
| Developing | 144 | 83.2 |
| Developed | 29 | 16.8 |