Setting up project and directory structure
Open your RStudio, create a project and save it. Go to the root
directory of the project and create folders named: “Code”, “Data”,
“Figures” and “Tables.” Download the life_expectancy.csv dataset and
store it in the folder “Data”. Create an R script (or .Rmd) file in the
Code folder.
Load relevant packages
# Call one package
library(pacman)
## Warning: package 'pacman' was built under R version 4.1.3
# Call multiple packages at once
p_load(tidyverse, here, modelsummary)
Read the data in
df <- read.csv(here("data/life_expectancy.csv"))
# There are other packages to read in data that are much faster and more flexible
Rename variable
df <- rename(df, region = country)
Create a new variable based on existing one(s)
# Replace existing variable
df <- df %>%
mutate(life_expectancy = round(life_expectancy, digits = 0))
# Create a new one
df <- df %>%
mutate(life2 = life_expectancy * life_expectancy)
Inspect your data
head(df)
## region status schooling life_expectancy life2
## 1 Afghanistan 0 10.1 65 4225
## 2 Albania 0 14.2 78 6084
## 3 Algeria 0 14.4 76 5776
## 4 Angola 0 11.4 52 2704
## 5 Antigua and Barbuda 0 13.9 76 5776
## 6 Argentina 0 17.3 76 5776
str(df)
## 'data.frame': 183 obs. of 5 variables:
## $ region : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ status : num 0 0 0 0 0 0 0 1 1 0 ...
## $ schooling : num 10.1 14.2 14.4 11.4 13.9 17.3 12.7 20.4 15.9 12.7 ...
## $ life_expectancy: num 65 78 76 52 76 76 75 83 82 73 ...
## $ life2 : num 4225 6084 5776 2704 5776 ...
Examine missingness
# One at a time
sum(is.na(df$schooling))
## [1] 10
# All at once
sapply(df, function(x) sum(is.na(x)))
## region status schooling life_expectancy life2
## 0 0 10 0 0
Decide on listwise/pairwise deletion
# In this case, we'll use listwise
df <- filter(df, !is.na(schooling))
Calculate summary statistics
mean(df$life_expectancy)
## [1] 71.73988
median(df$life_expectancy)
## [1] 74
sd(df$life_expectancy)
## [1] 7.965512
summary(df$life_expectancy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 51.00 66.00 74.00 71.74 77.00 88.00
table(df$status, exclude=NULL)
##
## Developing Developed
## 144 29
Visualize variables (categorical)
counts <- table(df$status)
barplot(counts)
![](data_management_files/figure-html/unnamed-chunk-13-1.png)
Visualize variables (continuous)
hist(df$life_expectancy)
![](data_management_files/figure-html/unnamed-chunk-14-1.png)
plot(density(df$schooling))
![](data_management_files/figure-html/unnamed-chunk-14-2.png)
Create descriptive table
desc_df <- select(df, -c(life2))
names(desc_df) <- c("Region", "Status", "Schooling (Yrs)",
"Life Expectancy (Yrs)")
datasummary_skim(desc_df,
fun_numeric = list(Mean = Mean, SD = SD, Min = Min,
Median = Median, Max = Max))
## Warning: These variables were omitted because they include more than 50 levels:
## Region.
|
Mean |
SD |
Min |
Median |
Max |
Schooling (Yrs) |
12.9 |
2.9 |
4.9 |
13.1 |
20.4 |
Life Expectancy (Yrs) |
71.7 |
8.0 |
51.0 |
74.0 |
88.0 |
Status |
N |
% |
|
|
|
Developing |
144 |
83.2 |
|
|
|
Developed |
29 |
16.8 |
|
|
|
Create descriptive table (categorical only)
datasummary_skim(desc_df, type="categorical")
## Warning: These variables were omitted because they include more than 50 levels:
## Region.
Status |
N |
% |
Developing |
144 |
83.2 |
Developed |
29 |
16.8 |