These exercises cover the sections of LMS_Statistics

Set working directory

# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions"

Basic data manipulation in R

Basic statistics in R

Exercise 1

# Read .csv file
life_exp <- read.csv("../data/life_expectancy.csv", header = T)
# Explore data
head(life_exp)
##        time    area life.expectancy gender
## 1 1991-1993 England           73.69  Males
## 2 1992-1994 England           74.02  Males
## 3 1993-1995 England           74.18  Males
## 4 1994-1996 England           74.44  Males
## 5 1995-1997 England           74.61  Males
## 6 1996-1998 England           74.84  Males
# Check dimensions
dim(life_exp)
## [1] 160   4
# Check data types
str(life_exp)
## 'data.frame':    160 obs. of  4 variables:
##  $ time           : chr  "1991-1993" "1992-1994" "1993-1995" "1994-1996" ...
##  $ area           : chr  "England" "England" "England" "England" ...
##  $ life.expectancy: num  73.7 74 74.2 74.4 74.6 ...
##  $ gender         : chr  "Males" "Males" "Males" "Males" ...
# Check summary information
summary(life_exp)
##      time               area           life.expectancy    gender         
##  Length:160         Length:160         Min.   :71.47   Length:160        
##  Class :character   Class :character   1st Qu.:75.08   Class :character  
##  Mode  :character   Mode  :character   Median :78.00   Mode  :character  
##                                        Mean   :77.63                     
##                                        3rd Qu.:79.91                     
##                                        Max.   :83.01
# Subset the df for the England cases
life_exp_eng <- life_exp[life_exp$area == "England", ]
dim(life_exp)
## [1] 160   4
dim(life_exp_eng)
## [1] 40  4
# Get number of cases for each gender case
ftable(life_exp_eng$gender)
##  Females Males
##               
##       20    20
# Option a - use min() and max() functions
min_life_exp_eng <- min(life_exp_eng[, "life.expectancy"])
max_life_exp_eng <- max(life_exp_eng[, "life.expectancy"])
c(min_life_exp_eng, max_life_exp_eng)
## [1] 73.69 83.01
cat("Min: ", min_life_exp_eng, "\nMax: ", max_life_exp_eng)
## Min:  73.69 
## Max:  83.01
# Option b - use the range() function
min_life_exp_eng <- range(life_exp_eng[, "life.expectancy"])[1]
max_life_exp_eng <- range(life_exp_eng[, "life.expectancy"])[2]
cat("Min: ", min_life_exp_eng, "\nMax: ", max_life_exp_eng)
## Min:  73.69 
## Max:  83.01
# Get median value
median(life_exp_eng[life_exp_eng$gender == "Males", "life.expectancy"])
## [1] 76.11
median(life_exp_eng[life_exp_eng$gender == "Females", "life.expectancy"])
## [1] 80.69
# Get quantile information
# Quantiles divide the range of a probability distribution into continuous intervals with equal probabilities
quantile(life_exp[life_exp$area == "Wales", "life.expectancy"])
##      0%     25%     50%     75%    100% 
## 73.2500 75.4525 78.5250 80.0575 82.2200
quantile(life_exp[life_exp$area == "N.Ireland", "life.expectancy"])
##      0%     25%     50%     75%    100% 
## 72.7300 75.5050 78.2550 80.2425 82.2800
quantile(life_exp[life_exp$area == "Scotland", "life.expectancy"])
##    0%   25%   50%   75%  100% 
## 71.47 73.46 76.89 78.81 80.83
# Get momenta of the distribution
mean(life_exp[life_exp$area == "England", "life.expectancy"])
## [1] 78.55425
var(life_exp[life_exp$area == "England", "life.expectancy"])
## [1] 7.610123
sd(life_exp[life_exp$area == "England", "life.expectancy"])
## [1] 2.758645
# Equivalent - use the subset we generated before
mean(life_exp_eng[, "life.expectancy"])
## [1] 78.55425
var(life_exp_eng[, "life.expectancy"])
## [1] 7.610123
sd(life_exp_eng[, "life.expectancy"])
## [1] 2.758645