These exercises cover the sections of LMS_Statistics
# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions"
Exercise 1
data/life_expectancy_at_birth.csv
and store it as a dataframe# Read .csv file
life_exp <- read.csv("../data/life_expectancy.csv", header = T)
head()
and dim()
functions to show the first 6 rows of the data. How many variables are there?# Explore data
head(life_exp)
## time area life.expectancy gender
## 1 1991-1993 England 73.69 Males
## 2 1992-1994 England 74.02 Males
## 3 1993-1995 England 74.18 Males
## 4 1994-1996 England 74.44 Males
## 5 1995-1997 England 74.61 Males
## 6 1996-1998 England 74.84 Males
# Check dimensions
dim(life_exp)
## [1] 160 4
str()
and summary()
functions to see the data type and summary information of the dataset# Check data types
str(life_exp)
## 'data.frame': 160 obs. of 4 variables:
## $ time : chr "1991-1993" "1992-1994" "1993-1995" "1994-1996" ...
## $ area : chr "England" "England" "England" "England" ...
## $ life.expectancy: num 73.7 74 74.2 74.4 74.6 ...
## $ gender : chr "Males" "Males" "Males" "Males" ...
# Check summary information
summary(life_exp)
## time area life.expectancy gender
## Length:160 Length:160 Min. :71.47 Length:160
## Class :character Class :character 1st Qu.:75.08 Class :character
## Mode :character Mode :character Median :78.00 Mode :character
## Mean :77.63
## 3rd Qu.:79.91
## Max. :83.01
# Subset the df for the England cases
life_exp_eng <- life_exp[life_exp$area == "England", ]
dim(life_exp)
## [1] 160 4
dim(life_exp_eng)
## [1] 40 4
# Get number of cases for each gender case
ftable(life_exp_eng$gender)
## Females Males
##
## 20 20
# Option a - use min() and max() functions
min_life_exp_eng <- min(life_exp_eng[, "life.expectancy"])
max_life_exp_eng <- max(life_exp_eng[, "life.expectancy"])
c(min_life_exp_eng, max_life_exp_eng)
## [1] 73.69 83.01
cat("Min: ", min_life_exp_eng, "\nMax: ", max_life_exp_eng)
## Min: 73.69
## Max: 83.01
# Option b - use the range() function
min_life_exp_eng <- range(life_exp_eng[, "life.expectancy"])[1]
max_life_exp_eng <- range(life_exp_eng[, "life.expectancy"])[2]
cat("Min: ", min_life_exp_eng, "\nMax: ", max_life_exp_eng)
## Min: 73.69
## Max: 83.01
# Get median value
median(life_exp_eng[life_exp_eng$gender == "Males", "life.expectancy"])
## [1] 76.11
median(life_exp_eng[life_exp_eng$gender == "Females", "life.expectancy"])
## [1] 80.69
# Get quantile information
# Quantiles divide the range of a probability distribution into continuous intervals with equal probabilities
quantile(life_exp[life_exp$area == "Wales", "life.expectancy"])
## 0% 25% 50% 75% 100%
## 73.2500 75.4525 78.5250 80.0575 82.2200
quantile(life_exp[life_exp$area == "N.Ireland", "life.expectancy"])
## 0% 25% 50% 75% 100%
## 72.7300 75.5050 78.2550 80.2425 82.2800
quantile(life_exp[life_exp$area == "Scotland", "life.expectancy"])
## 0% 25% 50% 75% 100%
## 71.47 73.46 76.89 78.81 80.83
# Get momenta of the distribution
mean(life_exp[life_exp$area == "England", "life.expectancy"])
## [1] 78.55425
var(life_exp[life_exp$area == "England", "life.expectancy"])
## [1] 7.610123
sd(life_exp[life_exp$area == "England", "life.expectancy"])
## [1] 2.758645
# Equivalent - use the subset we generated before
mean(life_exp_eng[, "life.expectancy"])
## [1] 78.55425
var(life_exp_eng[, "life.expectancy"])
## [1] 7.610123
sd(life_exp_eng[, "life.expectancy"])
## [1] 2.758645