Exercise 1 - Introduction to Probability theory

Set working directory

# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions")
getwd()

## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions"

Basic data manipulation in R

read.csv() - load input file in .csv format
head() - check first rows of a dataframe
dim() - check dimensions of a dataframe
str() - check data types of an R object

Basic statistics in R

mean(): generate Binomial distributed random variables with a given mean and std
sd(): generate Poisson distributed random variables with a given mean and std
var(): generate Normal distributed random variables with a given mean and std

Exercise 1

Read in the file data/life_expectancy_at_birth.csv and store it as a dataframe

# Read .csv file
life_exp <- read.csv("../data/life_expectancy.csv", header = T)

Use the head() and dim() functions to show the first 6 rows of the data. How many variables are there?

# Explore data
head(life_exp)

##        time    area life.expectancy gender
## 1 1991-1993 England           73.69  Males
## 2 1992-1994 England           74.02  Males
## 3 1993-1995 England           74.18  Males
## 4 1994-1996 England           74.44  Males
## 5 1995-1997 England           74.61  Males
## 6 1996-1998 England           74.84  Males

# Check dimensions
dim(life_exp)

## [1] 160   4

Use the str() and summary() functions to see the data type and summary information of the dataset

# Check data types
str(life_exp)

## 'data.frame':    160 obs. of  4 variables:
##  $ time           : chr  "1991-1993" "1992-1994" "1993-1995" "1994-1996" ...
##  $ area           : chr  "England" "England" "England" "England" ...
##  $ life.expectancy: num  73.7 74 74.2 74.4 74.6 ...
##  $ gender         : chr  "Males" "Males" "Males" "Males" ...

# Check summary information
summary(life_exp)

##      time               area           life.expectancy    gender         
##  Length:160         Length:160         Min.   :71.47   Length:160        
##  Class :character   Class :character   1st Qu.:75.08   Class :character  
##  Mode  :character   Mode  :character   Median :78.00   Mode  :character  
##                                        Mean   :77.63                     
##                                        3rd Qu.:79.91                     
##                                        Max.   :83.01

How many males and females are in the area of England?

# Subset the df for the England cases
life_exp_eng <- life_exp[life_exp$area == "England", ]
dim(life_exp)

## [1] 160   4

dim(life_exp_eng)

## [1] 40  4

# Get number of cases for each gender case
ftable(life_exp_eng$gender)

##  Females Males
##               
##       20    20

Check maximum and minimum life expectancy in England

# Option a - use min() and max() functions
min_life_exp_eng <- min(life_exp_eng[, "life.expectancy"])
max_life_exp_eng <- max(life_exp_eng[, "life.expectancy"])
c(min_life_exp_eng, max_life_exp_eng)

## [1] 73.69 83.01

cat("Min: ", min_life_exp_eng, "\nMax: ", max_life_exp_eng)

## Min:  73.69 
## Max:  83.01

# Option b - use the range() function
min_life_exp_eng <- range(life_exp_eng[, "life.expectancy"])[1]
max_life_exp_eng <- range(life_exp_eng[, "life.expectancy"])[2]
cat("Min: ", min_life_exp_eng, "\nMax: ", max_life_exp_eng)

## Min:  73.69 
## Max:  83.01

Find the the median life expectancy of of Males and Females in England

# Get median value
median(life_exp_eng[life_exp_eng$gender == "Males", "life.expectancy"])

## [1] 76.11

median(life_exp_eng[life_exp_eng$gender == "Females", "life.expectancy"])

## [1] 80.69

Get the the quantiles of life.expectancy in Wales, N.Ireland, and Scotland, respectively

# Get quantile information
# Quantiles divide the range of a probability distribution into continuous intervals with equal probabilities
quantile(life_exp[life_exp$area == "Wales", "life.expectancy"])

##      0%     25%     50%     75%    100% 
## 73.2500 75.4525 78.5250 80.0575 82.2200

quantile(life_exp[life_exp$area == "N.Ireland", "life.expectancy"])

##      0%     25%     50%     75%    100% 
## 72.7300 75.5050 78.2550 80.2425 82.2800

quantile(life_exp[life_exp$area == "Scotland", "life.expectancy"])

##    0%   25%   50%   75%  100% 
## 71.47 73.46 76.89 78.81 80.83

Get the mean, variance and standard deviation of the life.expectancy in England

# Get momenta of the distribution
mean(life_exp[life_exp$area == "England", "life.expectancy"])

## [1] 78.55425

var(life_exp[life_exp$area == "England", "life.expectancy"])

## [1] 7.610123

sd(life_exp[life_exp$area == "England", "life.expectancy"])

## [1] 2.758645

# Equivalent - use the subset we generated before
mean(life_exp_eng[, "life.expectancy"])

## [1] 78.55425

var(life_exp_eng[, "life.expectancy"])

## [1] 7.610123

sd(life_exp_eng[, "life.expectancy"])

## [1] 2.758645

Exercise 1 - Introduction to Probability theory

MRC London Institute of Medical Sciences (http://bioinformatics.lms.mrc.ac.uk)

April 2022

Set working directory

Basic data manipulation in R

Basic statistics in R