These exercises cover the sections of LMS_Statistics

Set working directory

# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises"

Hypothesis testing

Exercise 1

# Load tidyverse library
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Check the airquality dataset
?airquality
head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
# Null hypothesis: average wind speed is 9 mph

# Plot data as a histogram
qplot(airquality$Wind, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mean(airquality$Wind)
## [1] 9.957516
# From histogram we see that mean value is slightly higher than 9mph -> Null hypothesis is at least plausible
# Question: if null hypothesis is true, could this difference be really significant or is it due to a random fluctuation?

# Perform a t-test comparison
t.test(airquality$Wind, mu = 9)
## 
##  One Sample t-test
## 
## data:  airquality$Wind
## t = 3.3619, df = 152, p-value = 0.0009794
## alternative hypothesis: true mean is not equal to 9
## 95 percent confidence interval:
##   9.394804 10.520229
## sample estimates:
## mean of x 
##  9.957516
# t stat 3.3619 -> ver small p-value -> VERY UNLIKELY that a random effect produced this value for the mean
# We can reject our null hypothesis -> mean wind speed in NW is not 9 mph

Exercise 2

# Load tidyverse library
library(tidyverse)

# Check the airquality dataset
?airquality
head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
# Null hypothesis: mean solar radiation is 175 Langleys

# Plot data as a histogram
qplot(airquality$Solar.R, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing non-finite values (stat_bin).

mean(airquality$Solar.R)
## [1] NA
mean(na.omit(airquality$Solar.R))
## [1] 185.9315
# From histogram we see a mean value slightly higher -> Null hypothesis is at least plausible 
# Question: if null hypothesis is true, could this difference be really significant or is it due to a random fluctuation?

# Perform a t-test comparison with a 1 sided alternative
t.test(airquality$Solar.R, mu = 175, alternative = "greater")
## 
##  One Sample t-test
## 
## data:  airquality$Solar.R
## t = 1.4667, df = 145, p-value = 0.07232
## alternative hypothesis: true mean is greater than 175
## 95 percent confidence interval:
##  173.5931      Inf
## sample estimates:
## mean of x 
##  185.9315
# t stat 1.4667 -> big p-value = 0.07 -> sample mean like this one will happen just by chance 7% of the times
# Not strong evidence that the mean solar radiation will be greater than 175
# Not strong evidence to reject the null hypothesis