These exercises cover the sections of LMS_Statistics

Set working directory

# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions"

Generating events in R

Four functionn for simulating data following each distribution

Exercise 1

# Simulate 1 flip of a coin
rbinom(1, 1, 0.5)
## [1] 1
# Simulate 10 flips of a single coin
rbinom(10, 1, 0.5)
##  [1] 1 1 1 1 1 0 0 1 0 0
# Compute probability of getting at least 5 heads in 10 flips
pbinom(5, 10, 0.5)
## [1] 0.6230469
# Compute probability of getting at least 50 heads in 100 flips
pbinom(5000, 10000, 0.5)
## [1] 0.5039893
# Probability of 6 successes in 8 people: dbinom()
dbinom(6, 8, 0.72)
## [1] 0.3058222
# Probability of less than than 6: cumulative probability, pbinom()
pbinom(5, 8, 0.72)
## [1] 0.3972716
# Probability of at least 6: cumulative probability, pbinom()
1 - pbinom(5, 8, 0.72)
## [1] 0.6027284
# Compute the probabilities for the 5 tosses
x <- 0:5
p <- dbinom(x, 5, 0.42)

# Check sum of probabilities gives 1
sum(p)
## [1] 1
# Expected value (weighted mean)
mean <- weighted.mean(x, p)

# Variance (how spread away from the mean)
var <- weighted.mean((x - mean)^2, p)

# std (square root of variance)
std <- sqrt(var)

Exercise 2

# Probability of exactly 4 calls with lambda = 6
dpois(4, 6)
## [1] 0.1338526
# Probability of at least 4 calls with lambda = 6 (complementary of at most 3)
1 - ppois(3, 6)
## [1] 0.8487961
# Simulate 15 events
rpois(15, lambda = 12)
##  [1]  9 11 10 11 17  8 10  9  6  6  8 24  8  6 12
# Compute probability of having 15 or less patients
ppois(15, lambda = 12)
## [1] 0.8444157
# Compute probability of having at least 15 patients - option 1
ppois(15, lambda = 12, lower = FALSE)
## [1] 0.1555843
# Compute probability of having at least 15 patients - option 2
1 - ppois(15, lambda = 12)
## [1] 0.1555843

Exercise 3

# Probability of less/equal than 2 for a gaussian distributed variable
pnorm(2, mean = 0, sd = 1)
## [1] 0.9772499
# Probability of greater than 2 for a gaussian distributed variable - option 1
pnorm(2, mean = 0, sd = 1, lower.tail = F)
## [1] 0.02275013
# Probability of greater than 2 for a gaussian distributed variable - option 2
1 - pnorm(2, mean = 0, sd = 1)
## [1] 0.02275013
# Create a sequence of numbers between -10 and 10 increment by 0.1
x <- seq(-10, 10, by = 0.1)

# Choose the mean as 2.5 and standard deviation as 0.5
y <- dnorm(x, mean = 2.5, sd = 0.5)

# Choose the mean as 2.5 and standard deviation as 0.5
# y <- dnorm(x, mean = 0, sd = 1)

# Plot figure
plot(x,y)

bonus question 1 (optional)

# Load input data
load("../data/gene_exp_matrix.RData")
head(gene_exp_matrix)
##         Control1 Control2 Treatment1 Treatment2
## Pam     5.484927 5.523276   5.894180   5.981405
## Slco6d1 7.390315 7.259712   7.963158   7.751687
## Rnf152  7.018605 6.964135   7.364457   7.298354
## Cpa6    8.289773 8.116202   8.705973   8.677886
## Dsel    4.561322 4.548146   4.950947   4.850020
## Tsn     8.779546 8.814279   9.412200   9.488729
# Install basic package for data visualization
# install.packages("gplots")
library("gplots")
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# Generate heatmap
heatmap.2(gene_exp_matrix, dendrogram = "row",
          cexCol = 0.75, 
          cexRow = 0.35,
          density.info = "none", trace = "none")

# Option 1 - scale manually

# Scale the gene matrix
# scale() function requires transposed input
gene_exp_matrix_scaled <- t(scale(t(gene_exp_matrix)))

# Generate heatmap
heatmap.2(gene_exp_matrix_scaled, dendrogram = "row",
          cexCol = 0.75, 
          cexRow = 0.35,
          density.info = "none", trace = "none")

However, the heatmap.2() function has a argument called scale, which does the same thing for you

# Option 2 - scale function of heatmap.2()

# Generate heatmap
heatmap.2(gene_exp_matrix, scale = "row", dendrogram = "row",
          cexCol = 0.75, 
          cexRow = 0.35,
          density.info = "none", trace = "none")

bonus question 2 (optional)

# Read input file
cat_expr <- read.delim("../data/categories_expression.txt", header = T)

# Explore data
head(cat_expr)
##   geneName ofInterest    pathway Expression
## 1    Gene1   Selected Glycolysis   20.09519
## 2    Gene2   Selected Glycolysis   23.00306
## 3    Gene3   Selected Glycolysis   20.99712
## 4    Gene4   Selected Glycolysis   43.01145
## 5    Gene5   Selected Glycolysis   22.00567
## 6    Gene6   Selected Glycolysis   20.99162
# Get quantile information
summary(cat_expr[cat_expr$pathway == "Glycolysis", ])
##    geneName          ofInterest          pathway            Expression   
##  Length:55          Length:55          Length:55          Min.   :19.94  
##  Class :character   Class :character   Class :character   1st Qu.:21.01  
##  Mode  :character   Mode  :character   Mode  :character   Median :23.00  
##                                                           Mean   :29.78  
##                                                           3rd Qu.:27.06  
##                                                           Max.   :74.08
summary(cat_expr[cat_expr$pathway == "TGFb", ])
##    geneName          ofInterest          pathway            Expression   
##  Length:45          Length:45          Length:45          Min.   :42.91  
##  Class :character   Class :character   Class :character   1st Qu.:50.09  
##  Mode  :character   Mode  :character   Mode  :character   Median :54.04  
##                                                           Mean   :54.24  
##                                                           3rd Qu.:58.06  
##                                                           Max.   :66.10
# Find selected genes in the `Glycolysis` pathway

# Subset the columns of interest
cat_expr_subset <- cat_expr[, c("ofInterest", "pathway")]
contingency <- ftable(cat_expr_subset)
contingency
##             pathway Glycolysis TGFb
## ofInterest                         
## NotSelected                 40   40
## Selected                    15    5
# Store variables as factors
cat_expr_subset$ofInterest <- as.factor(cat_expr_subset$ofInterest)
cat_expr_subset$pathway <- as.factor(cat_expr_subset$pathway)

# Get only the selected genes
cat_expr_subset$ofInterest <- relevel(cat_expr_subset$ofInterest, ref = "Selected")
contingency <- ftable(cat_expr_subset[, c("ofInterest", "pathway")])
contingency
##             pathway Glycolysis TGFb
## ofInterest                         
## Selected                    15    5
## NotSelected                 40   40
fisher.test(contingency)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency
## p-value = 0.04923
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.9139679 11.4586313
## sample estimates:
## odds ratio 
##   2.968638
# Get the mean value of the expression counts
mean_expression <- mean(cat_expr$Expression)
sd_expression <- sd(cat_expr$Expression)
gene13_expression <- cat_expr[cat_expr$geneName == "Gene13", ]$Expression
1 - pnorm(gene13_expression, mean_expression, sd_expression)
## [1] 0.0254759
# Subset the genes involved in desired pathways
glycolysis_expression <- cat_expr[cat_expr$pathway == "Glycolysis", ]$Expression
tgfb_expression <- cat_expr[cat_expr$pathway == "TGFb", ]$Expression

# Perform statistic tests
var.test(glycolysis_expression, tgfb_expression)
## 
##  F test to compare two variances
## 
## data:  glycolysis_expression and tgfb_expression
## F = 7.5607, num df = 54, denom df = 44, p-value = 2.375e-10
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##   4.241664 13.253432
## sample estimates:
## ratio of variances 
##           7.560727
t.test(glycolysis_expression, tgfb_expression, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  glycolysis_expression and tgfb_expression
## t = -10.997, df = 70.605, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -28.89282 -20.02306
## sample estimates:
## mean of x mean of y 
##  29.77779  54.23573