These exercises cover the sections of LMS_Statistics
# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_Statistics/course/exercises/solutions"
Exercise 1
rbinom()
function# Simulate 1 flip of a coin
rbinom(1, 1, 0.5)
## [1] 1
# Simulate 10 flips of a single coin
rbinom(10, 1, 0.5)
## [1] 1 1 1 1 1 0 0 1 0 0
# Compute probability of getting at least 5 heads in 10 flips
pbinom(5, 10, 0.5)
## [1] 0.6230469
# Compute probability of getting at least 50 heads in 100 flips
pbinom(5000, 10000, 0.5)
## [1] 0.5039893
dbinom()
and pbinom()
function# Probability of 6 successes in 8 people: dbinom()
dbinom(6, 8, 0.72)
## [1] 0.3058222
# Probability of less than than 6: cumulative probability, pbinom()
pbinom(5, 8, 0.72)
## [1] 0.3972716
# Probability of at least 6: cumulative probability, pbinom()
1 - pbinom(5, 8, 0.72)
## [1] 0.6027284
# Compute the probabilities for the 5 tosses
x <- 0:5
p <- dbinom(x, 5, 0.42)
# Check sum of probabilities gives 1
sum(p)
## [1] 1
# Expected value (weighted mean)
mean <- weighted.mean(x, p)
# Variance (how spread away from the mean)
var <- weighted.mean((x - mean)^2, p)
# std (square root of variance)
std <- sqrt(var)
Exercise 2
dpois()
and `ppois()``function# Probability of exactly 4 calls with lambda = 6
dpois(4, 6)
## [1] 0.1338526
# Probability of at least 4 calls with lambda = 6 (complementary of at most 3)
1 - ppois(3, 6)
## [1] 0.8487961
rpois()
function# Simulate 15 events
rpois(15, lambda = 12)
## [1] 9 11 10 11 17 8 10 9 6 6 8 24 8 6 12
# Compute probability of having 15 or less patients
ppois(15, lambda = 12)
## [1] 0.8444157
# Compute probability of having at least 15 patients - option 1
ppois(15, lambda = 12, lower = FALSE)
## [1] 0.1555843
# Compute probability of having at least 15 patients - option 2
1 - ppois(15, lambda = 12)
## [1] 0.1555843
Exercise 3
pnorm()
function# Probability of less/equal than 2 for a gaussian distributed variable
pnorm(2, mean = 0, sd = 1)
## [1] 0.9772499
# Probability of greater than 2 for a gaussian distributed variable - option 1
pnorm(2, mean = 0, sd = 1, lower.tail = F)
## [1] 0.02275013
# Probability of greater than 2 for a gaussian distributed variable - option 2
1 - pnorm(2, mean = 0, sd = 1)
## [1] 0.02275013
# Create a sequence of numbers between -10 and 10 increment by 0.1
x <- seq(-10, 10, by = 0.1)
# Choose the mean as 2.5 and standard deviation as 0.5
y <- dnorm(x, mean = 2.5, sd = 0.5)
# Choose the mean as 2.5 and standard deviation as 0.5
# y <- dnorm(x, mean = 0, sd = 1)
# Plot figure
plot(x,y)
bonus question 1 (optional)
"data/gene_exp_matrix.RData"
# Load input data
load("../data/gene_exp_matrix.RData")
head(gene_exp_matrix)
## Control1 Control2 Treatment1 Treatment2
## Pam 5.484927 5.523276 5.894180 5.981405
## Slco6d1 7.390315 7.259712 7.963158 7.751687
## Rnf152 7.018605 6.964135 7.364457 7.298354
## Cpa6 8.289773 8.116202 8.705973 8.677886
## Dsel 4.561322 4.548146 4.950947 4.850020
## Tsn 8.779546 8.814279 9.412200 9.488729
# Install basic package for data visualization
# install.packages("gplots")
library("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# Generate heatmap
heatmap.2(gene_exp_matrix, dendrogram = "row",
cexCol = 0.75,
cexRow = 0.35,
density.info = "none", trace = "none")
scale()
function to perform the Z-score transformation, and use the code above to generate the scaled heatmap?scale
, and you might need to use t()
function as well# Option 1 - scale manually
# Scale the gene matrix
# scale() function requires transposed input
gene_exp_matrix_scaled <- t(scale(t(gene_exp_matrix)))
# Generate heatmap
heatmap.2(gene_exp_matrix_scaled, dendrogram = "row",
cexCol = 0.75,
cexRow = 0.35,
density.info = "none", trace = "none")
However, the heatmap.2()
function has a argument called scale
, which does the same thing for you
# Option 2 - scale function of heatmap.2()
# Generate heatmap
heatmap.2(gene_exp_matrix, scale = "row", dendrogram = "row",
cexCol = 0.75,
cexRow = 0.35,
density.info = "none", trace = "none")
bonus question 2 (optional)
"categories_Expression.txt"
ofInterest
and pathway
sections?# Read input file
cat_expr <- read.delim("../data/categories_expression.txt", header = T)
# Explore data
head(cat_expr)
## geneName ofInterest pathway Expression
## 1 Gene1 Selected Glycolysis 20.09519
## 2 Gene2 Selected Glycolysis 23.00306
## 3 Gene3 Selected Glycolysis 20.99712
## 4 Gene4 Selected Glycolysis 43.01145
## 5 Gene5 Selected Glycolysis 22.00567
## 6 Gene6 Selected Glycolysis 20.99162
Expression
, and for the Glycolysis
and TGFb
genesselected
and in the Glycolysis
pathway# Get quantile information
summary(cat_expr[cat_expr$pathway == "Glycolysis", ])
## geneName ofInterest pathway Expression
## Length:55 Length:55 Length:55 Min. :19.94
## Class :character Class :character Class :character 1st Qu.:21.01
## Mode :character Mode :character Mode :character Median :23.00
## Mean :29.78
## 3rd Qu.:27.06
## Max. :74.08
summary(cat_expr[cat_expr$pathway == "TGFb", ])
## geneName ofInterest pathway Expression
## Length:45 Length:45 Length:45 Min. :42.91
## Class :character Class :character Class :character 1st Qu.:50.09
## Mode :character Mode :character Mode :character Median :54.04
## Mean :54.24
## 3rd Qu.:58.06
## Max. :66.10
# Find selected genes in the `Glycolysis` pathway
# Subset the columns of interest
cat_expr_subset <- cat_expr[, c("ofInterest", "pathway")]
contingency <- ftable(cat_expr_subset)
contingency
## pathway Glycolysis TGFb
## ofInterest
## NotSelected 40 40
## Selected 15 5
# Store variables as factors
cat_expr_subset$ofInterest <- as.factor(cat_expr_subset$ofInterest)
cat_expr_subset$pathway <- as.factor(cat_expr_subset$pathway)
# Get only the selected genes
cat_expr_subset$ofInterest <- relevel(cat_expr_subset$ofInterest, ref = "Selected")
contingency <- ftable(cat_expr_subset[, c("ofInterest", "pathway")])
contingency
## pathway Glycolysis TGFb
## ofInterest
## Selected 15 5
## NotSelected 40 40
fisher.test(contingency)
##
## Fisher's Exact Test for Count Data
##
## data: contingency
## p-value = 0.04923
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9139679 11.4586313
## sample estimates:
## odds ratio
## 2.968638
"Gene13"
, assuming normal distributed data# Get the mean value of the expression counts
mean_expression <- mean(cat_expr$Expression)
sd_expression <- sd(cat_expr$Expression)
gene13_expression <- cat_expr[cat_expr$geneName == "Gene13", ]$Expression
1 - pnorm(gene13_expression, mean_expression, sd_expression)
## [1] 0.0254759
Expression
levels between genes in the Glycolysis
pathway and genes in the TGFb
pathway.# Subset the genes involved in desired pathways
glycolysis_expression <- cat_expr[cat_expr$pathway == "Glycolysis", ]$Expression
tgfb_expression <- cat_expr[cat_expr$pathway == "TGFb", ]$Expression
# Perform statistic tests
var.test(glycolysis_expression, tgfb_expression)
##
## F test to compare two variances
##
## data: glycolysis_expression and tgfb_expression
## F = 7.5607, num df = 54, denom df = 44, p-value = 2.375e-10
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 4.241664 13.253432
## sample estimates:
## ratio of variances
## 7.560727
t.test(glycolysis_expression, tgfb_expression, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: glycolysis_expression and tgfb_expression
## t = -10.997, df = 70.605, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -28.89282 -20.02306
## sample estimates:
## mean of x mean of y
## 29.77779 54.23573