Set working directory
# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions"
- Generate fake data
# Normalize read counts for remove bias related to:
# i) Sequence depth - sequencing runs with higher depths will have more reads mapping to each gene ("per million")
# ii) Gene length - longer genes (measured in kilobases of exons) will have more reads mapping to them
# Load some fake data - 3 replicates and 4 genes, following the example:
data_matrix <- read.table("../data/data_matrix.txt")
colnames(data_matrix) <- paste("Replicate", 1:3, sep = "")
rownames(data_matrix) <- paste("Gene", 1:4, sep = "")
# Gene lengths: Gene1 = 2kb, Gene2 = 4kb, Gene3 = 1kb, Gene4 = 10kb
gene_lengths <- c(2, 4, 1, 10)
# Explore data
data_matrix
## Replicate1 Replicate2 Replicate3
## Gene1 10 12 30
## Gene2 20 25 60
## Gene3 5 8 15
## Gene4 0 0 1
# Replica 3 has more counts than other replicates, regardless of the gene -> larger sequencing depth
# Gene 2 is 2 times larger than Gene1 -> always has twice more reads, regardless of the replica
- Perform RPKM normalization (Reads Per Kilobase of exon and per Million reads)
# 1. Normalize by read depth (total number per replica (column))
# Get sequencing depth for each replica
read_depths <- colSums(data_matrix)
read_depths_tens <- read_depths / 10 # "Tens" of reads; would be "millions" of reads in real data
# Divide each column by its read depth
data_matrix_RPM <- t(data_matrix) / read_depths_tens
data_matrix_RPM <- t(data_matrix_RPM)
data_matrix_RPM
## Replicate1 Replicate2 Replicate3
## Gene1 2.857143 2.666667 2.83018868
## Gene2 5.714286 5.555556 5.66037736
## Gene3 1.428571 1.777778 1.41509434
## Gene4 0.000000 0.000000 0.09433962
# 2. Normalize per gene length (total number per gene (row))
data_matrix_RPKM <- data_matrix_RPM / gene_lengths
data_matrix_RPKM
## Replicate1 Replicate2 Replicate3
## Gene1 1.428571 1.333333 1.415094340
## Gene2 1.428571 1.388889 1.415094340
## Gene3 1.428571 1.777778 1.415094340
## Gene4 0.000000 0.000000 0.009433962
# Store result as data frame
colnames(data_matrix_RPKM) <- paste("RPKM-Rep", 1:3, sep = "")
data_matrix_RPKM <- as.data.frame(data_matrix_RPKM)
data_matrix_RPKM
## RPKM-Rep1 RPKM-Rep2 RPKM-Rep3
## Gene1 1.428571 1.333333 1.415094340
## Gene2 1.428571 1.388889 1.415094340
## Gene3 1.428571 1.777778 1.415094340
## Gene4 0.000000 0.000000 0.009433962
- Perform TPM normalization (Transcripts per million)
# 1. Normalize per gene length (total number per gene (row))
data_matrix_RPK <- data_matrix / gene_lengths
data_matrix_RPK
## Replicate1 Replicate2 Replicate3
## Gene1 5 6.00 15.0
## Gene2 5 6.25 15.0
## Gene3 5 8.00 15.0
## Gene4 0 0.00 0.1
# 2. Normalize by read depth (total number per replica (column))
# Get sequencing depth for each replica
read_depths2 <- colSums(data_matrix_RPK)
read_depths2_tens <- read_depths2 / 10 # "Tens" of reads; would be "millions" of reads in real data
# Divide each column by its read depth
data_matrix_TPM <- t(data_matrix_RPK) / read_depths2_tens
data_matrix_TPM <- t(data_matrix_TPM)
data_matrix_TPM
## Replicate1 Replicate2 Replicate3
## Gene1 3.333333 2.962963 3.32594235
## Gene2 3.333333 3.086420 3.32594235
## Gene3 3.333333 3.950617 3.32594235
## Gene4 0.000000 0.000000 0.02217295
# Store result as data frame
colnames(data_matrix_TPM) <- paste("TPM-Rep", 1:3, sep = "")
rownames(data_matrix_TPM) <- paste("Gene", 1:4, sep = "")
data_matrix_TPM <- as.data.frame(data_matrix_TPM)
data_matrix_TPM
## TPM-Rep1 TPM-Rep2 TPM-Rep3
## Gene1 3.333333 2.962963 3.32594235
## Gene2 3.333333 3.086420 3.32594235
## Gene3 3.333333 3.950617 3.32594235
## Gene4 0.000000 0.000000 0.02217295
- Compare both methods - get sum of normalized reads for each column in the RPKM matrix
# Total number per replica in RPKM
total_RPKM <- colSums(data_matrix_RPKM)
total_RPKM
## RPKM-Rep1 RPKM-Rep2 RPKM-Rep3
## 4.285714 4.500000 4.254717
# Total number per replica in TPM
total_TPM <- colSums(data_matrix_TPM)
total_TPM
## TPM-Rep1 TPM-Rep2 TPM-Rep3
## 10 10 10
# Both methods correct biases in sequencing depth and gene length
# Total sum of normalized reads in each replica are different
# With RPKM is harder to compare the proportion of total reads that got mapped to each gene
# TPM gives more information about relative proportions -> we can immediately see that in R1
# a larger proportion of reads got mapped to Gene1 (3.33 vs 3.32) and only from R3 to Gene4
- Perform RPKM normalization (Reads Per Kilobase of exon and per Million reads) WITHOUT TRANSPOSING
- Hint: use a for loop for element-wise matrix manipulation
# 1. Normalize by read depth (total number per replica (column))
# Get sequencing depth for each replica
read_depths <- colSums(data_matrix)
read_depths_tens <- read_depths / 10 # "Tens" of reads; would be "millions" of reads in real data
# Divide each column by its read depth
data_matrix_RPM <- matrix(, nrow = 4, ncol = 3)
for (i in 1:dim(data_matrix)[2]) {
data_matrix_RPM[, i] <- data_matrix[, i] / read_depths_tens[i]
}
data_matrix_RPM
## [,1] [,2] [,3]
## [1,] 2.857143 2.666667 2.83018868
## [2,] 5.714286 5.555556 5.66037736
## [3,] 1.428571 1.777778 1.41509434
## [4,] 0.000000 0.000000 0.09433962
# 2. Normalize per gene length (total number per gene (row))
data_matrix_RPKM <- matrix(nrow = 4, ncol = 3)
for (i in 1:dim(data_matrix)[1]) {
data_matrix_RPKM[i, ] <- data_matrix_RPM[i, ] / gene_lengths[i]
}
data_matrix_RPKM
## [,1] [,2] [,3]
## [1,] 1.428571 1.333333 1.415094340
## [2,] 1.428571 1.388889 1.415094340
## [3,] 1.428571 1.777778 1.415094340
## [4,] 0.000000 0.000000 0.009433962
# Store result as data frame
colnames(data_matrix_RPKM) <- paste("RPKM-Rep", 1:3, sep = "")
rownames(data_matrix_RPKM) <- paste("Gene", 1:4, sep = "")
data_matrix_RPKM <- as.data.frame(data_matrix_RPKM)
data_matrix_RPKM
## RPKM-Rep1 RPKM-Rep2 RPKM-Rep3
## Gene1 1.428571 1.333333 1.415094340
## Gene2 1.428571 1.388889 1.415094340
## Gene3 1.428571 1.777778 1.415094340
## Gene4 0.000000 0.000000 0.009433962
- Perform TPM normalization (Transcripts per million) WITHOUT TRANSPOSING
- Hint: use a for loop for element-wise matrix manipulation
# 1. Normalize per gene length (total number per gene (row))
data_matrix_RPK <- matrix(nrow = 4, ncol = 3)
for (i in 1:dim(data_matrix)[1]) {
data_matrix_RPK[i, ] <- as.matrix(data_matrix[i, ]) / gene_lengths[i]
}
# 2. Normalize by read depth (total number per replica (column))
# Get sequencing depth for each replica
read_depths2 <- colSums(data_matrix_RPK)
read_depths2_tens <- read_depths2 / 10 # "Tens" of reads; would be "millions" of reads in real data
# Divide each column by its read depth
data_matrix_TPM <- matrix(nrow = 4, ncol = 3)
for (i in 1:dim(data_matrix)[2]) {
data_matrix_TPM[, i] <- data_matrix_RPK[, i] / read_depths2_tens[i]
}
data_matrix_TPM
## [,1] [,2] [,3]
## [1,] 3.333333 2.962963 3.32594235
## [2,] 3.333333 3.086420 3.32594235
## [3,] 3.333333 3.950617 3.32594235
## [4,] 0.000000 0.000000 0.02217295
# Store result as data frame
colnames(data_matrix_TPM) <- paste("TPM-Rep", 1:3, sep = "")
rownames(data_matrix_TPM) <- paste("Gene", 1:4, sep = "")
data_matrix_TPM <- as.data.frame(data_matrix_TPM)
data_matrix_TPM
## TPM-Rep1 TPM-Rep2 TPM-Rep3
## Gene1 3.333333 2.962963 3.32594235
## Gene2 3.333333 3.086420 3.32594235
## Gene3 3.333333 3.950617 3.32594235
## Gene4 0.000000 0.000000 0.02217295