Set working directory

# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions"
  1. Generate fake data

# Normalize read counts for remove bias related to:
# i) Sequence depth - sequencing runs with higher depths will have more reads mapping to each gene ("per million")
# ii) Gene length - longer genes (measured in kilobases of exons) will have more reads mapping to them

# Load some fake data - 3 replicates and 4 genes, following the example:
data_matrix <- read.table("../data/data_matrix.txt")
colnames(data_matrix) <- paste("Replicate", 1:3, sep = "")
rownames(data_matrix) <- paste("Gene", 1:4, sep = "")

# Gene lengths: Gene1 = 2kb, Gene2 = 4kb, Gene3 = 1kb, Gene4 = 10kb
gene_lengths <- c(2, 4, 1, 10)

# Explore data
data_matrix
##       Replicate1 Replicate2 Replicate3
## Gene1         10         12         30
## Gene2         20         25         60
## Gene3          5          8         15
## Gene4          0          0          1

# Replica 3 has more counts than other replicates, regardless of the gene -> larger sequencing depth
# Gene 2 is 2 times larger than Gene1 -> always has twice more reads, regardless of the replica
  1. Perform RPKM normalization (Reads Per Kilobase of exon and per Million reads)

# 1. Normalize by read depth (total number per replica (column))

# Get sequencing depth for each replica
read_depths <- colSums(data_matrix)
read_depths_tens <- read_depths / 10 # "Tens" of reads; would be "millions" of reads in real data

# Divide each column by its read depth
data_matrix_RPM <- t(data_matrix) / read_depths_tens
data_matrix_RPM <- t(data_matrix_RPM)

data_matrix_RPM
##       Replicate1 Replicate2 Replicate3
## Gene1   2.857143   2.666667 2.83018868
## Gene2   5.714286   5.555556 5.66037736
## Gene3   1.428571   1.777778 1.41509434
## Gene4   0.000000   0.000000 0.09433962

# 2. Normalize per gene length (total number per gene (row))
data_matrix_RPKM <- data_matrix_RPM / gene_lengths

data_matrix_RPKM
##       Replicate1 Replicate2  Replicate3
## Gene1   1.428571   1.333333 1.415094340
## Gene2   1.428571   1.388889 1.415094340
## Gene3   1.428571   1.777778 1.415094340
## Gene4   0.000000   0.000000 0.009433962

# Store result as data frame
colnames(data_matrix_RPKM) <- paste("RPKM-Rep", 1:3, sep = "")
data_matrix_RPKM <- as.data.frame(data_matrix_RPKM)
data_matrix_RPKM
##       RPKM-Rep1 RPKM-Rep2   RPKM-Rep3
## Gene1  1.428571  1.333333 1.415094340
## Gene2  1.428571  1.388889 1.415094340
## Gene3  1.428571  1.777778 1.415094340
## Gene4  0.000000  0.000000 0.009433962
  1. Perform TPM normalization (Transcripts per million)

# 1. Normalize per gene length (total number per gene (row))
data_matrix_RPK <- data_matrix / gene_lengths
data_matrix_RPK
##       Replicate1 Replicate2 Replicate3
## Gene1          5       6.00       15.0
## Gene2          5       6.25       15.0
## Gene3          5       8.00       15.0
## Gene4          0       0.00        0.1

# 2. Normalize by read depth (total number per replica (column))

# Get sequencing depth for each replica
read_depths2 <- colSums(data_matrix_RPK)
read_depths2_tens <- read_depths2 / 10 # "Tens" of reads; would be "millions" of reads in real data

# Divide each column by its read depth
data_matrix_TPM <- t(data_matrix_RPK) / read_depths2_tens
data_matrix_TPM <- t(data_matrix_TPM)
data_matrix_TPM
##       Replicate1 Replicate2 Replicate3
## Gene1   3.333333   2.962963 3.32594235
## Gene2   3.333333   3.086420 3.32594235
## Gene3   3.333333   3.950617 3.32594235
## Gene4   0.000000   0.000000 0.02217295

# Store result as data frame
colnames(data_matrix_TPM) <- paste("TPM-Rep", 1:3, sep = "")
rownames(data_matrix_TPM) <- paste("Gene", 1:4, sep = "")
data_matrix_TPM <- as.data.frame(data_matrix_TPM)
data_matrix_TPM
##       TPM-Rep1 TPM-Rep2   TPM-Rep3
## Gene1 3.333333 2.962963 3.32594235
## Gene2 3.333333 3.086420 3.32594235
## Gene3 3.333333 3.950617 3.32594235
## Gene4 0.000000 0.000000 0.02217295
  1. Compare both methods - get sum of normalized reads for each column in the RPKM matrix
# Total number per replica in RPKM
total_RPKM <- colSums(data_matrix_RPKM)
total_RPKM
## RPKM-Rep1 RPKM-Rep2 RPKM-Rep3 
##  4.285714  4.500000  4.254717
# Total number per replica in TPM
total_TPM <- colSums(data_matrix_TPM)
total_TPM
## TPM-Rep1 TPM-Rep2 TPM-Rep3 
##       10       10       10
# Both methods correct biases in sequencing depth and gene length
# Total sum of normalized reads in each replica are different
# With RPKM is harder to compare the proportion of total reads that got mapped to each gene
# TPM gives more information about relative proportions -> we can immediately see that in R1
# a larger proportion of reads got mapped to Gene1 (3.33 vs 3.32) and only from R3 to Gene4
  1. Perform RPKM normalization (Reads Per Kilobase of exon and per Million reads) WITHOUT TRANSPOSING

# 1. Normalize by read depth (total number per replica (column))

# Get sequencing depth for each replica
read_depths <- colSums(data_matrix)
read_depths_tens <- read_depths / 10 # "Tens" of reads; would be "millions" of reads in real data

# Divide each column by its read depth
data_matrix_RPM <- matrix(, nrow = 4, ncol = 3)
for (i in 1:dim(data_matrix)[2]) {

      data_matrix_RPM[, i] <- data_matrix[, i] / read_depths_tens[i]

}

data_matrix_RPM
##          [,1]     [,2]       [,3]
## [1,] 2.857143 2.666667 2.83018868
## [2,] 5.714286 5.555556 5.66037736
## [3,] 1.428571 1.777778 1.41509434
## [4,] 0.000000 0.000000 0.09433962

# 2. Normalize per gene length (total number per gene (row))
data_matrix_RPKM <- matrix(nrow = 4, ncol = 3)
for (i in 1:dim(data_matrix)[1]) {

      data_matrix_RPKM[i, ] <- data_matrix_RPM[i, ] / gene_lengths[i]

}

data_matrix_RPKM
##          [,1]     [,2]        [,3]
## [1,] 1.428571 1.333333 1.415094340
## [2,] 1.428571 1.388889 1.415094340
## [3,] 1.428571 1.777778 1.415094340
## [4,] 0.000000 0.000000 0.009433962

# Store result as data frame
colnames(data_matrix_RPKM) <- paste("RPKM-Rep", 1:3, sep = "")
rownames(data_matrix_RPKM) <- paste("Gene", 1:4, sep = "")
data_matrix_RPKM <- as.data.frame(data_matrix_RPKM)
data_matrix_RPKM
##       RPKM-Rep1 RPKM-Rep2   RPKM-Rep3
## Gene1  1.428571  1.333333 1.415094340
## Gene2  1.428571  1.388889 1.415094340
## Gene3  1.428571  1.777778 1.415094340
## Gene4  0.000000  0.000000 0.009433962
  1. Perform TPM normalization (Transcripts per million) WITHOUT TRANSPOSING

# 1. Normalize per gene length (total number per gene (row))
data_matrix_RPK <- matrix(nrow = 4, ncol = 3)
for (i in 1:dim(data_matrix)[1]) {

      data_matrix_RPK[i, ] <- as.matrix(data_matrix[i, ]) / gene_lengths[i]

}

# 2. Normalize by read depth (total number per replica (column))

# Get sequencing depth for each replica
read_depths2 <- colSums(data_matrix_RPK)
read_depths2_tens <- read_depths2 / 10 # "Tens" of reads; would be "millions" of reads in real data

# Divide each column by its read depth
data_matrix_TPM <- matrix(nrow = 4, ncol = 3)
for (i in 1:dim(data_matrix)[2]) {

      data_matrix_TPM[, i] <- data_matrix_RPK[, i] / read_depths2_tens[i]

}
data_matrix_TPM
##          [,1]     [,2]       [,3]
## [1,] 3.333333 2.962963 3.32594235
## [2,] 3.333333 3.086420 3.32594235
## [3,] 3.333333 3.950617 3.32594235
## [4,] 0.000000 0.000000 0.02217295

# Store result as data frame
colnames(data_matrix_TPM) <- paste("TPM-Rep", 1:3, sep = "")
rownames(data_matrix_TPM) <- paste("Gene", 1:4, sep = "")
data_matrix_TPM <- as.data.frame(data_matrix_TPM)
data_matrix_TPM
##       TPM-Rep1 TPM-Rep2   TPM-Rep3
## Gene1 3.333333 2.962963 3.32594235
## Gene2 3.333333 3.086420 3.32594235
## Gene3 3.333333 3.950617 3.32594235
## Gene4 0.000000 0.000000 0.02217295