Set working directory

# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions"
  1. Read in count data and sample description. Create col_data and check dimensions.

# Read the count information
all_counts <- read.csv(file = "../data/exercise1_counts.csv", header = T, row.names = 1)
head(all_counts)
##           control_FFa1.bam control_FFa2.bam control_FFa3.bam mutant_KOa1.bam
## 497097                  16               16                0              21
## 100503874               20                0                0              64
## 100038431                0                0                2               0
## 19888                   11                0               10             113
## 20671                   14               16                0              40
## 27395                  465              193              596             436
##           mutant_KOa2.bam mutant_KOa3.bam mutant_KOb1.bam mutant_KOb2.bam
## 497097                 16              27              20               0
## 100503874               0               4               5               0
## 100038431               0               8               0               0
## 19888                   0              26              14               6
## 20671                   8              33              33              12
## 27395                 686             572            1378            1901
##           mutant_KOb3.bam
## 497097                  2
## 100503874               2
## 100038431               0
## 19888                  16
## 20671                  24
## 27395                1553

# Read the sample description that we generated manually for this exercise
sam_des <- read.table("../data/exercise1_sample_description.info", sep = "\t", header = TRUE)
head(sam_des)
##              filename sample condition batch
## 1    control_FFa1.bam   FFa1       FFa     a
## 2    control_FFa2.bam   FFa2       FFa     b
## 3    control_FFa3.bam   FFa3       FFa     c
## 4 TMC_mutant_KOa1.bam   KOa1       KOa     a
## 5 TMC_mutant_KOa2.bam   KOa2       KOa     b
## 6 TMC_mutant_KOa3.bam   KOa3       KOa     c

# Create col data from sample description
col_data <- data.frame(sample = sam_des$sample, condition = sam_des$condition, batch = sam_des$batch)
head(col_data)
##   sample condition batch
## 1   FFa1       FFa     a
## 2   FFa2       FFa     b
## 3   FFa3       FFa     c
## 4   KOa1       KOa     a
## 5   KOa2       KOa     b
## 6   KOa3       KOa     c

# Check dimensions
all(colnames(all_counts) == col_data$name)
## [1] TRUE
  1. Construct DESeqDataSet object using count data and sample description.

# Load DESeq2 library
suppressPackageStartupMessages(library(DESeq2))

# Create DESeq object
dds <- DESeqDataSetFromMatrix(countData = all_counts, colData = col_data, design =~ condition)

# Run DESeq2 Normalization
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

# Check the dds object
dds
## class: DESeqDataSet 
## dim: 26301 9 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(26301): 497097 100503874 ... 100040384 100040400
## rowData names(26): baseMean baseVar ... deviance maxCooks
## colnames(9): control_FFa1.bam control_FFa2.bam ... mutant_KOb2.bam
##   mutant_KOb3.bam
## colData names(4): sample condition batch sizeFactor
  1. Perform rlog and vs transformation on the
# Perform rlog transformation
rld <- rlog(dds)
# Get matrix form
rld_counts <- assay(rld)
head(rld_counts)
##           control_FFa1.bam control_FFa2.bam control_FFa3.bam mutant_KOa1.bam
## 497097           3.8561447        3.9907973        3.5322792        3.923806
## 100503874        3.2895831        2.8326071        2.8201166        3.834407
## 100038431        0.1543662        0.1551115        0.1903975        0.154316
## 19888            3.9103723        3.6262932        3.8451124        5.046242
## 20671            4.2033204        4.3750856        3.9098102        4.550450
## 27395            9.2090107        8.7918710        9.2758269        9.106490
##           mutant_KOa2.bam mutant_KOa3.bam mutant_KOb1.bam mutant_KOb2.bam
## 497097          3.8489878       3.9980385       3.7603473       3.5253500
## 100503874       2.8226210       2.9321205       2.8973928       2.8131612
## 100038431       0.1543294       0.3042526       0.1537077       0.1536189
## 19888           3.6068923       4.1745338       3.8179714       3.6830152
## 20671           4.0867366       4.4566664       4.2622207       4.0354695
## 27395           9.5797251       9.3512412       9.6722305       9.8827325
##           mutant_KOb3.bam
## 497097          3.5590907
## 100503874       2.8554608
## 100038431       0.1538331
## 19888           3.8816181
## 20671           4.2166975
## 27395           9.9615722
# Perform vsd transformation
vsd <- varianceStabilizingTransformation(dds)
# Get matrix form
vsd_counts <- assay(vsd)
head(vsd_counts)
##           control_FFa1.bam control_FFa2.bam control_FFa3.bam mutant_KOa1.bam
## 497097            6.922956         7.102750         6.193022        7.011409
## 100503874         7.007005         6.193022         6.193022        7.586183
## 100038431         6.193022         6.193022         6.430894        6.193022
## 19888             6.800235         6.193022         6.722545        7.997112
## 20671             6.876703         7.102750         6.193022        7.309628
## 27395             9.440091         8.960798         9.517810        9.321921
##           mutant_KOa2.bam mutant_KOa3.bam mutant_KOb1.bam mutant_KOb2.bam
## 497097           6.913175        7.104558        6.788596        6.193022
## 100503874        6.193022        6.548835        6.492391        6.193022
## 100038431        6.193022        6.694958        6.193022        6.193022
## 19888            6.193022        7.088049        6.692365        6.501350
## 20671            6.704862        7.197214        6.954620        6.628242
## 27395            9.874359        9.605598        9.983524       10.234415
##           mutant_KOb3.bam
## 497097           6.397612
## 100503874        6.397612
## 100038431        6.193022
## 19888            6.768354
## 20671            6.895379
## 27395           10.329292
  1. Draw a heatmap of count matrix based on the top 40 highly expressed genes using rlog and vst data.
# Load pheatmap library
library("pheatmap")

# Select the top 40 highly expressed genes
select <- order(rowMeans(counts(dds, normalized = TRUE)), decreasing = TRUE)[1:40]

# Draw heatmap in rlog transformed data
pheatmap(assay(rld)[select, ])

# Draw heatmp from vst transformed data
pheatmap(assay(vsd)[select, ])

  1. Generate a SDM to see the clustering of count data
# Load libraries
suppressPackageStartupMessages(library("RColorBrewer"))
suppressPackageStartupMessages(library("pheatmap"))

# Compute SDM from rld data
sdm <- as.matrix(dist(t(rld_counts)))

# Set paramrters for heatmap
rownames(sdm) <- rld$condition
colnames(sdm) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)

# Plot heatmap
pheatmap(sdm,
         clustering_distance_rows = dist(t(rld_counts)),
         clustering_distance_cols = dist(t(rld_counts)),
         col = colors)

  1. Perform the Principal Component Analysis using rlog method and find out the % significance values of first two principal components
# Plot PCA
plotPCA(rld, intgroup = "condition")

# Store data from Principal Components
data <- plotPCA(rld, intgroup = c("condition"), returnData = TRUE)
data
##                         PC1        PC2 group condition             name
## control_FFa1.bam -21.987961  2.9589473   FFa       FFa control_FFa1.bam
## control_FFa2.bam -23.431934 -0.7944868   FFa       FFa control_FFa2.bam
## control_FFa3.bam -21.484069 -2.7281740   FFa       FFa control_FFa3.bam
## mutant_KOa1.bam   10.404057 15.5272723   KOa       KOa  mutant_KOa1.bam
## mutant_KOa2.bam    7.534726 -2.3443839   KOa       KOa  mutant_KOa2.bam
## mutant_KOa3.bam   11.856179  5.6900664   KOa       KOa  mutant_KOa3.bam
## mutant_KOb1.bam   13.387134 -3.4550087   KOb       KOb  mutant_KOb1.bam
## mutant_KOb2.bam    9.974359 -7.1423487   KOb       KOb  mutant_KOb2.bam
## mutant_KOb3.bam   13.747510 -7.7118840   KOb       KOb  mutant_KOb3.bam
  1. Repeat the PCA, this time using VST method and compare the plots with the ones obtained using rlog method.
# Plot PCA
plotPCA(vsd, intgroup = "condition")

# Store data from Principal Components
data <- plotPCA(rld, intgroup = c("condition"), returnData = TRUE)
data
##                         PC1        PC2 group condition             name
## control_FFa1.bam -21.987961  2.9589473   FFa       FFa control_FFa1.bam
## control_FFa2.bam -23.431934 -0.7944868   FFa       FFa control_FFa2.bam
## control_FFa3.bam -21.484069 -2.7281740   FFa       FFa control_FFa3.bam
## mutant_KOa1.bam   10.404057 15.5272723   KOa       KOa  mutant_KOa1.bam
## mutant_KOa2.bam    7.534726 -2.3443839   KOa       KOa  mutant_KOa2.bam
## mutant_KOa3.bam   11.856179  5.6900664   KOa       KOa  mutant_KOa3.bam
## mutant_KOb1.bam   13.387134 -3.4550087   KOb       KOb  mutant_KOb1.bam
## mutant_KOb2.bam    9.974359 -7.1423487   KOb       KOb  mutant_KOb2.bam
## mutant_KOb3.bam   13.747510 -7.7118840   KOb       KOb  mutant_KOb3.bam