Set working directory
# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions"
- Read in count data and sample description. Create col_data and check dimensions.
# Read the count information
all_counts <- read.csv(file = "../data/exercise1_counts.csv", header = T, row.names = 1)
head(all_counts)
## control_FFa1.bam control_FFa2.bam control_FFa3.bam mutant_KOa1.bam
## 497097 16 16 0 21
## 100503874 20 0 0 64
## 100038431 0 0 2 0
## 19888 11 0 10 113
## 20671 14 16 0 40
## 27395 465 193 596 436
## mutant_KOa2.bam mutant_KOa3.bam mutant_KOb1.bam mutant_KOb2.bam
## 497097 16 27 20 0
## 100503874 0 4 5 0
## 100038431 0 8 0 0
## 19888 0 26 14 6
## 20671 8 33 33 12
## 27395 686 572 1378 1901
## mutant_KOb3.bam
## 497097 2
## 100503874 2
## 100038431 0
## 19888 16
## 20671 24
## 27395 1553
# Read the sample description that we generated manually for this exercise
sam_des <- read.table("../data/exercise1_sample_description.info", sep = "\t", header = TRUE)
head(sam_des)
## filename sample condition batch
## 1 control_FFa1.bam FFa1 FFa a
## 2 control_FFa2.bam FFa2 FFa b
## 3 control_FFa3.bam FFa3 FFa c
## 4 TMC_mutant_KOa1.bam KOa1 KOa a
## 5 TMC_mutant_KOa2.bam KOa2 KOa b
## 6 TMC_mutant_KOa3.bam KOa3 KOa c
# Create col data from sample description
col_data <- data.frame(sample = sam_des$sample, condition = sam_des$condition, batch = sam_des$batch)
head(col_data)
## sample condition batch
## 1 FFa1 FFa a
## 2 FFa2 FFa b
## 3 FFa3 FFa c
## 4 KOa1 KOa a
## 5 KOa2 KOa b
## 6 KOa3 KOa c
# Check dimensions
all(colnames(all_counts) == col_data$name)
## [1] TRUE
- Construct DESeqDataSet object using count data and sample description.
# Load DESeq2 library
suppressPackageStartupMessages(library(DESeq2))
# Create DESeq object
dds <- DESeqDataSetFromMatrix(countData = all_counts, colData = col_data, design =~ condition)
# Run DESeq2 Normalization
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Check the dds object
dds
## class: DESeqDataSet
## dim: 26301 9
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(26301): 497097 100503874 ... 100040384 100040400
## rowData names(26): baseMean baseVar ... deviance maxCooks
## colnames(9): control_FFa1.bam control_FFa2.bam ... mutant_KOb2.bam
## mutant_KOb3.bam
## colData names(4): sample condition batch sizeFactor
- Perform rlog and vs transformation on the
# Perform rlog transformation
rld <- rlog(dds)
# Get matrix form
rld_counts <- assay(rld)
head(rld_counts)
## control_FFa1.bam control_FFa2.bam control_FFa3.bam mutant_KOa1.bam
## 497097 3.8561447 3.9907973 3.5322792 3.923806
## 100503874 3.2895831 2.8326071 2.8201166 3.834407
## 100038431 0.1543662 0.1551115 0.1903975 0.154316
## 19888 3.9103723 3.6262932 3.8451124 5.046242
## 20671 4.2033204 4.3750856 3.9098102 4.550450
## 27395 9.2090107 8.7918710 9.2758269 9.106490
## mutant_KOa2.bam mutant_KOa3.bam mutant_KOb1.bam mutant_KOb2.bam
## 497097 3.8489878 3.9980385 3.7603473 3.5253500
## 100503874 2.8226210 2.9321205 2.8973928 2.8131612
## 100038431 0.1543294 0.3042526 0.1537077 0.1536189
## 19888 3.6068923 4.1745338 3.8179714 3.6830152
## 20671 4.0867366 4.4566664 4.2622207 4.0354695
## 27395 9.5797251 9.3512412 9.6722305 9.8827325
## mutant_KOb3.bam
## 497097 3.5590907
## 100503874 2.8554608
## 100038431 0.1538331
## 19888 3.8816181
## 20671 4.2166975
## 27395 9.9615722
# Perform vsd transformation
vsd <- varianceStabilizingTransformation(dds)
# Get matrix form
vsd_counts <- assay(vsd)
head(vsd_counts)
## control_FFa1.bam control_FFa2.bam control_FFa3.bam mutant_KOa1.bam
## 497097 6.922956 7.102750 6.193022 7.011409
## 100503874 7.007005 6.193022 6.193022 7.586183
## 100038431 6.193022 6.193022 6.430894 6.193022
## 19888 6.800235 6.193022 6.722545 7.997112
## 20671 6.876703 7.102750 6.193022 7.309628
## 27395 9.440091 8.960798 9.517810 9.321921
## mutant_KOa2.bam mutant_KOa3.bam mutant_KOb1.bam mutant_KOb2.bam
## 497097 6.913175 7.104558 6.788596 6.193022
## 100503874 6.193022 6.548835 6.492391 6.193022
## 100038431 6.193022 6.694958 6.193022 6.193022
## 19888 6.193022 7.088049 6.692365 6.501350
## 20671 6.704862 7.197214 6.954620 6.628242
## 27395 9.874359 9.605598 9.983524 10.234415
## mutant_KOb3.bam
## 497097 6.397612
## 100503874 6.397612
## 100038431 6.193022
## 19888 6.768354
## 20671 6.895379
## 27395 10.329292
- Draw a heatmap of count matrix based on the top 40 highly expressed genes using rlog and vst data.
# Load pheatmap library
library("pheatmap")
# Select the top 40 highly expressed genes
select <- order(rowMeans(counts(dds, normalized = TRUE)), decreasing = TRUE)[1:40]
# Draw heatmap in rlog transformed data
pheatmap(assay(rld)[select, ])
# Draw heatmp from vst transformed data
pheatmap(assay(vsd)[select, ])
- Generate a SDM to see the clustering of count data
# Load libraries
suppressPackageStartupMessages(library("RColorBrewer"))
suppressPackageStartupMessages(library("pheatmap"))
# Compute SDM from rld data
sdm <- as.matrix(dist(t(rld_counts)))
# Set paramrters for heatmap
rownames(sdm) <- rld$condition
colnames(sdm) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
# Plot heatmap
pheatmap(sdm,
clustering_distance_rows = dist(t(rld_counts)),
clustering_distance_cols = dist(t(rld_counts)),
col = colors)
- Perform the Principal Component Analysis using rlog method and find out the % significance values of first two principal components
# Plot PCA
plotPCA(rld, intgroup = "condition")
# Store data from Principal Components
data <- plotPCA(rld, intgroup = c("condition"), returnData = TRUE)
data
## PC1 PC2 group condition name
## control_FFa1.bam -21.987961 2.9589473 FFa FFa control_FFa1.bam
## control_FFa2.bam -23.431934 -0.7944868 FFa FFa control_FFa2.bam
## control_FFa3.bam -21.484069 -2.7281740 FFa FFa control_FFa3.bam
## mutant_KOa1.bam 10.404057 15.5272723 KOa KOa mutant_KOa1.bam
## mutant_KOa2.bam 7.534726 -2.3443839 KOa KOa mutant_KOa2.bam
## mutant_KOa3.bam 11.856179 5.6900664 KOa KOa mutant_KOa3.bam
## mutant_KOb1.bam 13.387134 -3.4550087 KOb KOb mutant_KOb1.bam
## mutant_KOb2.bam 9.974359 -7.1423487 KOb KOb mutant_KOb2.bam
## mutant_KOb3.bam 13.747510 -7.7118840 KOb KOb mutant_KOb3.bam
- Repeat the PCA, this time using VST method and compare the plots with the ones obtained using rlog method.
# Plot PCA
plotPCA(vsd, intgroup = "condition")
# Store data from Principal Components
data <- plotPCA(rld, intgroup = c("condition"), returnData = TRUE)
data
## PC1 PC2 group condition name
## control_FFa1.bam -21.987961 2.9589473 FFa FFa control_FFa1.bam
## control_FFa2.bam -23.431934 -0.7944868 FFa FFa control_FFa2.bam
## control_FFa3.bam -21.484069 -2.7281740 FFa FFa control_FFa3.bam
## mutant_KOa1.bam 10.404057 15.5272723 KOa KOa mutant_KOa1.bam
## mutant_KOa2.bam 7.534726 -2.3443839 KOa KOa mutant_KOa2.bam
## mutant_KOa3.bam 11.856179 5.6900664 KOa KOa mutant_KOa3.bam
## mutant_KOb1.bam 13.387134 -3.4550087 KOb KOb mutant_KOb1.bam
## mutant_KOb2.bam 9.974359 -7.1423487 KOb KOb mutant_KOb2.bam
## mutant_KOb3.bam 13.747510 -7.7118840 KOb KOb mutant_KOb3.bam