# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions"
# Read the count information
all_counts <- read.csv(file = "../data/exercise1_counts.csv", header = T, row.names = 1)
head(all_counts)
## control_FFa1.bam control_FFa2.bam control_FFa3.bam mutant_KOa1.bam
## 497097 16 16 0 21
## 100503874 20 0 0 64
## 100038431 0 0 2 0
## 19888 11 0 10 113
## 20671 14 16 0 40
## 27395 465 193 596 436
## mutant_KOa2.bam mutant_KOa3.bam mutant_KOb1.bam mutant_KOb2.bam
## 497097 16 27 20 0
## 100503874 0 4 5 0
## 100038431 0 8 0 0
## 19888 0 26 14 6
## 20671 8 33 33 12
## 27395 686 572 1378 1901
## mutant_KOb3.bam
## 497097 2
## 100503874 2
## 100038431 0
## 19888 16
## 20671 24
## 27395 1553
# Read the sample description that we generated manually for this exercise
sam_des <- read.table("../data/exercise1_sample_description.info", sep = "\t", header = TRUE)
head(sam_des)
## filename sample condition batch
## 1 control_FFa1.bam FFa1 FFa a
## 2 control_FFa2.bam FFa2 FFa b
## 3 control_FFa3.bam FFa3 FFa c
## 4 TMC_mutant_KOa1.bam KOa1 KOa a
## 5 TMC_mutant_KOa2.bam KOa2 KOa b
## 6 TMC_mutant_KOa3.bam KOa3 KOa c
# Create col data from sample description
col_data <- data.frame(sample = sam_des$sample, condition = sam_des$condition, batch = sam_des$batch)
head(col_data)
## sample condition batch
## 1 FFa1 FFa a
## 2 FFa2 FFa b
## 3 FFa3 FFa c
## 4 KOa1 KOa a
## 5 KOa2 KOa b
## 6 KOa3 KOa c
# Check dimensions
all(colnames(all_counts) == col_data$name)
## [1] TRUE
# Load DESeq2 library
suppressPackageStartupMessages(library(DESeq2))
# Create DESeq object
dds <- DESeqDataSetFromMatrix(countData = all_counts, colData = col_data, design =~ condition)
dds
## class: DESeqDataSet
## dim: 26301 9
## metadata(1): version
## assays(1): counts
## rownames(26301): 497097 100503874 ... 100040384 100040400
## rowData names(0):
## colnames(9): control_FFa1.bam control_FFa2.bam ... mutant_KOb2.bam
## mutant_KOb3.bam
## colData names(3): sample condition batch
Apply Normalization with DESeq2 on the dds object just created
Find the number of genes that are changed in knockdown samples versus control (i.e. KOa vs FFa and KOb vs FFa), at FDR 0.05
Find the number of genes that are changed in the above situation with fold change threshold, i.e. fold change ratio > 2
# Apply DESeq normalization to our dds object
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Compute number of genes differentially expressed in KOa vs FFa, with FDR < 0.05
res1 <- results(dds, contrast = c("condition", "KOa", "FFa"))
summary(res1, alpha = 0.05)
##
## out of 24695 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 774, 3.1%
## LFC < 0 (down) : 913, 3.7%
## outliers [1] : 54, 0.22%
## low counts [2] : 4772, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Compute differentially expressed genes in KOa vs FFa, with FDR < 0.05 and fold change ratio > 2
DE_res1 <- res1[complete.cases(res1$padj), ]
DE_res1 <- DE_res1[DE_res1$padj < 0.05 & abs(DE_res1$log2FoldChange) > 1, ]
head(DE_res1)
## log2 fold change (MLE): condition KOa vs FFa
## Wald test p-value: condition KOa vs FFa
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## 21399 4212.8901 -1.02362 0.275575 -3.71448 2.03625e-04 4.90125e-03
## 170755 1259.6792 -1.95291 0.301560 -6.47604 9.41632e-11 9.84700e-09
## 212442 474.7721 2.68685 0.317701 8.45717 2.73948e-17 6.40362e-15
## 381246 45.0979 3.93498 0.880922 4.46689 7.93657e-06 3.12880e-04
## 329101 57.1348 2.34342 0.656002 3.57227 3.53904e-04 7.46454e-03
## 214855 171.5500 -1.59419 0.383665 -4.15516 3.25055e-05 1.03583e-03
# Compute number of genes differentially expressed in KOb vs FFa, with FDR < 0.05
res2 <- results(dds, contrast = c("condition","KOb","FFa"))
summary(res2, alpha = 0.05)
##
## out of 24695 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1178, 4.8%
## LFC < 0 (down) : 1218, 4.9%
## outliers [1] : 54, 0.22%
## low counts [2] : 12356, 50%
## (mean count < 33)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Compute differentially expressed genes in KOa vs FFa, with FDR < 0.05 and fold change ratio > 2
DE_res2 <- res2[complete.cases(res2$padj), ]
DE_res2 <- DE_res2[DE_res2$padj < 0.05 & abs(DE_res2$log2FoldChange) > 1, ]
head(DE_res2)
## log2 fold change (MLE): condition KOb vs FFa
## Wald test p-value: condition KOb vs FFa
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## 27395 725.6529 1.02064 0.259405 3.93456 8.33506e-05 1.03535e-03
## 170755 1259.6792 -1.97123 0.300679 -6.55591 5.53018e-11 2.79582e-09
## 212442 474.7721 2.70411 0.316855 8.53424 1.41085e-17 1.69925e-15
## 381246 45.0979 4.31222 0.876456 4.92007 8.65149e-07 1.93243e-05
## 75799 264.9315 -1.18031 0.423990 -2.78381 5.37246e-03 3.17158e-02
## 69668 457.3043 1.32644 0.229507 5.77952 7.49131e-09 2.61451e-07
# Draw MA plot for the KOa samples
plotMAa <- plotMA(res1, main = "DESeq2", ylim = c(-4,4))
# Draw MA plot for the KOb samples
plotMAb <- plotMA(res2, main = "DESeq2", ylim = c(-4,4))
Use plotCounts() function to plot the counts for gene 497097
Plot the un-normalized counts for the same gene
# Plot normalized counts
plotCounts(dds, gene = "497097", intgroup = "condition")
# Plot not normalized counts
plotCounts(dds, gene = "497097", intgroup = "condition", normalized = FALSE)