Set working directory

# Set working directory
setwd("/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions")
getwd()
## [1] "/Volumes/bioinfomatics$/jurtasun/Courses/CBW2022/LMS_RNASeq/course/exercises/solutions"
  1. Read in count data and sample description. Identify how many factors are involved in this this experiment.

# Read the count information
all_counts <- read.csv(file = "../data/exercise1_counts.csv", header = T, row.names = 1)
head(all_counts)
##           control_FFa1.bam control_FFa2.bam control_FFa3.bam mutant_KOa1.bam
## 497097                  16               16                0              21
## 100503874               20                0                0              64
## 100038431                0                0                2               0
## 19888                   11                0               10             113
## 20671                   14               16                0              40
## 27395                  465              193              596             436
##           mutant_KOa2.bam mutant_KOa3.bam mutant_KOb1.bam mutant_KOb2.bam
## 497097                 16              27              20               0
## 100503874               0               4               5               0
## 100038431               0               8               0               0
## 19888                   0              26              14               6
## 20671                   8              33              33              12
## 27395                 686             572            1378            1901
##           mutant_KOb3.bam
## 497097                  2
## 100503874               2
## 100038431               0
## 19888                  16
## 20671                  24
## 27395                1553

# Read the sample description that we generated manually for this exercise
sam_des <- read.table("../data/exercise1_sample_description.info", sep = "\t", header = TRUE)
head(sam_des)
##              filename sample condition batch
## 1    control_FFa1.bam   FFa1       FFa     a
## 2    control_FFa2.bam   FFa2       FFa     b
## 3    control_FFa3.bam   FFa3       FFa     c
## 4 TMC_mutant_KOa1.bam   KOa1       KOa     a
## 5 TMC_mutant_KOa2.bam   KOa2       KOa     b
## 6 TMC_mutant_KOa3.bam   KOa3       KOa     c
  1. Create the col_data from the sample description and check dimensions with the count matrix

# Create col data from sample description
col_data <- data.frame(sample = sam_des$sample, condition = sam_des$condition, batch = sam_des$batch)
head(col_data)
##   sample condition batch
## 1   FFa1       FFa     a
## 2   FFa2       FFa     b
## 3   FFa3       FFa     c
## 4   KOa1       KOa     a
## 5   KOa2       KOa     b
## 6   KOa3       KOa     c

# Check dimensions
all(colnames(all_counts) == col_data$name)
## [1] TRUE
  1. Construct DESeqDataSet object using count data and sample description.

# Load DESeq2 library
suppressPackageStartupMessages(library(DESeq2))

# Create DESeq object
dds <- DESeqDataSetFromMatrix(countData = all_counts, colData = col_data, design =~ condition)
dds
## class: DESeqDataSet 
## dim: 26301 9 
## metadata(1): version
## assays(1): counts
## rownames(26301): 497097 100503874 ... 100040384 100040400
## rowData names(0):
## colnames(9): control_FFa1.bam control_FFa2.bam ... mutant_KOb2.bam
##   mutant_KOb3.bam
## colData names(3): sample condition batch
    1. Apply Normalization with DESeq2 on the dds object just created

    2. Find the number of genes that are changed in knockdown samples versus control (i.e. KOa vs FFa and KOb vs FFa), at FDR 0.05

    3. Find the number of genes that are changed in the above situation with fold change threshold, i.e. fold change ratio > 2


# Apply DESeq normalization to our dds object
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

# Compute number of genes differentially expressed in KOa vs FFa, with FDR < 0.05
res1 <- results(dds, contrast = c("condition", "KOa", "FFa"))
summary(res1, alpha = 0.05)
## 
## out of 24695 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 774, 3.1%
## LFC < 0 (down)     : 913, 3.7%
## outliers [1]       : 54, 0.22%
## low counts [2]     : 4772, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

# Compute differentially expressed genes in KOa vs FFa, with FDR < 0.05 and fold change ratio > 2
DE_res1 <- res1[complete.cases(res1$padj), ]
DE_res1 <- DE_res1[DE_res1$padj < 0.05 & abs(DE_res1$log2FoldChange) > 1, ]
head(DE_res1)
## log2 fold change (MLE): condition KOa vs FFa 
## Wald test p-value: condition KOa vs FFa 
## DataFrame with 6 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##        <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## 21399  4212.8901       -1.02362  0.275575  -3.71448 2.03625e-04 4.90125e-03
## 170755 1259.6792       -1.95291  0.301560  -6.47604 9.41632e-11 9.84700e-09
## 212442  474.7721        2.68685  0.317701   8.45717 2.73948e-17 6.40362e-15
## 381246   45.0979        3.93498  0.880922   4.46689 7.93657e-06 3.12880e-04
## 329101   57.1348        2.34342  0.656002   3.57227 3.53904e-04 7.46454e-03
## 214855  171.5500       -1.59419  0.383665  -4.15516 3.25055e-05 1.03583e-03

# Compute number of genes differentially expressed in KOb vs FFa, with FDR < 0.05
res2 <- results(dds, contrast = c("condition","KOb","FFa"))
summary(res2, alpha = 0.05)
## 
## out of 24695 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1178, 4.8%
## LFC < 0 (down)     : 1218, 4.9%
## outliers [1]       : 54, 0.22%
## low counts [2]     : 12356, 50%
## (mean count < 33)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

# Compute differentially expressed genes in KOa vs FFa, with FDR < 0.05 and fold change ratio > 2
DE_res2 <- res2[complete.cases(res2$padj), ]
DE_res2 <- DE_res2[DE_res2$padj < 0.05 & abs(DE_res2$log2FoldChange) > 1, ]
head(DE_res2)
## log2 fold change (MLE): condition KOb vs FFa 
## Wald test p-value: condition KOb vs FFa 
## DataFrame with 6 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##        <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## 27395   725.6529        1.02064  0.259405   3.93456 8.33506e-05 1.03535e-03
## 170755 1259.6792       -1.97123  0.300679  -6.55591 5.53018e-11 2.79582e-09
## 212442  474.7721        2.70411  0.316855   8.53424 1.41085e-17 1.69925e-15
## 381246   45.0979        4.31222  0.876456   4.92007 8.65149e-07 1.93243e-05
## 75799   264.9315       -1.18031  0.423990  -2.78381 5.37246e-03 3.17158e-02
## 69668   457.3043        1.32644  0.229507   5.77952 7.49131e-09 2.61451e-07
  1. Draw MA plot and highlight all significant genes with adjusted p-value less than 0.05
# Draw MA plot for the KOa samples
plotMAa <- plotMA(res1, main = "DESeq2", ylim = c(-4,4))

# Draw MA plot for the KOb samples
plotMAb <- plotMA(res2, main = "DESeq2", ylim = c(-4,4))

    1. Use plotCounts() function to plot the counts for gene 497097

    2. Plot the un-normalized counts for the same gene

# Plot normalized counts
plotCounts(dds, gene = "497097", intgroup = "condition")

# Plot not normalized counts
plotCounts(dds, gene = "497097", intgroup = "condition", normalized = FALSE)