Plotting Answer 5

These exercises cover the sections on Plotting in R PlottingInR.

Section 1 - barplot

The paired-end RNAseq data for our case study was aligned to mouse genome (mm9) using Tophat2. The alignment summary can be found here "data/tophat_alignment_overallstat.txt" with the following columns.

sample: sample name
num.input.pairs: number of original pairs
num.aligned.pairs: number of aligned pairs
num.uniquely_mapped.pairs: number of properly aligned paired-reads, uniquely mapped
num.multiple_mapped.pairs: number of properly aligned paired-reads, multiple mapped
num.discordant_mapped.pairs: number of discordant aligned paired-reads

num.aligned.pairs = num.uniquely_mapped.pairs + num.multiple_mapped.pairs + num.discordant_mapped.pairs

hint: use read.table() function to load tophat_alignment_overallstat.txt

load tophat_alignment_overallstat.txt to R as a data.frame named aligned_res

aligned_res <- read.table("data/tophat_alignment_overallstat.txt",
                          header=T,sep="\t")

head(aligned_res)

##     sample num.input.pairs num.aligned.pairs num.uniquely_mapped.pairs
## 1 DOX24H_1        41113500          30527725                  29078260
## 2 DOX24H_2        38166396          29575699                  28188066
## 3 DOX24H_3        42506262          31376922                  29572848
## 4  DOX7D_1        46156954          34147421                  32392402
## 5  DOX7D_2        40372425          32876374                  31409139
## 6  DOX7D_3        46160430          34722360                  32896496
##   num.multiple_mapped.pairs num.discordant_mapped.pairs
## 1                   1249793                      199672
## 2                   1173844                      213789
## 3                   1553296                      250778
## 4                   1551960                      203059
## 5                   1201468                      265767
## 6                   1615952                      209912

add a new column 'num.unaligned.pairs' to aligned_res

# before creating the 'num.unaligned.pairs', check whether the following statement is true
# num.aligned.pairs = num.uniquely_mapped.pairs + num.multiple_mapped.pairs + num.discordant_mapped.pairs

allAligned = aligned_res$num.uniquely_mapped.pairs + aligned_res$num.multiple_mapped.pairs + aligned_res$num.discordant_mapped.pairs

# check whether aligned_res$num.aligned.pairs equals allAligned
# please note, we use "==" to see whether two objects are equal
aligned_res$num.aligned.pairs == allAligned

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

# you can also use the following code to check whether it is true for all elements in this vector
all(aligned_res$num.aligned.pairs == allAligned)

## [1] TRUE

# create new column "num.unaligned.pairs"
aligned_res$num.unaligned.pairs <- aligned_res$num.input.pairs - aligned_res$num.aligned.pairs
head(aligned_res)

##     sample num.input.pairs num.aligned.pairs num.uniquely_mapped.pairs
## 1 DOX24H_1        41113500          30527725                  29078260
## 2 DOX24H_2        38166396          29575699                  28188066
## 3 DOX24H_3        42506262          31376922                  29572848
## 4  DOX7D_1        46156954          34147421                  32392402
## 5  DOX7D_2        40372425          32876374                  31409139
## 6  DOX7D_3        46160430          34722360                  32896496
##   num.multiple_mapped.pairs num.discordant_mapped.pairs num.unaligned.pairs
## 1                   1249793                      199672            10585775
## 2                   1173844                      213789             8590697
## 3                   1553296                      250778            11129340
## 4                   1551960                      203059            12009533
## 5                   1201468                      265767             7496051
## 6                   1615952                      209912            11438070

create the bar plot below using geom_bar(stat = "identity",position=position_dodge())

hints:
- you need to convert the ‘wide’ data.frame to a ‘long’ data.frame using melt() function from reshape2 package
- check ?geom_bar and see description for position, and see what happens after you put position=position_dodge() in geom_bar()
- The x-axis labels can be rotated 45 degrees by adding layer theme(axis.text.x = element_text(angle=45, hjust=1))

library("reshape2")
library("ggplot2")

## Warning in register(): Can't find generic `scale_type` in package ggplot2 to
## register S3 method.

# remove column "num.aligned.pairs"
aligned_res_BarNum <- aligned_res[,colnames(aligned_res)!="num.aligned.pairs"]

# convert 'wide' data.frame to 'long' data.frame
aligned_res_BarNum_long <- melt(aligned_res_BarNum,id="sample")

# generate plot
ggplot(aligned_res_BarNum_long,aes(x=sample,y=value,fill=variable)) +
  geom_bar(stat = "identity",position=position_dodge()) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab("Number of pairs")

create the bar plot below: using geom_bar(position = "fill")

hint: you need to understand how factor data type works

library("reshape2")
library("ggplot2")
# remove columns c("num.input.pairs","num.aligned.pairs") from aligned_res
aligned_res_BarFra <- aligned_res[,!(colnames(aligned_res) %in% c("num.input.pairs","num.aligned.pairs"))]

# convert 'wide' data.frame to 'long' data.frame
aligned_res_BarFra_long <- melt(aligned_res_BarFra,id="sample")

# remove "num." and ".pairs" from aligned_res_BarFra_long$variable
aligned_res_BarFra_long$variable <- sub("(num.)(.+)(.pairs)","\\2",
                                        aligned_res_BarFra_long$variable)
# check data type aligned_res_BarFra_long$variable
class(aligned_res_BarFra_long$variable)

## [1] "character"

# change it to factor
aligned_res_BarFra_long$variable <- as.factor(aligned_res_BarFra_long$variable)

# check level
levels(aligned_res_BarFra_long$variable)

## [1] "discordant_mapped" "multiple_mapped"   "unaligned"        
## [4] "uniquely_mapped"

# change the levels
aligned_res_BarFra_long$variable <- factor(aligned_res_BarFra_long$variable,
                                           levels=c("unaligned","discordant_mapped",
                                                    "multiple_mapped","uniquely_mapped"))

# generate plot
ggplot(aligned_res_BarFra_long,aes(x=sample,weight=value,fill=variable)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab("Fraction of input pairs")

create the bar plot below: using facet_grid()

hint: check the Examples from ?facet_grid

aligned_res_BarFra_long$TimePoint <- sub("(.+)([2|7].+)(_)(\\d)","\\2",
                                         aligned_res_BarFra_long$sample)
aligned_res_BarFra_long$Treatment <- sub("(.+)([2|7].+)(_)(\\d)","\\1",
                                         aligned_res_BarFra_long$sample)
aligned_res_BarFra_long$Rep <- sub("(.+)([2|7].+)(_\\d)","Rep\\3",
                                         aligned_res_BarFra_long$sample)

# generate plot
ggplot(aligned_res_BarFra_long,aes(x=Rep,weight=value,fill=variable)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab("Fraction of input pairs") +
  facet_grid(TimePoint~Treatment)

create the bar plot below: using facet_wrap()

hint: check the Examples from ?facet_wrap

# generate plot
ggplot(aligned_res_BarFra_long,aes(x=Rep,weight=value,fill=variable)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab("Fraction of input pairs") +
  facet_wrap(TimePoint~Treatment)

Section 2 - bubble plot [bonus exercise]

Gene Set Enrichment Analysis (GSEA) is one of the popular functional analysis tools for bulkRNAseq data. More details please see [https://www.gsea-msigdb.org/gsea/index.jsp]. File "data/GSEA_hallmark_Dox7D_Minus_Dox1D.csv" summarizes the GSEA results for bulkRNAseq data based on Dox_7D vs Dox_24h using the hallmark gene sets.

GSEA_hallmark_Dox7D_Minus_Dox1D.csv contains the following columns:

NAME: full Gene set name
SIZE: Number of genes in the gene set after filtering out those genes not in the expression datase.
ES: Enrichment score for the gene set; that is, the degree to which this gene set is overrepresented at the top or bottom of the ranked list of genes in the expression dataset.
NES: Normalized enrichment score; that is, the enrichment score for the gene set after it has been normalized across analyzed gene sets.
NOM.p.val: Nominal p value; that is, the statistical significance of the enrichment score. The nominal p value is not adjusted for gene set size or multiple hypothesis testing; therefore, it is of limited use in comparing gene sets.
FDR.q.val: False discovery rate; that is, the estimated probability that the normalized enrichment score represents a false positive finding.
GS: short Gene set name
no.genes.in.Core: number of genes contributes to the leading-edge subset within the gene set and contributes most to the enrichment result.

More details please see [http://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideFrame.html?_Interpreting_GSEA_Results]

Please create the bubble plot below with the following criteria

use 50 hallmark gene sets as y-axis and NES as x-axis.
- if the NES > 0, order the Gene Sets based on NES values largest to smallest
- if the NES < 0, order the Gene Sets based on the NES values smallest to largest
FDR < 0.25 as threshold to show significant gene set with solid circle
- hint: use scale_shape_manual(values=c(1,19)). 1: hollow circle; 19: solid circle
use continuous colours to show FDR.q.val
- hint: use scale_colour_gradient(name = "FDR.q.val",low = "#D55E00", high = "#0072B2")
use the circle size to show no.genes.in.Core

Then save it as a pdf with width=8 and height=10

library(ggplot2)
#Bubble plot for GSEA

gsea_summary_df_full <- read.csv("../data/GSEA_hallmark_Dox7D_Minus_Dox1D.csv")
gsea_summary_df_full <- gsea_summary_df_full[
    order(gsea_summary_df_full$NES, decreasing = T), ]
gsea_summary_df_full$rank <- c(1:nrow(gsea_summary_df_full))
gsea_summary_df_full$GS <- factor(gsea_summary_df_full$GS,
                                    levels = as.character(gsea_summary_df_full$GS))
orilevels <- levels(gsea_summary_df_full$GS)
index1 <- nrow(gsea_summary_df_full[gsea_summary_df_full$NES>0, ])
  if (index1 == nrow(gsea_summary_df_full)) {
    orilevels2 <- orilevels[rev(c(1:index1))]
  }else {
    orilevels2 <- orilevels[c(rev(c(nrow(gsea_summary_df_full):(index1+1))),
                              rev(c(1:index1)))]
  }
gsea_summary_df_full$GS <- factor(gsea_summary_df_full$GS, levels=orilevels2)
gsea_summary_df_full$significant <-  ifelse(gsea_summary_df_full$FDR.q.val<0.25, 
                                              "FDR < 0.25", "FDR > 0.25")
gsea_summary_df_full$significant <- factor(gsea_summary_df_full$significant, 
                                             c("FDR > 0.25", "FDR < 0.25"))
gsea_summary_df_full <- gsea_summary_df_full[order(gsea_summary_df_full$NES),]
 

gseaplot <- ggplot(gsea_summary_df_full, aes(x=NES, y=GS)) +
    geom_point(aes(size=no.genes.in.Core, color=FDR.q.val, 
                   shape=significant))+
    geom_vline(xintercept = 0, colour="grey")+
    scale_colour_gradient(name = "FDR.q.val",
                          low = "#D55E00", high = "#0072B2")+
    scale_shape_manual(values=c(1, 19))+
    theme_light()+
    theme(axis.text=element_text(size=8), 
          axis.title=element_text(size=12, face="bold"),
          legend.title=element_text(size=10), 
          legend.text=element_text(size=10))+
    xlim(-3, 3)+
    ylab("Hallmark GeneSet")


gseaplot

outfilename <- "HallmarkGSEA.pdf"
ggsave(outfilename,gseaplot,width=8,height=10)

Plotting Answer 5

MRC LMS

April 2022

Section 1 - barplot

Section 2 - bubble plot [bonus exercise]