These exercises cover the sections on Plotting in R PlottingInR.

Section 1 - barplot

The paired-end RNAseq data for our case study was aligned to mouse genome (mm9) using Tophat2. The alignment summary can be found here "data/tophat_alignment_overallstat.txt" with the following columns.

num.aligned.pairs = num.uniquely_mapped.pairs + num.multiple_mapped.pairs + num.discordant_mapped.pairs

  1. load tophat_alignment_overallstat.txt to R as a data.frame named aligned_res
aligned_res <- read.table("data/tophat_alignment_overallstat.txt",
                          header=T,sep="\t")
head(aligned_res)
##     sample num.input.pairs num.aligned.pairs num.uniquely_mapped.pairs
## 1 DOX24H_1        41113500          30527725                  29078260
## 2 DOX24H_2        38166396          29575699                  28188066
## 3 DOX24H_3        42506262          31376922                  29572848
## 4  DOX7D_1        46156954          34147421                  32392402
## 5  DOX7D_2        40372425          32876374                  31409139
## 6  DOX7D_3        46160430          34722360                  32896496
##   num.multiple_mapped.pairs num.discordant_mapped.pairs
## 1                   1249793                      199672
## 2                   1173844                      213789
## 3                   1553296                      250778
## 4                   1551960                      203059
## 5                   1201468                      265767
## 6                   1615952                      209912
  1. add a new column 'num.unaligned.pairs' to aligned_res
# before creating the 'num.unaligned.pairs', check whether the following statement is true
# num.aligned.pairs = num.uniquely_mapped.pairs + num.multiple_mapped.pairs + num.discordant_mapped.pairs

allAligned = aligned_res$num.uniquely_mapped.pairs + aligned_res$num.multiple_mapped.pairs + aligned_res$num.discordant_mapped.pairs

# check whether aligned_res$num.aligned.pairs equals allAligned
# please note, we use "==" to see whether two objects are equal
aligned_res$num.aligned.pairs == allAligned
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# you can also use the following code to check whether it is true for all elements in this vector
all(aligned_res$num.aligned.pairs == allAligned)
## [1] TRUE
# create new column "num.unaligned.pairs"
aligned_res$num.unaligned.pairs <- aligned_res$num.input.pairs - aligned_res$num.aligned.pairs
head(aligned_res)
##     sample num.input.pairs num.aligned.pairs num.uniquely_mapped.pairs
## 1 DOX24H_1        41113500          30527725                  29078260
## 2 DOX24H_2        38166396          29575699                  28188066
## 3 DOX24H_3        42506262          31376922                  29572848
## 4  DOX7D_1        46156954          34147421                  32392402
## 5  DOX7D_2        40372425          32876374                  31409139
## 6  DOX7D_3        46160430          34722360                  32896496
##   num.multiple_mapped.pairs num.discordant_mapped.pairs num.unaligned.pairs
## 1                   1249793                      199672            10585775
## 2                   1173844                      213789             8590697
## 3                   1553296                      250778            11129340
## 4                   1551960                      203059            12009533
## 5                   1201468                      265767             7496051
## 6                   1615952                      209912            11438070
  1. create the bar plot below using geom_bar(stat = "identity",position=position_dodge())
library("reshape2")
library("ggplot2")
## Warning in register(): Can't find generic `scale_type` in package ggplot2 to
## register S3 method.
# remove column "num.aligned.pairs"
aligned_res_BarNum <- aligned_res[,colnames(aligned_res)!="num.aligned.pairs"]

# convert 'wide' data.frame to 'long' data.frame
aligned_res_BarNum_long <- melt(aligned_res_BarNum,id="sample")

# generate plot
ggplot(aligned_res_BarNum_long,aes(x=sample,y=value,fill=variable)) +
  geom_bar(stat = "identity",position=position_dodge()) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab("Number of pairs")

  1. create the bar plot below: using geom_bar(position = "fill")
library("reshape2")
library("ggplot2")
# remove columns c("num.input.pairs","num.aligned.pairs") from aligned_res
aligned_res_BarFra <- aligned_res[,!(colnames(aligned_res) %in% c("num.input.pairs","num.aligned.pairs"))]

# convert 'wide' data.frame to 'long' data.frame
aligned_res_BarFra_long <- melt(aligned_res_BarFra,id="sample")

# remove "num." and ".pairs" from aligned_res_BarFra_long$variable
aligned_res_BarFra_long$variable <- sub("(num.)(.+)(.pairs)","\\2",
                                        aligned_res_BarFra_long$variable)
# check data type aligned_res_BarFra_long$variable
class(aligned_res_BarFra_long$variable)
## [1] "character"
# change it to factor
aligned_res_BarFra_long$variable <- as.factor(aligned_res_BarFra_long$variable)

# check level
levels(aligned_res_BarFra_long$variable)
## [1] "discordant_mapped" "multiple_mapped"   "unaligned"        
## [4] "uniquely_mapped"
# change the levels
aligned_res_BarFra_long$variable <- factor(aligned_res_BarFra_long$variable,
                                           levels=c("unaligned","discordant_mapped",
                                                    "multiple_mapped","uniquely_mapped"))

# generate plot
ggplot(aligned_res_BarFra_long,aes(x=sample,weight=value,fill=variable)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab("Fraction of input pairs")

  1. create the bar plot below: using facet_grid()
aligned_res_BarFra_long$TimePoint <- sub("(.+)([2|7].+)(_)(\\d)","\\2",
                                         aligned_res_BarFra_long$sample)
aligned_res_BarFra_long$Treatment <- sub("(.+)([2|7].+)(_)(\\d)","\\1",
                                         aligned_res_BarFra_long$sample)
aligned_res_BarFra_long$Rep <- sub("(.+)([2|7].+)(_\\d)","Rep\\3",
                                         aligned_res_BarFra_long$sample)

# generate plot
ggplot(aligned_res_BarFra_long,aes(x=Rep,weight=value,fill=variable)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab("Fraction of input pairs") +
  facet_grid(TimePoint~Treatment)

  1. create the bar plot below: using facet_wrap()
# generate plot
ggplot(aligned_res_BarFra_long,aes(x=Rep,weight=value,fill=variable)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) + 
  ylab("Fraction of input pairs") +
  facet_wrap(TimePoint~Treatment)

Section 2 - bubble plot [bonus exercise]

Gene Set Enrichment Analysis (GSEA) is one of the popular functional analysis tools for bulkRNAseq data. More details please see [https://www.gsea-msigdb.org/gsea/index.jsp]. File "data/GSEA_hallmark_Dox7D_Minus_Dox1D.csv" summarizes the GSEA results for bulkRNAseq data based on Dox_7D vs Dox_24h using the hallmark gene sets.

GSEA_hallmark_Dox7D_Minus_Dox1D.csv contains the following columns:

More details please see [http://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideFrame.html?_Interpreting_GSEA_Results]

Please create the bubble plot below with the following criteria

Then save it as a pdf with width=8 and height=10

library(ggplot2)
#Bubble plot for GSEA

gsea_summary_df_full <- read.csv("../data/GSEA_hallmark_Dox7D_Minus_Dox1D.csv")
gsea_summary_df_full <- gsea_summary_df_full[
    order(gsea_summary_df_full$NES, decreasing = T), ]
gsea_summary_df_full$rank <- c(1:nrow(gsea_summary_df_full))
gsea_summary_df_full$GS <- factor(gsea_summary_df_full$GS,
                                    levels = as.character(gsea_summary_df_full$GS))
orilevels <- levels(gsea_summary_df_full$GS)
index1 <- nrow(gsea_summary_df_full[gsea_summary_df_full$NES>0, ])
  if (index1 == nrow(gsea_summary_df_full)) {
    orilevels2 <- orilevels[rev(c(1:index1))]
  }else {
    orilevels2 <- orilevels[c(rev(c(nrow(gsea_summary_df_full):(index1+1))),
                              rev(c(1:index1)))]
  }
gsea_summary_df_full$GS <- factor(gsea_summary_df_full$GS, levels=orilevels2)
gsea_summary_df_full$significant <-  ifelse(gsea_summary_df_full$FDR.q.val<0.25, 
                                              "FDR < 0.25", "FDR > 0.25")
gsea_summary_df_full$significant <- factor(gsea_summary_df_full$significant, 
                                             c("FDR > 0.25", "FDR < 0.25"))
gsea_summary_df_full <- gsea_summary_df_full[order(gsea_summary_df_full$NES),]
 

gseaplot <- ggplot(gsea_summary_df_full, aes(x=NES, y=GS)) +
    geom_point(aes(size=no.genes.in.Core, color=FDR.q.val, 
                   shape=significant))+
    geom_vline(xintercept = 0, colour="grey")+
    scale_colour_gradient(name = "FDR.q.val",
                          low = "#D55E00", high = "#0072B2")+
    scale_shape_manual(values=c(1, 19))+
    theme_light()+
    theme(axis.text=element_text(size=8), 
          axis.title=element_text(size=12, face="bold"),
          legend.title=element_text(size=10), 
          legend.text=element_text(size=10))+
    xlim(-3, 3)+
    ylab("Hallmark GeneSet")


gseaplot

outfilename <- "HallmarkGSEA.pdf"
ggsave(outfilename,gseaplot,width=8,height=10)