These exercises cover the sections on Plotting in R PlottingInR.
The paired-end RNAseq data for our case study was aligned to mouse genome (mm9) using Tophat2. The alignment summary can be found here "data/tophat_alignment_overallstat.txt"
with the following columns.
num.aligned.pairs = num.uniquely_mapped.pairs + num.multiple_mapped.pairs + num.discordant_mapped.pairs
read.table()
function to load tophat_alignment_overallstat.txtaligned_res
aligned_res <- read.table("data/tophat_alignment_overallstat.txt",
header=T,sep="\t")
head(aligned_res)
## sample num.input.pairs num.aligned.pairs num.uniquely_mapped.pairs
## 1 DOX24H_1 41113500 30527725 29078260
## 2 DOX24H_2 38166396 29575699 28188066
## 3 DOX24H_3 42506262 31376922 29572848
## 4 DOX7D_1 46156954 34147421 32392402
## 5 DOX7D_2 40372425 32876374 31409139
## 6 DOX7D_3 46160430 34722360 32896496
## num.multiple_mapped.pairs num.discordant_mapped.pairs
## 1 1249793 199672
## 2 1173844 213789
## 3 1553296 250778
## 4 1551960 203059
## 5 1201468 265767
## 6 1615952 209912
'num.unaligned.pairs'
to aligned_res
# before creating the 'num.unaligned.pairs', check whether the following statement is true
# num.aligned.pairs = num.uniquely_mapped.pairs + num.multiple_mapped.pairs + num.discordant_mapped.pairs
allAligned = aligned_res$num.uniquely_mapped.pairs + aligned_res$num.multiple_mapped.pairs + aligned_res$num.discordant_mapped.pairs
# check whether aligned_res$num.aligned.pairs equals allAligned
# please note, we use "==" to see whether two objects are equal
aligned_res$num.aligned.pairs == allAligned
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# you can also use the following code to check whether it is true for all elements in this vector
all(aligned_res$num.aligned.pairs == allAligned)
## [1] TRUE
# create new column "num.unaligned.pairs"
aligned_res$num.unaligned.pairs <- aligned_res$num.input.pairs - aligned_res$num.aligned.pairs
head(aligned_res)
## sample num.input.pairs num.aligned.pairs num.uniquely_mapped.pairs
## 1 DOX24H_1 41113500 30527725 29078260
## 2 DOX24H_2 38166396 29575699 28188066
## 3 DOX24H_3 42506262 31376922 29572848
## 4 DOX7D_1 46156954 34147421 32392402
## 5 DOX7D_2 40372425 32876374 31409139
## 6 DOX7D_3 46160430 34722360 32896496
## num.multiple_mapped.pairs num.discordant_mapped.pairs num.unaligned.pairs
## 1 1249793 199672 10585775
## 2 1173844 213789 8590697
## 3 1553296 250778 11129340
## 4 1551960 203059 12009533
## 5 1201468 265767 7496051
## 6 1615952 209912 11438070
geom_bar(stat = "identity",position=position_dodge())
hints:
you need to convert the ‘wide’ data.frame to a ‘long’ data.frame using melt() function from reshape2
package
check ?geom_bar
and see description for position
, and see what happens after you put position=position_dodge()
in geom_bar()
The x-axis labels can be rotated 45 degrees by adding layer theme(axis.text.x = element_text(angle=45, hjust=1))
library("reshape2")
library("ggplot2")
## Warning in register(): Can't find generic `scale_type` in package ggplot2 to
## register S3 method.
# remove column "num.aligned.pairs"
aligned_res_BarNum <- aligned_res[,colnames(aligned_res)!="num.aligned.pairs"]
# convert 'wide' data.frame to 'long' data.frame
aligned_res_BarNum_long <- melt(aligned_res_BarNum,id="sample")
# generate plot
ggplot(aligned_res_BarNum_long,aes(x=sample,y=value,fill=variable)) +
geom_bar(stat = "identity",position=position_dodge()) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
ylab("Number of pairs")
geom_bar(position = "fill")
factor
data type workslibrary("reshape2")
library("ggplot2")
# remove columns c("num.input.pairs","num.aligned.pairs") from aligned_res
aligned_res_BarFra <- aligned_res[,!(colnames(aligned_res) %in% c("num.input.pairs","num.aligned.pairs"))]
# convert 'wide' data.frame to 'long' data.frame
aligned_res_BarFra_long <- melt(aligned_res_BarFra,id="sample")
# remove "num." and ".pairs" from aligned_res_BarFra_long$variable
aligned_res_BarFra_long$variable <- sub("(num.)(.+)(.pairs)","\\2",
aligned_res_BarFra_long$variable)
# check data type aligned_res_BarFra_long$variable
class(aligned_res_BarFra_long$variable)
## [1] "character"
# change it to factor
aligned_res_BarFra_long$variable <- as.factor(aligned_res_BarFra_long$variable)
# check level
levels(aligned_res_BarFra_long$variable)
## [1] "discordant_mapped" "multiple_mapped" "unaligned"
## [4] "uniquely_mapped"
# change the levels
aligned_res_BarFra_long$variable <- factor(aligned_res_BarFra_long$variable,
levels=c("unaligned","discordant_mapped",
"multiple_mapped","uniquely_mapped"))
# generate plot
ggplot(aligned_res_BarFra_long,aes(x=sample,weight=value,fill=variable)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
ylab("Fraction of input pairs")
facet_grid()
?facet_grid
aligned_res_BarFra_long$TimePoint <- sub("(.+)([2|7].+)(_)(\\d)","\\2",
aligned_res_BarFra_long$sample)
aligned_res_BarFra_long$Treatment <- sub("(.+)([2|7].+)(_)(\\d)","\\1",
aligned_res_BarFra_long$sample)
aligned_res_BarFra_long$Rep <- sub("(.+)([2|7].+)(_\\d)","Rep\\3",
aligned_res_BarFra_long$sample)
# generate plot
ggplot(aligned_res_BarFra_long,aes(x=Rep,weight=value,fill=variable)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
ylab("Fraction of input pairs") +
facet_grid(TimePoint~Treatment)
facet_wrap()
?facet_wrap
# generate plot
ggplot(aligned_res_BarFra_long,aes(x=Rep,weight=value,fill=variable)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
ylab("Fraction of input pairs") +
facet_wrap(TimePoint~Treatment)
Gene Set Enrichment Analysis (GSEA) is one of the popular functional analysis tools for bulkRNAseq data. More details please see [https://www.gsea-msigdb.org/gsea/index.jsp]. File "data/GSEA_hallmark_Dox7D_Minus_Dox1D.csv"
summarizes the GSEA results for bulkRNAseq data based on Dox_7D vs Dox_24h using the hallmark gene sets.
GSEA_hallmark_Dox7D_Minus_Dox1D.csv
contains the following columns:
More details please see [http://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideFrame.html?_Interpreting_GSEA_Results]
Please create the bubble plot below with the following criteria
use 50 hallmark gene sets as y-axis and NES as x-axis.
if the NES > 0, order the Gene Sets based on NES values largest to smallest
if the NES < 0, order the Gene Sets based on the NES values smallest to largest
FDR < 0.25 as threshold to show significant gene set with solid circle
scale_shape_manual(values=c(1,19))
. 1: hollow circle; 19: solid circleuse continuous colours to show FDR.q.val
scale_colour_gradient(name = "FDR.q.val",low = "#D55E00", high = "#0072B2")
use the circle size to show no.genes.in.Core
Then save it as a pdf with width=8 and height=10
library(ggplot2)
#Bubble plot for GSEA
gsea_summary_df_full <- read.csv("../data/GSEA_hallmark_Dox7D_Minus_Dox1D.csv")
gsea_summary_df_full <- gsea_summary_df_full[
order(gsea_summary_df_full$NES, decreasing = T), ]
gsea_summary_df_full$rank <- c(1:nrow(gsea_summary_df_full))
gsea_summary_df_full$GS <- factor(gsea_summary_df_full$GS,
levels = as.character(gsea_summary_df_full$GS))
orilevels <- levels(gsea_summary_df_full$GS)
index1 <- nrow(gsea_summary_df_full[gsea_summary_df_full$NES>0, ])
if (index1 == nrow(gsea_summary_df_full)) {
orilevels2 <- orilevels[rev(c(1:index1))]
}else {
orilevels2 <- orilevels[c(rev(c(nrow(gsea_summary_df_full):(index1+1))),
rev(c(1:index1)))]
}
gsea_summary_df_full$GS <- factor(gsea_summary_df_full$GS, levels=orilevels2)
gsea_summary_df_full$significant <- ifelse(gsea_summary_df_full$FDR.q.val<0.25,
"FDR < 0.25", "FDR > 0.25")
gsea_summary_df_full$significant <- factor(gsea_summary_df_full$significant,
c("FDR > 0.25", "FDR < 0.25"))
gsea_summary_df_full <- gsea_summary_df_full[order(gsea_summary_df_full$NES),]
gseaplot <- ggplot(gsea_summary_df_full, aes(x=NES, y=GS)) +
geom_point(aes(size=no.genes.in.Core, color=FDR.q.val,
shape=significant))+
geom_vline(xintercept = 0, colour="grey")+
scale_colour_gradient(name = "FDR.q.val",
low = "#D55E00", high = "#0072B2")+
scale_shape_manual(values=c(1, 19))+
theme_light()+
theme(axis.text=element_text(size=8),
axis.title=element_text(size=12, face="bold"),
legend.title=element_text(size=10),
legend.text=element_text(size=10))+
xlim(-3, 3)+
ylab("Hallmark GeneSet")
gseaplot
outfilename <- "HallmarkGSEA.pdf"
ggsave(outfilename,gseaplot,width=8,height=10)