ChIP-seq Functional Annotation Of Peaks

In this practical we will read in two replicate sets of peaks from the Myc Encode dataset and annotate these to gene using the ChIPseeker packge. We will visualise the results using ChIPseeker plotting functions and see how to combine samples into one plot.

In the second half of the practicals we will perform an enrichment analysis on the genes overlapping peak sets using the GOseq package.

First lets read in the two Myc Mel peakset replicates as we did for our previous exercise.

library(GenomicRanges)
library(ChIPseeker)
library(org.Mm.eg.db)
library(TxDb.Mmusculus.UCSC.mm9.knownGene)

melPeak_Rep1 <- read.delim("data/MacsPeaks/mycmelrep1_peaks.xls",sep="\t",comment.char = "#")
melPeak_Rep2 <- read.delim("data/MacsPeaks/mycmelrep2_peaks.xls",sep="\t",comment.char = "#")

melRep1_GR <- GRanges(
                  seqnames=melPeak_Rep1[,"chr"],
                  IRanges(melPeak_Rep1[,"start"],
                  melPeak_Rep1[,"end"]
                  )
                )

mcols(melRep1_GR) <- melPeak_Rep1[,c("abs_summit", "fold_enrichment")]


melRep2_GR <- GRanges(
                  seqnames=melPeak_Rep2[,"chr"],
                  IRanges(melPeak_Rep2[,"start"],
                  melPeak_Rep2[,"end"]
                  )
                )

mcols(melRep2_GR) <- melPeak_Rep2[,c("abs_summit", "fold_enrichment")]

Now annotate the two peaksets to mm9 UCSC genes and draw the distribution of peaks in genomic features and distance of peaks to TSS regions.

library(GenomeInfoDb)

seqlevelsStyle(melRep1_GR) <- "UCSC"
seqlevelsStyle(melRep2_GR) <- "UCSC"

peakAnno_MelRep1 <- annotatePeak(melRep1_GR, tssRegion=c(-1000, 1000), 
                         TxDb=TxDb.Mmusculus.UCSC.mm9.knownGene, annoDb="org.Mm.eg.db")
## >> preparing features information...      2016-03-09 20:43:46 
## >> identifying nearest features...        2016-03-09 20:43:47 
## >> calculating distance from peak to TSS...   2016-03-09 20:43:48 
## >> assigning genomic annotation...        2016-03-09 20:43:48 
## >> adding gene annotation...          2016-03-09 20:43:58
## 'select()' returned many:many mapping between keys and columns
## >> assigning chromosome lengths           2016-03-09 20:44:21 
## >> done...                    2016-03-09 20:44:21
peakAnno_MelRep2 <- annotatePeak(melRep2_GR, tssRegion=c(-1000, 1000), 
                         TxDb=TxDb.Mmusculus.UCSC.mm9.knownGene, annoDb="org.Mm.eg.db")
## >> preparing features information...      2016-03-09 20:44:22 
## >> identifying nearest features...        2016-03-09 20:44:22 
## >> calculating distance from peak to TSS...   2016-03-09 20:44:22 
## >> assigning genomic annotation...        2016-03-09 20:44:22 
## >> adding gene annotation...          2016-03-09 20:44:24
## 'select()' returned many:many mapping between keys and columns
## >> assigning chromosome lengths           2016-03-09 20:44:47 
## >> done...                    2016-03-09 20:44:47

plotAnnoBar(peakAnno_MelRep1)

plotAnnoBar(peakAnno_MelRep2)



plotDistToTSS(peakAnno_MelRep1)
## Warning: Stacking not well defined when ymin != 0

plotDistToTSS(peakAnno_MelRep2)
## Warning: Stacking not well defined when ymin != 0

ChIPseeker allows the user to plot multiple samples in one plot. Combine the csAnno objects for each sample into a list and name elements with their respective sample names. Use the list of csAnno objects with plotAnnoBar function.

Mel_Rep1_and_Rep2 <- list(peakAnno_MelRep1,peakAnno_MelRep2)
names(Mel_Rep1_and_Rep2) <- c("Rep1","Rep2")
plotAnnoBar(Mel_Rep1_and_Rep2)

Retrieve all unique names (as SYMBOLS) of genes with peaks in their TSS/promoter for replicate 1 and 2.

peakAnno_MelRep1_GR <- as.GRanges(peakAnno_MelRep1)
peakAnno_MelRep2_GR <- as.GRanges(peakAnno_MelRep2)

MelRep1_genesWithPeakInTSS <- unique(peakAnno_MelRep1_GR[peakAnno_MelRep1_GR$annotation == "Promoter",]$SYMBOL)

MelRep1_genesWithPeakInTSS <- unique(peakAnno_MelRep1_GR[peakAnno_MelRep1_GR$annotation == "Promoter",]$SYMBOL)

Create an object containing of Myc peaks in Mel replicate 1 overlapping Myc peaks in Mel replicate 2.

melRep1_GR_InRep1AndRep2 <- melRep1_GR[melRep1_GR %over% melRep2_GR]
melRep1_GR_InRep1AndRep2
## GRanges object with 30233 ranges and 2 metadata columns:
##           seqnames                 ranges strand   | abs_summit
##              <Rle>              <IRanges>  <Rle>   |  <integer>
##       [1]     chr1     [4775337, 4775959]      *   |    4775616
##       [2]     chr1     [4847544, 4847931]      *   |    4847795
##       [3]     chr1     [5073028, 5073344]      *   |    5073202
##       [4]     chr1     [7078801, 7079170]      *   |    7078892
##       [5]     chr1     [7387587, 7388483]      *   |    7387940
##       ...      ...                    ...    ... ...        ...
##   [30229]     chrX [165737530, 165737735]      *   |  165737687
##   [30230]     chrX [165757688, 165758643]      *   |  165758296
##   [30231]     chrY [   581657,    582367]      *   |     582221
##   [30232]     chrY [   622356,    622742]      *   |     622484
##   [30233]     chrY [   622882,    623459]      *   |     623091
##           fold_enrichment
##                 <numeric>
##       [1]          9.5239
##       [2]         7.45675
##       [3]        10.20078
##       [4]         4.00702
##       [5]        19.01092
##       ...             ...
##   [30229]         6.23072
##   [30230]        17.31472
##   [30231]        18.62359
##   [30232]        10.23618
##   [30233]        14.59007
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

ChIP-seq Functional Annotation Of Peaks - practical

MRC LMS Bioinformatics Team

26 September 2019