ChIP-seq Identifying Motifs

In this practical we will read in two replicate sets of peaks from the Myc Encode dataset and extract sequences underneath subsets of peaks. We will write these sequences out to a FASTA file for us with Meme-ChIP.

First lets read in the two Myc Mel peakset replicates and create the common peakset as we did for our previous exercise

  library(GenomicRanges)
  library(BSgenome)
  library(BSgenome.Mmusculus.UCSC.mm9)


melPeak_Rep1 <- read.delim("data/MacsPeaks/mycmelrep1_peaks.xls",sep="\t",comment.char = "#")
melPeak_Rep2 <- read.delim("data/MacsPeaks/mycmelrep2_peaks.xls",sep="\t",comment.char = "#")

melRep1_GR <- GRanges(
                  seqnames=melPeak_Rep1[,"chr"],
                  IRanges(melPeak_Rep1[,"start"],
                  melPeak_Rep1[,"end"]
                  )
                )

mcols(melRep1_GR) <- melPeak_Rep1[,c("abs_summit", "fold_enrichment")]


melRep2_GR <- GRanges(
                  seqnames=melPeak_Rep2[,"chr"],
                  IRanges(melPeak_Rep2[,"start"],
                  melPeak_Rep2[,"end"]
                  )
                )

mcols(melRep2_GR) <- melPeak_Rep2[,c("abs_summit", "fold_enrichment")]

melRep1_GR_InRep1AndRep2 <- melRep1_GR[melRep1_GR %over% melRep2_GR]
melRep1_GR_InRep1AndRep2
## GRanges object with 30233 ranges and 2 metadata columns:
##           seqnames                 ranges strand   | abs_summit
##              <Rle>              <IRanges>  <Rle>   |  <integer>
##       [1]        1     [4775337, 4775959]      *   |    4775616
##       [2]        1     [4847544, 4847931]      *   |    4847795
##       [3]        1     [5073028, 5073344]      *   |    5073202
##       [4]        1     [7078801, 7079170]      *   |    7078892
##       [5]        1     [7387587, 7388483]      *   |    7387940
##       ...      ...                    ...    ... ...        ...
##   [30229]        X [165737530, 165737735]      *   |  165737687
##   [30230]        X [165757688, 165758643]      *   |  165758296
##   [30231]        Y [   581657,    582367]      *   |     582221
##   [30232]        Y [   622356,    622742]      *   |     622484
##   [30233]        Y [   622882,    623459]      *   |     623091
##           fold_enrichment
##                 <numeric>
##       [1]          9.5239
##       [2]         7.45675
##       [3]        10.20078
##       [4]         4.00702
##       [5]        19.01092
##       ...             ...
##   [30229]         6.23072
##   [30230]        17.31472
##   [30231]        18.62359
##   [30232]        10.23618
##   [30233]        14.59007
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

Now we can rank them by their fold enrichment, select the top 500 peaks and resize these peaks to 200bp around centre.

melRep1_GR_InRep1AndRep2 <- melRep1_GR_InRep1AndRep2[ order(melRep1_GR_InRep1AndRep2$fold_enrichment,decreasing=T)
                         ]
top500_melRep1_Rep1AndRep2 <- melRep1_GR_InRep1AndRep2[1:500,]
top500_melRep1_1And2_Resized <- resize(top500_melRep1_Rep1AndRep2,200,fix = "center")
top500_melRep1_1And2_Resized[1:3,]
## GRanges object with 3 ranges and 2 metadata columns:
##       seqnames                 ranges strand | abs_summit fold_enrichment
##          <Rle>              <IRanges>  <Rle> |  <integer>       <numeric>
##   [1]        4 [ 45966312,  45966511]      * |   45966486       123.11587
##   [2]        9 [ 21156572,  21156771]      * |   21157016       111.53541
##   [3]       12 [114345800, 114345999]      * |  114345880       104.74549
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

Extract the sequences underneath the file and write them to FASTA file in you working directory. Inspect the file in notepad.

genome <- BSgenome.Mmusculus.UCSC.mm9

seqlevelsStyle(top500_melRep1_1And2_Resized) <- "UCSC"

top500_melRep1_1And2_Resized_Seq <- getSeq(genome,top500_melRep1_1And2_Resized)
names(top500_melRep1_1And2_Resized_Seq) <- paste0("peak_",seqnames(top500_melRep1_1And2_Resized),"_",
                                         start(top500_melRep1_1And2_Resized),
                                         "-",
                                         end(top500_melRep1_1And2_Resized))

top500_melRep1_1And2_Resized_Seq[1:4,]
##   A DNAStringSet instance of length 4
##     width seq                                          names               
## [1]   200 AGACTCAGAATGAGAGGAGTC...GTGCTCAGAATCAGACCATG peak_chr4_4596631...
## [2]   200 CTTGCAGAGGACCTGGGTTCA...TTCTAAAGATGAGGCAGGAG peak_chr9_2115657...
## [3]   200 GCCTTTCTGAGCATGGGTCTG...TGCCTGACACAGCAGGTCCT peak_chr12_114345...
## [4]   200 GGTGGCTCTGGACATCCCTGG...AATAAAGCTTGCCTCCGCCT peak_chr3_8784724...

writeXStringSet(top500_melRep1_1And2_Resized_Seq,file="top500_melRep1_1And2.fa")

Upload the sequences to Meme-ChIP and review the results when complete.

Once you have uploaded the data you can preview the results here

** Bonus question - Recentre top 500 peaks around their absolute summit, resize to 200, extract sequences and submit to Meme-ChIP. Compare results between peaksets.

Once you have uploaded the data you can preview the results here


start(top500_melRep1_Rep1AndRep2) <- end(top500_melRep1_Rep1AndRep2) <- top500_melRep1_Rep1AndRep2$abs_summit
top500_melRep1_1And2_Resized <- resize(top500_melRep1_Rep1AndRep2,200,fix = "center")


genome <- BSgenome.Mmusculus.UCSC.mm9

seqlevelsStyle(top500_melRep1_1And2_Resized) <- "UCSC"

top500_melRep1_1And2_Resized_Seq <- getSeq(genome,top500_melRep1_1And2_Resized)
names(top500_melRep1_1And2_Resized_Seq) <- paste0("peak_",seqnames(top500_melRep1_1And2_Resized),"_",
                                         start(top500_melRep1_1And2_Resized),
                                         "-",
                                         end(top500_melRep1_1And2_Resized))

writeXStringSet(top500_melRep1_1And2_Resized_Seq,file="abssummit_top500_melRep1_1And2.fa")

ChIP-seq Identifying Motifs - practical

MRC LMS Bioinformatics