In this practical we will read in two replicate sets of peaks from the Myc Encode dataset and extract sequences underneath subsets of peaks. We will write these sequences out to a FASTA file for us with Meme-ChIP.

  library(GenomicRanges)
  library(BSgenome)
  library(BSgenome.Mmusculus.UCSC.mm9)

melPeak_Rep1 <- read.delim("data/MacsPeaks/mycmelrep1_peaks.xls",sep="\t",comment.char = "#")
melPeak_Rep2 <- read.delim("data/MacsPeaks/mycmelrep2_peaks.xls",sep="\t",comment.char = "#")

melRep1_GR <- GRanges(
                  seqnames=melPeak_Rep1[,"chr"],
                  IRanges(melPeak_Rep1[,"start"],
                  melPeak_Rep1[,"end"]
                  )
                )

mcols(melRep1_GR) <- melPeak_Rep1[,c("abs_summit", "fold_enrichment")]


melRep2_GR <- GRanges(
                  seqnames=melPeak_Rep2[,"chr"],
                  IRanges(melPeak_Rep2[,"start"],
                  melPeak_Rep2[,"end"]
                  )
                )

mcols(melRep2_GR) <- melPeak_Rep2[,c("abs_summit", "fold_enrichment")]

melRep1_GR_InRep1AndRep2 <- melRep1_GR[melRep1_GR %over% melRep2_GR]
melRep1_GR_InRep1AndRep2
## GRanges object with 30233 ranges and 2 metadata columns:
##           seqnames                 ranges strand   | abs_summit
##              <Rle>              <IRanges>  <Rle>   |  <integer>
##       [1]        1     [4775337, 4775959]      *   |    4775616
##       [2]        1     [4847544, 4847931]      *   |    4847795
##       [3]        1     [5073028, 5073344]      *   |    5073202
##       [4]        1     [7078801, 7079170]      *   |    7078892
##       [5]        1     [7387587, 7388483]      *   |    7387940
##       ...      ...                    ...    ... ...        ...
##   [30229]        X [165737530, 165737735]      *   |  165737687
##   [30230]        X [165757688, 165758643]      *   |  165758296
##   [30231]        Y [   581657,    582367]      *   |     582221
##   [30232]        Y [   622356,    622742]      *   |     622484
##   [30233]        Y [   622882,    623459]      *   |     623091
##           fold_enrichment
##                 <numeric>
##       [1]          9.5239
##       [2]         7.45675
##       [3]        10.20078
##       [4]         4.00702
##       [5]        19.01092
##       ...             ...
##   [30229]         6.23072
##   [30230]        17.31472
##   [30231]        18.62359
##   [30232]        10.23618
##   [30233]        14.59007
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths
melRep1_GR_InRep1AndRep2 <- melRep1_GR_InRep1AndRep2[ order(melRep1_GR_InRep1AndRep2$fold_enrichment,decreasing=T)
                         ]
top500_melRep1_Rep1AndRep2 <- melRep1_GR_InRep1AndRep2[1:500,]
top500_melRep1_1And2_Resized <- resize(top500_melRep1_Rep1AndRep2,200,fix = "center")
top500_melRep1_1And2_Resized[1:3,]
## GRanges object with 3 ranges and 2 metadata columns:
##       seqnames                 ranges strand | abs_summit fold_enrichment
##          <Rle>              <IRanges>  <Rle> |  <integer>       <numeric>
##   [1]        4 [ 45966312,  45966511]      * |   45966486       123.11587
##   [2]        9 [ 21156572,  21156771]      * |   21157016       111.53541
##   [3]       12 [114345800, 114345999]      * |  114345880       104.74549
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths
genome <- BSgenome.Mmusculus.UCSC.mm9

seqlevelsStyle(top500_melRep1_1And2_Resized) <- "UCSC"

top500_melRep1_1And2_Resized_Seq <- getSeq(genome,top500_melRep1_1And2_Resized)
names(top500_melRep1_1And2_Resized_Seq) <- paste0("peak_",seqnames(top500_melRep1_1And2_Resized),"_",
                                         start(top500_melRep1_1And2_Resized),
                                         "-",
                                         end(top500_melRep1_1And2_Resized))

top500_melRep1_1And2_Resized_Seq[1:4,]
##   A DNAStringSet instance of length 4
##     width seq                                          names               
## [1]   200 AGACTCAGAATGAGAGGAGTC...GTGCTCAGAATCAGACCATG peak_chr4_4596631...
## [2]   200 CTTGCAGAGGACCTGGGTTCA...TTCTAAAGATGAGGCAGGAG peak_chr9_2115657...
## [3]   200 GCCTTTCTGAGCATGGGTCTG...TGCCTGACACAGCAGGTCCT peak_chr12_114345...
## [4]   200 GGTGGCTCTGGACATCCCTGG...AATAAAGCTTGCCTCCGCCT peak_chr3_8784724...

writeXStringSet(top500_melRep1_1And2_Resized_Seq,file="top500_melRep1_1And2.fa")

Once you have uploaded the data you can preview the results here

** Bonus question - Recentre top 500 peaks around their absolute summit, resize to 200, extract sequences and submit to Meme-ChIP. Compare results between peaksets.

Once you have uploaded the data you can preview the results here


start(top500_melRep1_Rep1AndRep2) <- end(top500_melRep1_Rep1AndRep2) <- top500_melRep1_Rep1AndRep2$abs_summit
top500_melRep1_1And2_Resized <- resize(top500_melRep1_Rep1AndRep2,200,fix = "center")


genome <- BSgenome.Mmusculus.UCSC.mm9

seqlevelsStyle(top500_melRep1_1And2_Resized) <- "UCSC"

top500_melRep1_1And2_Resized_Seq <- getSeq(genome,top500_melRep1_1And2_Resized)
names(top500_melRep1_1And2_Resized_Seq) <- paste0("peak_",seqnames(top500_melRep1_1And2_Resized),"_",
                                         start(top500_melRep1_1And2_Resized),
                                         "-",
                                         end(top500_melRep1_1And2_Resized))

writeXStringSet(top500_melRep1_1And2_Resized_Seq,file="abssummit_top500_melRep1_1And2.fa")