In this practical we will read in two replicate sets of peaks from the Myc Encode dataset and extract sequences underneath subsets of peaks. We will write these sequences out to a FASTA file for us with Meme-ChIP.
library(GenomicRanges)
library(BSgenome)
library(BSgenome.Mmusculus.UCSC.mm9)
melPeak_Rep1 <- read.delim("data/MacsPeaks/mycmelrep1_peaks.xls",sep="\t",comment.char = "#")
melPeak_Rep2 <- read.delim("data/MacsPeaks/mycmelrep2_peaks.xls",sep="\t",comment.char = "#")
melRep1_GR <- GRanges(
seqnames=melPeak_Rep1[,"chr"],
IRanges(melPeak_Rep1[,"start"],
melPeak_Rep1[,"end"]
)
)
mcols(melRep1_GR) <- melPeak_Rep1[,c("abs_summit", "fold_enrichment")]
melRep2_GR <- GRanges(
seqnames=melPeak_Rep2[,"chr"],
IRanges(melPeak_Rep2[,"start"],
melPeak_Rep2[,"end"]
)
)
mcols(melRep2_GR) <- melPeak_Rep2[,c("abs_summit", "fold_enrichment")]
melRep1_GR_InRep1AndRep2 <- melRep1_GR[melRep1_GR %over% melRep2_GR]
melRep1_GR_InRep1AndRep2
## GRanges object with 30233 ranges and 2 metadata columns:
## seqnames ranges strand | abs_summit
## <Rle> <IRanges> <Rle> | <integer>
## [1] 1 [4775337, 4775959] * | 4775616
## [2] 1 [4847544, 4847931] * | 4847795
## [3] 1 [5073028, 5073344] * | 5073202
## [4] 1 [7078801, 7079170] * | 7078892
## [5] 1 [7387587, 7388483] * | 7387940
## ... ... ... ... ... ...
## [30229] X [165737530, 165737735] * | 165737687
## [30230] X [165757688, 165758643] * | 165758296
## [30231] Y [ 581657, 582367] * | 582221
## [30232] Y [ 622356, 622742] * | 622484
## [30233] Y [ 622882, 623459] * | 623091
## fold_enrichment
## <numeric>
## [1] 9.5239
## [2] 7.45675
## [3] 10.20078
## [4] 4.00702
## [5] 19.01092
## ... ...
## [30229] 6.23072
## [30230] 17.31472
## [30231] 18.62359
## [30232] 10.23618
## [30233] 14.59007
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
melRep1_GR_InRep1AndRep2 <- melRep1_GR_InRep1AndRep2[ order(melRep1_GR_InRep1AndRep2$fold_enrichment,decreasing=T)
]
top500_melRep1_Rep1AndRep2 <- melRep1_GR_InRep1AndRep2[1:500,]
top500_melRep1_1And2_Resized <- resize(top500_melRep1_Rep1AndRep2,200,fix = "center")
top500_melRep1_1And2_Resized[1:3,]
## GRanges object with 3 ranges and 2 metadata columns:
## seqnames ranges strand | abs_summit fold_enrichment
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## [1] 4 [ 45966312, 45966511] * | 45966486 123.11587
## [2] 9 [ 21156572, 21156771] * | 21157016 111.53541
## [3] 12 [114345800, 114345999] * | 114345880 104.74549
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
genome <- BSgenome.Mmusculus.UCSC.mm9
seqlevelsStyle(top500_melRep1_1And2_Resized) <- "UCSC"
top500_melRep1_1And2_Resized_Seq <- getSeq(genome,top500_melRep1_1And2_Resized)
names(top500_melRep1_1And2_Resized_Seq) <- paste0("peak_",seqnames(top500_melRep1_1And2_Resized),"_",
start(top500_melRep1_1And2_Resized),
"-",
end(top500_melRep1_1And2_Resized))
top500_melRep1_1And2_Resized_Seq[1:4,]
## A DNAStringSet instance of length 4
## width seq names
## [1] 200 AGACTCAGAATGAGAGGAGTC...GTGCTCAGAATCAGACCATG peak_chr4_4596631...
## [2] 200 CTTGCAGAGGACCTGGGTTCA...TTCTAAAGATGAGGCAGGAG peak_chr9_2115657...
## [3] 200 GCCTTTCTGAGCATGGGTCTG...TGCCTGACACAGCAGGTCCT peak_chr12_114345...
## [4] 200 GGTGGCTCTGGACATCCCTGG...AATAAAGCTTGCCTCCGCCT peak_chr3_8784724...
writeXStringSet(top500_melRep1_1And2_Resized_Seq,file="top500_melRep1_1And2.fa")
Once you have uploaded the data you can preview the results here
** Bonus question - Recentre top 500 peaks around their absolute summit, resize to 200, extract sequences and submit to Meme-ChIP. Compare results between peaksets.
Once you have uploaded the data you can preview the results here
start(top500_melRep1_Rep1AndRep2) <- end(top500_melRep1_Rep1AndRep2) <- top500_melRep1_Rep1AndRep2$abs_summit
top500_melRep1_1And2_Resized <- resize(top500_melRep1_Rep1AndRep2,200,fix = "center")
genome <- BSgenome.Mmusculus.UCSC.mm9
seqlevelsStyle(top500_melRep1_1And2_Resized) <- "UCSC"
top500_melRep1_1And2_Resized_Seq <- getSeq(genome,top500_melRep1_1And2_Resized)
names(top500_melRep1_1And2_Resized_Seq) <- paste0("peak_",seqnames(top500_melRep1_1And2_Resized),"_",
start(top500_melRep1_1And2_Resized),
"-",
end(top500_melRep1_1And2_Resized))
writeXStringSet(top500_melRep1_1And2_Resized_Seq,file="abssummit_top500_melRep1_1And2.fa")