In this practical we will read in two replicate sets of peaks from the Myc Encode dataset,filter by fold enrichment over input and find the number of peaks common to both replicates and those unique to either.
# Load the GenomicRanges Library .. here is suppress messages for a cleaner document
suppressPackageStartupMessages(
library(GenomicRanges)
)
melPeak_Rep1 <- read.delim("data/MacsPeaks/mycmelrep1_peaks.xls",sep="\t",comment.char = "#")
melPeak_Rep2 <- read.delim("data/MacsPeaks/mycmelrep2_peaks.xls",sep="\t",comment.char = "#")
melRep1_GR <- GRanges(
seqnames=melPeak_Rep1[,"chr"],
IRanges(melPeak_Rep1[,"start"],
melPeak_Rep1[,"end"]
)
)
mcols(melRep1_GR) <- melPeak_Rep1[,c("abs_summit", "fold_enrichment")]
melRep1_GR
## GRanges object with 52933 ranges and 2 metadata columns:
## seqnames ranges strand | abs_summit
## <Rle> <IRanges> <Rle> | <integer>
## [1] 1 [3049670, 3049833] * | 3049799
## [2] 1 [3435991, 3436154] * | 3436060
## [3] 1 [4774935, 4775285] * | 4775250
## [4] 1 [4775337, 4775959] * | 4775616
## [5] 1 [4847544, 4847931] * | 4847795
## ... ... ... ... ... ...
## [52929] Y [1913013, 1913289] * | 1913171
## [52930] Y [1934470, 1934640] * | 1934521
## [52931] Y [1964602, 1964765] * | 1964756
## [52932] Y [2555745, 2555908] * | 2555815
## [52933] Y [2890951, 2891338] * | 2891207
## fold_enrichment
## <numeric>
## [1] 4.61726
## [2] 5.56316
## [3] 4.1806
## [4] 9.5239
## [5] 7.45675
## ... ...
## [52929] 6.92589
## [52930] 4.61726
## [52931] 3.652
## [52932] 5.72846
## [52933] 2.83564
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
melRep2_GR <- GRanges(
seqnames=melPeak_Rep2[,"chr"],
IRanges(melPeak_Rep2[,"start"],
melPeak_Rep2[,"end"]
)
)
mcols(melRep2_GR) <- melPeak_Rep2[,c("abs_summit", "fold_enrichment")]
melRep2_GR
## GRanges object with 41443 ranges and 2 metadata columns:
## seqnames ranges strand | abs_summit
## <Rle> <IRanges> <Rle> | <integer>
## [1] 1 [4506720, 4506910] * | 4506852
## [2] 1 [4775356, 4776102] * | 4775578
## [3] 1 [4797708, 4797962] * | 4797822
## [4] 1 [4847097, 4848049] * | 4847802
## [5] 1 [4848406, 4848603] * | 4848479
## ... ... ... ... ... ...
## [41439] Y [ 581955, 582358] * | 582206
## [41440] Y [ 622263, 622682] * | 622529
## [41441] Y [ 622802, 623337] * | 623093
## [41442] Y [1903412, 1903639] * | 1903501
## [41443] Y [1905483, 1905649] * | 1905522
## fold_enrichment
## <numeric>
## [1] 5.79027
## [2] 11.70799
## [3] 3.7535
## [4] 5.81891
## [5] 4.18488
## ... ...
## [41439] 8.34254
## [41440] 4.57042
## [41441] 5.8556
## [41442] 6.03422
## [41443] 5.12232
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths
Now you have your peak sets as GRanges, you can start to work with the peaks in R.
Find the number of peaks in each replicate.
Find the number of peaks in each replicate on chromosome 4.
Find the number of peaks on chromosome 4 which have a 5 fold enrichment above input.
# Number of peaks
length(melRep1_GR)
## [1] 52933
length(melRep2_GR)
## [1] 41443
# Number of peaks on chromosome 4
# Using table on logical vector
table(seqnames(melRep1_GR) %in% "4")
##
## FALSE TRUE
## 49755 3178
table(seqnames(melRep2_GR) %in% "4")
##
## FALSE TRUE
## 38944 2499
# Indexing and recounting
length(melRep1_GR[seqnames(melRep1_GR) %in% "4"])
## [1] 3178
length(melRep2_GR[seqnames(melRep2_GR) %in% "4"])
## [1] 2499
# Number of peaks on chromosome 4 and with 5 fold enrichment above input
# Using table on logical vector
table(seqnames(melRep1_GR) %in% "4" & melRep1_GR$fold_enrichment > 5)
##
## FALSE TRUE
## 50761 2172
table(seqnames(melRep2_GR) %in% "4" & melRep2_GR$fold_enrichment > 5)
##
## FALSE TRUE
## 40020 1423
# Indexing and recounting
length(melRep1_GR[seqnames(melRep1_GR) %in% "4" & melRep1_GR$fold_enrichment > 5])
## [1] 2172
length(melRep2_GR[seqnames(melRep2_GR) %in% "4" & melRep2_GR$fold_enrichment > 5])
## [1] 1423
Now we can use our GRanges to identify the peak in common or unique to replicates.
# Using table
table(melRep1_GR %over% melRep2_GR)
##
## FALSE TRUE
## 22700 30233
# Using index and recounting
length(melRep1_GR[melRep1_GR %over% melRep2_GR])
## [1] 30233
# Using table
table(!melRep1_GR %over% melRep2_GR)
##
## FALSE TRUE
## 30233 22700
# Using index and recounting
length(melRep1_GR[!melRep1_GR %over% melRep2_GR])
## [1] 22700
commonMelPeaks <- melRep1_GR[melRep1_GR %over% melRep2_GR]
For a bonus question.
# Using table
melRep1_GRSummits <- melRep1_GR
start(melRep1_GRSummits) <- end(melRep1_GRSummits) <- melRep1_GR$abs_summit
melRep1_GRSummits
## GRanges object with 52933 ranges and 2 metadata columns:
## seqnames ranges strand | abs_summit
## <Rle> <IRanges> <Rle> | <integer>
## [1] 1 [3049799, 3049799] * | 3049799
## [2] 1 [3436060, 3436060] * | 3436060
## [3] 1 [4775250, 4775250] * | 4775250
## [4] 1 [4775616, 4775616] * | 4775616
## [5] 1 [4847795, 4847795] * | 4847795
## ... ... ... ... ... ...
## [52929] Y [1913171, 1913171] * | 1913171
## [52930] Y [1934521, 1934521] * | 1934521
## [52931] Y [1964756, 1964756] * | 1964756
## [52932] Y [2555815, 2555815] * | 2555815
## [52933] Y [2891207, 2891207] * | 2891207
## fold_enrichment
## <numeric>
## [1] 4.61726
## [2] 5.56316
## [3] 4.1806
## [4] 9.5239
## [5] 7.45675
## ... ...
## [52929] 6.92589
## [52930] 4.61726
## [52931] 3.652
## [52932] 5.72846
## [52933] 2.83564
## -------
## seqinfo: 22 sequences from an unspecified genome; no seqlengths