In this practical we will read in two replicate sets of peaks from the Myc Encode dataset,filter by fold enrichment over input and find the number of peaks common to both replicates and those unique to either.

# Load the GenomicRanges Library .. here is suppress messages for a cleaner document
suppressPackageStartupMessages(
  library(GenomicRanges)
  )

melPeak_Rep1 <- read.delim("data/MacsPeaks/mycmelrep1_peaks.xls",sep="\t",comment.char = "#")
melPeak_Rep2 <- read.delim("data/MacsPeaks/mycmelrep2_peaks.xls",sep="\t",comment.char = "#")

melRep1_GR <- GRanges(
                  seqnames=melPeak_Rep1[,"chr"],
                  IRanges(melPeak_Rep1[,"start"],
                  melPeak_Rep1[,"end"]
                  )
                )

mcols(melRep1_GR) <- melPeak_Rep1[,c("abs_summit", "fold_enrichment")]

melRep1_GR
## GRanges object with 52933 ranges and 2 metadata columns:
##           seqnames             ranges strand   | abs_summit
##              <Rle>          <IRanges>  <Rle>   |  <integer>
##       [1]        1 [3049670, 3049833]      *   |    3049799
##       [2]        1 [3435991, 3436154]      *   |    3436060
##       [3]        1 [4774935, 4775285]      *   |    4775250
##       [4]        1 [4775337, 4775959]      *   |    4775616
##       [5]        1 [4847544, 4847931]      *   |    4847795
##       ...      ...                ...    ... ...        ...
##   [52929]        Y [1913013, 1913289]      *   |    1913171
##   [52930]        Y [1934470, 1934640]      *   |    1934521
##   [52931]        Y [1964602, 1964765]      *   |    1964756
##   [52932]        Y [2555745, 2555908]      *   |    2555815
##   [52933]        Y [2890951, 2891338]      *   |    2891207
##           fold_enrichment
##                 <numeric>
##       [1]         4.61726
##       [2]         5.56316
##       [3]          4.1806
##       [4]          9.5239
##       [5]         7.45675
##       ...             ...
##   [52929]         6.92589
##   [52930]         4.61726
##   [52931]           3.652
##   [52932]         5.72846
##   [52933]         2.83564
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

melRep2_GR <- GRanges(
                  seqnames=melPeak_Rep2[,"chr"],
                  IRanges(melPeak_Rep2[,"start"],
                  melPeak_Rep2[,"end"]
                  )
                )

mcols(melRep2_GR) <- melPeak_Rep2[,c("abs_summit", "fold_enrichment")]

melRep2_GR
## GRanges object with 41443 ranges and 2 metadata columns:
##           seqnames             ranges strand   | abs_summit
##              <Rle>          <IRanges>  <Rle>   |  <integer>
##       [1]        1 [4506720, 4506910]      *   |    4506852
##       [2]        1 [4775356, 4776102]      *   |    4775578
##       [3]        1 [4797708, 4797962]      *   |    4797822
##       [4]        1 [4847097, 4848049]      *   |    4847802
##       [5]        1 [4848406, 4848603]      *   |    4848479
##       ...      ...                ...    ... ...        ...
##   [41439]        Y [ 581955,  582358]      *   |     582206
##   [41440]        Y [ 622263,  622682]      *   |     622529
##   [41441]        Y [ 622802,  623337]      *   |     623093
##   [41442]        Y [1903412, 1903639]      *   |    1903501
##   [41443]        Y [1905483, 1905649]      *   |    1905522
##           fold_enrichment
##                 <numeric>
##       [1]         5.79027
##       [2]        11.70799
##       [3]          3.7535
##       [4]         5.81891
##       [5]         4.18488
##       ...             ...
##   [41439]         8.34254
##   [41440]         4.57042
##   [41441]          5.8556
##   [41442]         6.03422
##   [41443]         5.12232
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

Now you have your peak sets as GRanges, you can start to work with the peaks in R.

# Number of peaks

length(melRep1_GR)
## [1] 52933

length(melRep2_GR)
## [1] 41443

# Number of peaks on chromosome 4

# Using table on logical vector

table(seqnames(melRep1_GR) %in% "4")
## 
## FALSE  TRUE 
## 49755  3178

table(seqnames(melRep2_GR) %in% "4")
## 
## FALSE  TRUE 
## 38944  2499

# Indexing and recounting

length(melRep1_GR[seqnames(melRep1_GR) %in% "4"])
## [1] 3178

length(melRep2_GR[seqnames(melRep2_GR) %in% "4"])
## [1] 2499


# Number of peaks on chromosome 4 and with 5 fold enrichment above input

# Using table on logical vector

table(seqnames(melRep1_GR) %in% "4" & melRep1_GR$fold_enrichment > 5)
## 
## FALSE  TRUE 
## 50761  2172

table(seqnames(melRep2_GR) %in% "4" & melRep2_GR$fold_enrichment > 5)
## 
## FALSE  TRUE 
## 40020  1423

# Indexing and recounting

length(melRep1_GR[seqnames(melRep1_GR) %in% "4" & melRep1_GR$fold_enrichment > 5])
## [1] 2172

length(melRep2_GR[seqnames(melRep2_GR) %in% "4" & melRep2_GR$fold_enrichment > 5])
## [1] 1423

Now we can use our GRanges to identify the peak in common or unique to replicates.


# Using table

table(melRep1_GR %over% melRep2_GR)
## 
## FALSE  TRUE 
## 22700 30233

# Using index and recounting
length(melRep1_GR[melRep1_GR %over% melRep2_GR])
## [1] 30233


# Using table

table(!melRep1_GR %over% melRep2_GR)
## 
## FALSE  TRUE 
## 30233 22700

# Using index and recounting
length(melRep1_GR[!melRep1_GR %over% melRep2_GR])
## [1] 22700

commonMelPeaks <- melRep1_GR[melRep1_GR %over% melRep2_GR]

For a bonus question.


# Using table

melRep1_GRSummits <- melRep1_GR

start(melRep1_GRSummits) <- end(melRep1_GRSummits) <- melRep1_GR$abs_summit

melRep1_GRSummits
## GRanges object with 52933 ranges and 2 metadata columns:
##           seqnames             ranges strand   | abs_summit
##              <Rle>          <IRanges>  <Rle>   |  <integer>
##       [1]        1 [3049799, 3049799]      *   |    3049799
##       [2]        1 [3436060, 3436060]      *   |    3436060
##       [3]        1 [4775250, 4775250]      *   |    4775250
##       [4]        1 [4775616, 4775616]      *   |    4775616
##       [5]        1 [4847795, 4847795]      *   |    4847795
##       ...      ...                ...    ... ...        ...
##   [52929]        Y [1913171, 1913171]      *   |    1913171
##   [52930]        Y [1934521, 1934521]      *   |    1934521
##   [52931]        Y [1964756, 1964756]      *   |    1964756
##   [52932]        Y [2555815, 2555815]      *   |    2555815
##   [52933]        Y [2891207, 2891207]      *   |    2891207
##           fold_enrichment
##                 <numeric>
##       [1]         4.61726
##       [2]         5.56316
##       [3]          4.1806
##       [4]          9.5239
##       [5]         7.45675
##       ...             ...
##   [52929]         6.92589
##   [52930]         4.61726
##   [52931]           3.652
##   [52932]         5.72846
##   [52933]         2.83564
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths