ChIP-seq Working With Peaks

In this practical we will read in two replicate sets of peaks from the Myc Encode dataset,filter by fold enrichment over input and find the number of peaks common to both replicates and those unique to either.

First read in each of the Mel replicate peak sets found in MacsPeaks directory and create a GRanges object containing peak locations for each.

# Load the GenomicRanges Library .. here is suppress messages for a cleaner document
suppressPackageStartupMessages(
  library(GenomicRanges)
  )

melPeak_Rep1 <- read.delim("data/MacsPeaks/mycmelrep1_peaks.xls",sep="\t",comment.char = "#")
melPeak_Rep2 <- read.delim("data/MacsPeaks/mycmelrep2_peaks.xls",sep="\t",comment.char = "#")

melRep1_GR <- GRanges(
                  seqnames=melPeak_Rep1[,"chr"],
                  IRanges(melPeak_Rep1[,"start"],
                  melPeak_Rep1[,"end"]
                  )
                )

mcols(melRep1_GR) <- melPeak_Rep1[,c("abs_summit", "fold_enrichment")]

melRep1_GR
## GRanges object with 52933 ranges and 2 metadata columns:
##           seqnames             ranges strand   | abs_summit
##              <Rle>          <IRanges>  <Rle>   |  <integer>
##       [1]        1 [3049670, 3049833]      *   |    3049799
##       [2]        1 [3435991, 3436154]      *   |    3436060
##       [3]        1 [4774935, 4775285]      *   |    4775250
##       [4]        1 [4775337, 4775959]      *   |    4775616
##       [5]        1 [4847544, 4847931]      *   |    4847795
##       ...      ...                ...    ... ...        ...
##   [52929]        Y [1913013, 1913289]      *   |    1913171
##   [52930]        Y [1934470, 1934640]      *   |    1934521
##   [52931]        Y [1964602, 1964765]      *   |    1964756
##   [52932]        Y [2555745, 2555908]      *   |    2555815
##   [52933]        Y [2890951, 2891338]      *   |    2891207
##           fold_enrichment
##                 <numeric>
##       [1]         4.61726
##       [2]         5.56316
##       [3]          4.1806
##       [4]          9.5239
##       [5]         7.45675
##       ...             ...
##   [52929]         6.92589
##   [52930]         4.61726
##   [52931]           3.652
##   [52932]         5.72846
##   [52933]         2.83564
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

melRep2_GR <- GRanges(
                  seqnames=melPeak_Rep2[,"chr"],
                  IRanges(melPeak_Rep2[,"start"],
                  melPeak_Rep2[,"end"]
                  )
                )

mcols(melRep2_GR) <- melPeak_Rep2[,c("abs_summit", "fold_enrichment")]

melRep2_GR
## GRanges object with 41443 ranges and 2 metadata columns:
##           seqnames             ranges strand   | abs_summit
##              <Rle>          <IRanges>  <Rle>   |  <integer>
##       [1]        1 [4506720, 4506910]      *   |    4506852
##       [2]        1 [4775356, 4776102]      *   |    4775578
##       [3]        1 [4797708, 4797962]      *   |    4797822
##       [4]        1 [4847097, 4848049]      *   |    4847802
##       [5]        1 [4848406, 4848603]      *   |    4848479
##       ...      ...                ...    ... ...        ...
##   [41439]        Y [ 581955,  582358]      *   |     582206
##   [41440]        Y [ 622263,  622682]      *   |     622529
##   [41441]        Y [ 622802,  623337]      *   |     623093
##   [41442]        Y [1903412, 1903639]      *   |    1903501
##   [41443]        Y [1905483, 1905649]      *   |    1905522
##           fold_enrichment
##                 <numeric>
##       [1]         5.79027
##       [2]        11.70799
##       [3]          3.7535
##       [4]         5.81891
##       [5]         4.18488
##       ...             ...
##   [41439]         8.34254
##   [41440]         4.57042
##   [41441]          5.8556
##   [41442]         6.03422
##   [41443]         5.12232
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

Now you have your peak sets as GRanges, you can start to work with the peaks in R.

Find the number of peaks in each replicate.
Find the number of peaks in each replicate on chromosome 4.
Find the number of peaks on chromosome 4 which have a 5 fold enrichment above input.

# Number of peaks

length(melRep1_GR)
## [1] 52933

length(melRep2_GR)
## [1] 41443

# Number of peaks on chromosome 4

# Using table on logical vector

table(seqnames(melRep1_GR) %in% "4")
## 
## FALSE  TRUE 
## 49755  3178

table(seqnames(melRep2_GR) %in% "4")
## 
## FALSE  TRUE 
## 38944  2499

# Indexing and recounting

length(melRep1_GR[seqnames(melRep1_GR) %in% "4"])
## [1] 3178

length(melRep2_GR[seqnames(melRep2_GR) %in% "4"])
## [1] 2499


# Number of peaks on chromosome 4 and with 5 fold enrichment above input

# Using table on logical vector

table(seqnames(melRep1_GR) %in% "4" & melRep1_GR$fold_enrichment > 5)
## 
## FALSE  TRUE 
## 50761  2172

table(seqnames(melRep2_GR) %in% "4" & melRep2_GR$fold_enrichment > 5)
## 
## FALSE  TRUE 
## 40020  1423

# Indexing and recounting

length(melRep1_GR[seqnames(melRep1_GR) %in% "4" & melRep1_GR$fold_enrichment > 5])
## [1] 2172

length(melRep2_GR[seqnames(melRep2_GR) %in% "4" & melRep2_GR$fold_enrichment > 5])
## [1] 1423

Now we can use our GRanges to identify the peak in common or unique to replicates.

Find the number of replicate 1 peaks common between replicates.
Find the number of unique peaks in replicate 1


# Using table

table(melRep1_GR %over% melRep2_GR)
## 
## FALSE  TRUE 
## 22700 30233

# Using index and recounting
length(melRep1_GR[melRep1_GR %over% melRep2_GR])
## [1] 30233


# Using table

table(!melRep1_GR %over% melRep2_GR)
## 
## FALSE  TRUE 
## 30233 22700

# Using index and recounting
length(melRep1_GR[!melRep1_GR %over% melRep2_GR])
## [1] 22700

commonMelPeaks <- melRep1_GR[melRep1_GR %over% melRep2_GR]

For a bonus question.

Create a new GRanges of the summit locations from Mel Myc Peaks replicate 1.


# Using table

melRep1_GRSummits <- melRep1_GR

start(melRep1_GRSummits) <- end(melRep1_GRSummits) <- melRep1_GR$abs_summit

melRep1_GRSummits
## GRanges object with 52933 ranges and 2 metadata columns:
##           seqnames             ranges strand   | abs_summit
##              <Rle>          <IRanges>  <Rle>   |  <integer>
##       [1]        1 [3049799, 3049799]      *   |    3049799
##       [2]        1 [3436060, 3436060]      *   |    3436060
##       [3]        1 [4775250, 4775250]      *   |    4775250
##       [4]        1 [4775616, 4775616]      *   |    4775616
##       [5]        1 [4847795, 4847795]      *   |    4847795
##       ...      ...                ...    ... ...        ...
##   [52929]        Y [1913171, 1913171]      *   |    1913171
##   [52930]        Y [1934521, 1934521]      *   |    1934521
##   [52931]        Y [1964756, 1964756]      *   |    1964756
##   [52932]        Y [2555815, 2555815]      *   |    2555815
##   [52933]        Y [2891207, 2891207]      *   |    2891207
##           fold_enrichment
##                 <numeric>
##       [1]         4.61726
##       [2]         5.56316
##       [3]          4.1806
##       [4]          9.5239
##       [5]         7.45675
##       ...             ...
##   [52929]         6.92589
##   [52930]         4.61726
##   [52931]           3.652
##   [52932]         5.72846
##   [52933]         2.83564
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

ChIP-seq Working With Peaks - practical

MRC LMS Bioinformatics Core

26 September 2019