+ Tip: How to identify TSS for genes in forward/reverse strand?
library(GenomicRanges)
hg19Gene <- read.table("./data/hg19Genes.txt",sep="\t",header=T)
hg19Gene$TSS <- ifelse(hg19Gene$strand=="+",hg19Gene$start,hg19Gene$end)
hg19TSS <- GRanges(seqnames=hg19Gene$chr,
ranges=IRanges(start=hg19Gene$TSS,end=hg19Gene$TSS),
strand=hg19Gene$strand,
EnsemblID=hg19Gene$ensID,
Symbol=hg19Gene$GeneSym)
hg19TSS
## GRanges object with 57736 ranges and 2 metadata columns:
## seqnames ranges strand | EnsemblID Symbol
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr1 50902978 - | ENSG00000271782 RP5-850O15.4
## [2] chr1 103817769 + | ENSG00000232753 RP11-347K2.1
## [3] chr1 50927141 + | ENSG00000225767 RP5-850O15.3
## [4] chr1 50965529 - | ENSG00000202140 Y_RNA
## [5] chr1 51048076 + | ENSG00000207194 RNU6-1026P
## ... ... ... ... . ... ...
## [57732] chrY 9175073 + | ENSG00000233803 TSPY4
## [57733] chrY 9195406 + | ENSG00000229549 TSPY8
## [57734] chrY 9236030 + | ENSG00000228927 TSPY3
## [57735] chrY 9236076 + | ENSG00000258992 TSPY1
## [57736] chrY 25365594 + | ENSG00000205944 DAZ2
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
+ Tip: Read the documentation for `promoters` function.
# First create GRanges object with gene coordinates
hg19Gene.GR <- GRanges(seqnames=hg19Gene$chr,
ranges=IRanges(start=hg19Gene$start,end=hg19Gene$end),
strand=hg19Gene$strand,
EnsemblID=hg19Gene$ensID,
Symbol=hg19Gene$GeneSym)
# Identifying promoters
hg19Promoters <- promoters(hg19Gene.GR,upstream=2000,downstream=2000)
hg19Promoters
## GRanges object with 57736 ranges and 2 metadata columns:
## seqnames ranges strand | EnsemblID Symbol
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr1 50900979-50904978 - | ENSG00000271782 RP5-850O15.4
## [2] chr1 103815769-103819768 + | ENSG00000232753 RP11-347K2.1
## [3] chr1 50925141-50929140 + | ENSG00000225767 RP5-850O15.3
## [4] chr1 50963530-50967529 - | ENSG00000202140 Y_RNA
## [5] chr1 51046076-51050075 + | ENSG00000207194 RNU6-1026P
## ... ... ... ... . ... ...
## [57732] chrY 9173073-9177072 + | ENSG00000233803 TSPY4
## [57733] chrY 9193406-9197405 + | ENSG00000229549 TSPY8
## [57734] chrY 9234030-9238029 + | ENSG00000228927 TSPY3
## [57735] chrY 9234076-9238075 + | ENSG00000258992 TSPY1
## [57736] chrY 25363594-25367593 + | ENSG00000205944 DAZ2
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
+ Tips: Import the ELF1 binding sites using `import.bed()` function from `rtracklayer` package
+ Find ELF1 binding sites overlap with promoters (TSS ± 1kb) using `findOverlaps` and `subsetByOverlaps` (Remember BED format uses 0-based coordinates)
library("rtracklayer")
ELF1 <- read.table("./data/ELF1_K562.bed",sep="\t",header=F)
ELF1GR <- GRanges(seqnames=ELF1$V1, IRanges(start=ELF1$V2+1,end=ELF1$V3))
ELF1GR_A <- import.bed("./data/ELF1_K562.bed")
# ELF1 binding sites overlap with promoters using `findOverlaps`
hg19Promoters <- promoters(hg19Gene.GR,upstream=1000,downstream=1000)
ELF1GR <- reduce(ELF1GR) # merging overlapping peaks
ELF1overlap <- findOverlaps(ELF1GR,hg19Promoters,ignore.strand=T)
ELF1overlap.m <- as.matrix(ELF1overlap)
ELF1_promoters <- ELF1GR[ELF1overlap.m[,"queryHits"],]
ELF1_promoters
## GRanges object with 15104 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr6 692969-693264 *
## [2] chr6 1515690-1515853 *
## [3] chr6 2245299-2245574 *
## [4] chr6 2245299-2245574 *
## [5] chr6 2245913-2246076 *
## ... ... ... ...
## [15100] chrX 153991664-153991939 *
## [15101] chrX 154255340-154255515 *
## [15102] chrX 154299054-154299187 *
## [15103] chrX 154299054-154299187 *
## [15104] chrX 155110770-155111144 *
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
# ELF1 binding sites overlap with promoters using `subsetByOverlaps`
ELF1_promoters1 <- subsetByOverlaps(ELF1GR,hg19Promoters)
ELF1_promoters1
## GRanges object with 11370 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr6 692969-693264 *
## [2] chr6 1515690-1515853 *
## [3] chr6 2245299-2245574 *
## [4] chr6 2245913-2246076 *
## [5] chr6 2634773-2635048 *
## ... ... ... ...
## [11366] chrX 153991139-153991345 *
## [11367] chrX 153991664-153991939 *
## [11368] chrX 154255340-154255515 *
## [11369] chrX 154299054-154299187 *
## [11370] chrX 155110770-155111144 *
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
Note the differences in the outputs!