Solutions

1. Import human hg19 gene coordiantes from “hg19Genes.txt and create a GRanges of transcription start sites (1 bp range).

+ Tip: How to identify TSS for genes in forward/reverse strand?
library(GenomicRanges)
hg19Gene <- read.table("./data/hg19Genes.txt",sep="\t",header=T)

hg19Gene$TSS <- ifelse(hg19Gene$strand=="+",hg19Gene$start,hg19Gene$end)

hg19TSS <- GRanges(seqnames=hg19Gene$chr,
                       ranges=IRanges(start=hg19Gene$TSS,end=hg19Gene$TSS),
                       strand=hg19Gene$strand,
                       EnsemblID=hg19Gene$ensID,
                       Symbol=hg19Gene$GeneSym)
hg19TSS
## GRanges object with 57736 ranges and 2 metadata columns:
##           seqnames    ranges strand |       EnsemblID       Symbol
##              <Rle> <IRanges>  <Rle> |     <character>  <character>
##       [1]     chr1  50902978      - | ENSG00000271782 RP5-850O15.4
##       [2]     chr1 103817769      + | ENSG00000232753 RP11-347K2.1
##       [3]     chr1  50927141      + | ENSG00000225767 RP5-850O15.3
##       [4]     chr1  50965529      - | ENSG00000202140        Y_RNA
##       [5]     chr1  51048076      + | ENSG00000207194   RNU6-1026P
##       ...      ...       ...    ... .             ...          ...
##   [57732]     chrY   9175073      + | ENSG00000233803        TSPY4
##   [57733]     chrY   9195406      + | ENSG00000229549        TSPY8
##   [57734]     chrY   9236030      + | ENSG00000228927        TSPY3
##   [57735]     chrY   9236076      + | ENSG00000258992        TSPY1
##   [57736]     chrY  25365594      + | ENSG00000205944         DAZ2
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

2. Create a GRanges object of human promoters with TSS ± 2000bp.

+ Tip: Read the documentation for `promoters` function.
# First create GRanges object with gene coordinates
hg19Gene.GR <- GRanges(seqnames=hg19Gene$chr,
                       ranges=IRanges(start=hg19Gene$start,end=hg19Gene$end),
                       strand=hg19Gene$strand,
                       EnsemblID=hg19Gene$ensID,
                       Symbol=hg19Gene$GeneSym)

# Identifying promoters
hg19Promoters <- promoters(hg19Gene.GR,upstream=2000,downstream=2000)
hg19Promoters
## GRanges object with 57736 ranges and 2 metadata columns:
##           seqnames              ranges strand |       EnsemblID       Symbol
##              <Rle>           <IRanges>  <Rle> |     <character>  <character>
##       [1]     chr1   50900979-50904978      - | ENSG00000271782 RP5-850O15.4
##       [2]     chr1 103815769-103819768      + | ENSG00000232753 RP11-347K2.1
##       [3]     chr1   50925141-50929140      + | ENSG00000225767 RP5-850O15.3
##       [4]     chr1   50963530-50967529      - | ENSG00000202140        Y_RNA
##       [5]     chr1   51046076-51050075      + | ENSG00000207194   RNU6-1026P
##       ...      ...                 ...    ... .             ...          ...
##   [57732]     chrY     9173073-9177072      + | ENSG00000233803        TSPY4
##   [57733]     chrY     9193406-9197405      + | ENSG00000229549        TSPY8
##   [57734]     chrY     9234030-9238029      + | ENSG00000228927        TSPY3
##   [57735]     chrY     9234076-9238075      + | ENSG00000258992        TSPY1
##   [57736]     chrY   25363594-25367593      + | ENSG00000205944         DAZ2
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

3. Import ELF1 binding sites in K562 cell from Encode (ELF1_K562.bed) and create GRanges object.

+ Tips: Import the ELF1 binding sites using `import.bed()` function from `rtracklayer` package
+ Find ELF1 binding sites overlap with promoters (TSS ± 1kb) using `findOverlaps` and `subsetByOverlaps` (Remember BED format uses 0-based coordinates)
library("rtracklayer")
ELF1 <- read.table("./data/ELF1_K562.bed",sep="\t",header=F)
ELF1GR <- GRanges(seqnames=ELF1$V1, IRanges(start=ELF1$V2+1,end=ELF1$V3))
ELF1GR_A <- import.bed("./data/ELF1_K562.bed")


# ELF1 binding sites overlap with promoters using `findOverlaps`
hg19Promoters <- promoters(hg19Gene.GR,upstream=1000,downstream=1000)
ELF1GR <- reduce(ELF1GR) # merging overlapping peaks
ELF1overlap <- findOverlaps(ELF1GR,hg19Promoters,ignore.strand=T)
ELF1overlap.m <- as.matrix(ELF1overlap)
ELF1_promoters <- ELF1GR[ELF1overlap.m[,"queryHits"],]
ELF1_promoters
## GRanges object with 15104 ranges and 0 metadata columns:
##           seqnames              ranges strand
##              <Rle>           <IRanges>  <Rle>
##       [1]     chr6       692969-693264      *
##       [2]     chr6     1515690-1515853      *
##       [3]     chr6     2245299-2245574      *
##       [4]     chr6     2245299-2245574      *
##       [5]     chr6     2245913-2246076      *
##       ...      ...                 ...    ...
##   [15100]     chrX 153991664-153991939      *
##   [15101]     chrX 154255340-154255515      *
##   [15102]     chrX 154299054-154299187      *
##   [15103]     chrX 154299054-154299187      *
##   [15104]     chrX 155110770-155111144      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

# ELF1 binding sites overlap with promoters using `subsetByOverlaps`
ELF1_promoters1 <- subsetByOverlaps(ELF1GR,hg19Promoters)
ELF1_promoters1
## GRanges object with 11370 ranges and 0 metadata columns:
##           seqnames              ranges strand
##              <Rle>           <IRanges>  <Rle>
##       [1]     chr6       692969-693264      *
##       [2]     chr6     1515690-1515853      *
##       [3]     chr6     2245299-2245574      *
##       [4]     chr6     2245913-2246076      *
##       [5]     chr6     2634773-2635048      *
##       ...      ...                 ...    ...
##   [11366]     chrX 153991139-153991345      *
##   [11367]     chrX 153991664-153991939      *
##   [11368]     chrX 154255340-154255515      *
##   [11369]     chrX 154299054-154299187      *
##   [11370]     chrX 155110770-155111144      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

Note the differences in the outputs!