Solutions

1. Check what kind of annotation data can be extracted using org.Hs.eg.db package

library("org.Hs.eg.db")
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
## [16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
## [21] "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [26] "UNIPROT"

3. Retrieve gene name and Ensembl gene identifiers for “HEBP2” and “PRND” from org.Hs.eg.db

select(org.Hs.eg.db,keys=c("HEBP2","PRND"),keytype="SYMBOL",
       columns=c("GENENAME","ENSEMBL"))
##   SYMBOL                  GENENAME         ENSEMBL
## 1  HEBP2    heme binding protein 2 ENSG00000051620
## 2   PRND prion like protein doppel ENSG00000171864

4. How many annotation datasets available in Ensembl Biomart?

library("biomaRt")

ensembl <- useEnsembl(biomart = 'genes',dataset = 'hsapiens_gene_ensembl',version = 80)

ens_datasets <- listDatasets(ensembl) # list datasets 
head(ens_datasets)
##                       dataset                             description
## 1  acarolinensis_gene_ensembl   Anolis carolinensis genes (AnoCar2.0)
## 2   amelanoleuca_gene_ensembl  Ailuropoda melanoleuca genes (ailMel1)
## 3     amexicanus_gene_ensembl    Astyanax mexicanus genes (AstMex102)
## 4 aplatyrhynchos_gene_ensembl Anas platyrhynchos genes (BGI_duck_1.0)
## 5        btaurus_gene_ensembl               Bos taurus genes (UMD3.1)
## 6       celegans_gene_ensembl Caenorhabditis elegans genes (WBcel235)
##        version
## 1    AnoCar2.0
## 2      ailMel1
## 3    AstMex102
## 4 BGI_duck_1.0
## 5       UMD3.1
## 6     WBcel235
dim(ens_datasets)
## [1] 69  3

5. Retrieve genomic coordinates for human genes from Ensembl biomart and build a GRanges object.

  • Subset the above GRanges object to include only protein coding genes

  • Subset the GRanges object again with genes in main chromsomes (1-22,X,Y)

  • Create another GRanges object with genes in chr1:1544000-2371000

    • Tips:
    • You can select main chromosomes and “protein coding” genes by using appropriate filter and value.
    • Search for “biotype” in available filters using grep()
    • Run filterOptions("biotype",selectedmart) to see the accepted values for “biotype” filter
    • When multiple filters specified, “values” argument should be a list of vectors; each vector corresponds to each specified filter.
    • Annotation fields to retrieve: “chromosome_name”, “start_position”, “end_position”,“ensembl_gene_id”, “strand”, “external_gene_name”
    • Before creating GRanges object, add “chr” prefix to chromosome using paste function, Ex: change 1 to chr1 (required for next task)
ens_human <- useDataset("hsapiens_gene_ensembl",mart=ensembl) # select human dataset
ens_human_Attr <- listAttributes(ens_human) # list available annotation
ens_human_filters <- listFilters(ens_human) # list available filters
availFilters <- filterOptions("biotype",ens_human) # Displays accepted values for "biotype"

hg19Gene <- getBM(
          attributes = c("chromosome_name","start_position","end_position",
                         "ensembl_gene_id","strand","external_gene_name"), 
          filter=c("chromosome_name","biotype"),
          values=list(c(1:22,"X","Y"),"protein_coding"), mart=ens_human)
head(hg19Gene)
##   chromosome_name start_position end_position ensembl_gene_id strand
## 1              15       40358235     40378639 ENSG00000140323      1
## 2              13       49628299     49633872 ENSG00000152213      1
## 3              22       18527802     18530573 ENSG00000278558      1
## 4              20        1309909      1329239 ENSG00000125775     -1
## 5              13       57160632     57163653 ENSG00000227151      1
## 6              16        1790413      1794971 ENSG00000099769     -1
##   external_gene_name
## 1              DISP2
## 2              ARL11
## 3           TMEM191B
## 4             SDCBP2
## 5             PRR20D
## 6             IGFALS

Now create GRanges object using the above data frame.

library(GenomicRanges)

# add 'chr' prefix to chromosome name
hg19Gene$chromosome_name <- paste("chr",hg19Gene$chromosome_name,sep="")

hg19Gene.GR <- GRanges(seqnames=hg19Gene$chromosome_name,
                       ranges=IRanges(start=hg19Gene$start_position,end=hg19Gene$end_position),
                       strand=ifelse(hg19Gene$strand==1,"+","-"),
                       EnsemblID=hg19Gene$ensembl_gene_id,
                       Symbol=hg19Gene$external_gene_name)
hg19Gene.GR
## GRanges object with 19783 ranges and 2 metadata columns:
##           seqnames            ranges strand |       EnsemblID      Symbol
##              <Rle>         <IRanges>  <Rle> |     <character> <character>
##       [1]    chr15 40358235-40378639      + | ENSG00000140323       DISP2
##       [2]    chr13 49628299-49633872      + | ENSG00000152213       ARL11
##       [3]    chr22 18527802-18530573      + | ENSG00000278558    TMEM191B
##       [4]    chr20   1309909-1329239      - | ENSG00000125775      SDCBP2
##       [5]    chr13 57160632-57163653      + | ENSG00000227151      PRR20D
##       ...      ...               ...    ... .             ...         ...
##   [19779]     chr8 13083361-13515658      - | ENSG00000164741        DLC1
##   [19780]     chr9 34689567-34691277      - | ENSG00000172724       CCL19
##   [19781]    chr17   7646627-7657768      + | ENSG00000129244      ATP1B2
##   [19782]     chr4   9215405-9217356      + | ENSG00000233136    USP17L11
##   [19783]    chr14 21016763-21070872      - | ENSG00000165795       NDRG2
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

Filter the above GRanges object for genes in chr1:1544000-2371000

chr1genes <- hg19Gene.GR[seqnames(hg19Gene.GR)=="chr1" & 
                         start(hg19Gene.GR) > 1544000 & 
                           end(hg19Gene.GR) < 2371000]
head(chr1genes)
## GRanges object with 6 ranges and 2 metadata columns:
##       seqnames          ranges strand |       EnsemblID      Symbol
##          <Rle>       <IRanges>  <Rle> |     <character> <character>
##   [1]     chr1 1632095-1701810      + | ENSG00000189409      MMP23B
##   [2]     chr1 1579756-1580046      + | ENSG00000279244  AL645728.1
##   [3]     chr1 1615415-1630610      + | ENSG00000197530        MIB2
##   [4]     chr1 1659529-1692728      - | ENSG00000189339    SLC35E2B
##   [5]     chr1 1702730-1724324      - | ENSG00000008128      CDK11A
##   [6]     chr1 1921951-2003837      - | ENSG00000142609      CFAP74
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

# alternate solution
chr1genes <- subset(hg19Gene.GR,start>1544000 & end<2371000 & seqnames=="chr1")

6. Retrieve 200 bp upstream promoter sequences for the given gene symbols AQP1, ASNSP2, KPNA2, FRMD4A, NSUN5, VAC14 from Ensembl human biomart.

+ Tips: 
+ Read documentation for `getSequence`
+ Use `type="hgnc_symbol"` and `seqType="coding_gene_flank"`
symbols <- c("AQP1", "ASNSP2", "KPNA2", "FRMD4A", "NSUN5", "VAC14")
ensembl <- useEnsembl(biomart = 'genes', dataset = 'hsapiens_gene_ensembl',version = 80)

seq <- getSequence(id=symbols, type="hgnc_symbol",
                  seqType="coding_gene_flank", upstream=200, mart = ensembl)

7. Retrieve the transcript coordinates for genes as GRangesList from TxDb.Hsapiens.UCSC.hg19.knownGene (install it from Bioconductor if required)

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
hg19txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
TranscrtipsByGene <- transcriptsBy(hg19txdb,by="gene") # inspect the output


Alternate solution

library(GenomicFeatures)
hg19txdb <- makeTxDbFromUCSC(genome = "hg19", tablename = "knownGene")
TranscrtipsByGene <- transcriptsBy(hg19txdb,by="gene") # inspect the output

8. Retrive exon coordiantes for genes from TxDb.Hsapiens.UCSC.hg19.knownGene

ExonsByGene <- exonsBy(hg19txdb,by="gene") # inspect the output