Solutions

1. Check what kind of annotation data can be extracted using `org.Hs.eg.db` package

library("org.Hs.eg.db")
columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
## [16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
## [21] "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [26] "UNIPROT"

2. Print few gene names from `org.Hs.eg.db`

head(keys(org.Hs.eg.db,keytype="GENENAME"))
## [1] "alpha-1-B glycoprotein"             "alpha-2-macroglobulin"             
## [3] "alpha-2-macroglobulin pseudogene 1" "N-acetyltransferase 1"             
## [5] "N-acetyltransferase 2"              "N-acetyltransferase pseudogene"

3. Retrieve gene name and Ensembl gene identifiers for “HEBP2” and “PRND” from `org.Hs.eg.db`

select(org.Hs.eg.db,keys=c("HEBP2","PRND"),keytype="SYMBOL",
       columns=c("GENENAME","ENSEMBL"))
##   SYMBOL                  GENENAME         ENSEMBL
## 1  HEBP2    heme binding protein 2 ENSG00000051620
## 2   PRND prion like protein doppel ENSG00000171864

4. How many annotation datasets available in Ensembl Biomart?

library("biomaRt")

ensembl <- useEnsembl(biomart = 'genes',dataset = 'hsapiens_gene_ensembl',version = 80)

ens_datasets <- listDatasets(ensembl) # list datasets 
head(ens_datasets)
##                       dataset                             description
## 1  acarolinensis_gene_ensembl   Anolis carolinensis genes (AnoCar2.0)
## 2   amelanoleuca_gene_ensembl  Ailuropoda melanoleuca genes (ailMel1)
## 3     amexicanus_gene_ensembl    Astyanax mexicanus genes (AstMex102)
## 4 aplatyrhynchos_gene_ensembl Anas platyrhynchos genes (BGI_duck_1.0)
## 5        btaurus_gene_ensembl               Bos taurus genes (UMD3.1)
## 6       celegans_gene_ensembl Caenorhabditis elegans genes (WBcel235)
##        version
## 1    AnoCar2.0
## 2      ailMel1
## 3    AstMex102
## 4 BGI_duck_1.0
## 5       UMD3.1
## 6     WBcel235
dim(ens_datasets)
## [1] 69  3

5. Retrieve genomic coordinates for human genes from Ensembl biomart and build a GRanges object.

Subset the above GRanges object to include only protein coding genes
Subset the GRanges object again with genes in main chromsomes (1-22,X,Y)
Create another GRanges object with genes in chr1:1544000-2371000
- Tips:
- You can select main chromosomes and “protein coding” genes by using appropriate filter and value.
- Search for “biotype” in available filters using grep()
- Run filterOptions("biotype",selectedmart) to see the accepted values for “biotype” filter
- When multiple filters specified, “values” argument should be a list of vectors; each vector corresponds to each specified filter.
- Annotation fields to retrieve: “chromosome_name”, “start_position”, “end_position”,“ensembl_gene_id”, “strand”, “external_gene_name”
- Before creating GRanges object, add “chr” prefix to chromosome using paste function, Ex: change 1 to chr1 (required for next task)

ens_human <- useDataset("hsapiens_gene_ensembl",mart=ensembl) # select human dataset
ens_human_Attr <- listAttributes(ens_human) # list available annotation
ens_human_filters <- listFilters(ens_human) # list available filters
availFilters <- filterOptions("biotype",ens_human) # Displays accepted values for "biotype"

hg19Gene <- getBM(
          attributes = c("chromosome_name","start_position","end_position",
                         "ensembl_gene_id","strand","external_gene_name"), 
          filter=c("chromosome_name","biotype"),
          values=list(c(1:22,"X","Y"),"protein_coding"), mart=ens_human)
head(hg19Gene)
##   chromosome_name start_position end_position ensembl_gene_id strand
## 1              15       40358235     40378639 ENSG00000140323      1
## 2              13       49628299     49633872 ENSG00000152213      1
## 3              22       18527802     18530573 ENSG00000278558      1
## 4              20        1309909      1329239 ENSG00000125775     -1
## 5              13       57160632     57163653 ENSG00000227151      1
## 6              16        1790413      1794971 ENSG00000099769     -1
##   external_gene_name
## 1              DISP2
## 2              ARL11
## 3           TMEM191B
## 4             SDCBP2
## 5             PRR20D
## 6             IGFALS

Now create GRanges object using the above data frame.

library(GenomicRanges)

# add 'chr' prefix to chromosome name
hg19Gene$chromosome_name <- paste("chr",hg19Gene$chromosome_name,sep="")

hg19Gene.GR <- GRanges(seqnames=hg19Gene$chromosome_name,
                       ranges=IRanges(start=hg19Gene$start_position,end=hg19Gene$end_position),
                       strand=ifelse(hg19Gene$strand==1,"+","-"),
                       EnsemblID=hg19Gene$ensembl_gene_id,
                       Symbol=hg19Gene$external_gene_name)
hg19Gene.GR
## GRanges object with 19783 ranges and 2 metadata columns:
##           seqnames            ranges strand |       EnsemblID      Symbol
##              <Rle>         <IRanges>  <Rle> |     <character> <character>
##       [1]    chr15 40358235-40378639      + | ENSG00000140323       DISP2
##       [2]    chr13 49628299-49633872      + | ENSG00000152213       ARL11
##       [3]    chr22 18527802-18530573      + | ENSG00000278558    TMEM191B
##       [4]    chr20   1309909-1329239      - | ENSG00000125775      SDCBP2
##       [5]    chr13 57160632-57163653      + | ENSG00000227151      PRR20D
##       ...      ...               ...    ... .             ...         ...
##   [19779]     chr8 13083361-13515658      - | ENSG00000164741        DLC1
##   [19780]     chr9 34689567-34691277      - | ENSG00000172724       CCL19
##   [19781]    chr17   7646627-7657768      + | ENSG00000129244      ATP1B2
##   [19782]     chr4   9215405-9217356      + | ENSG00000233136    USP17L11
##   [19783]    chr14 21016763-21070872      - | ENSG00000165795       NDRG2
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

Filter the above GRanges object for genes in chr1:1544000-2371000

chr1genes <- hg19Gene.GR[seqnames(hg19Gene.GR)=="chr1" & 
                         start(hg19Gene.GR) > 1544000 & 
                           end(hg19Gene.GR) < 2371000]
head(chr1genes)
## GRanges object with 6 ranges and 2 metadata columns:
##       seqnames          ranges strand |       EnsemblID      Symbol
##          <Rle>       <IRanges>  <Rle> |     <character> <character>
##   [1]     chr1 1632095-1701810      + | ENSG00000189409      MMP23B
##   [2]     chr1 1579756-1580046      + | ENSG00000279244  AL645728.1
##   [3]     chr1 1615415-1630610      + | ENSG00000197530        MIB2
##   [4]     chr1 1659529-1692728      - | ENSG00000189339    SLC35E2B
##   [5]     chr1 1702730-1724324      - | ENSG00000008128      CDK11A
##   [6]     chr1 1921951-2003837      - | ENSG00000142609      CFAP74
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

# alternate solution
chr1genes <- subset(hg19Gene.GR,start>1544000 & end<2371000 & seqnames=="chr1")

6. Retrieve 200 bp upstream promoter sequences for the given gene symbols `AQP1, ASNSP2, KPNA2, FRMD4A, NSUN5, VAC14` from Ensembl human biomart.

+ Tips: 
+ Read documentation for `getSequence`
+ Use `type="hgnc_symbol"` and `seqType="coding_gene_flank"`

symbols <- c("AQP1", "ASNSP2", "KPNA2", "FRMD4A", "NSUN5", "VAC14")
ensembl <- useEnsembl(biomart = 'genes', dataset = 'hsapiens_gene_ensembl',version = 80)

seq <- getSequence(id=symbols, type="hgnc_symbol",
                  seqType="coding_gene_flank", upstream=200, mart = ensembl)

7. Retrieve the transcript coordinates for genes as GRangesList from `TxDb.Hsapiens.UCSC.hg19.knownGene` (install it from Bioconductor if required)

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
hg19txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
TranscrtipsByGene <- transcriptsBy(hg19txdb,by="gene") # inspect the output

Alternate solution

library(GenomicFeatures)
hg19txdb <- makeTxDbFromUCSC(genome = "hg19", tablename = "knownGene")
TranscrtipsByGene <- transcriptsBy(hg19txdb,by="gene") # inspect the output

8. Retrive exon coordiantes for genes from `TxDb.Hsapiens.UCSC.hg19.knownGene`

ExonsByGene <- exonsBy(hg19txdb,by="gene") # inspect the output

Bioconductor Exercises Part 3

MRC LMS Bioinformatics Core

21 April 2022

Solutions

1. Check what kind of annotation data can be extracted using `org.Hs.eg.db` package

2. Print few gene names from `org.Hs.eg.db`

3. Retrieve gene name and Ensembl gene identifiers for “HEBP2” and “PRND” from `org.Hs.eg.db`

4. How many annotation datasets available in Ensembl Biomart?

5. Retrieve genomic coordinates for human genes from Ensembl biomart and build a GRanges object.

6. Retrieve 200 bp upstream promoter sequences for the given gene symbols `AQP1, ASNSP2, KPNA2, FRMD4A, NSUN5, VAC14` from Ensembl human biomart.

7. Retrieve the transcript coordinates for genes as GRangesList from `TxDb.Hsapiens.UCSC.hg19.knownGene` (install it from Bioconductor if required)

8. Retrive exon coordiantes for genes from `TxDb.Hsapiens.UCSC.hg19.knownGene`

Bioconductor Exercises Part 3

MRC LMS Bioinformatics Core

21 April 2022

Solutions

1. Check what kind of annotation data can be extracted using org.Hs.eg.db package

2. Print few gene names from org.Hs.eg.db

3. Retrieve gene name and Ensembl gene identifiers for “HEBP2” and “PRND” from org.Hs.eg.db

4. How many annotation datasets available in Ensembl Biomart?

5. Retrieve genomic coordinates for human genes from Ensembl biomart and build a GRanges object.

6. Retrieve 200 bp upstream promoter sequences for the given gene symbols AQP1, ASNSP2, KPNA2, FRMD4A, NSUN5, VAC14 from Ensembl human biomart.

7. Retrieve the transcript coordinates for genes as GRangesList from TxDb.Hsapiens.UCSC.hg19.knownGene (install it from Bioconductor if required)

8. Retrive exon coordiantes for genes from TxDb.Hsapiens.UCSC.hg19.knownGene

1. Check what kind of annotation data can be extracted using `org.Hs.eg.db` package

2. Print few gene names from `org.Hs.eg.db`

3. Retrieve gene name and Ensembl gene identifiers for “HEBP2” and “PRND” from `org.Hs.eg.db`

6. Retrieve 200 bp upstream promoter sequences for the given gene symbols `AQP1, ASNSP2, KPNA2, FRMD4A, NSUN5, VAC14` from Ensembl human biomart.

7. Retrieve the transcript coordinates for genes as GRangesList from `TxDb.Hsapiens.UCSC.hg19.knownGene` (install it from Bioconductor if required)

8. Retrive exon coordiantes for genes from `TxDb.Hsapiens.UCSC.hg19.knownGene`