org.Hs.eg.db
packagelibrary("org.Hs.eg.db")
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "MAP"
## [16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
## [21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [26] "UNIPROT"
org.Hs.eg.db
head(keys(org.Hs.eg.db,keytype="GENENAME"))
## [1] "alpha-1-B glycoprotein" "alpha-2-macroglobulin"
## [3] "alpha-2-macroglobulin pseudogene 1" "N-acetyltransferase 1"
## [5] "N-acetyltransferase 2" "N-acetyltransferase pseudogene"
org.Hs.eg.db
select(org.Hs.eg.db,keys=c("HEBP2","PRND"),keytype="SYMBOL",
columns=c("GENENAME","ENSEMBL"))
## SYMBOL GENENAME ENSEMBL
## 1 HEBP2 heme binding protein 2 ENSG00000051620
## 2 PRND prion like protein doppel ENSG00000171864
library("biomaRt")
ensembl <- useEnsembl(biomart = 'genes',dataset = 'hsapiens_gene_ensembl',version = 80)
ens_datasets <- listDatasets(ensembl) # list datasets
head(ens_datasets)
## dataset description
## 1 acarolinensis_gene_ensembl Anolis carolinensis genes (AnoCar2.0)
## 2 amelanoleuca_gene_ensembl Ailuropoda melanoleuca genes (ailMel1)
## 3 amexicanus_gene_ensembl Astyanax mexicanus genes (AstMex102)
## 4 aplatyrhynchos_gene_ensembl Anas platyrhynchos genes (BGI_duck_1.0)
## 5 btaurus_gene_ensembl Bos taurus genes (UMD3.1)
## 6 celegans_gene_ensembl Caenorhabditis elegans genes (WBcel235)
## version
## 1 AnoCar2.0
## 2 ailMel1
## 3 AstMex102
## 4 BGI_duck_1.0
## 5 UMD3.1
## 6 WBcel235
dim(ens_datasets)
## [1] 69 3
Subset the above GRanges object to include only protein coding genes
Subset the GRanges object again with genes in main chromsomes (1-22,X,Y)
Create another GRanges object with genes in chr1:1544000-2371000
grep()
filterOptions("biotype",selectedmart)
to see the accepted values for “biotype” filterpaste
function, Ex: change 1 to chr1 (required for next task)ens_human <- useDataset("hsapiens_gene_ensembl",mart=ensembl) # select human dataset
ens_human_Attr <- listAttributes(ens_human) # list available annotation
ens_human_filters <- listFilters(ens_human) # list available filters
availFilters <- filterOptions("biotype",ens_human) # Displays accepted values for "biotype"
hg19Gene <- getBM(
attributes = c("chromosome_name","start_position","end_position",
"ensembl_gene_id","strand","external_gene_name"),
filter=c("chromosome_name","biotype"),
values=list(c(1:22,"X","Y"),"protein_coding"), mart=ens_human)
head(hg19Gene)
## chromosome_name start_position end_position ensembl_gene_id strand
## 1 15 40358235 40378639 ENSG00000140323 1
## 2 13 49628299 49633872 ENSG00000152213 1
## 3 22 18527802 18530573 ENSG00000278558 1
## 4 20 1309909 1329239 ENSG00000125775 -1
## 5 13 57160632 57163653 ENSG00000227151 1
## 6 16 1790413 1794971 ENSG00000099769 -1
## external_gene_name
## 1 DISP2
## 2 ARL11
## 3 TMEM191B
## 4 SDCBP2
## 5 PRR20D
## 6 IGFALS
Now create GRanges object using the above data frame.
library(GenomicRanges)
# add 'chr' prefix to chromosome name
hg19Gene$chromosome_name <- paste("chr",hg19Gene$chromosome_name,sep="")
hg19Gene.GR <- GRanges(seqnames=hg19Gene$chromosome_name,
ranges=IRanges(start=hg19Gene$start_position,end=hg19Gene$end_position),
strand=ifelse(hg19Gene$strand==1,"+","-"),
EnsemblID=hg19Gene$ensembl_gene_id,
Symbol=hg19Gene$external_gene_name)
hg19Gene.GR
## GRanges object with 19783 ranges and 2 metadata columns:
## seqnames ranges strand | EnsemblID Symbol
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr15 40358235-40378639 + | ENSG00000140323 DISP2
## [2] chr13 49628299-49633872 + | ENSG00000152213 ARL11
## [3] chr22 18527802-18530573 + | ENSG00000278558 TMEM191B
## [4] chr20 1309909-1329239 - | ENSG00000125775 SDCBP2
## [5] chr13 57160632-57163653 + | ENSG00000227151 PRR20D
## ... ... ... ... . ... ...
## [19779] chr8 13083361-13515658 - | ENSG00000164741 DLC1
## [19780] chr9 34689567-34691277 - | ENSG00000172724 CCL19
## [19781] chr17 7646627-7657768 + | ENSG00000129244 ATP1B2
## [19782] chr4 9215405-9217356 + | ENSG00000233136 USP17L11
## [19783] chr14 21016763-21070872 - | ENSG00000165795 NDRG2
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
Filter the above GRanges object for genes in chr1:1544000-2371000
chr1genes <- hg19Gene.GR[seqnames(hg19Gene.GR)=="chr1" &
start(hg19Gene.GR) > 1544000 &
end(hg19Gene.GR) < 2371000]
head(chr1genes)
## GRanges object with 6 ranges and 2 metadata columns:
## seqnames ranges strand | EnsemblID Symbol
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr1 1632095-1701810 + | ENSG00000189409 MMP23B
## [2] chr1 1579756-1580046 + | ENSG00000279244 AL645728.1
## [3] chr1 1615415-1630610 + | ENSG00000197530 MIB2
## [4] chr1 1659529-1692728 - | ENSG00000189339 SLC35E2B
## [5] chr1 1702730-1724324 - | ENSG00000008128 CDK11A
## [6] chr1 1921951-2003837 - | ENSG00000142609 CFAP74
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
# alternate solution
chr1genes <- subset(hg19Gene.GR,start>1544000 & end<2371000 & seqnames=="chr1")
AQP1, ASNSP2, KPNA2, FRMD4A, NSUN5, VAC14
from Ensembl human biomart.+ Tips:
+ Read documentation for `getSequence`
+ Use `type="hgnc_symbol"` and `seqType="coding_gene_flank"`
symbols <- c("AQP1", "ASNSP2", "KPNA2", "FRMD4A", "NSUN5", "VAC14")
ensembl <- useEnsembl(biomart = 'genes', dataset = 'hsapiens_gene_ensembl',version = 80)
seq <- getSequence(id=symbols, type="hgnc_symbol",
seqType="coding_gene_flank", upstream=200, mart = ensembl)
TxDb.Hsapiens.UCSC.hg19.knownGene
(install it from Bioconductor if required)#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
hg19txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
TranscrtipsByGene <- transcriptsBy(hg19txdb,by="gene") # inspect the output
Alternate solution
library(GenomicFeatures)
hg19txdb <- makeTxDbFromUCSC(genome = "hg19", tablename = "knownGene")
TranscrtipsByGene <- transcriptsBy(hg19txdb,by="gene") # inspect the output
TxDb.Hsapiens.UCSC.hg19.knownGene
ExonsByGene <- exonsBy(hg19txdb,by="gene") # inspect the output