## ----style,results='asis', echo=FALSE, eval=TRUE------------------------- BiocStyle::latex() ## ----warning=FALSE, message=FALSE---------------------------------------- library(EnsDb.Hsapiens.v75) ## print some informations for this package EnsDb.Hsapiens.v75 ## for what organism was the database generated? organism(EnsDb.Hsapiens.v75) ## ------------------------------------------------------------------------ Tx <- transcripts(EnsDb.Hsapiens.v75, filter=list(GenenameFilter("BCL2L11"))) Tx ## as this is a GRanges object we can access e.g. the start coordinates with head(start(Tx)) ## or extract the biotype with head(Tx$tx_biotype) ## ------------------------------------------------------------------------ ## list all database tables along with their columns listTables(EnsDb.Hsapiens.v75) ## list columns from a specific table listColumns(EnsDb.Hsapiens.v75, "tx") ## ------------------------------------------------------------------------ Tx <- transcripts(EnsDb.Hsapiens.v75, columns=c(listColumns(EnsDb.Hsapiens.v75 , "tx"), "gene_name"), filter=list(TxbiotypeFilter("nonsense_mediated_decay")), return.type="DataFrame") nrow(Tx) Tx ## ------------------------------------------------------------------------ ## Get all gene biotypes from the database. The GenebiotypeFilter ## allows to filter on these values. listGenebiotypes(EnsDb.Hsapiens.v75) ## Get all transcript biotypes from the database. listTxbiotypes(EnsDb.Hsapiens.v75) ## ------------------------------------------------------------------------ ## We're going to fetch all genes which names start with BCL. To this end ## we define a GenenameFilter with partial matching, i.e. condition "like" ## and a % for any character/string. BCLs <- genes(EnsDb.Hsapiens.v75, columns=c("gene_name", "entrezid", "gene_biotype"), filter=list(GenenameFilter("BCL%", condition="like")), return.type="DataFrame") nrow(BCLs) BCLs ## ------------------------------------------------------------------------ ## determine the average length of snRNA, snoRNA and rRNA genes encoded on ## chromosomes X and Y. mean(lengthOf(EnsDb.Hsapiens.v75, of="tx", filter=list(GenebiotypeFilter(c("snRNA", "snoRNA", "rRNA")), SeqnameFilter(c("X", "Y"))))) ## determine the average length of protein coding genes encoded on the same ## chromosomes. mean(lengthOf(EnsDb.Hsapiens.v75, of="tx", filter=list(GenebiotypeFilter("protein_coding"), SeqnameFilter(c("X", "Y"))))) ## ------------------------------------------------------------------------ TxByGns <- transcriptsBy(EnsDb.Hsapiens.v75, by="gene", filter=list(SeqnameFilter(c("X", "Y"))) ) TxByGns ## ----eval=FALSE---------------------------------------------------------- ## ## will just get exons for all genes on chromosomes 1 to 22, X and Y. ## ## Note: want to get rid of the "LRG" genes!!! ## EnsGenes <- exonsBy(EnsDb.Hsapiens.v75, by="gene", ## filter=list(SeqnameFilter(c(1:22, "X", "Y")), ## GeneidFilter("ENSG%", "like"))) ## ----eval=FALSE---------------------------------------------------------- ## ## Transforming the GRangesList into a data.frame in SAF format ## EnsGenes.SAF <- toSAF(EnsGenes) ## ----eval=FALSE---------------------------------------------------------- ## ## Create a GRanges of non-overlapping exon parts. ## DJE <- disjointExons(EnsDb.Hsapiens.v75, ## filter=list(SeqnameFilter(c(1:22, "X", "Y")), ## GeneidFilter("ENSG%", "like"))) ## ----eval=FALSE---------------------------------------------------------- ## ## load the AnnotationHub data ## library(AnnotationHub) ## library(EnsDb.Hsapiens.v75) ## library(Rsamtools) ## ah <- AnnotationHub() ## ## edb <- EnsDb.Hsapiens.v75 ## ## ## get the Ensembl version ## eVersion <- metadata(edb)[metadata(edb)$name=="ensembl_version", "value"] ## ## query all available files for the Ensembl version ## eData <- query(ah, c(organism(edb), paste0("release-", eVersion))) ## eData ## ## ## retrieve the *dna.toplevel.fa file; this might take some time. ## Dna <- ah[["AH20439"]] ## ## generate an index if none is available ## if(is.na(index(Dna))){ ## indexFa(Dna) ## Dna <- FaFile(path(Dna)) ## } ## ## ## get start/end coordinates of all genes ## genes <- genes(edb) ## ## subset to all genes that are encoded on chromosomes for which ## ## we do have DNA sequence available. ## genes <- genes[seqnames(genes) %in% seqnames(seqinfo(Dna))] ## ## get the gene sequences, i.e. the sequence including the sequence of ## ## all of the gene's exons and introns ## geneSeqs <- getSeq(Dna, genes) ## ## ## to get the sequence of all transcripts (i.e. only their exonic sequence) we ## ## fetch the exons grouped by transcripts. ## ## get all exons by transcript for all genes defined by Ensembl. This excludes ## ## eventual "LRG" genes, that might be encoded on a sequence for which we don't ## ## have a DNA sequence. ## txExons <- exonsBy(edb, "tx", filter=GeneidFilter("ENS%", condition="like")) ## ## extract sequence of all of each transcripts' exons and join them into a single ## ## sequence; this takes quite some time, so we just run it on the first 100. ## txSeqs <- lapply(txExons[1:100], function(x){unlist(getSeq(Dna, x))}) ## ----eval=FALSE---------------------------------------------------------- ## library(ensembldb) ## ## ## get all human gene/transcript/exon annotations from Ensembl (75) ## ## the resulting tables will be stored by default to the current working ## ## directory ## fetchTablesFromEnsembl(75, species="human") ## ## ## These tables can then be processed to generate a SQLite database ## ## containing the annotations (again, the function assumes the required ## ## txt files to be present in the current working directory) ## DBFile <- makeEnsemblSQLiteFromTables() ## ## ## and finally we can generate the package ## makeEnsembldbPackage(ensdb=DBFile, version="0.99.12", ## maintainer="Johannes Rainer ", ## author="J Rainer") ## ----eval=FALSE---------------------------------------------------------- ## library(ensembl) ## ## ## the GTF file can be downloaded from ## ## ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/ ## gtffile <- "Homo_sapiens.GRCh37.75.gtf.gz" ## ## generate the SQLite database file ## DB <- ensDbFromGtf(gtf=gtffile, verbose=TRUE) ## ## ## load the DB file directly ## EDB <- EnsDb(DB) ## ## ## alternatively, build the annotation package ## ## and finally we can generate the package ## makeEnsembldbPackage(ensdb=DB, version="0.99.12", ## maintainer="Johannes Rainer ", ## author="J Rainer") ## ------------------------------------------------------------------------ ## Generate a sqlite database from a GRanges object specifying ## genes encoded on chromosome Y load(system.file("YGRanges.RData", package="ensembldb")) Y DB <- ensDbFromGRanges(Y, path=tempdir(), version=75, organism="Homo_sapiens") edb <- EnsDb(DB) edb ## ----eval=FALSE---------------------------------------------------------- ## ## load the AnnotationHub data ## library(AnnotationHub) ## ah <- AnnotationHub() ## ## ## query all available files from Ensembl release 77 for ## ## Mus musculus ## query(ah, c("Mus musculus", "release-77")) ## ## ## get the gtf file ## Gtf <- ah[["AH28822"]] ## ## create a EnsDb database file from the Gtf ## DbFile <- ensDbFromGRanges(Gtf, organism="Mus_musculus", version=77) ## ## we can either generate a database package, or directly load the data ## Edb <- EnsDb(DbFile) ## ## ## retrieve the toplevel DNA ## Dna <- ah[["AH22042"]] ## ## ## we next retrieve the sequence of all exons ## library(Rsamtools) ## exons <- exons(Edb) ## exonSeq <- getSeq(Dna, exons)