## ----style,results='asis', echo=FALSE, eval=TRUE-------------------------
BiocStyle::latex()


## ----warning=FALSE, message=FALSE----------------------------------------
library(EnsDb.Hsapiens.v75)

## print some informations for this package
EnsDb.Hsapiens.v75

## for what organism was the database generated?
organism(EnsDb.Hsapiens.v75)


## ------------------------------------------------------------------------
Tx <- transcripts(EnsDb.Hsapiens.v75, filter=list(GenenameFilter("BCL2L11")))

Tx

## as this is a GRanges object we can access e.g. the start coordinates with
head(start(Tx))

## or extract the biotype with
head(Tx$tx_biotype)


## ------------------------------------------------------------------------
## list all database tables along with their columns
listTables(EnsDb.Hsapiens.v75)

## list columns from a specific table
listColumns(EnsDb.Hsapiens.v75, "tx")


## ------------------------------------------------------------------------
Tx <- transcripts(EnsDb.Hsapiens.v75,
		  columns=c(listColumns(EnsDb.Hsapiens.v75 , "tx"), "gene_name"),
		  filter=list(TxbiotypeFilter("nonsense_mediated_decay")),
		  return.type="DataFrame")
nrow(Tx)
Tx


## ------------------------------------------------------------------------
## Get all gene biotypes from the database. The GenebiotypeFilter
## allows to filter on these values.
listGenebiotypes(EnsDb.Hsapiens.v75)

## Get all transcript biotypes from the database.
listTxbiotypes(EnsDb.Hsapiens.v75)


## ------------------------------------------------------------------------
## We're going to fetch all genes which names start with BCL. To this end
## we define a GenenameFilter with partial matching, i.e. condition "like"
## and a % for any character/string.
BCLs <- genes(EnsDb.Hsapiens.v75,
	      columns=c("gene_name", "entrezid", "gene_biotype"),
	      filter=list(GenenameFilter("BCL%", condition="like")),
	      return.type="DataFrame")
nrow(BCLs)
BCLs


## ------------------------------------------------------------------------
## determine the average length of snRNA, snoRNA and rRNA genes encoded on
## chromosomes X and Y.
mean(lengthOf(EnsDb.Hsapiens.v75, of="tx",
	      filter=list(GenebiotypeFilter(c("snRNA", "snoRNA", "rRNA")),
		  SeqnameFilter(c("X", "Y")))))

## determine the average length of protein coding genes encoded on the same
## chromosomes.
mean(lengthOf(EnsDb.Hsapiens.v75, of="tx",
	      filter=list(GenebiotypeFilter("protein_coding"),
		  SeqnameFilter(c("X", "Y")))))


## ------------------------------------------------------------------------
TxByGns <- transcriptsBy(EnsDb.Hsapiens.v75, by="gene",
			 filter=list(SeqnameFilter(c("X", "Y")))
			 )
TxByGns


## ----eval=FALSE----------------------------------------------------------
## ## will just get exons for all genes on chromosomes 1 to 22, X and Y.
## ## Note: want to get rid of the "LRG" genes!!!
## EnsGenes <- exonsBy(EnsDb.Hsapiens.v75, by="gene",
## 		    filter=list(SeqnameFilter(c(1:22, "X", "Y")),
## 			GeneidFilter("ENSG%", "like")))


## ----eval=FALSE----------------------------------------------------------
## ## Transforming the GRangesList into a data.frame in SAF format
## EnsGenes.SAF <- toSAF(EnsGenes)


## ----eval=FALSE----------------------------------------------------------
## ## Create a GRanges of non-overlapping exon parts.
## DJE <- disjointExons(EnsDb.Hsapiens.v75,
## 		     filter=list(SeqnameFilter(c(1:22, "X", "Y")),
## 			  GeneidFilter("ENSG%", "like")))


## ----eval=FALSE----------------------------------------------------------
## ## load the AnnotationHub data
## library(AnnotationHub)
## library(EnsDb.Hsapiens.v75)
## library(Rsamtools)
## ah <- AnnotationHub()
## 
## edb <- EnsDb.Hsapiens.v75
## 
## ## get the Ensembl version
## eVersion <- metadata(edb)[metadata(edb)$name=="ensembl_version", "value"]
## ## query all available files for the Ensembl version
## eData <- query(ah, c(organism(edb), paste0("release-", eVersion)))
## eData
## 
## ## retrieve the *dna.toplevel.fa file; this might take some time.
## Dna <- ah[["AH20439"]]
## ## generate an index if none is available
## if(is.na(index(Dna))){
##     indexFa(Dna)
##     Dna <- FaFile(path(Dna))
## }
## 
## ## get start/end coordinates of all genes
## genes <- genes(edb)
## ## subset to all genes that are encoded on chromosomes for which
## ## we do have DNA sequence available.
## genes <- genes[seqnames(genes) %in% seqnames(seqinfo(Dna))]
## ## get the gene sequences, i.e. the sequence including the sequence of
## ## all of the gene's exons and introns
## geneSeqs <- getSeq(Dna, genes)
## 
## ## to get the sequence of all transcripts (i.e. only their exonic sequence) we
## ## fetch the exons grouped by transcripts.
## ## get all exons by transcript for all genes defined by Ensembl. This excludes
## ## eventual "LRG" genes, that might be encoded on a sequence for which we don't
## ## have a DNA sequence.
## txExons <- exonsBy(edb, "tx", filter=GeneidFilter("ENS%", condition="like"))
## ## extract sequence of all of each transcripts' exons and join them into a single
## ## sequence; this takes quite some time, so we just run it on the first 100.
## txSeqs <- lapply(txExons[1:100], function(x){unlist(getSeq(Dna, x))})


## ----eval=FALSE----------------------------------------------------------
## library(ensembldb)
## 
## ## get all human gene/transcript/exon annotations from Ensembl (75)
## ## the resulting tables will be stored by default to the current working
## ## directory
## fetchTablesFromEnsembl(75, species="human")
## 
## ## These tables can then be processed to generate a SQLite database
## ## containing the annotations (again, the function assumes the required
## ## txt files to be present in the current working directory)
## DBFile <- makeEnsemblSQLiteFromTables()
## 
## ## and finally we can generate the package
## makeEnsembldbPackage(ensdb=DBFile, version="0.99.12",
## 		     maintainer="Johannes Rainer <johannes.rainer@eurac.edu>",
## 		     author="J Rainer")


## ----eval=FALSE----------------------------------------------------------
## library(ensembl)
## 
## ## the GTF file can be downloaded from
## ## ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/
## gtffile <- "Homo_sapiens.GRCh37.75.gtf.gz"
## ## generate the SQLite database file
## DB <- ensDbFromGtf(gtf=gtffile, verbose=TRUE)
## 
## ## load the DB file directly
## EDB <- EnsDb(DB)
## 
## ## alternatively, build the annotation package
## ## and finally we can generate the package
## makeEnsembldbPackage(ensdb=DB, version="0.99.12",
## 		     maintainer="Johannes Rainer <johannes.rainer@eurac.edu>",
## 		     author="J Rainer")


## ------------------------------------------------------------------------
## Generate a sqlite database from a GRanges object specifying
## genes encoded on chromosome Y
load(system.file("YGRanges.RData", package="ensembldb"))
Y

DB <- ensDbFromGRanges(Y, path=tempdir(), version=75,
		       organism="Homo_sapiens")

edb <- EnsDb(DB)
edb


## ----eval=FALSE----------------------------------------------------------
## ## load the AnnotationHub data
## library(AnnotationHub)
## ah <- AnnotationHub()
## 
## ## query all available files from Ensembl release 77 for
## ## Mus musculus
## query(ah, c("Mus musculus", "release-77"))
## 
## ## get the gtf file
## Gtf <- ah[["AH28822"]]
## ## create a EnsDb database file from the Gtf
## DbFile <- ensDbFromGRanges(Gtf, organism="Mus_musculus", version=77)
## ## we can either generate a database package, or directly load the data
## Edb <- EnsDb(DbFile)
## 
## ## retrieve the toplevel DNA
## Dna <- ah[["AH22042"]]
## 
## ## we next retrieve the sequence of all exons
## library(Rsamtools)
## exons <- exons(Edb)
## exonSeq <- getSeq(Dna, exons)