## ----style, echo = FALSE, results = 'asis'--------------------------------- BiocStyle::markdown() ## ----setup, echo=FALSE, warning=FALSE-------------------------------------- knitr::opts_chunk$set( eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE")) ) options(max.print=1000) suppressPackageStartupMessages({ library(org.Hs.eg.db) library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(EnsDb.Hsapiens.v75) library(BSgenome.Hsapiens.UCSC.hg38) library(GenomicRanges) library(biomaRt) library(rtracklayer) library(Gviz) library(AnnotationHub) }) ## -------------------------------------------------------------------------- library(org.Hs.eg.db) ## -------------------------------------------------------------------------- org.Hs.eg.db ## -------------------------------------------------------------------------- keytypes(org.Hs.eg.db) columns(org.Hs.eg.db) head(keys(org.Hs.eg.db, "SYMBOL")) ## -------------------------------------------------------------------------- set.seed(123) egid <- sample(keys(org.Hs.eg.db), 6) mapIds(org.Hs.eg.db, egid, "SYMBOL", "ENTREZID") AnnotationDbi::select( org.Hs.eg.db, egid, c("SYMBOL", "ENSEMBL", "GENENAME"), "ENTREZID" ) ## -------------------------------------------------------------------------- egid <- "3812" mapIds(org.Hs.eg.db, egid, "ENSEMBL", "ENTREZID") mapIds( org.Hs.eg.db, egid, "ENSEMBL", "ENTREZID", multiVals = "CharacterList" ) AnnotationDbi::select( org.Hs.eg.db, egid, c("SYMBOL", "ENSEMBL"), multiVals = "CharacterList" ) ## ---- message=FALSE-------------------------------------------------------- library(tidyverse) egid <- keys(org.Hs.eg.db) # all ENTREZIDs mapIds(org.Hs.eg.db, egid, "SYMBOL", "ENTREZID") %>% as_tibble() %>% rownames_to_column("ENTREZID") AnnotationDbi::select( org.Hs.eg.db, egid, c("SYMBOL", "GO", "GENENAME"), "ENTREZID" ) %>% as_tibble() ## -------------------------------------------------------------------------- library(biomaRt) head(listMarts()) mart <- useMart("ENSEMBL_MART_ENSEMBL") ## -------------------------------------------------------------------------- head(listDatasets(mart)) dataset <- useDataset("hsapiens_gene_ensembl", mart) ## -------------------------------------------------------------------------- head(listFilters(dataset)) filters <- "ensembl_gene_id" # see `listFilters()` ## -------------------------------------------------------------------------- head(listAttributes(dataset)) attrs <- c("ensembl_gene_id", "hgnc_symbol") # see `listAttributes()` ## -------------------------------------------------------------------------- ids <- c( "ENSG00000000003", "ENSG00000000005", "ENSG00000000419", "ENSG00000000457", "ENSG00000000460", "ENSG00000000938" ) tbl <- getBM(attrs, filters, ids, dataset) %>% as_tibble() tbl ## -------------------------------------------------------------------------- library(KEGGREST) KEGGREST::listDatabases() ## -------------------------------------------------------------------------- hsa_pathways <- keggList("pathway", "hsa") %>% tibble(pathway = names(.), description = .) hsa_pathways ## -------------------------------------------------------------------------- hsa_path_eg <- keggLink("pathway", "hsa") %>% tibble(pathway = ., egid = sub("hsa:", "", names(.))) hsa_path_eg hsa_path_eg %>% group_by(pathway) %>% summarize(genes = list(egid)) ## -------------------------------------------------------------------------- hsa_kegg_anno <- hsa_path_eg %>% mutate( symbol = mapIds(org.Hs.eg.db, egid, "SYMBOL", "ENTREZID"), ensembl = mapIds(org.Hs.eg.db, egid, "ENSEMBL", "ENTREZID") ) ## -------------------------------------------------------------------------- left_join(hsa_kegg_anno, hsa_pathways) ## -------------------------------------------------------------------------- library(TxDb.Hsapiens.UCSC.hg38.knownGene) TxDb.Hsapiens.UCSC.hg38.knownGene txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene ## -------------------------------------------------------------------------- ex <- exons(txdb) ex library(ggplot2) qplot(log10(width(ex))) ex[ which.max(width(ex)) ] ## -------------------------------------------------------------------------- gn <- genes(txdb) length(gn) std <- paste0("chr", c(1:22, "X", "Y", "M")) seqlevels(gn, pruning.mode = "coarse") <- std length(gn) seqlevels(gn) table( seqnames(gn) ) tibble(chr = as.factor(seqnames(gn))) %>% group_by(chr) %>% summarize(n = n()) ## -------------------------------------------------------------------------- exByGn <- exonsBy(txdb, "gene") ## trans <- lengths(unique(seqnames(exByGn))) table( trans ) seqnames( exByGn[ trans > 1 ] ) ## std <- paste0("chr", c(1:22, "X", "Y", "M")) unames <- unique(seqnames(exByGn[ trans > 1 ])) transstd <- all(unames %in% std) unames[transstd] ## -------------------------------------------------------------------------- egid <- "22947" AnnotationDbi::select( org.Hs.eg.db, egid, c("SYMBOL", "GENENAME"), "ENTREZID" ) ## ---- eval = FALSE--------------------------------------------------------- # url <- paste0("https://www.ncbi.nlm.nih.gov/gene/", egid) # browseURL(url) ## -------------------------------------------------------------------------- library(BSgenome.Hsapiens.UCSC.hg38) BSgenome.Hsapiens.UCSC.hg38 hg38 <- BSgenome.Hsapiens.UCSC.hg38 ## -------------------------------------------------------------------------- dna <- getSeq(hg38, GRanges("chr1:1000000-2000000")) letterFrequency(dna, "GC", as.prob=TRUE) ## -------------------------------------------------------------------------- brca1_egid <- mapIds(org.Hs.eg.db, "BRCA1", "ENTREZID", "SYMBOL") brca1_exons <- exonsBy(txdb, "gene")[[brca1_egid]] getSeq(hg38, brca1_exons) ## ----edb-brca1-cds, message = FALSE---------------------------------------- library(EnsDb.Hsapiens.v75) edb <- EnsDb.Hsapiens.v75 brca1cds <- cdsBy(edb, by = "tx", filter = ~ genename == "BRCA1") class(brca1cds) length(brca1cds) brca1cds[[1]] # exons in cds cdswidth <- width(brca1cds) # width of each exon all((sum(cdswidth) %% 3) == 0) # sum within cds, modulus 3 ## ----edb-brca1-cds-wrongsize----------------------------------------------- tx_cds_fail <- names(brca1cds)[(sum(cdswidth) %% 3) != 0] length(tx_cds_fail) tx_cds_fail[1] ## ----edb-brca1-Gviz, message=FALSE----------------------------------------- library(Gviz) ## Use the function from the ensembldb package to extract the data in the ## format suitable for Gviz grt <- getGeneRegionTrackForGviz(edb, filter = ~genename == "BRCA1") plotTracks(list(GenomeAxisTrack(), GeneRegionTrack(grt))) ## ----edb-cds-to-seq-------------------------------------------------------- library(BSgenome.Hsapiens.UCSC.hg19) genome <- BSgenome.Hsapiens.UCSC.hg19 ## Change the seqlevelsStyle from UCSC to Ensembl seqlevelsStyle(genome) <- "Ensembl" tx_seq <- extractTranscriptSeqs(genome, brca1cds) tx_seq ## ----edb-fail-cds---------------------------------------------------------- tx_seq[tx_cds_fail] ## ----edb-introns----------------------------------------------------------- introns <- psetdiff(unlist(range(brca1cds)), brca1cds) ## ----edb-intron-seqs------------------------------------------------------- unique(genome(genome)) genome(introns) ## Change the genome name on introns to match the one from the ## BSgenome package genome(introns) <- c(`17` = unique(genome(genome))) seq <- getSeq(genome, introns) names(seq) seq[["ENST00000352993"]] # 20 introns ## ----rtracklayer-roi------------------------------------------------------- library(GenomicRanges) roi <- GRanges("chr10", IRanges(92106877, 112106876, names="ENSG00000099194")) ## ----rtracklayer-session, eval=FALSE--------------------------------------- # library(rtracklayer) # session <- browserSession() ## ----rtracklayer-marks, eval=FALSE----------------------------------------- # trackName <- "wgEncodeRegTfbsClusteredV2" # tableName <- "wgEncodeRegTfbsClusteredV2" # trFactor <- "ERalpha_a" # ucscTable <- getTable(ucscTableQuery(session, track=trackName, # range=roi, table=tableName, name=trFactor)) ## ----rtracklayer-plot, fig.height=3, eval=FALSE---------------------------- # plot(score ~ chromStart, ucscTable, pch="+") # abline(v=start(roi) + (end(roi) - start(roi) + 1) / 2, col="blue") ## -------------------------------------------------------------------------- sessionInfo()