## ----readVcF,message=FALSE----------------------------------------------------
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
vcf

## ----readVcf_showheader-------------------------------------------------------
header(vcf)

## ----headeraccessors----------------------------------------------------------
samples(header(vcf))
geno(header(vcf))

## ----readVcf_rowRanges--------------------------------------------------------
head(rowRanges(vcf), 3)

## ----readVcf_fixed------------------------------------------------------------
ref(vcf)[1:5]
qual(vcf)[1:5]

## ----readVcf_ALT--------------------------------------------------------------
alt(vcf)[1:5]

## ----geno_hdr-----------------------------------------------------------------
geno(vcf)
sapply(geno(vcf), class)

## ----explore_geno-------------------------------------------------------------
geno(header(vcf))["DS",]

## ----dim_geno-----------------------------------------------------------------
DS <-geno(vcf)$DS
dim(DS)
DS[1:3,]

## ----fivenum------------------------------------------------------------------
fivenum(DS)

## ----DS_zero------------------------------------------------------------------
length(which(DS==0))/length(DS)

## ----DS_hist, fig=TRUE--------------------------------------------------------
hist(DS[DS != 0], breaks=seq(0, 2, by=0.05),
    main="DS non-zero values", xlab="DS")

## ----info---------------------------------------------------------------------
info(vcf)[1:4, 1:5]

## ----examine_dbSNP, message=FALSE, warning=FALSE------------------------------
library(SNPlocs.Hsapiens.dbSNP144.GRCh37)
vcf_rsids <- names(rowRanges(vcf))
chr22snps <- snpsBySeqname(SNPlocs.Hsapiens.dbSNP144.GRCh37, "22")
chr22_rsids <- mcols(chr22snps)$RefSNP_id
in_dbSNP <- vcf_rsids %in% chr22_rsids
table(in_dbSNP) 

## ----header_info--------------------------------------------------------------
info(header(vcf))[c("VT", "LDAF", "RSQ"),]

## ----examine_quality----------------------------------------------------------
metrics <- data.frame(QUAL=qual(vcf), in_dbSNP=in_dbSNP,
    VT=info(vcf)$VT, LDAF=info(vcf)$LDAF, RSQ=info(vcf)$RSQ)

## ----examine_ggplot2, message=FALSE, warning=FALSE, fig=TRUE------------------
library(ggplot2)
ggplot(metrics, aes(x=RSQ, fill=in_dbSNP)) +
    geom_density(alpha=0.5) +
    scale_x_continuous(name="MaCH / Thunder Imputation Quality") +
    scale_y_continuous(name="Density") +
    theme(legend.position="top")

## ----subset_ranges------------------------------------------------------------
rng <- GRanges(seqnames="22", ranges=IRanges(
           start=c(50301422, 50989541), 
           end=c(50312106, 51001328),
           names=c("gene_79087", "gene_644186")))

## ----subset_TabixFile---------------------------------------------------------
tab <- TabixFile(fl)
vcf_rng <- readVcf(tab, "hg19", param=rng)

## -----------------------------------------------------------------------------
head(rowRanges(vcf_rng), 3)

## ----subset_scanVcfHeader-----------------------------------------------------
hdr <- scanVcfHeader(fl)
## e.g., INFO and GENO fields
head(info(hdr), 3)
head(geno(hdr), 3)

## ----subset_ScanVcfParam------------------------------------------------------
## Return all 'fixed' fields, "LAF" from 'info' and "GT" from 'geno'
svp <- ScanVcfParam(info="LDAF", geno="GT")
vcf1 <- readVcf(fl, "hg19", svp)
names(geno(vcf1))

## ----subset_ScanVcfParam_new--------------------------------------------------
svp_all <- ScanVcfParam(info="LDAF", geno="GT", which=rng)
svp_all

## ----locate_rename_seqlevels, message=FALSE, warning=FALSE--------------------
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevels(vcf) <- "chr22"
rd <- rowRanges(vcf)
loc <- locateVariants(rd, txdb, CodingVariants())
head(loc, 3)

## ----AllVariants, eval=FALSE--------------------------------------------------
#  allvar <- locateVariants(rd, txdb, AllVariants())

## ----locate_gene_centric------------------------------------------------------
## Did any coding variants match more than one gene?
splt <- split(mcols(loc)$GENEID, mcols(loc)$QUERYID) 
table(sapply(splt, function(x) length(unique(x)) > 1))

## Summarize the number of coding variants by gene ID.
splt <- split(mcols(loc)$QUERYID, mcols(loc)$GENEID)
head(sapply(splt, function(x) length(unique(x))), 3)

## ----predictCoding, warning=FALSE---------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)
coding <- predictCoding(vcf, txdb, seqSource=Hsapiens)
coding[5:7]

## ----predictCoding_frameshift-------------------------------------------------
## CONSEQUENCE is 'frameshift' where translation is not possible
coding[mcols(coding)$CONSEQUENCE == "frameshift"]

## ----nonsynonymous------------------------------------------------------------
nms <- names(coding) 
idx <- mcols(coding)$CONSEQUENCE == "nonsynonymous"
nonsyn <- coding[idx]
names(nonsyn) <- nms[idx]
rsids <- unique(names(nonsyn)[grep("rs", names(nonsyn), fixed=TRUE)])

## ----polyphen, message=FALSE, warning=FALSE-----------------------------------
library(PolyPhen.Hsapiens.dbSNP131)
pp <- select(PolyPhen.Hsapiens.dbSNP131, keys=rsids,
          cols=c("TRAININGSET", "PREDICTION", "PPH2PROB"))
head(pp[!is.na(pp$PREDICTION), ]) 

## ----snpMatrix, message=FALSE-------------------------------------------------
res <- genotypeToSnpMatrix(vcf) 
res

## ----snpMatrix_ALT------------------------------------------------------------
allele2 <- res$map[["allele.2"]]
## number of alternate alleles per variant
unique(elementNROWS(allele2))

## ----message=FALSE------------------------------------------------------------
fl.gl <- system.file("extdata", "gl_chr1.vcf", package="VariantAnnotation")
vcf.gl <- readVcf(fl.gl, "hg19")
geno(vcf.gl)

## Convert the "GL" FORMAT field to a SnpMatrix
res <- genotypeToSnpMatrix(vcf.gl, uncertain=TRUE)
res
t(as(res$genotype, "character"))[c(1,3,7), 1:5]

## Compare to a SnpMatrix created from the "GT" field
res.gt <- genotypeToSnpMatrix(vcf.gl, uncertain=FALSE)
t(as(res.gt$genotype, "character"))[c(1,3,7), 1:5]

## What are the original likelihoods for rs58108140?
geno(vcf.gl)$GL["rs58108140", 1:5]

## ----writeVcf, message=FALSE, warning=FALSE-----------------------------------
fl <- system.file("extdata", "ex2.vcf", package="VariantAnnotation")
out1.vcf <- tempfile()
out2.vcf <- tempfile()
in1 <- readVcf(fl, "hg19")
writeVcf(in1, out1.vcf)
in2 <- readVcf(out1.vcf, "hg19")
writeVcf(in2, out2.vcf)
in3 <- readVcf(out2.vcf, "hg19")
identical(rowRanges(in1), rowRanges(in3))
identical(geno(in1), geno(in2))

## ----eval=FALSE---------------------------------------------------------------
#  readVcf(TabixFile(fl, yieldSize=10000))

## ----eval=FALSE---------------------------------------------------------------
#  readVcf(TabixFile(fl), param=ScanVcfParam(info='DP', geno='GT'))

## ----eval=FALSE---------------------------------------------------------------
#  readGT(fl)

## ----eval=FALSE---------------------------------------------------------------
#  library(microbenchmark)
#  fl <- "ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
#  ys <- c(100, 1000, 10000, 100000)
#  
#  ## readGT() input only 'GT':
#  fun <- function(fl, yieldSize) readGT(TabixFile(fl, yieldSize))
#  lapply(ys, function(i) microbenchmark(fun(fl, i), times=5)
#  
#  ## readVcf() input only 'GT' and 'ALT':
#  fun <- function(fl, yieldSize, param)
#             readVcf(TabixFile(fl, yieldSize), "hg19", param=param)
#  param <- ScanVcfParam(info=NA, geno="GT", fixed="ALT")
#  lapply(ys, function(i) microbenchmark(fun(fl, i, param), times=5)
#  
#  ## readVcf() input all variables:
#  fun <- function(fl, yieldSize) readVcf(TabixFile(fl, yieldSize), "hg19")
#  lapply(ys, function(i) microbenchmark(fun(fl, i), times=5))

## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()