## ---- include = FALSE-----------------------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  width = 100
)
options(width=100)
# devtools::load_all(".")   # delete later

## -------------------------------------------------------------------------------------------------
library(epialleleR)

capture.bam <- system.file("extdata", "capture.bam", package="epialleleR")
bam.data    <- preprocessBam(capture.bam)

## -------------------------------------------------------------------------------------------------
# data.table::data.table object for
# CpG VEF report
cg.vef.report <- generateCytosineReport(bam.data)
head(cg.vef.report[order(meth+unmeth, decreasing=TRUE)])

# CpG cytosine report
cg.report <- generateCytosineReport(bam.data, threshold.reads=FALSE)
head(cg.report[order(meth+unmeth, decreasing=TRUE)])

# CX cytosine report
cx.report <- generateCytosineReport(bam.data, threshold.reads=FALSE,
                                    report.context="CX")
head(cx.report[order(meth+unmeth, decreasing=TRUE)])

## -------------------------------------------------------------------------------------------------
# report for amplicon-based data
# matching is done by exact start or end positions plus/minus tolerance
amplicon.report <- generateAmpliconReport(
  bam=system.file("extdata", "amplicon010meth.bam", package="epialleleR"),
  bed=system.file("extdata", "amplicon.bed", package="epialleleR")
)
amplicon.report

# report for capture-based data
# matching is done by overlap
capture.report <- generateCaptureReport(
  bam=system.file("extdata", "capture.bam", package="epialleleR"),
  bed=system.file("extdata", "capture.bed", package="epialleleR")
)
head(capture.report)

# generateBedReport is a generic function for BED-guided reports
bed.report <- generateBedReport(
  bam=system.file("extdata", "capture.bam", package="epialleleR"),
  bed=system.file("extdata", "capture.bed", package="epialleleR"),
  bed.type="capture"
)
identical(capture.report, bed.report)

## ---- fig.width=10, fig.height=6, out.width="100%", out.height="100%"-----------------------------
# First, let's visualise eCDFs for within- and out-of-context beta values
# for all four amplicons and unmatched reads. Note that within-the-context eCDF
# of 0.5 is very close to the expected 1-VEF value (0.1) for all amplicons
# produced from this 1:9 mix of methylated and non-methylated control DNA

# let's compute eCDF
amplicon.ecdfs <- generateBedEcdf(
  bam=system.file("extdata", "amplicon010meth.bam", package="epialleleR"),
  bed=system.file("extdata", "amplicon.bed", package="epialleleR"),
  bed.rows=NULL
)

# there are 5 items in amplicon.ecdfs, let's plot all of them
par(mfrow=c(1,length(amplicon.ecdfs)))

# cycle through items
for (x in 1:length(amplicon.ecdfs)) {
  # four of them have names corresponding to genomic regions of amplicon.bed
  # fifth - NA for all the reads that don't match to any of those regions
  main <- if (is.na(names(amplicon.ecdfs[x]))) "unmatched"
          else names(amplicon.ecdfs[x])
  
  # plotting eCDF for within-the-context per-read beta values (in red)
  plot(amplicon.ecdfs[[x]]$context, col="red", verticals=TRUE, do.points=FALSE,
       xlim=c(0,1), xlab="per-read beta value", ylab="cumulative density",
       main=main)
  
  # adding eCDF for out-of-context per-read beta values (in blue)
  plot(amplicon.ecdfs[[x]]$out.of.context, add=TRUE, col="blue",
       verticals=TRUE, do.points=FALSE)
}


# Second, let's compare eCDFs for within-the-context beta values for only one
# amplicon but all three sequenced samples: pure non-methylated DNA, 1:9 mix of
# methylated and non-methylated DNA, and fully methylated DNA

# our files
bam.files <- c("amplicon000meth.bam", "amplicon010meth.bam",
               "amplicon100meth.bam")

# let's plot all of them
par(mfrow=c(1,length(bam.files)))

# cycle through items
for (f in bam.files) {
  # let's compute eCDF
  amplicon.ecdfs <- generateBedEcdf(
    bam=system.file("extdata", f, package="epialleleR"),
    bed=system.file("extdata", "amplicon.bed", package="epialleleR"),
    # only the second amplicon
    bed.rows=2, verbose=FALSE
  )
  
  # plotting eCDF for within-the-context per-read beta values (in red)
  plot(amplicon.ecdfs[[1]]$context, col="red", verticals=TRUE, do.points=FALSE,
       xlim=c(0,1), xlab="per-read beta value", ylab="cumulative density",
       main=f)
  
   # adding eCDF for out-of-context per-read beta values (in blue)
  plot(amplicon.ecdfs[[1]]$out.of.context, add=TRUE, col="blue",
       verticals=TRUE, do.points=FALSE)
}

## -------------------------------------------------------------------------------------------------
# VCF report
vcf.report <- generateVcfReport(
  bam=system.file("extdata", "amplicon010meth.bam", package="epialleleR"),
  bed=system.file("extdata", "amplicon.bed", package="epialleleR"),
  vcf=system.file("extdata", "amplicon.vcf.gz", package="epialleleR"),
  # thresholds on alignment and base quality
  min.mapq=30, min.baseq=13,
  # when VCF seqlevels are different from BED and BAM it is possible
  # to convert them internally
  vcf.style="NCBI"
)

# NA values are shown for the C->T variants on the "+" and G->A on the "-"
# strands, because bisulfite conversion makes their counting impossible
head(vcf.report)

# let's sort the report by increasing Fisher's exact test's p-values.
# the p-values are given separately for reads that map to the "+"
head(vcf.report[order(`FEp-`, na.last=TRUE)])

# and to the "-" strand
head(vcf.report[order(`FEp+`, na.last=TRUE)])

## ----session--------------------------------------------------------------------------------------
sessionInfo()