## ----setup, include=FALSE, echo=F, warning= F, message=F-------------------
knitr::opts_chunk$set(message = FALSE, 
                      warning = FALSE, 
                      error = FALSE, 
                      tidy = FALSE,
                      fig.align="center", 
                      dpi = 150, 
                      cache = FALSE,
                      progress=FALSE, 
                      quite = TRUE)

## ---- echo=FALSE, results="hide", warning=FALSE,message=FALSE--------------
suppressPackageStartupMessages({
  if("TCGAWorkflow" %in% installed.packages()[,1]) {
    library(TCGAWorkflow)
  } else {
    devtools::load_all(".")
  }
})

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  if (!"BiocManager" %in% rownames(installed.packages()))
#       install.packages("BiocManager")
#  BiocManager::install("TCGAWorkflow")

## ---- eval=TRUE, message=FALSE, warning=FALSE, include=TRUE----------------
library(TCGAWorkflowData)
library(DT)

## ---- eval=TRUE, message=FALSE, warning=FALSE, include=FALSE---------------
library(TCGAbiolinks)

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  library(TCGAbiolinks)
#  # Obs: The data in the legacy database has been aligned to hg19
#  query.met.gbm <- GDCquery(project = "TCGA-GBM",
#                            legacy = TRUE,
#                            data.category = "DNA methylation",
#                            platform = "Illumina Human Methylation 450",
#                            barcode = c("TCGA-76-4926-01B-01D-1481-05", "TCGA-28-5211-01C-11D-1844-05"))
#  GDCdownload(query.met.gbm)
#  
#  met.gbm.450 <- GDCprepare(query = query.met.gbm,
#                            save = TRUE,
#                            save.filename = "gbmDNAmet450k.rda",
#                            summarizedExperiment = TRUE)
#  
#  query.met.lgg <- GDCquery(project = "TCGA-LGG",
#                            legacy = TRUE,
#                            data.category = "DNA methylation",
#                            platform = "Illumina Human Methylation 450",
#                            barcode = c("TCGA-HT-7879-01A-11D-2399-05", "TCGA-HT-8113-01A-11D-2399-05"))
#  GDCdownload(query.met.lgg)
#  met.lgg.450 <- GDCprepare(query = query.met.lgg,
#                            save = TRUE,
#                            save.filename = "lggDNAmet450k.rda",
#                            summarizedExperiment = TRUE)
#  met.gbm.lgg <- SummarizedExperiment::cbind(met.lgg.450, met.gbm.450)
#  
#  
#  query.exp.lgg <- GDCquery(project = "TCGA-LGG",
#                            legacy = TRUE,
#                            data.category = "Gene expression",
#                            data.type = "Gene expression quantification",
#                            platform = "Illumina HiSeq",
#                            file.type = "results",
#                            sample.type = "Primary solid Tumor")
#  GDCdownload(query.exp.lgg)
#  exp.lgg <- GDCprepare(query = query.exp.lgg, save = TRUE, save.filename = "lggExp.rda")
#  
#  query.exp.gbm <- GDCquery(project = "TCGA-GBM",
#                            legacy = TRUE,
#                            data.category = "Gene expression",
#                            data.type = "Gene expression quantification",
#                            platform = "Illumina HiSeq",
#                            file.type = "results",
#                            sample.type = "Primary solid Tumor")
#  GDCdownload(query.exp.gbm)
#  exp.gbm <- GDCprepare(query = query.exp.gbm, save = TRUE, save.filename = "gbmExp.rda")
#  exp.gbm.lgg <- SummarizedExperiment::cbind(exp.lgg, exp.gbm)

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  #-----------------------------------------------------------------------------
#  #                   Data.category: Copy number variation aligned to hg38
#  #-----------------------------------------------------------------------------
#  query <- GDCquery(project = "TCGA-ACC",
#                    data.category = "Copy Number Variation",
#                    data.type = "Copy Number Segment",
#                    barcode = c( "TCGA-OR-A5KU-01A-11D-A29H-01", "TCGA-OR-A5JK-01A-11D-A29H-01"))
#  GDCdownload(query)
#  data <- GDCprepare(query)
#  
#  query <- GDCquery("TCGA-ACC",
#                    "Copy Number Variation",
#                    data.type = "Masked Copy Number Segment",
#                    sample.type = c("Primary solid Tumor")) # see the barcodes with getResults(query)$cases
#  GDCdownload(query)
#  data <- GDCprepare(query)

## ---- eval=TRUE, include=TRUE----------------------------------------------
library(SummarizedExperiment)

# Load object from TCGAWorkflowData package
# THis object will be created in the further sections,
data(GBMIllumina_HiSeq) 

# get expression matrix
data <- assay(gbm.exp)
datatable(data[1:10,], 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = TRUE)

# get genes information
genes.info <- rowRanges(gbm.exp)
genes.info

# get sample information
sample.info <- colData(gbm.exp)
datatable(as.data.frame(sample.info), 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = FALSE)


## ---- eval=TRUE, include=TRUE----------------------------------------------
# get indexed clinical patient data for GBM samples
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")

# get indexed clinical patient data for LGG samples
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")

# Bind the results, as the columns might not be the same,
# we will will plyr rbind.fill, to have all columns from both files
clinical <- plyr::rbind.fill(gbm_clin,lgg_clin)

## ----echo = TRUE, message = FALSE, warning = FALSE-------------------------
datatable(clinical[1:10,], options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE-------------
# Fetch clinical data directly from the clinical XML files.
# if barcode is not set, it will consider all samples.
# We only set it to make the example faster
query <- GDCquery(project = "TCGA-GBM",
                  file.type = "xml",
                  data.category = "Clinical",
                  barcode = c("TCGA-08-0516","TCGA-02-0317")) 
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")

## ----echo = TRUE, message = FALSE, warning = FALSE-------------------------
datatable(clinical, options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE-------------
clinical.drug <- GDCprepare_clinic(query, clinical.info = "drug")

## ----echo = TRUE, message = FALSE, warning = FALSE-------------------------
datatable(clinical.drug, options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE-------------
clinical.radiation <- GDCprepare_clinic(query, clinical.info = "radiation")

## ----echo = TRUE, message = FALSE, warning = FALSE-------------------------
datatable(clinical.radiation, options = list(scrollX = TRUE,  keys = TRUE), rownames = FALSE)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE-------------
clinical.admin <- GDCprepare_clinic(query, clinical.info = "admin")

## ----echo = TRUE, message = FALSE, warning = FALSE-------------------------
datatable(clinical.admin, options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE)

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")

## ----echo = TRUE, message = FALSE, warning = FALSE-------------------------
data(mafMutect2LGGGBM)
datatable(LGGmut[1:10,], options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE)

## ---- eval=TRUE, include=TRUE----------------------------------------------
gbm.subtypes <- TCGAquery_subtype(tumor = "gbm")

## ----echo = TRUE, message = FALSE, warning = FALSE-------------------------
datatable(gbm.subtypes[1:10,], options = list(scrollX = TRUE, keys = TRUE), rownames = FALSE)

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  library(RTCGAToolbox)

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  # Get the last run dates
#  lastRunDate <- getFirehoseRunningDates()[1]
#  
#  # get DNA methylation data, RNAseq2 and clinical data for GBM
#  gbm.data <- getFirehoseData(dataset = "GBM",
#                              runDate = lastRunDate,
#                              gistic2Date = getFirehoseAnalyzeDates(1),
#                              Methylation = FALSE,
#                              clinical = TRUE,
#                              RNASeq2GeneNorm  = FALSE,
#                              Mutation = TRUE,
#                              fileSizeLimit = 10000)
#  
#  gbm.mut <- getData(gbm.data,"Mutation")
#  gbm.clin <- getData(gbm.data,"clinical")

## ---- eval=FALSE, message=FALSE, warning=FALSE, include=TRUE---------------
#  # Download GISTIC results
#  lastanalyzedate <- getFirehoseAnalyzeDates(1)
#  gistic <- getFirehoseData("GBM",gistic2Date = lastanalyzedate)
#  
#  # get GISTIC results
#  gistic.allbygene <- getData(gistic, type = "GISTIC", platform = "AllByGene")
#  gistic.thresholedbygene <- getData(gistic, type = "GISTIC", platform = "ThresholdedByGene")

## ---- eval=TRUE, message=FALSE, warning=FALSE, include=TRUE----------------
data(GBMGistic)
datatable(gistic.allbygene,
          filter = 'top',
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = FALSE)
datatable(gistic.thresholedbygene,
          filter = 'top',
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = FALSE)

## ----results='hide', eval=FALSE, message=FALSE, warning=FALSE, include=TRUE----
#  library(TCGAbiolinks)
#  # Select common CN technology available for GBM and LGG
#  #############################
#  ## CNV data pre-processing ##
#  #############################
#  query.gbm.nocnv <- GDCquery(project = "TCGA-GBM",
#                              data.category = "Copy number variation",
#                              legacy = TRUE,
#                              file.type = "nocnv_hg19.seg",
#                              sample.type = c("Primary solid Tumor"))
#  # to reduce time we will select only 20 samples
#  query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,]
#  
#  GDCdownload(query.gbm.nocnv, files.per.chunk = 100)
#  
#  gbm.nocnv <- GDCprepare(query.gbm.nocnv, save = TRUE, save.filename = "GBMnocnvhg19.rda")

## ----results='hide', eval=FALSE, message=FALSE, warning=FALSE, include=TRUE----
#  # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==---=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==--
#  # Retrieve probes meta file from broadinstitute website for hg19
#  # For hg38 analysis please take a look on:
#  # https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files
#  # File: SNP6 GRCh38 Liftover Probeset File for Copy Number Variation Analysis
#  # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==---=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--=-==--=--==--
#  gdac.root <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/"
#  file <- paste0(gdac.root, "genome.info.6.0_hg19.na31_minus_frequent_nan_probes_sorted_2.1.txt")
#  if(!file.exists(basename(file))) downloader::download(file, basename(file))
#  markersMatrix <-  readr::read_tsv(basename(file), col_names = FALSE, col_types = "ccn", progress = FALSE)
#  save(markersMatrix, file = "markersMatrix.rda", compress = "xz")

## ---- echo=TRUE, message=FALSE,warning=FALSE, include=TRUE-----------------
cancer <- "GBM"
message(paste0("Starting ", cancer))

# get objects created above
data(GBMnocnvhg19)
data(markersMatrix)

# Add label (0 for loss, 1 for gain)
cnvMatrix <- cbind(cnvMatrix,Label=NA)
cnvMatrix[cnvMatrix[,"Segment_Mean"] < -0.3,"Label"] <- 0
cnvMatrix[cnvMatrix[,"Segment_Mean"] > 0.3,"Label"] <- 1
cnvMatrix <- cnvMatrix[!is.na(cnvMatrix$Label),]

# Remove "Segment_Mean" and change col.names
cnvMatrix <- cnvMatrix[,-6]
colnames(cnvMatrix) <- c("Sample.Name", "Chromosome", "Start", "End", "Num.of.Markers", "Aberration")

# Substitute Chromosomes "X" and "Y" with "23" and "24"
cnvMatrix[cnvMatrix$Chromosome == "X","Chromosome"] <- 23
cnvMatrix[cnvMatrix$Chromosome == "Y","Chromosome"] <- 24
cnvMatrix$Chromosome <- as.integer(cnvMatrix$Chromosome)

# Recurrent CNV identification with GAIA
colnames(markersMatrix) <- c("Probe.Name", "Chromosome", "Start")
unique(markersMatrix$Chromosome)
markersMatrix[markersMatrix$Chromosome == "X","Chromosome"] <- 23
markersMatrix[markersMatrix$Chromosome == "Y","Chromosome"] <- 24
markersMatrix$Chromosome <- as.integer(markersMatrix$Chromosome)
markerID <- paste(markersMatrix$Chromosome,markersMatrix$Start, sep = ":")
# Removed duplicates
markersMatrix <- markersMatrix[!duplicated(markerID),]
# Filter markersMatrix for common CNV
markerID <- paste(markersMatrix$Chromosome,markersMatrix$Start, sep = ":")

file <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/CNV.hg19.bypos.111213.txt"
if(!file.exists(basename(file))) downloader::download(file, basename(file))
commonCNV <- readr::read_tsv(basename(file), progress = FALSE)
#data(CNV.hg19.bypos.111213)
commonID <- paste(commonCNV$Chromosome,commonCNV$Start, sep = ":")
markersMatrix_fil <- markersMatrix[!markerID %in% commonID,]

library(gaia)
set.seed(200)
markers_obj <- load_markers(as.data.frame(markersMatrix_fil))
nbsamples <- length(unique(cnvMatrix$Sample))
cnv_obj <- load_cnv(cnvMatrix, markers_obj, nbsamples)
suppressWarnings({
  results <- runGAIA(cnv_obj,
                     markers_obj,
                     output_file_name = paste0("GAIA_",cancer,"_flt.txt"),
                     aberrations = -1,  # -1 to all aberrations
                     chromosomes = 9, # -1 to all chromosomes
                     approximation = TRUE, # Set to TRUE to speed up the time requirements
                     num_iterations = 5000, # Reduced to speed up the time requirements
                     threshold = 0.25)
})
# Set q-value threshold
# Use a smalled value for your analysis. We set this as high values
# due to the small number of samples which did not reproduced
# results with smaller q-values
threshold <- 0.3

# Plot the results
RecCNV <- t(apply(results,1,as.numeric))
colnames(RecCNV) <- colnames(results)
RecCNV <- cbind(RecCNV, score = 0)
minval <- format(min(RecCNV[RecCNV[,"q-value"] != 0,"q-value"]), scientific = FALSE)
minval <- substring(minval,1, nchar(minval) - 1)
RecCNV[RecCNV[,"q-value"] == 0,"q-value"] <- as.numeric(minval)
RecCNV[,"score"] <- sapply(RecCNV[,"q-value"],function(x) -log10(as.numeric(x)))
RecCNV[RecCNV[,"q-value"] == as.numeric(minval),]

gaiaCNVplot(RecCNV,threshold)
save(results, RecCNV, threshold, file = paste0(cancer,"_CNV_results.rda"))

## ---- eval = TRUE, message=FALSE,warning=FALSE, include=FALSE--------------
library(GenomicRanges)

## ---- eval = FALSE, message=FALSE,warning=FALSE, include=TRUE--------------
#  library(GenomicRanges)
#  # Get gene information from GENCODE using biomart
#  genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19")
#  genes <- genes[genes$external_gene_id != "" & genes$chromosome_name %in% c(1:22,"X","Y"),]
#  genes[genes$chromosome_name == "X", "chromosome_name"] <- 23
#  genes[genes$chromosome_name == "Y", "chromosome_name"] <- 24
#  genes$chromosome_name <- sapply(genes$chromosome_name,as.integer)
#  genes <- genes[order(genes$start_position),]
#  genes <- genes[order(genes$chromosome_name),]
#  genes <- genes[,c("external_gene_id", "chromosome_name", "start_position","end_position")]
#  colnames(genes) <- c("GeneSymbol","Chr","Start","End")
#  genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
#  save(genes_GR,genes,file = "genes_GR.rda", compress = "xz")

## ---- echo=TRUE, message=FALSE,warning=FALSE, include=TRUE-----------------
##############################
## Recurrent CNV annotation ## 
##############################
# Get gene information from GENCODE using biomart
data(genes_GR) # downloaded in the previous step (available in TCGAWorkflowData)

load(paste0(cancer,"_CNV_results.rda"))
sCNV <- RecCNV[RecCNV[,"q-value"] <= threshold,c(1:4,6)]
sCNV <- sCNV[order(sCNV[,3]),]
sCNV <- sCNV[order(sCNV[,1]),]
colnames(sCNV) <- c("Chr","Aberration","Start","End","q-value")
sCNV_GR <- makeGRangesFromDataFrame(sCNV,keep.extra.columns = TRUE)

hits <- findOverlaps(genes_GR, sCNV_GR, type = "within")
sCNV_ann <- cbind(sCNV[subjectHits(hits),],genes[queryHits(hits),])
AberrantRegion <- paste0(sCNV_ann[,1],":",sCNV_ann[,3],"-",sCNV_ann[,4])
GeneRegion <- paste0(sCNV_ann[,7],":",sCNV_ann[,8],"-",sCNV_ann[,9])
AmpDel_genes <- cbind(sCNV_ann[,c(6,2,5)],AberrantRegion,GeneRegion)
AmpDel_genes[AmpDel_genes[,2] == 0,2] <- "Del"
AmpDel_genes[AmpDel_genes[,2] == 1,2] <- "Amp"
rownames(AmpDel_genes) <- NULL

save(RecCNV, AmpDel_genes, file = paste0(cancer,"_CNV_results.rda"))

## ----results='asis', echo=FALSE, message=FALSE, warning=FALSE--------------
knitr::kable(head(AmpDel_genes), caption = "Chromosome 9 recurrent deleted genes in LGG")

## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE, eval = FALSE----
#  library(maftools)
#  # Download Mutation Annotation Format (MAF) files
#  LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
#  GBMmut <- GDCquery_Maf(tumor = "GBM", pipelines = "mutect2")
#  
#  # Merge them
#  mut <- plyr::rbind.fill(LGGmut, GBMmut)
#  save(maf,file ="mafMutect2LGGGBM.rda", compress = "xz")

## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE---------------
library(maftools)
# recovering data from TCGAWorkflowData package.
data(mafMutect2LGGGBM)

# To prepare for maftools we will also include clinical data
# For a mutant vs WT survival analysis 
# get indexed clinical patient data for GBM samples
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")
# get indexed clinical patient data for LGG samples
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")
# Bind the results, as the columns might not be the same,
# we will will plyr rbind.fill, to have all columns from both files
clinical <- plyr::rbind.fill(gbm_clin,lgg_clin)
colnames(clinical)[1] <- "Tumor_Sample_Barcode"
clinical$Overall_Survival_Status <- 1
clinical$Overall_Survival_Status[which(clinical$vital_status != "dead")] <- 0
clinical$time <- clinical$days_to_death
clinical$time[is.na(clinical$days_to_death)] <- clinical$days_to_last_follow_up[is.na(clinical$days_to_death)]

# Create object to use in maftools
maf <- read.maf(maf = mut, clinicalData = clinical, isTCGA = T)

## ----echo=TRUE, message=FALSE, warning=FALSE,fig.width=10------------------
plotmafSummary(maf = maf,
               rmOutlier = TRUE,
               addStat = 'median',
               dashboard = TRUE)

## ----echo=TRUE, message=FALSE, warning=FALSE,fig.height=10,fig.width=15,eval=FALSE----
#  oncoplot(maf = maf,
#           top = 20,
#           legendFontSize = 8,
#           clinicalFeatures = c("tissue_or_organ_of_origin")
#           )

## ----echo=TRUE, message=FALSE, warning=FALSE-------------------------------
plot <- mafSurvival(maf = maf,
                   genes = "IDH1",
                   fn = NULL,
                   time = 'time',
                   Status = 'Overall_Survival_Status',
                   isTCGA = TRUE)

## ----results='hide', eval = FALSE, echo=TRUE, message=FALSE,warning=FALSE----
#  LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE----------------
###############################################
## Genomic aberration overview - Circos plot ## 
###############################################
# Retrieve curated mutations for selected cancer (e.g. "LGG") 
data(mafMutect2LGGGBM)
# Select only potentially damaging mutations
LGGmut <- LGGmut[LGGmut$Variant_Classification %in% c("Missense_Mutation",
                                                      "Nonsense_Mutation",
                                                      "Nonstop_Mutation",
                                                      "Frame_Shift_Del",
                                                      "Frame_Shift_Ins"),]
# Select recurrent mutations (identified in at least two samples)
mut.id <- paste0(LGGmut$Chromosome,":",LGGmut$Start_Position,"-",LGGmut$End_Position,"|",LGGmut$Reference_Allele)
mut <- cbind(mut.id, LGGmut)
# Prepare selected mutations data for circos plot
s.mut <- mut[mut$mut.id %in% unique(mut.id[duplicated(mut.id)]),]
s.mut <- s.mut[,c("Chromosome","Start_Position","End_Position","Variant_Classification","Hugo_Symbol")]
s.mut <- unique(s.mut)
typeNames <- unique(s.mut[,4])
type <- c(4:1)
names(type) <- typeNames[1:4]
Type <- type[s.mut[,4]]
s.mut <- cbind(s.mut,Type)
s.mut <- s.mut[,c(1:3,6,4,5)]

# Load recurrent CNV data for selected cancer (e.g. "LGG")
load("GBM_CNV_results.rda")
# Prepare selected sample CNV data for circos plot
s.cnv <- as.data.frame(RecCNV[RecCNV[,"q-value"] <= threshold,c(1:4,6)])
s.cnv <- s.cnv[,c(1,3,4,2)]
s.cnv[s.cnv$Chromosome == 23,"Chromosome"] <- "X"
s.cnv[s.cnv$Chromosome == 24,"Chromosome"] <- "Y"
Chromosome <- paste0("chr",s.cnv[,1])
s.cnv <- cbind(Chromosome, s.cnv[,-1])
s.cnv[,1] <- as.character(s.cnv[,1])
s.cnv[,4] <- as.character(s.cnv[,4])
s.cnv <- cbind(s.cnv,CNV=1)
colnames(s.cnv) <- c("Chromosome","Start_position","End_position","Aberration_Kind","CNV")

library(circlize)
# Draw genomic circos plot
par(mar=c(1,1,1,1), cex=1)
circos.initializeWithIdeogram()
# Add CNV results
colors <- c("forestgreen","firebrick")
names(colors)  <- c(0,1)
circos.genomicTrackPlotRegion(s.cnv,  ylim = c(0,1.2),
                              panel.fun = function(region, value, ...) {
                                circos.genomicRect(region, value, ytop.column = 2, ybottom = 0,
                                                   col = colors[value[[1]]], 
                                                   border="white")
                                cell.xlim = get.cell.meta.data("cell.xlim")
                                circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
                              })
# Add mutation results
colors <- c("blue","green","red","gold")
names(colors)  <- typeNames[1:4]
circos.genomicTrackPlotRegion(s.mut, ylim = c(1.2,4.2),
                              panel.fun = function(region, value, ...) {
                                circos.genomicPoints(region, value, cex = 0.8, pch = 16, col = colors[value[[2]]], ...)
                              })

circos.clear()

legend(-0.2, 0.2, bty="n", y.intersp=1, c("Amp","Del"), pch=15, 
       col=c("firebrick","forestgreen"), title="CNVs", text.font=1, cex=0.4, title.adj=0)
legend(-0.2, 0, bty="n", y.intersp=1, names(colors), pch=16, 
       col=colors, title="Mutations", text.font=1, cex=0.4, title.adj=0)

## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE----------------
par(mar=c(1,1,1,1),cex=1.5)
circos.par("start.degree" = 90, canvas.xlim = c(0, 1), canvas.ylim = c(0, 1), 
           gap.degree = 270, cell.padding = c(0, 0, 0, 0), track.margin = c(0.005, 0.005))
circos.initializeWithIdeogram(chromosome.index = "chr17")
circos.par(cell.padding = c(0, 0, 0, 0))
# Add CNV results
colors <- c("forestgreen","firebrick")
names(colors)  <- c(0,1)
circos.genomicTrackPlotRegion(s.cnv,  ylim = c(0,1.2),
                              panel.fun = function(region, value, ...) {
                                circos.genomicRect(region, value, ytop.column = 2, ybottom = 0,
                                                   col = colors[value[[1]]], 
                                                   border="white")
                                cell.xlim = get.cell.meta.data("cell.xlim")
                                circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
                              })

# Add mutation results representing single genes
genes.mut <- paste0(s.mut$Hugo_Symbol,"-",s.mut$Type)
s.mutt <- cbind(s.mut,genes.mut)
n.mut <- table(genes.mut)
idx <- !duplicated(s.mutt$genes.mut)
s.mutt <- s.mutt[idx,]
s.mutt <- cbind(s.mutt,num=n.mut[s.mutt$genes.mut])
genes.num <- paste0(s.mutt$Hugo_Symbol," (",s.mutt$num.Freq,")")
s.mutt <- cbind(s.mutt[,-c(6:8)],genes.num)
s.mutt[,6] <- as.character(s.mutt[,6])
s.mutt[,4] <- s.mutt[,4]/2
s.mutt$num.Freq <- NULL
colors <- c("blue","green","red","gold")
names(colors)  <- typeNames[1:4]
circos.genomicTrackPlotRegion(s.mutt, ylim = c(0.3,2.2), track.height = 0.05,
                              panel.fun = function(region, value, ...) {
                                circos.genomicPoints(region, value, cex = 0.4, pch = 16, col = colors[value[[2]]], ...)
                              })

circos.genomicTrackPlotRegion(s.mutt, ylim = c(0, 1), track.height = 0.1, bg.border = NA)
i_track = get.cell.meta.data("track.index")

circos.genomicTrackPlotRegion(s.mutt, ylim = c(0,1),
                              panel.fun = function(region, value, ...) {
                                circos.genomicText(region, value, 
                                                   y = 1, 
                                                   labels.column = 3,
                                                   col = colors[value[[2]]],
                                                   facing = "clockwise", adj = c(1, 0.5),
                                                   posTransform = posTransform.text, cex = 0.4, niceFacing = TRUE)
                              }, track.height = 0.1, bg.border = NA)

circos.genomicPosTransformLines(s.mutt,
                                posTransform = function(region, value)
                                  posTransform.text(region, 
                                                    y = 1, 
                                                    labels = value[[3]],
                                                    cex = 0.4, track.index = i_track+1),
                                direction = "inside", track.index = i_track)

circos.clear()

legend(0.25, 0.2, bty="n", y.intersp=1, c("Amp","Del"), pch=15, 
       col=c("firebrick","forestgreen"), title="CNVs", text.font=1, cex=0.4, title.adj=0)
legend(0, 0.2, bty="n", y.intersp=1, names(colors), pch=16, 
       col=colors, title="Mutations", text.font=1, cex=0.4, title.adj=0)

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  query <- GDCquery(project = "TCGA-GBM",
#                    data.category = "Gene expression",
#                    data.type = "Gene expression quantification",
#                    platform = "Illumina HiSeq",
#                    file.type  = "results",
#                    sample.type = c("Primary solid Tumor"),
#                    legacy = TRUE)
#  # We will use only 20 samples to make the example faster
#  query$results[[1]] <-  query$results[[1]][1:20,]
#  GDCdownload(query)
#  gbm.exp <- GDCprepare(query,
#                        save = TRUE,
#                        summarizedExperiment = TRUE,
#                        save.filename = "GBMIllumina_HiSeq.rda")
#  
#  query <- GDCquery(project = "TCGA-LGG",
#                    data.category = "Gene expression",
#                    data.type = "Gene expression quantification",
#                    platform = "Illumina HiSeq",
#                    file.type  = "results",
#                    sample.type = c("Primary solid Tumor"),
#                    legacy = TRUE)
#  # We will use only 20 samples to make the example faster
#  query$results[[1]] <-  query$results[[1]][1:20,]
#  GDCdownload(query)
#  lgg.exp <- GDCprepare(query,
#                        save = TRUE,
#                        summarizedExperiment = TRUE,
#                        save.filename = "LGGIllumina_HiSeq.rda")

## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE----------------
data("LGGIllumina_HiSeq")
data("GBMIllumina_HiSeq")

dataPrep_LGG <- TCGAanalyze_Preprocessing(object = lgg.exp,
                                          cor.cut = 0.6,    
                                          datatype = "raw_count",
                                          filename = "LGG_IlluminaHiSeq_RNASeqV2.png")

dataPrep_GBM <- TCGAanalyze_Preprocessing(object = gbm.exp,
                                          cor.cut = 0.6, 
                                          datatype = "raw_count",
                                          filename = "GBM_IlluminaHiSeq_RNASeqV2.png")

dataNorm <- TCGAanalyze_Normalization(tabDF = cbind(dataPrep_LGG, dataPrep_GBM),
                                      geneInfo = TCGAbiolinks::geneInfo,
                                      method = "gcContent") #18323   672

dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
                                  method = "quantile",
                                  qnt.cut =  0.25)  # 13742   672

save(dataFilt, file = paste0("LGG_GBM_Norm_IlluminaHiSeq.rda"))

dataFiltLGG <- subset(dataFilt, select = substr(colnames(dataFilt),1,12) %in% lgg_clin$bcr_patient_barcode)
dataFiltGBM <- subset(dataFilt, select = substr(colnames(dataFilt),1,12) %in% gbm_clin$bcr_patient_barcode)

dataDEGs <- TCGAanalyze_DEA(mat1 = dataFiltLGG,
                            mat2 = dataFiltGBM,
                            Cond1type = "LGG",
                            Cond2type = "GBM",
                            fdr.cut = 0.01 ,
                            logFC.cut = 1,
                            method = "glmLRT")


## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE----------------
# Number of differentially expressed genes (DEG)
nrow(dataDEGs)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10----
#-------------------  4.2 EA: enrichment analysis             --------------------
ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes LGG Vs GBM", RegulonList = rownames(dataDEGs))
TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP),
                        filename = NULL,
                        GOBPTab = ansEA$ResBP, 
                        nRGTab = rownames(dataDEGs),
                        nBar = 20)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10----
TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP),
                        filename = NULL,
                        GOCCTab = ansEA$ResCC,
                        nRGTab = rownames(dataDEGs),
                        nBar = 20)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10----
TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP),
                        filename = NULL,
                        GOMFTab = ansEA$ResMF, 
                        nRGTab = rownames(dataDEGs),
                        nBar = 20)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=12, fig.width=15----
TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP),
                        filename = NULL,
                        PathTab = ansEA$ResPat,
                        nRGTab = rownames(dataDEGs),
                        nBar = 20)

## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE----------------
library(SummarizedExperiment)
GenelistComplete <- rownames(assay(lgg.exp,1))

# DEGs TopTable
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(dataDEGs,"LGG","GBM",
                                          dataFilt[,colnames(dataFiltLGG)],
                                          dataFilt[,colnames(dataFiltGBM)])

dataDEGsFiltLevel$GeneID <- 0

library(clusterProfiler)
# Converting Gene symbol to geneID
eg = as.data.frame(bitr(dataDEGsFiltLevel$mRNA,
                        fromType="SYMBOL",
                        toType="ENTREZID",
                        OrgDb="org.Hs.eg.db"))
eg <- eg[!duplicated(eg$SYMBOL),]

dataDEGsFiltLevel <- dataDEGsFiltLevel[dataDEGsFiltLevel$mRNA %in% eg$SYMBOL,]

dataDEGsFiltLevel <- dataDEGsFiltLevel[order(dataDEGsFiltLevel$mRNA,decreasing=FALSE),]
eg <- eg[order(eg$SYMBOL,decreasing=FALSE),]

# table(eg$SYMBOL == dataDEGsFiltLevel$mRNA) should be TRUE
all(eg$SYMBOL == dataDEGsFiltLevel$mRNA)
dataDEGsFiltLevel$GeneID <- eg$ENTREZID

dataDEGsFiltLevel_sub <- subset(dataDEGsFiltLevel, select = c("GeneID", "logFC"))
genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC)
names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID
library(pathview)
# pathway.id: hsa05214 is the glioma pathway
# limit: sets the limit for gene expression legend and color
hsa05214 <- pathview::pathview(gene.data  = genelistDEGs,
                               pathway.id = "hsa05214",
                               species    = "hsa",
                               limit = list(gene=as.integer(max(abs(genelistDEGs)))))


## ---- eval=FALSE, include=TRUE---------------------------------------------
#  ### read biogrid info (available in TCGAWorkflowData as "biogrid")
#  ### Check last version in https://thebiogrid.org/download.php
#  file <- "http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.146/BIOGRID-ALL-3.4.146.tab2.zip"
#  if(!file.exists(gsub("zip","txt",basename(file)))){
#    downloader::download(file,basename(file))
#    unzip(basename(file),junkpaths =TRUE)
#  }
#  
#  tmp.biogrid <- read.csv(gsub("zip","txt",basename(file)), header=TRUE, sep="\t", stringsAsFactors=FALSE)

## ---- eval=TRUE, include=TRUE----------------------------------------------
### plot details (colors & symbols)
mycols <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628')

### load network inference libraries
library(minet)
library(c3net)

### deferentially identified genes using TCGAbiolinks
# we will use only a subset (first 50 genes) of it to make the example faster
names.genes.de <- rownames(dataDEGs)[1:30]

data(biogrid)
net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)

mydata <- dataFiltLGG[names.genes.de, ]

### infer networks
t.mydata <- t(mydata)
net.aracne <- minet(t.mydata, method = "aracne")
net.mrnet <- minet(t.mydata)
net.clr <- minet(t.mydata, method = "clr")
net.c3net <- c3net(mydata)

### validate compared to biogrid network
tmp.val <- list(validate(net.aracne, net.biogrid.de), 
                validate(net.mrnet, net.biogrid.de),
                validate(net.clr, net.biogrid.de), 
                validate(net.c3net, net.biogrid.de))

### plot roc and compute auc for the different networks
dev1 <- show.roc(tmp.val[[1]],cex=0.3,col=mycols[1],type="l")
res.auc <- auc.roc(tmp.val[[1]])
for(count in 2:length(tmp.val)){
  show.roc(tmp.val[[count]],device=dev1,cex=0.3,col=mycols[count],type="l")
  res.auc <- c(res.auc, auc.roc(tmp.val[[count]]))
}

legend("bottomright", legend=paste(c("aracne","mrnet","clr","c3net"), signif(res.auc,4), sep=": "),
       col=mycols[1:length(tmp.val)],lty=1, bty="n" )
# Please, uncomment this line to produce the pdf files.
# dev.copy2pdf(width=8,height=8,device = dev1, file = paste0("roc_biogrid_",cancertype,".pdf"))

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  #----------------------------
#  # Obtaining DNA methylation
#  #----------------------------
#  # Samples
#  lgg.samples <- matchedMetExp("TCGA-LGG", n = 10)
#  gbm.samples <- matchedMetExp("TCGA-GBM", n = 10)
#  samples <- c(lgg.samples,gbm.samples)
#  
#  #-----------------------------------
#  # 1 - Methylation
#  # ----------------------------------
#  # For methylation it is quicker in this case to download the tar.gz file
#  # and get the samples we want instead of downloading files by files
#  query <- GDCquery(project = c("TCGA-LGG","TCGA-GBM"),
#                    data.category = "DNA methylation",
#                    platform = "Illumina Human Methylation 450",
#                    legacy = TRUE,
#                    barcode = samples)
#  GDCdownload(query)
#  met <- GDCprepare(query, save = FALSE)
#  
#  # We will use only chr9 to make the example faster
#  met <- subset(met,subset = as.character(seqnames(met)) %in% c("chr9"))
#  # This data is avaliable in the package (object elmerExample)

## ----echo=TRUE, message=FALSE,warning=FALSE, fig.height=8,fig.width=8------
data(elmerExample)
#----------------------------
# Mean methylation
#----------------------------
# Plot a barplot for the groups in the disease column in the
# summarizedExperiment object

# remove probes with NA (similar to na.omit)
met <- subset(met,subset = (rowSums(is.na(assay(met))) == 0))

TCGAvisualize_meanMethylation(met,
                              groupCol = "project_id",
                              group.legend  = "Groups",
                              filename = NULL,
                              print.pvalue = TRUE)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE----------------
#------- Searching for differentially methylated CpG sites     ----------
met <- TCGAanalyze_DMR(met,
                       groupCol = "project_id", # a column in the colData matrix
                       group1 = "TCGA-GBM", # a type of the disease type column
                       group2 = "TCGA-LGG", # a type of the disease column
                       p.cut = 0.05,
                       diffmean.cut = 0.15,
                       save = FALSE,
                       legend = "State",
                       plot.filename = "LGG_GBM_metvolcano.png",
                       cores = 1 # if set to 1 there will be a progress bar
)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE----------------
#--------------------------
# DNA Methylation heatmap
#-------------------------
library(ComplexHeatmap)
clinical <- plyr::rbind.fill(gbm_clin,lgg_clin)

# get the probes that are Hypermethylated or Hypomethylated
# met is the same object of the section 'DNA methylation analysis'
status.col <- "status.TCGA.GBM.TCGA.LGG"
sig.met <- met[values(met)[,status.col] %in% c("Hypermethylated","Hypomethylated"),]


# top annotation, which sampples are LGG and GBM
# We will add clinical data as annotation of the samples
# we will sort the clinical data to have the same order of the DNA methylation matrix
clinical.order <- clinical[match(substr(colnames(sig.met),1,12),clinical$bcr_patient_barcode),]
ta = HeatmapAnnotation(df = clinical.order[,c("disease","gender","vital_status","race")],
                       col = list(disease = c("LGG" = "grey", "GBM" = "black"),
                                  gender = c("male" = "blue","female" = "pink")))

# row annotation: add the status for LGG in relation to GBM
# For exmaple: status.gbm.lgg Hypomethyated means that the
# mean DNA methylation of probes for lgg are hypomethylated
# compared to GBM ones.
ra = rowAnnotation(df = values(sig.met)[status.col],
                   col = list("status.TCGA.GBM.TCGA.LGG" = 
                                c("Hypomethylated" = "orange",
                                  "Hypermethylated" = "darkgreen")),
                   width = unit(1, "cm"))

heatmap  <- Heatmap(assay(sig.met),
                    name = "DNA methylation",
                    col = matlab::jet.colors(200),
                    show_row_names = FALSE,
                    cluster_rows = TRUE,
                    cluster_columns = FALSE,
                    show_column_names = FALSE,
                    bottom_annotation = ta,
                    column_title = "DNA Methylation") 
# Save to pdf
png("heatmap.png",width = 600, height = 400)
draw(heatmap, annotation_legend_side =  "bottom")
dev.off()

## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE----------------
library(rGADEM)
library(BSgenome.Hsapiens.UCSC.hg19)
library(motifStack)

probes <- rowRanges(met)[values(met)[,status.col] %in% c("Hypermethylated" ,"Hypomethylated") ,]
# Get hypo/hyper methylated probes and make a 200bp window 
# surrounding each probe.
sequence <- RangedData(space = as.character(seqnames(probes)),
                       IRanges(start = start(ranges(probes)) - 100,
                               end = start(ranges(probes)) + 100), strand = "*")
#look for motifs
gadem <- GADEM(sequence, verbose = FALSE, genome = Hsapiens)

# How many motifs were found?
nMotifs(gadem)

# get the number of occurrences
nOccurrences(gadem)

# view all sequences consensus
consensus(gadem)

# Print motif
pwm <- getPWM(gadem)
pfm  <- new("pfm",mat=pwm[[1]],name="Novel Site 1")
plotMotifLogo(pfm)

# Number of instances of motif 1?
length(gadem@motifList[[1]]@alignList)


## ----results='asis', echo=TRUE,message=FALSE,warning=FALSE-----------------
library(MotIV)
# motif matching analysis
analysis.jaspar <- motifMatch(pwm)
summary(analysis.jaspar)
plot(analysis.jaspar, ncol=2, top=5, rev=FALSE, main="", bysim=TRUE, cex=0.4)

#  visualize the quality of the results looking into the alignments
# E-value gives an estimation of the match.
alignment <- viewAlignments(analysis.jaspar)
print(alignment[[1]])

## ----table4, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'-----
tabl <- "
|                  Histone marks                 |                                                   Role                                                  |
|:----------------------------------------------:|:-------------------------------------------------------------------------------------------------------:|
| Histone H3 lysine 4 trimethylation (H3K4me3)   | Promoter regions [@heintzman2007distinct,@bernstein2005genomic]                                      |
| Histone H3 lysine 4 monomethylation (H3K4me1)  | Enhancer regions [@heintzman2007distinct]                                                           |
| Histone H3 lysine 36 trimethylation (H3K36me3) | Transcribed regions                                                                                     |
| Histone H3 lysine 27 trimethylation (H3K27me3) | Polycomb repression [@bonasio2010molecular]                                                         |
| Histone H3 lysine 9 trimethylation (H3K9me3)   | Heterochromatin regions  [@peters2003partitioning]                                                  |
| Histone H3 acetylated at lysine 27 (H3K27ac)   | Increase activation of genomic elements [@heintzman2009histone,@rada2011unique,@creyghton2010histone] |
| Histone H3 lysine 9 acetylation  (H3K9ac)      | Transcriptional activation [@nishida2006histone]                                                    |
"
cat(tabl) 

## ----table5, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'-----
tabl <- "  
|                File                |                               Description                              |
|:----------------------------------:|:----------------------------------------------------------------------:|
| fc.signal.bigwig                   | Bigwig File containing  fold enrichment signal tracks                  |
| pval.signal.bigwig                 | Bigwig File containing -log10(p-value) signal tracks                   |
| hotspot.fdr0.01.broad.bed.gz       | Broad domains on enrichment for  DNase-seq for consolidated epigenomes |
| hotspot.broad.bed.gz               | Broad domains on enrichment for DNase-seq for consolidated epigenomes  |
| broadPeak.gz                       | Broad ChIP-seq peaks for consolidated  epigenomes                      |
| gappedPeak.gz                      | Gapped ChIP-seq peaks for consolidated   epigenomes                    |
| narrowPeak.gz                      | Narrow ChIP-seq peaks for consolidated epigenomes                      |
| hotspot.fdr0.01.peaks.bed.gz       | Narrow DNasePeaks for   consolidated epigenomes                        |
| hotspot.all.peaks.bed.gz           | Narrow DNasePeaks for  consolidated epigenomes                         |
| .macs2.narrowPeak.gz               | Narrow DNasePeaks for consolidated epigenomes                          |
| coreMarks_mnemonics.bed.gz        | 15 state chromatin segmentations                                       |
| mCRF_FractionalMethylation.bigwig | MeDIP/MRE(mCRF) fractional methylation calls                           |
| RRBS_FractionalMethylation.bigwig | RRBS fractional methylation calls                                      |
| WGBS_FractionalMethylation.bigwig | Whole genome bisulphite fractional methylation calls                   |
"
cat(tabl) 

## ----results='hide', eval=TRUE, echo=TRUE, message=FALSE,warning=FALSE-----
library(ChIPseeker)
library(pbapply)
library(ggplot2)

## ----results='hide', eval=FALSE, echo=TRUE, message=FALSE,warning=FALSE----
#  #------------------ Working with ChipSeq data ---------------
#  # Step 1: download histone marks for a brain and non-brain samples.
#  #------------------------------------------------------------
#  # loading annotation hub database
#  library(AnnotationHub)
#  ah = AnnotationHub()
#  
#  # Searching for brain consolidated epigenomes in the roadmap database
#  bpChipEpi_brain <- query(ah , c("EpigenomeRoadMap", "narrowPeak", "chip", "consolidated","brain","E068"))
#  # Get chip-seq data
#  histone.marks <- pblapply(names(bpChipEpi_brain), function(x) {ah[[x]]})
#  names(histone.marks) <- names(bpChipEpi_brain)
#  # OBS: histone.marks is available in TCGAWorkflowData package

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE----------------
data(histoneMarks)
# Create a GR object based on the hypo/hypermethylated probes.
probes <- keepStandardChromosomes(rowRanges(met)[values(met)[,status.col] %in% c("Hypermethylated","Hypomethylated"),])
# Defining a window of 3kbp - 3kbp_probe_3kbp
ranges(probes) <- IRanges(start = c(start(ranges(probes)) - 3000), end = c(start(ranges(probes)) + 3000))

### Profile of ChIP peaks binding to TSS regions
# First of all, to calculate the profile of ChIP peaks binding to TSS regions, we should
# prepare the TSS regions, which are defined as the flanking sequence of the TSS sites.
# Then align the peaks that are mapping to these regions and generate the tagMatrix.
tagMatrixList <- pbapply::pblapply(histone.marks, function(x) {
  getTagMatrix(keepStandardChromosomes(x), windows = probes, weightCol = "score")
})
# change names retrieved with the following command: basename(bpChipEpi_brain$title)
names(tagMatrixList) <- c("H3K4me1","H3K4me3", "H3K9ac", "H3K9me3", "H3K27ac",  "H3K27me3", "H3K36me3")

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  tagHeatmap(tagMatrixList, xlim=c(-3000, 3000),color = NULL)

## ----results='asis', echo=FALSE, fig.width = 7, message=FALSE,warning=FALSE----
tagHeatmap(tagMatrixList, xlim=c(-3000, 3000),color = NULL)

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  p <- plotAvgProf(tagMatrixList, xlim = c(-3000,3000), xlab = "Genomic Region (5'->3', centered on CpG)")
#  # We are centreing in the CpG instead of the TSS. So we'll change the labels manually
#  p <- p + scale_x_continuous(breaks=c(-3000,-1500,0,1500,3000),labels=c(-3000,-1500,"CpG",1500,3000))
#  library(ggthemes)
#  p + theme_few() + scale_colour_few(name="Histone marks") +  guides(colour = guide_legend(override.aes = list(size=4)))

## ----results='asis', echo=FALSE, message=FALSE,warning=FALSE---------------
p <- plotAvgProf(tagMatrixList, xlim = c(-3000,3000), xlab = "Genomic Region (5'->3', centered on CpG)")
# We are centreing in the CpG instead of the TSS. So we'll change the labels manually
if (requireNamespace("ggplot2", quietly = TRUE)) 
  p <- p + ggplot2::scale_x_continuous(breaks=c(-3000,-1500,0,1500,3000),
                                       labels=c(-3000,-1500,"CpG",1500,3000))

if (requireNamespace("ggthemes", quietly = TRUE)) 
  p + ggthemes::theme_few() + 
  ggthemes::scale_colour_few(name="Histone marks") + 
  ggplot2::guides(colour = guide_legend(override.aes = list(size=4)))

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  #----------- 8.3 Identification of Regulatory Enhancers   -------
#  library(TCGAbiolinks)
#  # Samples: primary solid tumor w/ DNA methylation and gene expression
#  lgg.samples <- matchedMetExp("TCGA-LGG", n = 10)
#  gbm.samples <- matchedMetExp("TCGA-GBM", n = 10)
#  samples <- c(lgg.samples,gbm.samples)
#  
#  #-----------------------------------
#  # 1 - Methylation
#  # ----------------------------------
#  query.met <- GDCquery(project = c("TCGA-LGG","TCGA-GBM"),
#                        data.category = "DNA methylation",
#                        platform = "Illumina Human Methylation 450",
#                        legacy = TRUE,
#                        barcode = samples)
#  GDCdownload(query.met)
#  met <- GDCprepare(query.met, save = FALSE)
#  met <- subset(met,subset = as.character(GenomicRanges::seqnames(met)) %in% c("chr9"))
#  
#  #-----------------------------------
#  # 2 - Expression
#  # ----------------------------------
#  query.exp <- GDCquery(project = c("TCGA-LGG","TCGA-GBM"),
#                        data.category = "Gene expression",
#                        data.type = "Gene expression quantification",
#                        platform = "Illumina HiSeq",
#                        file.type  = "results",
#                        legacy = TRUE,
#                        barcode =  samples)
#  GDCdownload(query.exp)
#  exp <- GDCprepare(query.exp, save = FALSE)
#  save(exp, met, gbm.samples, lgg.samples, file = "elmer.example.rda", compress = "xz")

## ---- message=FALSE,warning=FALSE------------------------------------------
library(ELMER)
library(MultiAssayExperiment)
library(GenomicRanges)
distal.probes <- get.feature.probe(genome = "hg19", 
                                   met.platform = "450K")

# Recover the data created in the last step
data(elmerExample)
rownames(exp) <- values(exp)$ensembl_gene_id
mae <- createMAE(exp = exp, 
                 met = met,
                 save = TRUE,
                 linearize.exp = TRUE,
                 save.filename = "mae.rda",
                 filter.probes = distal.probes,
                 met.platform = "450K",
                 genome = "hg19",
                 TCGA = TRUE)
mae

## ---- warning=FALSE, results="hide"----------------------------------------
cores <- 1 # you can use more cores if you want
group.col <- "project_id"
group1 <- "TCGA-GBM"
group2 <- "TCGA-LGG"

# Available directions are hypo and hyper, we will use only hypo
# due to speed constraint
direction <- "hypo"
dir.out <- paste0("elmer/",direction)
dir.create(dir.out, showWarnings = FALSE, recursive = TRUE)

## ---- warning=FALSE, results="hide"----------------------------------------
#--------------------------------------
# STEP 3: Analysis                     |
#--------------------------------------
# Step 3.1: Get diff methylated probes |
#--------------------------------------
message("Get diff methylated probes")
Sig.probes <- get.diff.meth(data = mae, 
                            group.col = group.col,
                            group1 = group1,
                            group2 =  group2,
                            minSubgroupFrac = 1.0, # Use all samples
                            sig.dif = 0.2, # defualt is 0.3
                            diff.dir = direction, # Search for hypomethylated probes in group 1
                            cores = cores, 
                            dir.out = dir.out, 
                            pvalue = 0.1)

## ---- warning=FALSE--------------------------------------------------------
datatable(Sig.probes[1:10,], 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = TRUE)

## ---- warning=FALSE, results="hide"----------------------------------------
#-------------------------------------------------------------
# Step 3.2: Identify significant probe-gene pairs            |
#-------------------------------------------------------------
# Collect nearby 20 genes for Sig.probes
message("Get nearby genes")
nearGenes <- GetNearGenes(data = mae,
                          numFlankingGenes = 4, # default is 20 genes
                          probes = Sig.probes$probe)

message("Get anti correlated probes-genes")
pair <- get.pair(data = mae,
                 group.col = group.col,
                 group1 = group1,
                 group2 =  group2,
                 nearGenes = nearGenes,
                 mode = "supervised",
                 minSubgroupFrac = 1, # % of samples to use in to create groups U/M
                 permu.dir = paste0(dir.out,"/permu"),
                 permu.size = 5, # Please set to 100000 to get significant results
                 raw.pvalue = 0.1,   # defualt is 0.001
                 Pe = 0.5, # Please set to 0.001 to get significant results
                 filter.probes = TRUE, # See preAssociationProbeFiltering function
                 filter.percentage = 0.05,
                 filter.portion = 0.3,
                 dir.out = dir.out,
                 diff.dir = direction,
                 cores = cores,
                 label = direction)

## ---- warning=FALSE--------------------------------------------------------
head(nearGenes)
datatable(pair[1:10,], 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = TRUE)

## ---- warning=FALSE, results="hide"----------------------------------------
#-------------------------------------------------------------
# Step 3.3: Motif enrichment analysis on the selected probes |
#-------------------------------------------------------------
enriched.motif <- get.enriched.motif(data = mae,
                                     probes = pair$Probe, 
                                     dir.out = dir.out, 
                                     label = direction,
                                     pvalue = 1, # default is FDR < 0.05
                                     min.incidence = 10,
                                     lower.OR = 1.1)
# One of the output from the  previous function is a file with the motif, OR and Number of probes
# It will be used for plotting purposes
motif.enrichment <- read.csv(paste0(dir.out,"/getMotif.",direction, ".motif.enrichment.csv"))

## ---- warning=FALSE--------------------------------------------------------
head(enriched.motif[names(enriched.motif)[1]]) ## probes in the given set that have the first motif.
datatable(motif.enrichment, 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          filter = 'top',
          rownames = FALSE)

## ---- warning=FALSE, results="hide"----------------------------------------
#-------------------------------------------------------------
# Step 3.4: Identifying regulatory TFs                        |
#-------------------------------------------------------------
TF <- get.TFs(data = mae, 
              group.col = group.col,
              group1 = group1,
              group2 =  group2,
              mode = "supervised",
              enriched.motif = enriched.motif,
              dir.out = dir.out, 
              cores = cores,
              save.plots = FALSE,
              diff.dir = direction,
              label = direction)
    
# One of the output from the previous function is a file with the raking of TF,
# for each motif. It will be used for plotting purposes
TF.meth.cor <- get(load(paste0(dir.out,"/getTF.",direction,".TFs.with.motif.pvalue.rda")))

## ---- warning=FALSE--------------------------------------------------------
datatable(TF, 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = FALSE)
datatable(TF.meth.cor[1:10,1:6], 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
          rownames = TRUE)

## ---- echo=TRUE, message=FALSE,eval = FALSE, warnings=FALSE, results='asis',fig.height=6, fig.width=10----
#  heatmapPairs(data = mae,
#               group.col = group.col,
#               group1 = group1,
#               group2 = group2,
#               annotation.col = c("gender"),
#               pairs = pair,
#               filename =  NULL)

## ---- echo=TRUE, message=FALSE, warnings=FALSE, results='asis',fig.height=4,fig.width=15----
motif.enrichment.plot(motif.enrichment = motif.enrichment, 
                      save = FALSE,
                      significant = list(lowerOR = 1.1)) # Filter motifs in the plot lowerOR > 1.3

## ---- eval=FALSE, include=TRUE---------------------------------------------
#  grid:TF.rank.plot(motif.pvalue=TF.meth.cor, motif=TF$motif[1], save=FALSE)

## ----results='asis', echo=FALSE, message=FALSE,warning=FALSE,  fig.align='center',fig.height=8,fig.width=8----
gg <- TF.rank.plot(motif.pvalue=TF.meth.cor, motif=TF$motif[1], save=FALSE)
grid::grid.draw(gg[[1]])

## ----results='asis', echo=TRUE, message=FALSE, warning=FALSE---------------
png("TF.png",width = 800, height = 400)
scatter.plot(mae, 
             category = group.col, 
             save = FALSE, 
             lm_line = TRUE,
             byTF=list(TF=unlist(stringr::str_split(TF[1,"top_5percent_TFs"],";"))[1:4], 
                       probe=enriched.motif[[TF$motif[1]]]))
dev.off()

## --------------------------------------------------------------------------
pander::pander(sessionInfo(), compact = FALSE)