## ----init, results='hide', echo=FALSE, warning=FALSE, message=FALSE-----------
library(knitr)
opts_chunk$set(warning=FALSE, message=FALSE)
BiocStyle::markdown()

## ----install_bioc, eval=FALSE-------------------------------------------------
# if (!require("BiocManager"))
#     install.packages("BiocManager")
# BiocManager::install('GenomicDataCommons')

## ----libraries, message=FALSE-------------------------------------------------
library(GenomicDataCommons)

## ----statusQS-----------------------------------------------------------------
GenomicDataCommons::status()

## ----statusCheck--------------------------------------------------------------
stopifnot(GenomicDataCommons::status()$status=="OK")

## ----manifest-----------------------------------------------------------------
ge_manifest <- files() |>
    filter( cases.project.project_id == 'TCGA-OV') |> 
    filter( type == 'gene_expression' ) |>
    filter( analysis.workflow_type == 'STAR - Counts')  |>
    manifest()
head(ge_manifest)

## ----downloadQS, eval=FALSE---------------------------------------------------
# fnames <- lapply(ge_manifest$id[1:20], gdcdata)

## ----gdc_clinical-------------------------------------------------------------
case_ids = cases() |> results(size=10) |> ids()
clindat = gdc_clinical(case_ids)
names(clindat)

## ----clinData-----------------------------------------------------------------
head(clindat[["main"]])
head(clindat[["diagnoses"]])

## ----metadataQS---------------------------------------------------------------
expands = c("diagnoses","annotations",
             "demographic","exposures")
clinResults = cases() |>
    GenomicDataCommons::select(NULL) |>
    GenomicDataCommons::expand(expands) |>
    results(size=50)
str(clinResults[[1]],list.len=6)
# or listviewer::jsonedit(clinResults)

## ----projectquery-------------------------------------------------------------
pquery = projects()

## ----pquery-------------------------------------------------------------------
str(pquery)

## ----pquerycount--------------------------------------------------------------
pcount = count(pquery)
# or
pcount = pquery |> count()
pcount

## ----pqueryresults------------------------------------------------------------
presults = pquery |> results()

## ----presultsstr--------------------------------------------------------------
str(presults)

## ----presultsall--------------------------------------------------------------
length(ids(presults))
presults = pquery |> results_all()
length(ids(presults))
# includes all records
length(ids(presults)) == count(pquery)

## ----defaultfields------------------------------------------------------------
default_fields('files')
# The number of fields available for files endpoint
length(available_fields('files'))
# The first few fields available for files endpoint
head(available_fields('files'))

## ----selectexample------------------------------------------------------------
# Default fields here
qcases = cases()
qcases$fields
# set up query to use ALL available fields
# Note that checking of fields is done by select()
qcases = cases() |> GenomicDataCommons::select(available_fields('cases'))
head(qcases$fields)

## ----aggexample---------------------------------------------------------------
# total number of files of a specific type
res = files() |> facet(c('type','data_type')) |> aggregations()
res$type

## ----allfilesunfiltered-------------------------------------------------------
qfiles = files()
qfiles |> count() # all files

## ----onlyGeneExpression-------------------------------------------------------
qfiles = files() |> filter( type == 'gene_expression')
# here is what the filter looks like after translation
str(get_filter(qfiles))

## ----filtAvailFields----------------------------------------------------------
grep('pro',available_fields('files'),value=TRUE) |> 
    head()

## ----filtProgramID------------------------------------------------------------
files() |> 
    facet('cases.project.project_id') |> 
    aggregations() |> 
    head()

## ----filtfinal----------------------------------------------------------------
qfiles = files() |>
    filter( cases.project.project_id == 'TCGA-OV' & type == 'gene_expression')
str(get_filter(qfiles))
qfiles |> count()

## ----filtChain----------------------------------------------------------------
qfiles2 = files() |>
    filter( cases.project.project_id == 'TCGA-OV') |> 
    filter( type == 'gene_expression') 
qfiles2 |> count()
(qfiles |> count()) == (qfiles2 |> count()) #TRUE

## ----filtAndManifest----------------------------------------------------------
manifest_df = qfiles |> manifest()
head(manifest_df)

## ----filterForSTARCounts------------------------------------------------------
qfiles = files() |> filter( ~ cases.project.project_id == 'TCGA-OV' &
                            type == 'gene_expression' &
                            access == "open" &
                            analysis.workflow_type == 'STAR - Counts')
manifest_df = qfiles |> manifest()
nrow(manifest_df)

## ----authenNoRun, eval=FALSE--------------------------------------------------
# token = gdc_token()
# transfer(...,token=token)
# # or
# transfer(...,token=get_token())

## ----singlefileDL-------------------------------------------------------------
fnames = gdcdata(manifest_df$id[1:2],progress=FALSE)


## ----bulkDL, eval=FALSE-------------------------------------------------------
# # Requires gcd_client command-line utility to be isntalled
# # separately.
# fnames = gdcdata(manifest_df$id[3:10], access_method = 'client')

## ----casesPerProject----------------------------------------------------------
res = cases() |> facet("project.project_id") |> aggregations()
head(res)
library(ggplot2)
ggplot(res$project.project_id,aes(x = key, y = doc_count)) +
    geom_bar(stat='identity') +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

## ----casesInTCGA--------------------------------------------------------------
cases() |> filter(~ project.program.name=='TARGET') |> count()

## ----casesInTARGET------------------------------------------------------------
cases() |> filter(~ project.program.name=='TCGA') |> count()

## ----casesTCGABRCASampleTypes-------------------------------------------------
# The need to do the "&" here is a requirement of the
# current version of the GDC API. I have filed a feature
# request to remove this requirement.
resp = cases() |> filter(~ project.project_id=='TCGA-BRCA' &
                              project.project_id=='TCGA-BRCA' ) |>
    facet('samples.sample_type') |> aggregations()
resp$samples.sample_type

## ----casesTCGABRCASolidNormal-------------------------------------------------
# The need to do the "&" here is a requirement of the
# current version of the GDC API. I have filed a feature
# request to remove this requirement.
resp = cases() |> filter(~ project.project_id=='TCGA-BRCA' &
                              samples.sample_type=='Solid Tissue Normal') |>
    GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) |>
    response_all()
count(resp)
res = resp |> results()
str(res[1],list.len=6)
head(ids(resp))

## ----casesFemaleTCGA----------------------------------------------------------
cases() |>
  GenomicDataCommons::filter(~ project.program.name == 'TCGA' &
    "cases.demographic.gender" %in% "female") |>
      GenomicDataCommons::results(size = 4) |>
        ids()

## ----notFemaleTCGACOAD--------------------------------------------------------
cases() |>
  GenomicDataCommons::filter(~ project.project_id == 'TCGA-COAD' &
    "cases.demographic.gender" %exclude% "female") |>
      GenomicDataCommons::results(size = 4) |>
        ids()

## ----missingGenderTCGA--------------------------------------------------------
cases() |>
  GenomicDataCommons::filter(~ project.program.name == 'TCGA' &
    missing("cases.demographic.gender")) |>
      GenomicDataCommons::results(size = 4) |>
        ids()

## ----notMissingGenderTCGA-----------------------------------------------------
cases() |>
  GenomicDataCommons::filter(~ project.program.name == 'TCGA' &
    !missing("cases.demographic.gender")) |>
      GenomicDataCommons::results(size = 4) |>
        ids()

## ----filesVCFCount------------------------------------------------------------
res = files() |> facet('type') |> aggregations()
res$type
ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

## ----filesRNAseqGeneGBM-------------------------------------------------------
q = files() |>
    GenomicDataCommons::select(available_fields('files')) |>
    filter(~ cases.project.project_id=='TCGA-GBM' &
               data_type=='Gene Expression Quantification')
q |> facet('analysis.workflow_type') |> aggregations()
# so need to add another filter
file_ids = q |> filter(~ cases.project.project_id=='TCGA-GBM' &
                            data_type=='Gene Expression Quantification' &
                            analysis.workflow_type == 'STAR - Counts') |>
    GenomicDataCommons::select('file_id') |>
    response_all() |>
    ids()

## ----filesRNAseqGeneGBMforBAM-------------------------------------------------
q = files() |>
    GenomicDataCommons::select(available_fields('files')) |>
    filter(~ cases.project.project_id == 'TCGA-GBM' &
               data_type == 'Aligned Reads' &
               experimental_strategy == 'RNA-Seq' &
               data_format == 'BAM')
file_ids = q |> response_all() |> ids()

## ----slicing10, eval=FALSE----------------------------------------------------
# bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token())
# library(GenomicAlignments)
# aligns = readGAlignments(bamfile)

## ----sessionInfo--------------------------------------------------------------
sessionInfo()