## ----style, echo=FALSE, results='hide', warning=FALSE, message=FALSE----------
BiocStyle::markdown()

suppressPackageStartupMessages({
    library(knitr)
    library(RAIDS)
    library(SNPRelate)
    library(gdsfmt)
})

set.seed(121444)

## ----runRefGDS, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE----
#############################################################################
## Load required packages
#############################################################################
library(RAIDS)    
library(SNPRelate)

pathRef <- system.file("extdata/", package="RAIDS")

fileReferenceGDS <- file.path(pathRef, "PopulationReferenceDemo.gds")

gdsRef <- snpgdsOpen(fileReferenceGDS)

## Show the file format
print(gdsRef)

closefn.gds(gdsRef)

## ----createRefGDS, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE----
#############################################################################
## Load required packages
#############################################################################
library(RAIDS)    
library(SNPRelate)
library(gdsfmt)

## Create a temporary GDS Reference file in the temporary directory
fileNewReferenceGDS <- file.path(tempdir(), "reference_DEMO.gds")

gdsRefNew <- createfn.gds(fileNewReferenceGDS)

## The entry 'sample.id' contain the unique identifiers of 10 samples 
## that constitute the reference dataset
sample.id <- c("HG00243", "HG00150", "HG00149", "HG00246", "HG00138", 
                    "HG01334", "HG00268", "HG00275", "HG00290", "HG00364")
add.gdsn(node=gdsRefNew, name="sample.id", val=sample.id, 
            storage="string", check=TRUE)

## A data frame containing the information about the 10 samples 
## (in the same order than in the 'sample.id') is created and added to 
## the 'sample.annot' entry
## The data frame must contain those columns: 
##     'sex': '1'=male, '2'=female
##     'pop.group': acronym for the population (ex: GBR, CDX, MSL, ASW, etc..)
##     'superPop': acronym for the super-population (ex: AFR, EUR, etc...)
##     'batch': number identifying the batch of provenance 
sampleInformation <- data.frame(sex=c("1", "2", "1", "1", "1", 
        "1", "2", "2", "1", "2"), pop.group=c(rep("GBR", 6), rep("FIN", 4)), 
        superPop=c(rep("EUR", 10)), batch=rep(0, 10), stringsAsFactors=FALSE)
add.gdsn(node=gdsRefNew, name="sample.annot", val=sampleInformation, 
            check=TRUE)

## The identifier of each SNV is added in the 'snp.id' entry
snvID <- c("s29603", "s29605", "s29633", "s29634", "s29635", "s29637", 
            "s29638", "s29663", "s29664", "s29666", "s29667", "s29686", 
            "s29687", "s29711", "s29741", "s29742", "s29746", "s29750", 
            "s29751", "s29753")
add.gdsn(node=gdsRefNew, name="snp.id", val=snvID, check=TRUE)

## The chromosome of each SNV is added to the 'snp.chromosome' entry
## The order of the SNVs is the same than in the 'snp.id' entry
snvChrom <- c(rep(1, 20))
add.gdsn(node=gdsRefNew, name="snp.chromosome", val=snvChrom, storage="uint16",
            check=TRUE)

## The position on the chromosome of each SNV is added to 
## the 'snp.position' entry
## The order of the SNVs is the same than in the 'snp.id' entry
snvPos <- c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, 
                    3469737, 3471497, 3471565, 3471618)
add.gdsn(node=gdsRefNew, name="snp.position", val=snvPos, storage="int32",
            check=TRUE)

## The allele information of each SNV is added to the 'snp.allele' entry
## The order of the SNVs is the same than in the 'snp.allele' entry
snvAllele <- c("A/G", "C/G", "C/T", "C/T", "T/G", "C/T", 
                    "G/A", "A/G", "G/A", "G/A")
add.gdsn(node=gdsRefNew, name="snp.allele", val=snvAllele, storage="string",
            check=TRUE)

## The allele frequency in the general population (between 0 and 1) of each 
## SNV is added to the 'snp.AF' entry
## The order of the SNVs is the same than in the 'snp.id' entry
snvAF <- c(0.86, 0.01, 0.00, 0.00, 0.01, 0.00, 0.00, 0.00, 0.00, 0.01)
add.gdsn(node=gdsRefNew, name="snp.AF", val=snvAF, storage="packedreal24",
            check=TRUE)

## The allele frequency in the East Asian population (between 0 and 1) of each 
## SNV is added to the 'snp.EAS_AF' entry
## The order of the SNVs is the same than in the 'snp.id' entry
snvAF <- c(0.80, 0.00, 0.00, 0.01, 0.00, 0.00, 0.01, 0.00, 0.02, 0.00)
add.gdsn(node=gdsRefNew, name="snp.EAS_AF", val=snvAF, storage="packedreal24",
            check=TRUE)

## The allele frequency in the European population (between 0 and 1) of each 
## SNV is added to the 'snp.EUR_AF' entry
## The order of the SNVs is the same than in the 'snp.id' entry
snvAF <- c(0.91, 0.00, 0.01, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.03)
add.gdsn(node=gdsRefNew, name="snp.EUR_AF", val=snvAF, storage="packedreal24",
            check=TRUE)

## The allele frequency in the African population (between 0 and 1) of each 
## SNV is added to the 'snp.AFR_AF' entry
## The order of the SNVs is the same than in the 'snp.id' entry
snvAF <- c(0.85, 0.04, 0.00, 0.00, 0.00, 0.01, 0.00, 0.00, 0.00, 0.00)
add.gdsn(node=gdsRefNew, name="snp.AFR_AF", val=snvAF, storage="packedreal24",
            check=TRUE)

## The allele frequency in the American population (between 0 and 1) of each 
## SNV is added to the 'snp.AMR_AF' entry
## The order of the SNVs is the same than in the 'snp.id' entry
snvAF <- c(0.83, 0.01, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.02)
add.gdsn(node=gdsRefNew, name="snp.AMR_AF", val=snvAF, storage="packedreal24",
            check=TRUE)

## The allele frequency in the South Asian population (between 0 and 1) of each 
## SNV is added to the 'snp.SAS_AF' entry
## The order of the SNVs is the same than in the 'snp.id' entry
snvAF <- c(0.89, 0.00, 0.00, 0.00, 0.05, 0.00, 0.00, 0.01, 0.00, 0.00)
add.gdsn(node=gdsRefNew, name="snp.SAS_AF", val=snvAF, storage="packedreal24",
            check=TRUE)

## The genotype of each SNV for each sample is added to the 'genotype' entry
## The genotype correspond to the number of A alleles
## The rows represent the SNVs is the same order than in 'snp.id' entry
## The columns represent the samples is the same order than in 'sample.id' entry
genotypeInfo <- matrix(data=c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=10, byrow=TRUE)
add.gdsn(node=gdsRefNew, name="genotype", val=genotypeInfo, 
            storage="bit2", check=TRUE)

## The entry 'sample.ref' is filled with 1 indicating that all 10 
## samples are retained to be used as reference
## The order of the samples is the same than in the 'sample.id' entry
add.gdsn(node=gdsRefNew, name="sample.ref", val=rep(1L, 10), 
            storage="bit1", check=TRUE)

## Show the file format
print(gdsRefNew)

closefn.gds(gdsRefNew)

unlink(fileNewReferenceGDS, force=TRUE)


## ----runRefAnnotGDS, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE----
#############################################################################
## Load required packages
#############################################################################
library(RAIDS)    
library(SNPRelate)

pathReference <- system.file("extdata/tests", package="RAIDS")

fileReferenceAnnotGDS <- file.path(pathReference, "ex1_good_small_1KG.gds")

gdsRefAnnot <- openfn.gds(fileReferenceAnnotGDS)

## Show the file format
print(gdsRefAnnot)

closefn.gds(gdsRefAnnot)

## ----createRefAnnotGDS, echo=TRUE, eval=TRUE, collapse=TRUE, warning=FALSE, message=FALSE----
#############################################################################
## Load required packages
#############################################################################
library(RAIDS)    
library(gdsfmt)

## Create a temporary GDS Reference file in the temporary directory
fileNewReferenceAnnotGDS <- 
        file.path(tempdir(), "reference_SNV_Annotation_DEMO.gds")

gdsRefAnnotNew <- createfn.gds(fileNewReferenceAnnotGDS)

## The entry 'phase' contain the phase of the SNVs in the
## Population Annotation GDS file
## 0 means the first allele is a reference; 1 means the first allele is
## the alternative and 3 means unknown
## The SNVs (rows) and samples (columns) in phase are in the same order as
## in the Population Annotation GDS file.
phase <- matrix(data=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                        0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
                        0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
                        1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                        0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
                        0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
                        0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                        0, 1, 0, 1, 1, 0, 1, 1, 1, 1), ncol=10, byrow=TRUE)
add.gdsn(node=gdsRefAnnotNew, name="phase", val=phase, storage="bit2", 
                check=TRUE)

## The entry 'blockAnnot' contain the information for each group of blocks
## that are present in the 'block' entry.
blockAnnot <- data.frame(block.id=c("EAS.0.05.500k", "EUR.0.05.500k",
                    "AFR.0.05.500k", "AMR.0.05.500k", "SAS.0.05.500k"),
                block.desc=c(
                    "EAS populationblock base on SNP 0.05 and windows 500k",
                    "EUR populationblock base on SNP 0.05 and windows 500k",
                    "AFR populationblock base on SNP 0.05 and windows 500k",
                    "AMR populationblock base on SNP 0.05 and windows 500k",
                    "SAS populationblock base on SNP 0.05 and windows 500k"),
                stringsAsFactors=FALSE)
add.gdsn(node=gdsRefAnnotNew, name="block.annot", val=blockAnnot, check=TRUE)

## The entry 'block' contain the block information for the SNVs in the
## Population Annotation GDS file.
## The SNVs (rows) are in the same order as in 
## the Population Annotation GDS file.
## The block groups (columns) are in the same order as in 
## the 'block.annot' entry.
block <- matrix(data=c(-1, -1, -1, -1, -1,
                        -2, -2,  1, -2, -2,
                        -2,  1,  1,  1, -2,
                        -2,  1,  1,  1, -2,
                        -2, -3, -2, -3, -2,
                         1,  2,  2,  2,  1,
                         1,  2,  2,  2,  1,
                        -3, -4, -3, -4, -3,
                         2, -4,  3, -4, -3,
                         2, -4,  3, -4, -3), ncol=5, byrow=TRUE)
add.gdsn(node=gdsRefAnnotNew, name="block", val=block, storage="int32", 
            check=TRUE)

## Show the file format
print(gdsRefAnnotNew)

closefn.gds(gdsRefAnnotNew)

unlink(fileNewReferenceAnnotGDS, force=TRUE)


## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()