## ----include=FALSE------------------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
options(scipen = 1, digits = 2)
set.seed(1234)
library(projectR)

## ----prcomp, warning=FALSE----------------------------------------------------
# data to define PCs
library(ggplot2)
data(p.RNAseq6l3c3t)

# do PCA on RNAseq6l3c3t expression data
pc.RNAseq6l3c3t<-prcomp(t(p.RNAseq6l3c3t))
pcVAR <- round(((pc.RNAseq6l3c3t$sdev)^2/sum(pc.RNAseq6l3c3t$sdev^2))*100,2)
dPCA <- data.frame(cbind(pc.RNAseq6l3c3t$x,pd.RNAseq6l3c3t))

#plot pca

setCOL <- scale_colour_manual(values = c("blue","black","red"), name="Condition:")
setFILL <- scale_fill_manual(values = c("blue","black","red"),guide = FALSE)
setPCH <- scale_shape_manual(values=c(23,22,25,25,21,24),name="Cell Line:")

pPCA <- ggplot(dPCA, aes(x=PC1, y=PC2, colour=ID.cond, shape=ID.line,
        fill=ID.cond)) +
        geom_point(aes(size=days),alpha=.6)+
        setCOL + setPCH  + setFILL +
        scale_size_area(breaks = c(2,4,6), name="Day") +
        theme(legend.position=c(0,0), legend.justification=c(0,0),
              legend.direction = "horizontal",
              panel.background = element_rect(fill = "white",colour=NA),
              legend.background = element_rect(fill = "transparent",colour=NA),
              plot.title = element_text(vjust = 0,hjust=0,face="bold")) +
        labs(title = "PCA of hPSC PolyA RNAseq",
            x=paste("PC1 (",pcVAR[1],"% of varience)",sep=""),
            y=paste("PC2 (",pcVAR[2],"% of varience)",sep=""))

## ----projectR.prcomp, warning=FALSE-------------------------------------------
# data to project into PCs from RNAseq6l3c3t expression data
data(p.ESepiGen4c1l)
library(ggplot2)

PCA2ESepi <- projectR(data = p.ESepiGen4c1l$mRNA.Seq,loadings=pc.RNAseq6l3c3t,
full=TRUE, dataNames=map.ESepiGen4c1l[["GeneSymbols"]])

pd.ESepiGen4c1l<-data.frame(Condition=sapply(colnames(p.ESepiGen4c1l$mRNA.Seq),
  function(x) unlist(strsplit(x,'_'))[1]),stringsAsFactors=FALSE)
pd.ESepiGen4c1l$color<-c(rep("red",2),rep("green",3),rep("blue",2),rep("black",2))
names(pd.ESepiGen4c1l$color)<-pd.ESepiGen4c1l$Cond

dPCA2ESepi<- data.frame(cbind(t(PCA2ESepi[[1]]),pd.ESepiGen4c1l))

#plot pca
library(ggplot2)
setEpiCOL <- scale_colour_manual(values = c("red","green","blue","black"),
  guide = guide_legend(title="Lineage"))

pPC2ESepiGen4c1l <- ggplot(dPCA2ESepi, aes(x=PC1, y=PC2, colour=Condition)) +
  geom_point(size=5) + setEpiCOL +
  theme(legend.position=c(0,0), legend.justification=c(0,0),
  panel.background = element_rect(fill = "white"),
  legend.direction = "horizontal",
  plot.title = element_text(vjust = 0,hjust=0,face="bold")) +
  labs(title = "Encode RNAseq in target PC1 & PC2",
  x=paste("Projected PC1 (",round(PCA2ESepi[[2]][1],2),"% of varience)",sep=""),
  y=paste("Projected PC2 (",round(PCA2ESepi[[2]][2],2),"% of varience)",sep=""))


## ----fig.show='hold', fig.width=10, fig.height=5, echo=FALSE, message= FALSE----
library(gridExtra)
#grid.arrange(pPCA,pPC2ESepiGen4c1l,nrow=1)

## -----------------------------------------------------------------------------
# get data

AP <- get(data("AP.RNAseq6l3c3t")) #CoGAPS run data
AP <- AP$Amean
# heatmap of gene weights for CoGAPs patterns
library(gplots)
par(mar=c(1,1,1,1))
pNMF<-heatmap.2(as.matrix(AP),col=bluered, trace='none',
          distfun=function(c) as.dist(1-cor(t(c))) ,
          cexCol=1,cexRow=.5,scale = "row",
          hclustfun=function(x) hclust(x, method="average")
      )

## -----------------------------------------------------------------------------
# data to project into PCs from RNAseq6l3c3t expression data

data('p.ESepiGen4c1l4')
data('p.RNAseq6l3c3t')

NMF2ESepi <- projectR(p.ESepiGen4c1l$mRNA.Seq,loadings=AP,full=TRUE,
    dataNames=map.ESepiGen4c1l[["GeneSymbols"]])

dNMF2ESepi<- data.frame(cbind(t(NMF2ESepi),pd.ESepiGen4c1l))

#plot pca
library(ggplot2)
setEpiCOL <- scale_colour_manual(values = c("red","green","blue","black"),
guide = guide_legend(title="Lineage"))

pNMF2ESepiGen4c1l <- ggplot(dNMF2ESepi, aes(x=X1, y=X2, colour=Condition)) +
  geom_point(size=5) + setEpiCOL +
  theme(legend.position=c(0,0), legend.justification=c(0,0),
  panel.background = element_rect(fill = "white"),
  legend.direction = "horizontal",
  plot.title = element_text(vjust = 0,hjust=0,face="bold"))
  labs(title = "Encode RNAseq in target PC1 & PC2",
  x=paste("Projected PC1 (",round(PCA2ESepi[[2]][1],2),"% of varience)",sep=""),
  y=paste("Projected PC2 (",round(PCA2ESepi[[2]][2],2),"% of varience)",sep=""))

## ----correlateR-exp-----------------------------------------------------------
# data to

data("p.RNAseq6l3c3t")

# get genes correlated to T
cor2T<-correlateR(genes="T", dat=p.RNAseq6l3c3t, threshtype="N", threshold=10, absR=TRUE)
cor2T <- cor2T@corM
### heatmap of genes more correlated to T
indx<-unlist(sapply(cor2T,rownames))
indx <- as.vector(indx)
colnames(p.RNAseq6l3c3t)<-pd.RNAseq6l3c3t$sampleX
library(reshape2)
pm.RNAseq6l3c3t<-melt(cbind(p.RNAseq6l3c3t[indx,],indx))

library(gplots)
library(ggplot2)
library(viridis)
pCorT<-ggplot(pm.RNAseq6l3c3t, aes(variable, indx, fill = value)) +
  geom_tile(colour="gray20", size=1.5, stat="identity") +
  scale_fill_viridis(option="B") +
  xlab("") +  ylab("") +
  scale_y_discrete(limits=indx) +
  ggtitle("Ten genes most highly pos & neg correlated with T") +
  theme(
    panel.background = element_rect(fill="gray20"),
    panel.border = element_rect(fill=NA,color="gray20", size=0.5, linetype="solid"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    axis.text = element_text(size=rel(1),hjust=1),
    axis.text.x = element_text(angle = 90,vjust=.5),
    legend.text = element_text(color="white", size=rel(1)),
    legend.background = element_rect(fill="gray20"),
    legend.position = "bottom",
    legend.title=element_blank()
)


## ----fig.show='hold', fig.width=10, fig.height=5, echo=FALSE------------------
pCorT

## -----------------------------------------------------------------------------
# data to project into from RNAseq6l3c3t expression data
data(p.ESepiGen4c1l)


cor2ESepi <- projectR(p.ESepiGen4c1l$mRNA.Seq,loadings=cor2T[[1]],full=FALSE,
    dataNames=map.ESepiGen4c1l$GeneSymbols)


## ----projectionDriver, message = F, out.width="100%"------
options(width = 60)
library(dplyr, warn.conflicts = F)

#size-normed, log expression
data("microglial_counts")

#size-normed, log expression
data("glial_counts")

#5 pattern cogaps object generated on microglial_counts
data("cr_microglial")
microglial_fl <- cr_microglial@featureLoadings

#the features by which to weight the difference in expression 
pattern_to_weight <- "Pattern_1"
drivers_ci <- projectionDriveR(microglial_counts, #expression matrix
                                       glial_counts, #expression matrix
                                       loadings = microglial_fl, #feature x pattern dataframe
                                       loadingsNames = NULL,
                                       pattern_name = pattern_to_weight, #column name
                                       pvalue = 1e-5, #pvalue before bonferroni correction
                                       display = T,
                                       normalize_pattern = T, #normalize feature weights
                                       mode = "CI") #confidence interval mode


conf_intervals <- drivers_ci$mean_ci[drivers_ci$sig_genes$significant_shared_genes,]


str(conf_intervals)

drivers_pv <- projectionDriveR(microglial_counts, #expression matrix
                                       glial_counts, #expression matrix
                                       loadings = microglial_fl, #feature x pattern dataframe
                                       loadingsNames = NULL,
                                       pattern_name = pattern_to_weight, #column name
                                       pvalue = 1e-5, #pvalue before bonferroni correction
                                       display = T,
                                       normalize_pattern = T, #normalize feature weights
                                       mode = "PV") #confidence interval mode

difexp <- drivers_pv$difexpgenes
str(difexp)



## ---------------------------------------------------------
suppressWarnings(library(cowplot))
#order in ascending order of estimates
conf_intervals <- conf_intervals %>% mutate(mid = (high+low)/2) %>% arrange(mid)
gene_order <- rownames(conf_intervals)

#add text labels for top and bottom n genes
conf_intervals$label_name <- NA_character_
n <- 2
idx <- c(1:n, (dim(conf_intervals)[1]-(n-1)):dim(conf_intervals)[1])
gene_ids <- gene_order[idx]
conf_intervals$label_name[idx] <- gene_ids

#the labels above can now be used as ggplot aesthetics
plots_list <- plotConfidenceIntervals(conf_intervals, #mean difference in expression confidence intervals
                                      sort = F, #should genes be sorted by estimates
                                      weights = drivers_ci$normalized_weights[rownames(conf_intervals)],
                                      pattern_name = pattern_to_weight,
                                      weights_clip = 0.99,
                                      weights_vis_norm = "none")

pl1 <- plots_list[["ci_estimates_plot"]] +
  ggrepel::geom_label_repel(aes(label = label_name), max.overlaps = 20, force = 50)

pl2 <- plots_list[["weights_heatmap"]]

#now plot the weighted differences
weighted_conf_intervals <- drivers_ci$weighted_mean_ci[gene_order,]
plots_list_weighted <- plotConfidenceIntervals(weighted_conf_intervals,
                                               sort = F,
                                               weighted = T)

pl3 <- plots_list_weighted[["ci_estimates_plot"]] +
  xlab("Difference in weighted group means") +
  theme(axis.title.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank())

cowplot::plot_grid(pl1, pl2, pl3, align = "h", rel_widths = c(1,.4, 1), ncol = 3)




## ----fig.width=9, fig.height=10---------------------------
suppressWarnings(library(cowplot))
library(fgsea)
library(msigdbr)
#manually change FC and pvalue threshold from defaults 0.2, 1e-5
drivers_pv_mod <- pdVolcano(drivers_pv, FC = 0.5, pvalue = 1e-7)

difexp_mod <- drivers_pv_mod$difexpgenes
str(difexp_mod)

#fgsea application 

#extract unweighted fgsea vector
fgseavec <- drivers_pv$fgseavecs$unweightedvec
#split into enrichment groups, negative estimates are enriched in the reference matrix (glial), positive are enriched in the test matrix (microglial), take abs value 
glial_fgsea_vec <- abs(fgseavec[which(fgseavec < 0)])
microglial_fgsea_vec <- abs(fgseavec[which(fgseavec > 0)])

#get FGSEA pathways - selecting subcategory C8 for cell types
msigdbr_list =  msigdbr::msigdbr(species = "Mus musculus", category = "C8")
pathways = split(x = msigdbr_list$ensembl_gene, f = msigdbr_list$gs_name)

#run fgsea scoreType positive, all values will be positive
glial_fgsea <- fgsea::fgsea(pathways, glial_fgsea_vec, scoreType = "pos")
#tidy 
glial_fgseaResTidy <- glial_fgsea %>% 
  subset(padj <= 0.05 & size >= 10 & size <= 500) %>%
    as_tibble() %>%
    dplyr::arrange(desc(size))
#plot 
glial_EnrichmentPlot <- ggplot2::ggplot(glial_fgseaResTidy, 
                                        ggplot2::aes(reorder(pathway, NES), NES)) + 
  ggplot2::geom_point(aes(size=size, color = padj)) + 
  coord_flip() + 
  labs(x="Pathway", y="Normalized Enrichment Score", title="Pathway NES from GSEA") + 
  theme_minimal()
glial_EnrichmentPlot
  
microglial_fgsea <- fgsea::fgsea(pathways, microglial_fgsea_vec, scoreType = "pos")
#tidy 
microglial_fgseaResTidy <- microglial_fgsea %>% 
  subset(padj <= 0.05 & size >= 10 & size <= 500) %>%
    as_tibble() %>%
    dplyr::arrange(desc(size))

microglial_EnrichmentPlot <- ggplot2::ggplot(microglial_fgseaResTidy, 
                                             ggplot2::aes(reorder(pathway, NES), NES)) +
  ggplot2::geom_point(aes(size=size, color = padj)) + 
  coord_flip() + 
  labs(x="Pathway", y="Normalized Enrichment Score", title="Pathway NES from GSEA") + 
  theme_minimal()
microglial_EnrichmentPlot