## ----setup, include = FALSE, warning = FALSE---------------------------------- knitr::opts_chunk$set(comment = FALSE, warning = FALSE, message = FALSE) ## ----------------------------------------------------------------------------- require(IgGeneUsage) require(knitr) require(ggplot2) require(ggforce) require(gridExtra) require(ggrepel) require(rstan) require(reshape2) rstan_options(auto_write = TRUE) ## ---- eval=F------------------------------------------------------------------ # data("IGHV_HCV", package = "IgGeneUsage") ## ---- eval=F------------------------------------------------------------------ # kable(x = head(IGHV_HCV), row.names = FALSE) ## ---- fig.height = 5, fig.width = 8, eval=F----------------------------------- # # we can compute the total number of rearrangements per sample # total.usage <- aggregate(gene_usage_count~sample_id+condition, # FUN = sum, data = IGHV_HCV) # total.usage$total <- total.usage$gene_usage_count # total.usage$gene_usage_count <- NULL # # # merge it with the original data # viz <- merge(x = IGHV_HCV, y = total.usage, # by = c("sample_id", "condition"), # all.x = TRUE) # # # compute % # viz$gene_usage_pct <- viz$gene_usage_count/viz$total*100 # # # For this example lets consider the 30 most used (mean prevalence) genes # # Hint: In real analyses you MUST use the complete set of genes # top <- aggregate(gene_usage_pct~gene_name, data = viz, FUN = mean) # top <- top[order(top$gene_usage_pct, decreasing = TRUE), ] # IGHV_HCV <- IGHV_HCV[IGHV_HCV$gene_name %in% top$gene_name[seq_len(30)], ] # viz <- viz[viz$gene_name %in% top$gene_name[seq_len(30)], ] # # # visualize # ggplot(data = viz)+ # geom_point(aes(x = gene_name, y = gene_usage_pct, # fill = condition, shape = condition), # position = position_dodge(width = .7), stroke = 0)+ # theme_bw(base_size = 11)+ # ylab(label = "Usage [%]")+ # xlab(label = '')+ # theme(legend.position = "top")+ # theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))+ # scale_fill_manual(name = "Condition", values = c("orange", "#6666ff"))+ # scale_shape_manual(name = "Condition", values = c(21, 22)) ## ---- eval=F------------------------------------------------------------------ # M <- DGU(usage.data = IGHV_HCV, # input data # mcmc.warmup = 250, # how many MCMC warm-ups per chain (default: 500) # mcmc.steps = 1000, # how many MCMC steps per chain (default: 1,500) # mcmc.chains = 2, # how many MCMC chain to run (default: 4) # mcmc.cores = 1, # how many PC cores to use? (e.g. parallel chains) # hdi.level = 0.95, # highest density interval level (de fault: 0.95) # adapt.delta = 0.95, # MCMC target acceptance rate (default: 0.95) # max.treedepth = 10) # tree depth evaluated at each step (default: 12) ## ---- eval=F------------------------------------------------------------------ # summary(M, eval=F) ## ---- eval=F------------------------------------------------------------------ # rstan::check_hmc_diagnostics(M$glm) ## ---- fig.height = 3, fig.width = 6, eval=F----------------------------------- # gridExtra::grid.arrange(rstan::stan_rhat(object = M$glm), # rstan::stan_ess(object = M$glm), # nrow = 1) ## ---- fig.height = 10, fig.width = 8, eval=F---------------------------------- # ggplot(data = M$ppc.data$ppc.repertoire)+ # facet_wrap(facets = ~sample_name, ncol = 5)+ # geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+ # geom_errorbar(aes(x = observed_count, y = ppc_mean_count, # ymin = ppc_L_count, ymax = ppc_H_count), col = "darkgray")+ # geom_point(aes(x = observed_count, y = ppc_mean_count, # fill = condition), shape = 21, size = 1)+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # scale_x_log10(breaks = c(1, 10, 100, 1000), # labels = expression(10^0, 10^1, 10^2, 10^3))+ # scale_y_log10(breaks = c(1, 10, 100, 1000), # labels = expression(10^0, 10^1, 10^2, 10^3))+ # xlab(label = "Observed usage [counts]")+ # ylab(label = "Predicted usage [counts]")+ # annotation_logticks(base = 10, sides = "lb") ## ---- fig.height = 4, fig.width = 6, eval=F----------------------------------- # ggplot(data = M$ppc.data$ppc.gene)+ # geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+ # geom_errorbar(aes(x = observed_prop*100, ymin = ppc_L_prop*100, # ymax = ppc_H_prop*100), col = "darkgray")+ # geom_point(aes(x = observed_prop*100, y = ppc_mean_prop*100, # fill = condition), shape = 21, size = 1)+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # xlab(label = "Observed usage [%]")+ # ylab(label = "Predicted usage [%]")+ # scale_x_log10()+ # scale_y_log10()+ # annotation_logticks(base = 10, sides = "lb") ## ---- eval=F------------------------------------------------------------------ # kable(x = head(M$glm.summary), row.names = FALSE, digits = 3) ## ---- fig.height = 4, fig.width = 6, eval=F----------------------------------- # # format data # stats <- M$glm.summary # stats <- stats[order(abs(stats$es_mean), decreasing = FALSE), ] # stats$gene_fac <- factor(x = stats$gene_name, levels = stats$gene_name) # # stats <- merge(x = stats, y = M$test.summary, by = "gene_name") # # ggplot(data = stats)+ # geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+ # geom_errorbar(aes(x = pmax, y = es_mean, ymin = es_L, ymax = es_H), # col = "darkgray")+ # geom_point(aes(x = pmax, y = es_mean))+ # geom_text_repel(data = stats[stats$pmax >= 0.8, ], # aes(x = pmax, y = es_mean, label = gene_fac), # min.segment.length = 0)+ # theme_bw(base_size = 11)+ # xlab(label = expression(pi))+ # xlim(c(0, 1)) ## ---- fig.height = 3, fig.width = 6, eval=F----------------------------------- # promising.genes <- stats$gene_name[stats$pmax >= 0.8] # # ppc.gene <- M$ppc.data$ppc.gene # ppc.gene <- ppc.gene[ppc.gene$gene_name %in% promising.genes, ] # # ggplot()+ # geom_errorbar(data = ppc.gene, # aes(x = gene_name, ymin = ppc_L_prop*100, # ymax = ppc_H_prop*100, col = condition), # position = position_dodge(width = .8), width = 0.75)+ # geom_point(data = viz[viz$gene_name %in% promising.genes, ], # aes(x = gene_name, y = gene_usage_pct, col = condition), # shape = 21, size = 1.5, fill = "black", # position = position_jitterdodge(jitter.width = 0.15, # jitter.height = 0, # dodge.width = 0.8))+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # ylab(label = "Usage [%]")+ # xlab(label = '') ## ---- fig.height = 5, fig.width = 6, eval=F----------------------------------- # promising.genes <- stats$gene_name[stats$pmax >= 0.8] # # ggplot(data = viz[viz$gene_name %in% promising.genes, ])+ # facet_wrap(facets = ~gene_name, ncol = 1, scales = "free_y")+ # geom_point(aes(x = sample_id, y = gene_usage_count, fill = condition, # size = total/10^3), shape = 21)+ # theme_bw(base_size = 11)+ # ylab(label = "Usage [count]")+ # xlab(label = '')+ # theme(legend.position = "top")+ # theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))+ # scale_fill_manual(name = "Condition", values = c("orange", "#6666ff"))+ # scale_size_continuous(name = expression("N ("*10^3*")"), range = c(1, 5))+ # ylab(label = "Usage [count]")+ # xlab(label = '')+ # scale_y_log10()+ # annotation_logticks(base = 10, sides = "l") ## ---- fig.height = 5, fig.width = 8, eval=F----------------------------------- # ggplot()+ # geom_hline(yintercept = c(-log10(0.05), -log10(0.01)), # linetype = "dashed", col = "darkgray")+ # geom_point(data = stats, col = "red", size = 2, # aes(x = pmax, y = -log10(t.test.fdr.pvalue)))+ # geom_text_repel(data = stats[stats$pmax >= 0.5, ], # aes(x = pmax, y = -log10(t.test.fdr.pvalue), # label = gene_name), size = 4, # min.segment.length = 0.1)+ # xlim(0, 1)+ # ylab(label = "-log10 (P-value) from t-test [FDR corrected]")+ # xlab(label = expression(pi))+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # scale_color_discrete(name = '') ## ---- fig.height = 4, fig.width = 5, eval=F----------------------------------- # promising.genes <- c("IGHV3-21", "IGHV3-72", "IGHV3-9") # # ppc.gene <- M$ppc.data$ppc.gene # ppc.gene <- ppc.gene[ppc.gene$gene_name %in% promising.genes, ] # # ggplot()+ # geom_errorbar(data = ppc.gene, # aes(x = gene_name, ymin = ppc_L_prop*100, # ymax = ppc_H_prop*100, col = condition), # position = position_dodge(width = .8), width = 0.75)+ # geom_point(data = viz[viz$gene_name %in% promising.genes, ], # aes(x = gene_name, y = gene_usage_pct, col = condition), # shape = 21, size = 1.5, fill = "black", # position = position_jitterdodge(jitter.width = 0.15, # jitter.height = 0, # dodge.width = 0.8))+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # ylab(label = "Gene usage [%]")+ # xlab(label = '') ## ---- fig.height = 6, fig.width = 6, eval=F----------------------------------- # promising.genes <- c("IGHV3-21", "IGHV3-72", "IGHV3-9") # # ggplot(data = viz[viz$gene_name %in% promising.genes, ])+ # facet_wrap(facets = ~gene_name, ncol = 1, scales = "free_y")+ # geom_point(aes(x = sample_id, y = gene_usage_count, fill = condition, # size = total/10^3), shape = 21)+ # theme_bw(base_size = 11)+ # ylab(label = "Usage [count]")+ # xlab(label = '')+ # theme(legend.position = "top")+ # theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))+ # scale_fill_manual(name = "Condition", values = c("orange", "#6666ff"))+ # scale_size_continuous(name = expression("N ("*10^3*")"), range = c(1, 5))+ # ylab(label = "Usage [count]")+ # xlab(label = '')+ # scale_y_log10()+ # annotation_logticks(base = 10, sides = "l") ## ---- fig.height = 6, fig.width = 8, eval=F----------------------------------- # ggplot()+ # geom_hline(yintercept = c(-log10(0.05), -log10(0.01)), # linetype = "dashed", col = "darkgray")+ # geom_point(data = stats, col = "red", size = 2, # aes(x = pmax, y = -log10(u.test.fdr.pvalue)))+ # geom_text_repel(data = stats[stats$pmax >= 0.5, ], # aes(x = pmax, y = -log10(u.test.fdr.pvalue), # label = gene_name), size = 4, # min.segment.length = 0.1)+ # xlim(0, 1)+ # ylab(label = "-log10 (P-value) from U-test [FDR corrected]")+ # xlab(label = expression(pi))+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # scale_color_discrete(name = '') ## ---- eval=F------------------------------------------------------------------ # data(Ig, package = "IgGeneUsage") ## ---- eval=F------------------------------------------------------------------ # kable(x = head(Ig), row.names = FALSE) ## ---- fig.height = 4, fig.width = 6, eval=F----------------------------------- # # we can compute the total number of rearrangements per sample # total.usage <- aggregate(gene_usage_count~sample_id+condition, # FUN = sum, data = Ig) # total.usage$total <- total.usage$gene_usage_count # total.usage$gene_usage_count <- NULL # # # merge it with the original data # viz <- merge(x = Ig, y = total.usage, # by = c("sample_id", "condition"), # all.x = TRUE) # # # compute % # viz$gene_usage_pct <- viz$gene_usage_count/viz$total*100 # # # visualize # ggplot(data = viz)+ # geom_point(aes(x = gene_name, y = gene_usage_pct, fill = condition, # shape = condition), stroke = 0, size = 3, # position = position_jitterdodge(jitter.width = 0.25, # dodge.width = 0.75))+ # theme_bw(base_size = 11)+ # ylab(label = "Usage [%]")+ # xlab(label = '')+ # theme(legend.position = "top")+ # scale_fill_manual(name = "Condition", values = c("orange", "#6666ff"))+ # scale_shape_manual(name = "Condition", values = c(21, 22)) ## ---- eval=F------------------------------------------------------------------ # M2 <- DGU(usage.data = Ig, # mcmc.warmup = 250, # mcmc.steps = 1000, # mcmc.chains = 2, # mcmc.cores = 1, # hdi.level = 0.95, # adapt.delta = 0.95, # max.treedepth = 12) ## ---- eval=F------------------------------------------------------------------ # rstan::check_hmc_diagnostics(M2$glm) ## ---- fig.height = 3, fig.width = 6, eval=F----------------------------------- # gridExtra::grid.arrange(rstan::stan_rhat(object = M2$glm), # rstan::stan_ess(object = M2$glm), # nrow = 1) ## ---- fig.height = 5, fig.width = 8, eval=F----------------------------------- # ggplot(data = M2$ppc$ppc.repertoire)+ # facet_wrap(facets = ~sample_name, ncol = 4)+ # geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+ # geom_errorbar(aes(x = observed_count, y = ppc_mean_count, # ymin = ppc_L_count, ymax = ppc_H_count), col = "darkgray")+ # geom_point(aes(x = observed_count, y = ppc_mean_count, # fill = condition), shape = 21, size = 1)+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # scale_x_log10(breaks = c(1, 10, 100, 1000), # labels = expression(10^0, 10^1, 10^2, 10^3))+ # scale_y_log10(breaks = c(1, 10, 100, 1000), # labels = expression(10^0, 10^1, 10^2, 10^3))+ # xlab(label = "Observed usage [counts]")+ # ylab(label = "Predicted usage [counts]")+ # annotation_logticks(base = 10, sides = "bl") ## ---- fig.height = 4, fig.width = 6, eval=F----------------------------------- # ggplot(data = M2$ppc.data$ppc.gene)+ # geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+ # geom_errorbar(aes(x = observed_prop*100, ymin = ppc_L_prop*100, # ymax = ppc_H_prop*100), col = "darkgray")+ # geom_point(aes(x = observed_prop*100, y = ppc_mean_prop*100, # fill = condition), shape = 21, size = 1)+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # xlab(label = "Observed usage [%]")+ # ylab(label = "Predicted usage [%]")+ # scale_x_log10()+ # scale_y_log10()+ # annotation_logticks(base = 10, sides = "bl") ## ---- eval=F------------------------------------------------------------------ # kable(x = M2$glm.summary, row.names = FALSE, digits = 3) ## ---- fig.height = 4, fig.width = 6, eval=F----------------------------------- # # format data # stats <- M2$glm.summary # stats <- stats[order(abs(stats$es_mean), decreasing = FALSE), ] # stats$gene_fac <- factor(x = stats$gene_name, levels = stats$gene_name) # # stats <- merge(x= stats, y = M2$test.summary, by = "gene_name") # # ggplot(data = stats)+ # geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+ # geom_errorbar(aes(x = pmax, y = es_mean, ymin = es_L, ymax = es_H), # col = "darkgray")+ # geom_point(aes(x = pmax, y = es_mean))+ # geom_text_repel(data = stats, aes(x = pmax, y = es_mean, label = gene_fac))+ # theme_bw(base_size = 11)+ # xlab(label = expression(pi))+ # xlim(0, 1) ## ---- fig.height = 6, fig.width = 8, eval=F----------------------------------- # ggplot()+ # geom_hline(yintercept = c(-log10(0.05), -log10(0.01)), # linetype = "dashed", col = "darkgray")+ # geom_point(data = stats, col = "red", size = 2, # aes(x = pmax, y = -log10(t.test.fdr.pvalue)))+ # geom_text_repel(data = stats, # aes(x = pmax, y = -log10(t.test.fdr.pvalue), # label = gene_name), size = 4, # min.segment.length = 0.1)+ # xlim(0, 1)+ # ylab(label = "-log10 (P-value) from t-test [FDR corrected]")+ # xlab(label = expression(pi))+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # scale_color_discrete(name = '') ## ---- fig.height = 5, fig.width = 8, eval=F----------------------------------- # promising.genes <- unique(M2$usage.data$gene_names) # # ppc.gene <- M2$ppc.data$ppc.gene # ppc.gene <- ppc.gene[ppc.gene$gene_name %in% promising.genes, ] # # ggplot()+ # geom_errorbar(data = ppc.gene, # aes(x = gene_name, ymin = ppc_L_prop*100, # ymax = ppc_H_prop*100, col = condition), # position = position_dodge(width = .8), width = 0.75)+ # geom_point(data = viz[viz$gene_name %in% promising.genes, ], # aes(x = gene_name, y = gene_usage_pct, col = condition), # shape = 21, size = 1.5, fill = "black", # position = position_jitterdodge(jitter.width = 0.15, # jitter.height = 0, # dodge.width = 0.8))+ # theme_bw(base_size = 11)+ # theme(legend.position = "top") ## ---- fig.height = 4, fig.width = 7, eval=F----------------------------------- # promising.genes <- "IGHV5" # # ggplot(data = viz[viz$gene_name %in% promising.genes, ])+ # facet_wrap(facets = ~gene_name, ncol = 1, scales = "free_y")+ # geom_point(aes(x = sample_id, y = gene_usage_count, fill = condition, # size = total/10^3), shape = 21)+ # theme_bw(base_size = 11)+ # ylab(label = "Usage [count]")+ # xlab(label = '')+ # theme(legend.position = "top")+ # theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))+ # scale_fill_manual(name = "Condition", values = c("orange", "#6666ff"))+ # scale_size_continuous(name = expression("N ("*10^3*")"), range = c(1, 5))+ # ylab(label = "Usage [count]")+ # xlab(label = '')+ # scale_y_log10()+ # annotation_logticks(base = 10, sides = "l") ## ---- fig.height = 4.5, fig.width = 8, eval=F--------------------------------- # ggplot()+ # geom_hline(yintercept = c(-log10(0.05), -log10(0.01)), # linetype = "dashed", col = "darkgray")+ # geom_point(data = stats, col = "red", size = 2, # aes(x = pmax, y = -log10(u.test.fdr.pvalue)))+ # geom_text_repel(data = stats, # aes(x = pmax, y = -log10(u.test.fdr.pvalue), # label = gene_name), size = 4, # min.segment.length = 0.1)+ # xlim(0, 1)+ # ylab(label = "-log10 (P-value) from U-test [FDR corrected]")+ # xlab(label = expression(pi))+ # theme_bw(base_size = 11)+ # theme(legend.position = "top")+ # scale_color_discrete(name = '')