code update

mcvickerlab · Aug 22, 2024 · e26c0dc · e26c0dc
1 parent fa198db
commit e26c0dc
Show file tree

Hide file tree

Showing 11 changed files with 2,515 additions and 457 deletions.
diff --git a/notebooks/experimental/Fig2D.ipynb b/notebooks/experimental/Fig2D.ipynb
diff --git a/notebooks/simulations/plot_power_curves.ipynb b/notebooks/simulations/plot_power_curves.ipynb
diff --git a/notebooks/simulations/plot_power_curves_example.ipynb b/notebooks/simulations/plot_power_curves_example.ipynb
diff --git a/notebooks/simulations/visualize_sim_interactions_data.ipynb b/notebooks/simulations/visualize_sim_interactions_data.ipynb
diff --git a/src/simulations/data/run_sim_counts_power_analysis.sh b/src/simulations/data/run_sim_counts_power_analysis.sh
@@ -22,7 +22,7 @@ fi
 # --png --tiff --pdf
 
 # simulate data resembling at scale experiment
-Rscript $PWD/sim_counts_interactions_power_analysis_updated.R --out $outdir \
+Rscript $PWD/sim_counts_interactions_power_analysis.R --out $outdir \
 --lambda 15 20 30 50  \
 --cells 50000 \
 --genes 1000 \

diff --git a/src/simulations/models/filt_power_analysis.R b/src/simulations/models/filt_power_analysis.R
@@ -0,0 +1,274 @@
+### this code runs a power analysis on simulated dataset for interactions between regulatory elements
+
+library(argparse)
+# library(Matrix)
+# library(ggplot2)
+library(MASS)
+library(rhdf5)
+library(dplyr)
+library(broom)
+
+# define parser to handle input arguments from command line
+parser <- ArgumentParser(description = "process input arguments")
+parser$add_argument('--h5', action = "store", type = "character", 
+                    help = "path to h5 file with simulated data")
+parser$add_argument("--cells", action = "store", type = 'integer',
+	                default = 50000,
+	                help = "number of cells in simualted dataset")
+parser$add_argument('--genes', action = "store", type = "integer", 
+	                default = 13000,
+                    help = "number of genes in simulated dataset")
+parser$add_argument("--out", action = "store", type = "character",
+	                help = "where to save outputs")
+parser$add_argument("--pseudocount", action = "store", type = "double",
+					default = NULL,
+					help = "pseudocount to add when modeling")
+parser$add_argument("--lambda", action = "store", type = "integer",
+					# default = NULL,
+					help = "specify which MOI to do power analysis for; if NULL test all")
+# parser$add_argument("--pos", action = "store_true",
+# 					help = "if true, test positive pairs, e.g. enhancers with interaction")
+parser$add_argument("--neg", action = "store_true",
+					help = "if true, test negative pairs, e.g. enhancers without interactions (DEFAULT=test positive pairs")
+parser$add_argument("--neg_effect", action = "store_true",
+					help = "if true, interaction effect sizes are negative")
+parser$add_argument('--test', action = 'store_true',
+					help = "if true, just test five genes")
+parser$add_argument("--min.cells", action = "store", type = "integer",
+					default = 20,
+					help = "minimum number of cells containing perturbations for both enhancers")
+args <- parser$parse_args()
+
+print(args)
+
+# show contents of h5 file
+h5ls(args$h5)
+
+### define helper functions 
+load_counts <- function(lambda, effect.sizes, neg = FALSE) {
+	# lambda should be fixed value
+	# effect.sizes is a list of effect sizes
+	sim.counts.list <- list()
+
+	for (j in 1:length(effect.sizes)) {
+		set.name <- paste0("lambda", lambda, "_size", effect.sizes.list[j])
+		if (neg) {
+			set.name <- paste0("lambda", lambda, "_size-NEG", effect.sizes[[j]])
+		}
+	    counts <- h5read(args$h5, paste0("counts/", set.name))
+	    sim.counts.list[[j]] <- counts
+	}
+
+	return(sim.counts.list)
+}
+
+### load important info
+all.ts.pairs <- h5read(args$h5, "pairs")
+lambda.list <- h5read(args$h5, "lambda")
+effect.sizes.list <- h5read(args$h5, "effect.sizes")
+guide.target.map <- h5read(args$h5, "guides/guide_target_map")
+coeffs <- h5read(args$h5, "coeffs")
+cell.cycle.scores <- h5read(args$h5, "x/cell_cycle_scores")
+percent.mito <- h5read(args$h5, "x/percent_mito")
+scaling.factors <- h5read(args$h5, "scaling_factors")
+
+# get list of positive/negative control cases of interactions 
+genes.to.test <- c()
+if (args$neg) {
+	genes.to.test <- all.ts.pairs %>% filter(set == "negative") %>% pull(target.genes)
+} else {
+	genes.to.test <- all.ts.pairs %>% filter(set == "positive") %>% pull(target.genes)
+}
+
+# if test flag is set, iterate through just a subset of genes
+if (args$test) {
+	genes.to.test <- genes.to.test[1:200]
+}
+
+# list for collecting results of model fitting
+broom.results <- list()
+
+# if running power analysis for just one value of lambda
+if (!is.null(args$lambda)) {
+
+	# load one hot mtx + x_a, x_b, x_ab matrices for lambda
+	one.hot.mtx <- h5read(args$h5, paste0("guides/one_hot/", args$lambda))
+	xa <- h5read(args$h5, paste0("x/x_a/", args$lambda))
+	xb <- h5read(args$h5, paste0("x/x_b/", args$lambda))
+	xab <- h5read(args$h5, paste0("x/x_ab/", args$lambda))
+
+	# load list of simulated counts for this value of lambda and all effect sizes
+	sim.counts.list <- load_counts(args$lambda, effect.sizes.list, args$neg_effect) 
+
+	### POWER ANALYSIS 
+	pct.correc.vec <- c()
+	coeffs.compare.list <- list()
+
+	# iterate through interaction effect sizes
+	for (j in 1:length(effect.sizes.list)) {
+	    print(paste('effect size =', effect.sizes.list[j]))
+
+	    n.correct <- 0
+
+	    coeffs.compare.list.sub <- list()
+
+	    broom.sublist <- list()
+
+	    k <- 1
+
+
+	    # iterate through genes to test
+	    for (tg in genes.to.test) {
+	        print(paste('gene', tg))
+
+	        # get observed counts for gene 
+	        obs.counts <- sim.counts.list[[j]][tg,]
+
+	        # if using pseudocount, add pseudocount
+	        if (!is.null(args$pseudocount)) {
+	            obs.counts <- obs.counts + args$pseudocount
+	        }
+
+	        # get X_A, X_B, X_AB for gene
+	        xa.gene <- xa[tg,]
+	        xb.gene <- xb[tg,]
+	        xab.gene <- xab[tg,]
+
+	        # check if min number of cells contains guides for both sites 
+	        if (sum(xab.gene !=0)>args$min.cells) {
+
+	        	# collect data in df for model fitting
+		        gene.data <- data.frame(tsA = xa.gene, 
+		        						tsB = xb.gene,
+				                       tsAB = xab.gene,
+				                       s.score = cell.cycle.scores$s.scores,
+				                       g2m.score = cell.cycle.scores$g2m.scores,
+				                       percent.mito = percent.mito,
+				                       counts = obs.counts, 
+				                       scaling.factor = scaling.factors)
+
+	        	# fit model
+		        alt <- glm.nb(counts ~ tsA * tsB + s.score + g2m.score + percent.mito + offset(log(scaling.factor)),
+		                      data = gene.data)
+
+				# tidy fitted model results		        
+		        alt.brm <- broom::tidy(alt)
+
+		        alt.brm$gene <- tg
+		        size <- effect.sizes.list[j]
+		        if (args$neg_effect) {
+		        	size <- -1*size
+		        }
+		        # alt.brm$effect <- effect.sizes.list[j]
+		        alt.brm$effect <- size
+
+		        # store list of fitted model results 
+		        broom.sublist[[k]] <- alt.brm 
+
+		        # store coefficient data compared to actual data
+		        # size <- effect.sizes.list[j]
+		        if (args$neg) {
+		        	size <- 0
+		        }
+
+		        coeffs.compare.df <- bind_cols(alt.brm %>% dplyr::select(term, estimate), 
+		                                       true = t(coeffs %>% filter(gene == tg) %>%
+		                                                rename("tsA" = "betaA",
+		                                                       "tsB" = "betaB",
+		                                                       "(Intercept)" = "beta0",
+		                                                       "s.score" = "beta.s",
+		                                                       "g2m.score" = "beta.g2m") %>% 
+		                                                dplyr::select("(Intercept)",
+		                                                              "tsA",
+		                                                              "tsB",
+		                                                              "s.score",
+		                                                              "g2m.score",
+		                                                              "percent.mito") %>%
+		                                                mutate(`tsA:tsB`= size)), 
+		                                       gene = tg)
+
+
+		        coeffs.compare.list.sub[[k]] <- coeffs.compare.df
+		        k <- k + 1
+
+		        # count number correctly called 
+		        if (args$neg) {
+		        	# check for cases called as non-significant interaction
+		        	if (broom::tidy(alt) %>% filter(term=="tsA:tsB") %>% pull(p.value)>=0.05) {
+			            n.correct <- n.correct + 1
+			        }
+		        } else {
+		        	# check for cases called as significant interaction 
+		        	if (broom::tidy(alt) %>% filter(term=="tsA:tsB") %>% pull(p.value)<0.05) {
+			            n.correct <- n.correct + 1
+			        }
+		        }
+		    }
+        }	
+
+        # calculate percent of genes tested that were correctly called 
+	    pct.correc.vec[j] <- n.correct/(k-1)
+
+	    # rbind df lists
+	    coeffs.compare.list[[j]] <- do.call(rbind,coeffs.compare.list.sub)
+	    broom.results[[j]] <- do.call(rbind, broom.sublist)
+	}
+
+	### plot power curves based on percentage of correct hits 
+	print('calculating pct detected and making plot df for power curves')
+	# pct.correct <- sapply(n.correct.list, function(x) {x/length(genes.to.test)})
+	power.plotdf <- data.frame(effect.size = effect.sizes.list, 
+								pct.detected = pct.correc.vec,
+								lambda = args$lambda)
+	if (args$neg_effect) {
+		power.plotdf$effect.size <- -1*effect.sizes.list
+	}
+
+	# power.curve <- ggplot(power.plotdf, aes(x = effect.size, y = pct.detected)) + 
+	# 					geom_point() + geom_line() +
+	# 					theme_clasic() + 
+	# 					theme(text = element_text(size = 16))
+
+
+	# # save plot as pdf
+	# pdf(file.path(args$out, paste0("lambda", args$lambda, "_power_curves.pdf")))
+	# print(power.curve)
+	# dev.off()
+
+	# # save plot as png
+	# png(file.path(args$out, paste0("lambda", args$lambda,"_power_curves.png")), 
+	# 	res = 300, units = "in", width = 10, height = 6)
+	# print(power.curve)
+	# dev.off()
+
+	if (args$neg) {
+		print('saving results (NEG)')
+		# save power.plotdf (in case we want to work with it later...)
+		write.csv(power.plotdf, file.path(args$out, 
+					paste0("lambda", args$lambda, "_power_plotdf_NEG_filt", args$min.cells, ".csv")), 
+			quote=F, row.names = F)
+
+		# save df of broom results
+		saveRDS(broom.results, file.path(args$out, 
+					paste0("lambda", args$lambda, "_broom_list_NEG_filt", args$min.cells, ".rds")))
+
+		### export table of est vs. true coeffs for downstream analysis
+		saveRDS(coeffs.compare.list, file.path(args$out, 
+					paste0("lambda", args$lambda,"_coeffs_list_NEG_filt", args$min.cells, ".rds")))
+	} else {
+		print('saving results (POS)')
+		# save power.plotdf (in case we want to work with it later...)
+		write.csv(power.plotdf, file.path(args$out, 
+					paste0("lambda", args$lambda, "_power_plotdf_filt", args$min.cells, ".csv")), 
+			quote=F, row.names = F)
+
+		# save df of broom results
+		saveRDS(broom.results, file.path(args$out, 
+					paste0("lambda", args$lambda, "_broom_list_filt", args$min.cells, ".rds")))
+
+		### export table of est vs. true coeffs for downstream analysis
+		saveRDS(coeffs.compare.list, file.path(args$out, 
+					paste0("lambda", args$lambda,"_coeffs_list_filt", args$min.cells, ".rds")))
+	}
+
+}