Merge pull request #105 from mcvickerlab/experimental

final push
mcvickerlab · Aug 24, 2024 · dd5cada · dd5cada
2 parents e26c0dc + 0d9364b
commit dd5cada
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 0 deletions.
diff --git a/src/gasperini/visualization/plot_pairs_per_gene_330_pairs.R b/src/gasperini/visualization/plot_pairs_per_gene_330_pairs.R
@@ -0,0 +1,41 @@
+# This script plots the distribution of number of enhancer pairs per gene for
+# the set of 264 tested enhancer pairs.
+#
+# Author: Karthik Guruvayurappan
+
+library(stats)
+library(dplyr)
+library(ggplot2)
+
+# read in results from 330 enhancer-enhancer-gene sets
+results <- read.csv('data/gasperini/processed/enhancer_pairs_330_models.csv')
+
+# filter out NA values
+results <- results[complete.cases(results), ]
+
+# plot distribution of number of enhancer pairs per gene
+plot_df <- results %>% count(gene.list)
+
+plot <- ggplot(plot_df, aes(x = n)) +
+  geom_histogram(fill = 'darkgray', color = 'darkgray') +
+  theme_classic() +
+  scale_x_continuous(expand = c(0.02, 0)) +
+  scale_y_continuous(expand = c(0, 0)) +
+  xlab("Number of Enhancer Pairs") + 
+  ylab("Count") +
+  theme(
+    axis.line = element_line(linewidth = 1),
+    axis.title.x = element_text(size = 24, color = 'black'),
+    axis.title.y = element_text(size = 24, color = 'black'),
+    axis.text = element_text(size = 24, color = 'black'),
+    axis.ticks = element_line(color = 'black', linewidth = 1),
+    axis.ticks.length = unit(2, 'mm'),
+    plot.margin = rep(unit(10, 'mm'), 4),
+  )
+
+ggsave(
+  plot = plot,
+  filename = 'out/pairs_per_gene_330_pairs.pdf',
+  device = 'pdf'
+)
+
diff --git a/src/gasperini/visualization/plot_pairs_per_gene_at_scale.R b/src/gasperini/visualization/plot_pairs_per_gene_at_scale.R
@@ -0,0 +1,70 @@
+# This script plots the number of enhancer pairs per gene for the at-scale
+# analysis.
+#
+# Author: Karthik Guruvayurappan
+
+library(stats)
+library(dplyr)
+library(ggplot2)
+
+# read in at-scale enhancer pair model results across all 32 batches
+models <- data.frame()
+
+for (i in 1:32) {
+    batch.file.name <- paste0(
+        'data/gasperini/processed/enhancer_pairs_at_scale_',
+        i,
+        '.csv'
+    )
+    batch.models <- read.csv(batch.file.name)
+    models <- rbind(models, batch.models)
+}
+
+# filter for cases where all coefficients exist
+models <- models[complete.cases(models), ]
+
+# read in the "true" double perturbation counts
+perturbation_counts <- read.csv(
+  paste0(
+    'data/gasperini/processed/',
+    'enhancer_pair_efficiency_adjusted_double_perturb_counts.csv'
+  )
+)
+
+# merge model outputs with perturbation counts
+models <- merge(
+  models,
+  perturbation_counts,
+  by.x = c('enhancer.1.list', 'enhancer.2.list', 'gene.list'),
+  by.y = c('enhancer_1_list', 'enhancer_2_list', 'gene_list')
+)
+
+# filter for true 10 cell threshold
+models <- models[models$double_perturbation_counts >= 10, ]
+
+# count number of pairs per gene
+plot_df <- models %>% count(gene.list)
+
+plot <- ggplot(plot_df, aes(x = n)) +
+  geom_histogram(fill = 'darkgray', color = 'darkgray') +
+  theme_classic() +
+  scale_x_continuous(expand = c(0.02, 0)) +
+  scale_y_continuous(expand = c(0, 0)) +
+  xlab("Number of Enhancer Pairs") + 
+  ylab("Count") +
+  theme(
+    axis.line = element_line(linewidth = 1),
+    axis.title.x = element_text(size = 24, color = 'black'),
+    axis.title.y = element_text(size = 24, color = 'black'),
+    axis.text = element_text(size = 24, color = 'black'),
+    axis.ticks = element_line(color = 'black', linewidth = 1),
+    axis.ticks.length = unit(2, 'mm'),
+    plot.margin = rep(unit(10, 'mm'), 4),
+  )
+
+ggsave(
+  plot = plot,
+  filename = 'out/pairs_per_gene_at_scale.pdf',
+  device = 'pdf'
+)
+
diff --git a/src/gasperini/visualization/plot_qqplot_interaction_term_pvalues_at_scale.R b/src/gasperini/visualization/plot_qqplot_interaction_term_pvalues_at_scale.R
@@ -44,6 +44,11 @@ models <- models[models$double_perturbation_counts >= 10, ]
 
 # print out how many models remain after filtering
 print(dim(models))
+write.csv(
+  models, 
+  'data/gasperini/processed/enhancer_pairs_at_scale.csv',
+  row.names = FALSE
+)
 
 # sort results by p-value
 models <- models[order(models$interaction.pvalues), ]