allow specific pseudocount for lfc calculation

pinellolab · Jun 25, 2024 · cac5d9e · cac5d9e
1 parent 4d37e08
commit cac5d9e
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 21 deletions.
diff --git a/perturb_tools/_framework/_ScreenModule.py b/perturb_tools/_framework/_ScreenModule.py
@@ -117,13 +117,13 @@ def annotate_guides(
             ref_seq_path,
         )
 
-    def log_norm(self, output_layer="lognorm_counts", read_count_layer=None):
+    def log_norm(self, output_layer="lognorm_counts", read_count_layer=None, pseudocount = 1):
         if read_count_layer is None:
-            self.layers[output_layer] = _log_normalize_read_count(self.X)
+            self.layers[output_layer] = _log_normalize_read_count(self.X, pseudocount)
         else:
             output_layer = f"lognorm_{read_count_layer}"
             self.layers[output_layer] = _log_normalize_read_count(
-                self.layers[read_count_layer]
+                self.layers[read_count_layer], pseudocount
             )
 
     # TBD: mask ones with too low raw counts.
@@ -133,22 +133,22 @@ def log_fold_change(
         sample2,
         lognorm_counts_key="lognorm_counts",
         name=False,
+        pseudocount: int = 1,
         out_guides_suffix="lfc",
         return_result=False,
     ):
         """
         General module to calculate LFC across experimental conditions.
         """
-        if "lognorm" not in lognorm_counts_key:
-            warnings.warn(
-                "The layer specified must be log-normalized values using screen.log_norm()."
-            )
-
-        if lognorm_counts_key not in self.layers.keys():
-            raise ValueError(
-                "Specified normalized count isn't in your layer. First run screen.log_norm()."
-            )
-
+        if lognorm_counts_key == "lognorm_counts":
+            self.log_norm(pseudocount=pseudocount)
+        else:
+            if "lognorm_" not in lognorm_counts_key:
+                raise ValueError(f"{lognorm_counts_key} is not a lognorm layer- feed in 'lognorm_`layer_key`' as lognorm_counts_key.")
+            read_count_layer_key = lognorm_counts_key.split("lognorm_")[-1]
+            if read_count_layer_key not in self.layers:
+                raise ValueError(f"{read_count_layer_key} not in .layers - feed in 'lognorm_`layer_key`' as lognorm_counts_key.")
+            self.log_norm(output_layer=lognorm_counts_key, read_count_layer=read_count_layer_key, pseudocount=pseudocount)
         sample1_idx = np.where(sample1 == self.samples.index)[0]
         sample2_idx = np.where(sample2 == self.samples.index)[0]
         if len(sample1_idx) != 1 or len(sample2_idx) != 1:
@@ -190,6 +190,7 @@ def log_fold_change_reps(
         rep_col: Union[str, List[str]] = "replicate",
         compare_col="sort",
         out_guides_suffix="lfc",
+        pseudocount=1,
         keep_result=False,
         ignore_missing=False,
     ):
@@ -258,6 +259,7 @@ def log_fold_change_reps(
                     self.samples.index[cond1_idx].tolist()[0],
                     self.samples.index[cond2_idx].tolist()[0],
                     lognorm_counts_key=lognorm_counts_key,
+                    pseudocount=pseudocount,
                     return_result=True,
                 )
             )
@@ -281,9 +283,10 @@ def log_fold_change_agg(
         cond2,
         lognorm_counts_key="lognorm_counts",
         agg_col="replicate",
-        compare_col="sort",
+        compare_col="condition",
         out_guides_suffix="lfc",
         agg_fn="median",
+        pseudocount=1,
         name=None,
         return_result=False,
         keep_per_replicate=False,
@@ -295,6 +298,7 @@ def log_fold_change_agg(
             rep_col=agg_col,
             compare_col=compare_col,
             out_guides_suffix=out_guides_suffix,
+            pseudocount=pseudocount,
             keep_result=keep_per_replicate,
         )
 

diff --git a/perturb_tools/_normalization/_funcs/_read_count_norm.py b/perturb_tools/_normalization/_funcs/_read_count_norm.py
@@ -1,24 +1,24 @@
 import numpy as np
 
 
-def _read_count_normalize(X):
+def _read_count_normalize(X, pseudocount: int = 1):
     """Read depth normalization by sample. Assumes samples are columns and guides are rows."""
 
-    return (X / np.nansum(X, axis=0)) * 1e6
+    return (X / np.nansum(X+pseudocount, axis=0)) * 1e6
 
 
-def _log_transform_read_count(X):
+def _log_transform_read_count(X, pseudocount: int = 1):
     """"""
-    return np.log2(X + 1)
+    return np.log2(X + pseudocount)
 
 
-def _log_normalize_read_count(X):
+def _log_normalize_read_count(X, pseudocount:int = 1):
     """Following the protocol written clearly, here:
     https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0170445#sec002
     (see Methods).
     """
 
-    X_read_norm = _read_count_normalize(X)
+    X_read_norm = _read_count_normalize(X, pseudocount)
     X_log_read_norm = _log_transform_read_count(X_read_norm)
 
     return X_log_read_norm
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name="perturb-tools",
-    version="0.3.4",
+    version="0.3.5",
     python_requires=">3.7.0",
     author=[
         "Michael E. Vinyard - Harvard University - Massachussetts General Hospital - Broad Institute of MIT and Harvard",