Merge pull request #121 from gwaygenomics/update-subsample

Subsampling at the point of single cell merging
cytomining · Jan 7, 2021 · 90fda6b · 90fda6b
2 parents 91afadc + b5ba767
commit 90fda6b
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 9 deletions.
diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
@@ -206,7 +206,7 @@ def count_cells(self, compartment="cells", count_subset=False):
 
         return count_df
 
-    def subsample_profiles(self, df):
+    def subsample_profiles(self, df, rename_col=True):
         """Sample a Pandas DataFrame given subsampling information
 
         :param df: A single cell profile dataframe
@@ -231,10 +231,12 @@ def subsample_profiles(self, df):
                 df, frac=self.subsample_frac, random_state=self.subsampling_random_state
             )
 
-        output_df = output_df.rename(self.linking_col_rename, axis="columns")
+        if rename_col:
+            output_df = output_df.rename(self.linking_col_rename, axis="columns")
+
         return output_df
 
-    def get_subsample(self, compartment="cells"):
+    def get_subsample(self, df=None, compartment="cells", rename_col=True):
         """Apply the subsampling procedure
 
         :param compartment: string indicating the compartment to process, defaults to "cells"
@@ -246,13 +248,14 @@ def get_subsample(self, compartment="cells"):
         query = "select {} from {}".format(query_cols, compartment)
 
         # Load query and merge with image_df
-        query_df = self.image_df.merge(
-            pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
-        )
+        if df is None:
+            df = pd.read_sql(sql=query, con=self.conn)
+
+        query_df = self.image_df.merge(df, how="inner", on=self.merge_cols)
 
         self.subset_data_df = (
             query_df.groupby(self.strata)
-            .apply(lambda x: self.subsample_profiles(x))
+            .apply(lambda x: self.subsample_profiles(x, rename_col=rename_col))
             .reset_index(drop=True)
         )
 
@@ -318,6 +321,7 @@ def aggregate_compartment(
 
     def merge_single_cells(
         self,
+        compute_subsample=False,
         sc_output_file="none",
         compression_options=None,
         float_format=None,
@@ -365,7 +369,21 @@ def merge_single_cells(
                 ]
 
                 if isinstance(sc_df, str):
-                    sc_df = self.load_compartment(compartment=left_compartment).merge(
+                    initial_df = self.load_compartment(compartment=left_compartment)
+
+                    if compute_subsample:
+                        # Sample cells proportionally by self.strata
+                        self.get_subsample(df=initial_df, rename_col=False)
+
+                        subset_logic_df = self.subset_data_df.drop(
+                            self.image_df.columns, axis="columns"
+                        )
+
+                        initial_df = subset_logic_df.merge(
+                            initial_df, how="left", on=subset_logic_df.columns.tolist()
+                        ).reindex(initial_df.columns, axis="columns")
+
+                    sc_df = initial_df.merge(
                         self.load_compartment(compartment=right_compartment),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],
@@ -447,7 +465,7 @@ def aggregate_profiles(
 
         :param compute_subsample: Determine if subsample should be computed, defaults to False
         :type compute_subsample: bool
-        :param output_file: the name of a file to output, defaults to "none":
+        :param output_file: the name of a file to output, defaults to "none"
         :type output_file: str, optional
         :param compression: the mechanism to compress, defaults to None
         :type compression: str, optional

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -295,6 +295,52 @@ def test_merge_single_cells():
     )
 
 
+def test_merge_single_cells_subsample():
+
+    for subsample_frac in [0.1, 0.5, 0.9]:
+        ap_subsample = SingleCells(file_or_conn=file, subsample_frac=subsample_frac)
+
+        sc_merged_df = ap_subsample.merge_single_cells(
+            sc_output_file="none",
+            compute_subsample=True,
+            compression_options=None,
+            float_format=None,
+            single_cell_normalize=True,
+            normalize_args=None,
+        )
+
+        # Assert that the image data was merged
+        assert all(
+            x in sc_merged_df.columns for x in ["Metadata_Plate", "Metadata_Well"]
+        )
+
+        # Assert that metadata columns were renamed appropriately
+        for x in ap_subsample.full_merge_suffix_rename:
+            assert ap_subsample.full_merge_suffix_rename[x] == "Metadata_{x}".format(
+                x=x
+            )
+
+        # Assert that the subsample fraction worked
+        assert sc_merged_df.shape[0] == cells_df.shape[0] * subsample_frac
+
+    for subsample_n in [2, 5, 10]:
+        ap_subsample = SingleCells(file_or_conn=file, subsample_n=subsample_n)
+
+        sc_merged_df = ap_subsample.merge_single_cells(
+            sc_output_file="none",
+            compute_subsample=True,
+            compression_options=None,
+            float_format=None,
+            single_cell_normalize=True,
+            normalize_args=None,
+        )
+
+        # Assert that the number of each strata should be even
+        assert subsample_n == int(
+            sc_merged_df.loc[:, ap_subsample.strata].value_counts().values.mean()
+        )
+
+
 def test_aggregate_comparment():
     df = image_df.merge(cells_df, how="inner", on=["TableNumber", "ImageNumber"])
     result = aggregate(df)