From 22d305ef6ce40787b2c96e4b3e60ce005cca9940 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Fri, 25 Sep 2020 15:18:38 -0400
Subject: [PATCH 01/15] move AggregateProfiles class to SingleCell class

---
 pycytominer/aggregate.py                      | 290 +-----------------
 .../test_cells.py}                            |  24 +-
 2 files changed, 26 insertions(+), 288 deletions(-)
 rename pycytominer/tests/{test_aggregate_profiles.py => test_cyto_utils/test_cells.py} (95%)

diff --git a/pycytominer/aggregate.py b/pycytominer/aggregate.py
index 1d691a5c..8cb327c2 100644
--- a/pycytominer/aggregate.py
+++ b/pycytominer/aggregate.py
@@ -4,294 +4,19 @@
 
 import numpy as np
 import pandas as pd
-from sqlalchemy import create_engine
 from pycytominer.cyto_utils import (
     output,
-    check_compartments,
     check_aggregate_operation,
     infer_cp_features,
 )
 
 
-class AggregateProfiles:
-    """
-    Class to aggregate single cell morphological profiles
-    """
-
-    def __init__(
-        self,
-        sql_file,
-        strata=["Metadata_Plate", "Metadata_Well"],
-        features="infer",
-        operation="median",
-        output_file="none",
-        compartments=["cells", "cytoplasm", "nuclei"],
-        merge_cols=["TableNumber", "ImageNumber"],
-        load_image_data=True,
-        subsample_frac=1,
-        subsample_n="all",
-        subsampling_random_state="none",
-    ):
-        """
-        Arguments:
-        sql_file - string or sqlalchemy connection
-        strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate
-        features - [default: "all"] or list indicating features that should be aggregated
-        operation - [default: "median"] a string indicating how the data is aggregated
-                    currently only supports one of ['mean', 'median']
-        output_file - [default: "none"] string if specified, write to location
-        compartments - list of compartments to process
-        merge_cols - column indicating which columns to merge images and compartments
-        subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of
-                         single cells to select
-        subsample_n - [default: "all"] int indicating how many samples to include
-        subsampling_random_state - [default: "none"] the random state to init subsample
-        """
-        # Check compartments specified
-        check_compartments(compartments)
-
-        # Check if correct operation is specified
-        operation = check_aggregate_operation(operation)
-
-        # Check that the subsample_frac is between 0 and 1
-        assert (
-            0 < subsample_frac and 1 >= subsample_frac
-        ), "subsample_frac must be between 0 and 1"
-
-        self.sql_file = sql_file
-        self.strata = strata
-        self.features = features
-        self.operation = operation.lower()
-        self.output_file = output_file
-        self.compartments = compartments
-        self.merge_cols = merge_cols
-        self.subsample_frac = subsample_frac
-        self.subsample_n = subsample_n
-        self.subset_data_df = "none"
-        self.subsampling_random_state = subsampling_random_state
-        self.is_aggregated = False
-        self.is_subset_computed = False
-
-        if self.subsample_n != "all":
-            self.set_subsample_n(self.subsample_n)
-
-        # Connect to sqlite engine
-        self.engine = create_engine(self.sql_file)
-        self.conn = self.engine.connect()
-
-        # Throw an error if both subsample_frac and subsample_n is set
-        self._check_subsampling()
-
-        if load_image_data:
-            self.load_image()
-
-    def _check_subsampling(self):
-        # Check that the user didn't specify both subset frac and subsample all
-        assert (
-            self.subsample_frac == 1 or self.subsample_n == "all"
-        ), "Do not set both subsample_frac and subsample_n"
-
-    def set_output_file(self, output_file):
-        self.output_file = output_file
-
-    def set_subsample_frac(self, subsample_frac):
-        self.subsample_frac = subsample_frac
-        self._check_subsampling()
-
-    def set_subsample_n(self, subsample_n):
-        try:
-            self.subsample_n = int(subsample_n)
-        except ValueError:
-            raise ValueError("subsample n must be an integer or coercable")
-        self._check_subsampling()
-
-    def set_subsample_random_state(self, random_state):
-        self.subsampling_random_state = random_state
-
-    def load_image(self):
-        """
-        Load image table from sqlite file
-        """
-        # Extract image metadata
-        image_cols = "TableNumber, ImageNumber, {}".format(", ".join(self.strata))
-        image_query = "select {} from image".format(image_cols)
-        self.image_df = pd.read_sql(sql=image_query, con=self.conn)
-
-    def count_cells(self, compartment="cells", count_subset=False):
-        """
-        Determine how many cells are measured per well.
-
-        Arguments:
-        compartment - string indicating the compartment to subset
-        count_subset - [default: False] count the number of cells in subset partition
-        """
-        check_compartments(compartment)
-
-        if count_subset:
-            assert self.is_aggregated, "Make sure to aggregate_profiles() first!"
-            assert self.is_subset_computed, "Make sure to get_subsample() first!"
-            count_df = (
-                self.subset_data_df.groupby(self.strata)["ObjectNumber"]
-                .count()
-                .reset_index()
-                .rename({"ObjectNumber": "cell_count"}, axis="columns")
-            )
-        else:
-            query_cols = "TableNumber, ImageNumber, ObjectNumber"
-            query = "select {} from {}".format(query_cols, compartment)
-            count_df = self.image_df.merge(
-                pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
-            )
-            count_df = (
-                count_df.groupby(self.strata)["ObjectNumber"]
-                .count()
-                .reset_index()
-                .rename({"ObjectNumber": "cell_count"}, axis="columns")
-            )
-
-        return count_df
-
-    def subsample_profiles(self, x):
-        """
-        Sample a Pandas DataFrame given the subsampling fraction
-        """
-        if self.subsampling_random_state == "none":
-            random_state = np.random.randint(0, 10000, size=1)[0]
-            self.set_subsample_random_state(random_state)
-
-        if self.subsample_frac == 1:
-            return pd.DataFrame.sample(
-                x,
-                n=self.subsample_n,
-                replace=True,
-                random_state=self.subsampling_random_state,
-            )
-        else:
-            return pd.DataFrame.sample(
-                x, frac=self.subsample_frac, random_state=self.subsampling_random_state
-            )
-
-    def get_subsample(self, compartment="cells"):
-        """
-        Extract subsample from sqlite file
-
-        Arguments:
-        compartment - [default: "cells"] string indicating the compartment to subset
-        """
-        check_compartments(compartment)
-
-        query_cols = "TableNumber, ImageNumber, ObjectNumber"
-        query = "select {} from {}".format(query_cols, compartment)
-
-        # Load query and merge with image_df
-        query_df = self.image_df.merge(
-            pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
-        )
-
-        self.subset_data_df = (
-            query_df.groupby(self.strata)
-            .apply(lambda x: self.subsample_profiles(x))
-            .reset_index(drop=True)
-        )
-
-        self.is_subset_computed = True
-
-    def aggregate_compartment(self, compartment, compute_subsample=False):
-        """
-        Aggregate morphological profiles
-
-        Arguments:
-        compartment - str indicating specific compartment to extract
-
-        Return:
-        Either the merged object file or write object to disk
-        """
-        check_compartments(compartment)
-
-        compartment_query = "select * from {}".format(compartment)
-
-        if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
-            self.get_subsample(compartment=compartment)
-
-        population_df = self.image_df.merge(
-            pd.read_sql(sql=compartment_query, con=self.conn),
-            how="inner",
-            on=self.merge_cols,
-        )
-
-        object_df = aggregate(
-            population_df=population_df,
-            strata=self.strata,
-            features=self.features,
-            operation=self.operation,
-            subset_data_df=self.subset_data_df,
-        )
-
-        return object_df
-
-    def aggregate_profiles(
-        self,
-        compute_subsample="False",
-        output_file="none",
-        compression=None,
-        float_format=None,
-    ):
-        """
-        Aggregate and merge compartments. This is the primary entry to this class.
-
-        Arguments:
-        compute_subsample - [default: False] boolean if subsample should be computed.
-                            NOTE: Must be specified to perform subsampling. Will not
-                            apply subsetting if set to False even if subsample is
-                            initialized
-        output_file - [default: "none"] if provided, will write annotated profiles to file
-                  if not specified, will return the annotated profiles. We recommend
-                  that this output file be suffixed with "_augmented.csv".
-        compression - the mechanism to compress [default: None]
-        float_format - decimal precision to use in writing output file [default: None]
-                           For example, use "%.3g" for 3 decimal precision.
-
-        Return:
-        if output_file is set, then write to file. If not then return
-        """
-
-        if output_file != "none":
-            self.set_output_file(output_file)
-
-        aggregated = (
-            self.aggregate_compartment(
-                compartment="cells", compute_subsample=compute_subsample
-            )
-            .merge(
-                self.aggregate_compartment(compartment="cytoplasm"),
-                on=self.strata,
-                how="inner",
-            )
-            .merge(
-                self.aggregate_compartment(compartment="nuclei"),
-                on=self.strata,
-                how="inner",
-            )
-        )
-
-        self.is_aggregated = True
-
-        if self.output_file != "none":
-            output(
-                df=aggregated,
-                output_filename=self.output_file,
-                compression=compression,
-                float_format=float_format,
-            )
-        else:
-            return aggregated
-
-
 def aggregate(
     population_df,
     strata=["Metadata_Plate", "Metadata_Well"],
     features="infer",
     operation="median",
+    output_file="none",
     subset_data_df="none",
 ):
     """
@@ -303,6 +28,9 @@ def aggregate(
     features - [default: "all"] or list indicating features that should be aggregated
     operation - [default: "median"] a string indicating how the data is aggregated
                 currently only supports one of ['mean', 'median']
+    output_file - [default: "none"] if provided, will write aggregated profiles to file
+                  if not specified, will return the aggregated profiles. We recommend
+                  naming the file based on the plate name.
     subset_data_df - [default: "none"] a pandas dataframe indicating how to subset the input
 
     Return:
@@ -345,4 +73,14 @@ def aggregate(
         if col in population_df.columns:
             population_df = population_df.drop([col], axis="columns")
 
+    if output_file != "none":
+        output(
+            df=population_df,
+            output_filename=output_file,
+            compression=compression,
+            float_format=float_format,
+        )
+    else:
+        return population_df
+
     return population_df
diff --git a/pycytominer/tests/test_aggregate_profiles.py b/pycytominer/tests/test_cyto_utils/test_cells.py
similarity index 95%
rename from pycytominer/tests/test_aggregate_profiles.py
rename to pycytominer/tests/test_cyto_utils/test_cells.py
index c18f73c1..97ec40c7 100644
--- a/pycytominer/tests/test_aggregate_profiles.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from sqlalchemy import create_engine
 from pycytominer import aggregate
-from pycytominer.aggregate import AggregateProfiles
+from pycytominer.cyto_utils.cells import SingleCells
 
 random.seed(123)
 
@@ -64,16 +64,16 @@ def build_random_data(
 cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace")
 nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace")
 
-# Setup AggregateProfiles Class
-ap = AggregateProfiles(sql_file=file)
-ap_subsample = AggregateProfiles(
+# Setup SingleCells Class
+ap = SingleCells(sql_file=file)
+ap_subsample = SingleCells(
     sql_file=file, subsample_n=2, subsampling_random_state=123
 )
 
 
-def test_AggregateProfiles_init():
+def test_SingleCells_init():
     """
-    Testing initialization of AggregateProfiles
+    Testing initialization of SingleCells
     """
     assert ap.sql_file == file
     assert ap.strata == ["Metadata_Plate", "Metadata_Well"]
@@ -92,11 +92,11 @@ def test_AggregateProfiles_init():
     assert ap_subsample.subsampling_random_state == 123
 
 
-def test_AggregateProfiles_reset_variables():
+def test_SingleCells_reset_variables():
     """
-    Testing initialization of AggregateProfiles
+    Testing initialization of SingleCells
     """
-    ap_switch = AggregateProfiles(sql_file=file)
+    ap_switch = SingleCells(sql_file=file)
     assert ap_switch.subsample_frac == 1
     assert ap_switch.subsample_n == "all"
     assert ap_switch.subsampling_random_state == "none"
@@ -121,7 +121,7 @@ def test_AggregateProfiles_reset_variables():
     assert "subsample n must be an integer or coercable" in str(errorinfo.value.args[0])
 
 
-def test_AggregateProfiles_count():
+def test_SingleCells_count():
     count_df = ap.count_cells()
     expected_count = pd.DataFrame(
         {
@@ -307,8 +307,8 @@ def test_aggregate_count_cells_multiple_strata():
     cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace")
     nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace")
 
-    # Setup AggregateProfiles Class
-    ap_strata = AggregateProfiles(
+    # Setup SingleCells Class
+    ap_strata = SingleCells(
         sql_file=file,
         subsample_n="4",
         strata=["Metadata_Plate", "Metadata_Well", "Metadata_Site"],

From 93759dad597636ea097864b97189cde70ec25d8a Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Fri, 9 Oct 2020 11:53:56 -0400
Subject: [PATCH 02/15] black on test

---
 pycytominer/tests/test_cyto_utils/test_cells.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 97ec40c7..4b293082 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -66,9 +66,7 @@ def build_random_data(
 
 # Setup SingleCells Class
 ap = SingleCells(sql_file=file)
-ap_subsample = SingleCells(
-    sql_file=file, subsample_n=2, subsampling_random_state=123
-)
+ap_subsample = SingleCells(sql_file=file, subsample_n=2, subsampling_random_state=123)
 
 
 def test_SingleCells_init():

From 3b86693f6eb68facc638efebd329830a1146e910 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Thu, 10 Dec 2020 09:50:34 -0500
Subject: [PATCH 03/15] add cells module

---
 pycytominer/cyto_utils/cells.py | 284 ++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 pycytominer/cyto_utils/cells.py

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
new file mode 100644
index 00000000..dbf8de83
--- /dev/null
+++ b/pycytominer/cyto_utils/cells.py
@@ -0,0 +1,284 @@
+import numpy as np
+import pandas as pd
+from sqlalchemy import create_engine
+from pycytominer import aggregate
+from pycytominer.cyto_utils import (
+    output,
+    check_compartments,
+    check_aggregate_operation,
+    infer_cp_features,
+)
+
+
+class SingleCells:
+    """
+    Class to interact with single cell morphological profiles
+    """
+
+    def __init__(
+        self,
+        sql_file,
+        strata=["Metadata_Plate", "Metadata_Well"],
+        features="infer",
+        operation="median",
+        output_file="none",
+        compartments=["cells", "cytoplasm", "nuclei"],
+        merge_cols=["TableNumber", "ImageNumber"],
+        load_image_data=True,
+        subsample_frac=1,
+        subsample_n="all",
+        subsampling_random_state="none",
+    ):
+        """
+        Arguments:
+        sql_file - string or sqlalchemy connection
+        strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate
+        features - [default: "all"] or list indicating features that should be aggregated
+        operation - [default: "median"] a string indicating how the data is aggregated
+                    currently only supports one of ['mean', 'median']
+        output_file - [default: "none"] string if specified, write to location
+        compartments - list of compartments to process
+        merge_cols - column indicating which columns to merge images and compartments
+        subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of
+                         single cells to select
+        subsample_n - [default: "all"] int indicating how many samples to include
+        subsampling_random_state - [default: "none"] the random state to init subsample
+        """
+        # Check compartments specified
+        check_compartments(compartments)
+
+        # Check if correct operation is specified
+        operation = check_aggregate_operation(operation)
+
+        # Check that the subsample_frac is between 0 and 1
+        assert (
+            0 < subsample_frac and 1 >= subsample_frac
+        ), "subsample_frac must be between 0 and 1"
+
+        self.sql_file = sql_file
+        self.strata = strata
+        self.features = features
+        self.operation = operation.lower()
+        self.output_file = output_file
+        self.compartments = compartments
+        self.merge_cols = merge_cols
+        self.subsample_frac = subsample_frac
+        self.subsample_n = subsample_n
+        self.subset_data_df = "none"
+        self.subsampling_random_state = subsampling_random_state
+        self.is_aggregated = False
+        self.is_subset_computed = False
+
+        if self.subsample_n != "all":
+            self.set_subsample_n(self.subsample_n)
+
+        # Connect to sqlite engine
+        self.engine = create_engine(self.sql_file)
+        self.conn = self.engine.connect()
+
+        # Throw an error if both subsample_frac and subsample_n is set
+        self._check_subsampling()
+
+        if load_image_data:
+            self.load_image()
+
+    def _check_subsampling(self):
+        # Check that the user didn't specify both subset frac and subsample all
+        assert (
+            self.subsample_frac == 1 or self.subsample_n == "all"
+        ), "Do not set both subsample_frac and subsample_n"
+
+    def set_output_file(self, output_file):
+        self.output_file = output_file
+
+    def set_subsample_frac(self, subsample_frac):
+        self.subsample_frac = subsample_frac
+        self._check_subsampling()
+
+    def set_subsample_n(self, subsample_n):
+        try:
+            self.subsample_n = int(subsample_n)
+        except ValueError:
+            raise ValueError("subsample n must be an integer or coercable")
+        self._check_subsampling()
+
+    def set_subsample_random_state(self, random_state):
+        self.subsampling_random_state = random_state
+
+    def load_image(self):
+        """
+        Load image table from sqlite file
+        """
+        # Extract image metadata
+        image_cols = "TableNumber, ImageNumber, {}".format(", ".join(self.strata))
+        image_query = "select {} from image".format(image_cols)
+        self.image_df = pd.read_sql(sql=image_query, con=self.conn)
+
+    def count_cells(self, compartment="cells", count_subset=False):
+        """
+        Determine how many cells are measured per well.
+
+        Arguments:
+        compartment - string indicating the compartment to subset
+        count_subset - [default: False] count the number of cells in subset partition
+        """
+        check_compartments(compartment)
+
+        if count_subset:
+            assert self.is_aggregated, "Make sure to aggregate_profiles() first!"
+            assert self.is_subset_computed, "Make sure to get_subsample() first!"
+            count_df = (
+                self.subset_data_df.groupby(self.strata)["ObjectNumber"]
+                .count()
+                .reset_index()
+                .rename({"ObjectNumber": "cell_count"}, axis="columns")
+            )
+        else:
+            query_cols = "TableNumber, ImageNumber, ObjectNumber"
+            query = "select {} from {}".format(query_cols, compartment)
+            count_df = self.image_df.merge(
+                pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
+            )
+            count_df = (
+                count_df.groupby(self.strata)["ObjectNumber"]
+                .count()
+                .reset_index()
+                .rename({"ObjectNumber": "cell_count"}, axis="columns")
+            )
+
+        return count_df
+
+    def subsample_profiles(self, x):
+        """
+        Sample a Pandas DataFrame given the subsampling fraction
+        """
+        if self.subsampling_random_state == "none":
+            random_state = np.random.randint(0, 10000, size=1)[0]
+            self.set_subsample_random_state(random_state)
+
+        if self.subsample_frac == 1:
+            return pd.DataFrame.sample(
+                x,
+                n=self.subsample_n,
+                replace=True,
+                random_state=self.subsampling_random_state,
+            )
+        else:
+            return pd.DataFrame.sample(
+                x, frac=self.subsample_frac, random_state=self.subsampling_random_state
+            )
+
+    def get_subsample(self, compartment="cells"):
+        """
+        Extract subsample from sqlite file
+
+        Arguments:
+        compartment - [default: "cells"] string indicating the compartment to subset
+        """
+        check_compartments(compartment)
+
+        query_cols = "TableNumber, ImageNumber, ObjectNumber"
+        query = "select {} from {}".format(query_cols, compartment)
+
+        # Load query and merge with image_df
+        query_df = self.image_df.merge(
+            pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
+        )
+
+        self.subset_data_df = (
+            query_df.groupby(self.strata)
+            .apply(lambda x: self.subsample_profiles(x))
+            .reset_index(drop=True)
+        )
+
+        self.is_subset_computed = True
+
+    def aggregate_compartment(self, compartment, compute_subsample=False):
+        """
+        Aggregate morphological profiles
+
+        Arguments:
+        compartment - str indicating specific compartment to extract
+
+        Return:
+        Either the merged object file or write object to disk
+        """
+        check_compartments(compartment)
+
+        compartment_query = "select * from {}".format(compartment)
+
+        if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
+            self.get_subsample(compartment=compartment)
+
+        population_df = self.image_df.merge(
+            pd.read_sql(sql=compartment_query, con=self.conn),
+            how="inner",
+            on=self.merge_cols,
+        )
+
+        object_df = aggregate(
+            population_df=population_df,
+            strata=self.strata,
+            features=self.features,
+            operation=self.operation,
+            subset_data_df=self.subset_data_df,
+        )
+
+        return object_df
+
+    def aggregate_profiles(
+        self,
+        compute_subsample="False",
+        output_file="none",
+        compression=None,
+        float_format=None,
+    ):
+        """
+        Aggregate and merge compartments. This is the primary entry to this class.
+
+        Arguments:
+        compute_subsample - [default: False] boolean if subsample should be computed.
+                            NOTE: Must be specified to perform subsampling. Will not
+                            apply subsetting if set to False even if subsample is
+                            initialized
+        output_file - [default: "none"] if provided, will write annotated profiles to file
+                  if not specified, will return the annotated profiles. We recommend
+                  that this output file be suffixed with "_augmented.csv".
+        compression - the mechanism to compress [default: None]
+        float_format - decimal precision to use in writing output file [default: None]
+                           For example, use "%.3g" for 3 decimal precision.
+
+        Return:
+        if output_file is set, then write to file. If not then return
+        """
+
+        if output_file != "none":
+            self.set_output_file(output_file)
+
+        aggregated = (
+            self.aggregate_compartment(
+                compartment="cells", compute_subsample=compute_subsample
+            )
+            .merge(
+                self.aggregate_compartment(compartment="cytoplasm"),
+                on=self.strata,
+                how="inner",
+            )
+            .merge(
+                self.aggregate_compartment(compartment="nuclei"),
+                on=self.strata,
+                how="inner",
+            )
+        )
+
+        self.is_aggregated = True
+
+        if self.output_file != "none":
+            output(
+                df=aggregated,
+                output_filename=self.output_file,
+                compression=compression,
+                float_format=float_format,
+            )
+        else:
+            return aggregated

From bdbf15bedfa2566bb103a9522cf30dac67c73018 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Mon, 14 Dec 2020 16:11:00 -0500
Subject: [PATCH 04/15] add sphinx comments and prep for file input modularity

---
 pycytominer/cyto_utils/cells.py | 169 ++++++++++++++++++++------------
 1 file changed, 104 insertions(+), 65 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index dbf8de83..c96e713a 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -2,6 +2,7 @@
 import pandas as pd
 from sqlalchemy import create_engine
 from pycytominer import aggregate
+from pycytominer import normalize
 from pycytominer.cyto_utils import (
     output,
     check_compartments,
@@ -10,17 +11,40 @@
 )
 
 
-class SingleCells:
-    """
-    Class to interact with single cell morphological profiles
+class SingleCells(object):
+    """This is a class to interact with single cell morphological profiles. Interaction
+    includes aggregation, normalization, and output.
+
+    :param file_or_conn: A file string or database connection storing the location of single cell profiles
+    :type file_or_conn: str
+    :param strata: The columns to groupby and aggregate single cells, defaults to ["Metadata_Plate", "Metadata_Well"]
+    :type strata: list
+    :param features: The features that should be aggregated, defaults to "infer"
+    :type features: str, list
+    :param aggregation_operation: operation to perform single cell aggregation, defaults to "median"
+    :type aggregation_operation: str
+    :param output_file: If specified, the location to write the file, defaults to "none"
+    :type output_file: str
+    :param compartments: list of compartments to process, defaults to ["cells", "cytoplasm", "nuclei"]
+    :type compartments: list
+    :param merge_cols: columns indicating how to merge image and compartment data, defaults to ["TableNumber", "ImageNumber"]
+    :type merge_cols: list
+    :param load_image_data: if image data should be loaded into memory, defaults to True
+    :type load_image_data: bool
+    :param subsample_frac: indicating percentage of single cells to select (0 < subsample_frac <= 1), defaults to 1
+    :type subsample_frac: float
+    :param subsample_n: indicate how many samples to subsample - do not specify both subsample_frac and subsample_n, defaults to "all"
+    :type subsample_n:, str, int
+    :param subsampling_random_state: the random state to init subsample, defaults to "none"
+    :type subsampling_random_state: str, int
     """
 
     def __init__(
         self,
-        sql_file,
+        file_or_conn,
         strata=["Metadata_Plate", "Metadata_Well"],
         features="infer",
-        operation="median",
+        aggregation_operation="median",
         output_file="none",
         compartments=["cells", "cytoplasm", "nuclei"],
         merge_cols=["TableNumber", "ImageNumber"],
@@ -29,36 +53,22 @@ def __init__(
         subsample_n="all",
         subsampling_random_state="none",
     ):
-        """
-        Arguments:
-        sql_file - string or sqlalchemy connection
-        strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate
-        features - [default: "all"] or list indicating features that should be aggregated
-        operation - [default: "median"] a string indicating how the data is aggregated
-                    currently only supports one of ['mean', 'median']
-        output_file - [default: "none"] string if specified, write to location
-        compartments - list of compartments to process
-        merge_cols - column indicating which columns to merge images and compartments
-        subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of
-                         single cells to select
-        subsample_n - [default: "all"] int indicating how many samples to include
-        subsampling_random_state - [default: "none"] the random state to init subsample
-        """
+        """Constructor method"""
         # Check compartments specified
         check_compartments(compartments)
 
         # Check if correct operation is specified
-        operation = check_aggregate_operation(operation)
+        aggregation_operation = check_aggregate_operation(aggregation_operation)
 
         # Check that the subsample_frac is between 0 and 1
         assert (
             0 < subsample_frac and 1 >= subsample_frac
         ), "subsample_frac must be between 0 and 1"
 
-        self.sql_file = sql_file
+        self.file_or_conn = file_or_conn
         self.strata = strata
         self.features = features
-        self.operation = operation.lower()
+        self.aggregation_operation = aggregation_operation.lower()
         self.output_file = output_file
         self.compartments = compartments
         self.merge_cols = merge_cols
@@ -73,7 +83,7 @@ def __init__(
             self.set_subsample_n(self.subsample_n)
 
         # Connect to sqlite engine
-        self.engine = create_engine(self.sql_file)
+        self.engine = create_engine(self.file_or_conn)
         self.conn = self.engine.connect()
 
         # Throw an error if both subsample_frac and subsample_n is set
@@ -83,19 +93,35 @@ def __init__(
             self.load_image()
 
     def _check_subsampling(self):
+        """Internal method checking if subsampling options were specified correctly"""
         # Check that the user didn't specify both subset frac and subsample all
         assert (
             self.subsample_frac == 1 or self.subsample_n == "all"
         ), "Do not set both subsample_frac and subsample_n"
 
     def set_output_file(self, output_file):
+        """Setting operation to conveniently rename output file
+
+        :param output_file: the new output file name
+        :type output_file: str
+        """
         self.output_file = output_file
 
     def set_subsample_frac(self, subsample_frac):
+        """Setting operation to conveniently update the subsample fraction
+
+        :param subsample_frac: indicating percentage of single cells to select (0 < subsample_frac <= 1), defaults to 1
+        :type subsample_frac: float
+        """
         self.subsample_frac = subsample_frac
         self._check_subsampling()
 
     def set_subsample_n(self, subsample_n):
+        """Setting operation to conveniently update the subsample n
+
+        :param subsample_n: indicate how many samples to subsample - do not specify both subsample_frac and subsample_n, defaults to "all"
+        :type subsample_n:, str, int
+        """
         try:
             self.subsample_n = int(subsample_n)
         except ValueError:
@@ -103,24 +129,28 @@ def set_subsample_n(self, subsample_n):
         self._check_subsampling()
 
     def set_subsample_random_state(self, random_state):
+        """Setting operation to conveniently update the subsample random state
+
+        :param random_state: the random state to init subsample, defaults to "none"
+        :type random_state:, str, int
+        """
         self.subsampling_random_state = random_state
 
     def load_image(self):
-        """
-        Load image table from sqlite file
-        """
+        """Load image table from sqlite file"""
         # Extract image metadata
         image_cols = "TableNumber, ImageNumber, {}".format(", ".join(self.strata))
         image_query = "select {} from image".format(image_cols)
         self.image_df = pd.read_sql(sql=image_query, con=self.conn)
 
     def count_cells(self, compartment="cells", count_subset=False):
-        """
-        Determine how many cells are measured per well.
+        """Determine how many cells are measured per well.
 
-        Arguments:
-        compartment - string indicating the compartment to subset
-        count_subset - [default: False] count the number of cells in subset partition
+        :param compartment: string indicating the compartment to subset, defaults to "cells"
+        :type compartment: str
+        :param count_subset: whether or not count the number of cells as specified by the strata groups
+        :return: A pandas dataframe of cell counts in the experiment
+        :rtype: pd.DataFrame
         """
         check_compartments(compartment)
 
@@ -148,9 +178,13 @@ def count_cells(self, compartment="cells", count_subset=False):
 
         return count_df
 
-    def subsample_profiles(self, x):
-        """
-        Sample a Pandas DataFrame given the subsampling fraction
+    def subsample_profiles(self, df):
+        """Sample a Pandas DataFrame given subsampling information
+
+        :param df: A single cell profile dataframe
+        :type df: pd.DataFrame
+        :return: A subsampled pandas dataframe of single cell profiles
+        :rtype: pd.DataFrame
         """
         if self.subsampling_random_state == "none":
             random_state = np.random.randint(0, 10000, size=1)[0]
@@ -158,22 +192,21 @@ def subsample_profiles(self, x):
 
         if self.subsample_frac == 1:
             return pd.DataFrame.sample(
-                x,
+                df,
                 n=self.subsample_n,
                 replace=True,
                 random_state=self.subsampling_random_state,
             )
         else:
             return pd.DataFrame.sample(
-                x, frac=self.subsample_frac, random_state=self.subsampling_random_state
+                df, frac=self.subsample_frac, random_state=self.subsampling_random_state
             )
 
     def get_subsample(self, compartment="cells"):
-        """
-        Extract subsample from sqlite file
+        """Apply the subsampling procedure
 
-        Arguments:
-        compartment - [default: "cells"] string indicating the compartment to subset
+        :param compartment: string indicating the compartment to process, defaults to "cells"
+        :type compartment: str
         """
         check_compartments(compartment)
 
@@ -194,14 +227,14 @@ def get_subsample(self, compartment="cells"):
         self.is_subset_computed = True
 
     def aggregate_compartment(self, compartment, compute_subsample=False):
-        """
-        Aggregate morphological profiles
-
-        Arguments:
-        compartment - str indicating specific compartment to extract
-
-        Return:
-        Either the merged object file or write object to disk
+        """Aggregate morphological profiles. Uses pycytominer.aggregate()
+
+        :param compartment: string indicating the specific compartment, defaults to "cells"
+        :type compartment: str
+        :param compute_subsample: determine if subsample should be computed, defaults to False
+        :type compute_subsample: bool
+        :return: Aggregated single-cell profiles
+        :rtype: pd.DataFrame
         """
         check_compartments(compartment)
 
@@ -220,7 +253,7 @@ def aggregate_compartment(self, compartment, compute_subsample=False):
             population_df=population_df,
             strata=self.strata,
             features=self.features,
-            operation=self.operation,
+            operation=self.aggregation_operation,
             subset_data_df=self.subset_data_df,
         )
 
@@ -228,28 +261,34 @@ def aggregate_compartment(self, compartment, compute_subsample=False):
 
     def aggregate_profiles(
         self,
-        compute_subsample="False",
+        compute_subsample=False,
         output_file="none",
         compression=None,
         float_format=None,
     ):
-        """
-        Aggregate and merge compartments. This is the primary entry to this class.
-
-        Arguments:
-        compute_subsample - [default: False] boolean if subsample should be computed.
-                            NOTE: Must be specified to perform subsampling. Will not
-                            apply subsetting if set to False even if subsample is
-                            initialized
-        output_file - [default: "none"] if provided, will write annotated profiles to file
-                  if not specified, will return the annotated profiles. We recommend
-                  that this output file be suffixed with "_augmented.csv".
-        compression - the mechanism to compress [default: None]
-        float_format - decimal precision to use in writing output file [default: None]
-                           For example, use "%.3g" for 3 decimal precision.
+        """Aggregate and merge compartments. This is the primary entry to this class.
+
+        :param compute_subsample: Determine if subsample should be computed, defaults to False
+        :type compute_subsample: bool
+        :param output_file: the name of a file to output, defaults to "none":
+        :type output_file: str, optional
+        :param compression: the mechanism to compress, defaults to None
+        :type compression: str, optional
+        :param float_format: decimal precision to use in writing output file, defaults to None
+        :type float_format: str, optional
 
         Return:
         if output_file is set, then write to file. If not then return
+
+        .. note::
+            compute_subsample must be specified to perform subsampling. The function
+            aggregate_profiles(compute_subsample=True) will apply subsetting if even if
+            subsample is initialized
+
+        .. note::
+            We recommend that, if provided, the output file be suffixed with "_augmented"
+
+        :Example:
         """
 
         if output_file != "none":

From 4cd0896956f205acea1aa2103e3133c2a7f4aff6 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Mon, 14 Dec 2020 16:11:54 -0500
Subject: [PATCH 05/15] update tests for updated cells function args

---
 .../tests/test_cyto_utils/test_cells.py        | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 4b293082..3fb70e98 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -65,15 +65,17 @@ def build_random_data(
 nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace")
 
 # Setup SingleCells Class
-ap = SingleCells(sql_file=file)
-ap_subsample = SingleCells(sql_file=file, subsample_n=2, subsampling_random_state=123)
+ap = SingleCells(file_or_conn=file)
+ap_subsample = SingleCells(
+    file_or_conn=file, subsample_n=2, subsampling_random_state=123
+)
 
 
 def test_SingleCells_init():
     """
     Testing initialization of SingleCells
     """
-    assert ap.sql_file == file
+    assert ap.file_or_conn == file
     assert ap.strata == ["Metadata_Plate", "Metadata_Well"]
     assert ap.merge_cols == ["TableNumber", "ImageNumber"]
     assert ap.features == "infer"
@@ -84,7 +86,7 @@ def test_SingleCells_init():
     assert ap_subsample.subsample_n == 2
     assert ap.subset_data_df == "none"
     assert ap.output_file == "none"
-    assert ap.operation == "median"
+    assert ap.aggregation_operation == "median"
     assert not ap.is_aggregated
     assert ap.subsampling_random_state == "none"
     assert ap_subsample.subsampling_random_state == 123
@@ -94,7 +96,7 @@ def test_SingleCells_reset_variables():
     """
     Testing initialization of SingleCells
     """
-    ap_switch = SingleCells(sql_file=file)
+    ap_switch = SingleCells(file_or_conn=file)
     assert ap_switch.subsample_frac == 1
     assert ap_switch.subsample_n == "all"
     assert ap_switch.subsampling_random_state == "none"
@@ -188,7 +190,7 @@ def test_aggregate_subsampling_count_cells():
     )
     pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)
 
-    profiles = ap_subsample.aggregate_profiles()
+    profiles = ap_subsample.aggregate_profiles(compute_subsample=True)
 
     count_df = ap_subsample.count_cells(count_subset=True)
     expected_count = pd.DataFrame(
@@ -307,7 +309,7 @@ def test_aggregate_count_cells_multiple_strata():
 
     # Setup SingleCells Class
     ap_strata = SingleCells(
-        sql_file=file,
+        file_or_conn=file,
         subsample_n="4",
         strata=["Metadata_Plate", "Metadata_Well", "Metadata_Site"],
     )
@@ -323,7 +325,7 @@ def test_aggregate_count_cells_multiple_strata():
     )
     pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)
 
-    profiles = ap_strata.aggregate_profiles()
+    profiles = ap_strata.aggregate_profiles(compute_subsample=True)
 
     count_df = ap_strata.count_cells(count_subset=True)
     expected_count = pd.DataFrame(

From a55bf74dd2188604ebad7a31118d72c575e7a817 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Mon, 14 Dec 2020 18:29:35 -0500
Subject: [PATCH 06/15] add function to merge single cells given linking
 columns

---
 pycytominer/cyto_utils/cells.py | 128 +++++++++++++++++++++++++++++---
 1 file changed, 119 insertions(+), 9 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index c96e713a..53676418 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -8,8 +8,14 @@
     check_compartments,
     check_aggregate_operation,
     infer_cp_features,
+    get_default_linking_cols,
+    get_default_compartments,
+    assert_linking_cols_complete,
 )
 
+default_compartments = get_default_compartments()
+default_linking_cols = get_default_linking_cols()
+
 
 class SingleCells(object):
     """This is a class to interact with single cell morphological profiles. Interaction
@@ -27,6 +33,8 @@ class SingleCells(object):
     :type output_file: str
     :param compartments: list of compartments to process, defaults to ["cells", "cytoplasm", "nuclei"]
     :type compartments: list
+    :param compartment_linking_cols: dictionary identifying how to merge columns across tables, default noted below:
+    :type compartment_linking_cols: dict
     :param merge_cols: columns indicating how to merge image and compartment data, defaults to ["TableNumber", "ImageNumber"]
     :type merge_cols: list
     :param load_image_data: if image data should be loaded into memory, defaults to True
@@ -37,6 +45,17 @@ class SingleCells(object):
     :type subsample_n:, str, int
     :param subsampling_random_state: the random state to init subsample, defaults to "none"
     :type subsampling_random_state: str, int
+
+    .. note::
+        the argument compartment_linking_cols is designed to work with CellProfiler output,
+        as curated by cytominer-database. The defaut is: {
+            "cytoplasm": {
+                "cells": "Cytoplasm_Parent_Cells",
+                "nuclei": "Cytoplasm_Parent_Nuclei",
+            },
+            "cells": {"cytoplasm": "ObjectNumber"},
+            "nuclei": {"cytoplasm": "ObjectNumber"},
+        }
     """
 
     def __init__(
@@ -46,7 +65,8 @@ def __init__(
         features="infer",
         aggregation_operation="median",
         output_file="none",
-        compartments=["cells", "cytoplasm", "nuclei"],
+        compartments=default_compartments,
+        compartment_linking_cols=default_linking_cols,
         merge_cols=["TableNumber", "ImageNumber"],
         load_image_data=True,
         subsample_frac=1,
@@ -68,9 +88,9 @@ def __init__(
         self.file_or_conn = file_or_conn
         self.strata = strata
         self.features = features
+        self.load_image_data = load_image_data
         self.aggregation_operation = aggregation_operation.lower()
         self.output_file = output_file
-        self.compartments = compartments
         self.merge_cols = merge_cols
         self.subsample_frac = subsample_frac
         self.subsample_n = subsample_n
@@ -78,6 +98,13 @@ def __init__(
         self.subsampling_random_state = subsampling_random_state
         self.is_aggregated = False
         self.is_subset_computed = False
+        self.compartments = compartments
+        self.compartment_linking_cols = compartment_linking_cols
+
+        # Confirm that the compartments and linking cols are formatted properly
+        assert_linking_cols_complete(
+            compartments=self.compartments, linking_cols=self.compartment_linking_cols
+        )
 
         if self.subsample_n != "all":
             self.set_subsample_n(self.subsample_n)
@@ -89,7 +116,7 @@ def __init__(
         # Throw an error if both subsample_frac and subsample_n is set
         self._check_subsampling()
 
-        if load_image_data:
+        if self.load_image_data:
             self.load_image()
 
     def _check_subsampling(self):
@@ -226,6 +253,11 @@ def get_subsample(self, compartment="cells"):
 
         self.is_subset_computed = True
 
+    def load_compartment(self, compartment):
+        compartment_query = "select * from {}".format(compartment)
+        df = pd.read_sql(sql=compartment_query, con=self.conn)
+        return df
+
     def aggregate_compartment(self, compartment, compute_subsample=False):
         """Aggregate morphological profiles. Uses pycytominer.aggregate()
 
@@ -238,13 +270,11 @@ def aggregate_compartment(self, compartment, compute_subsample=False):
         """
         check_compartments(compartment)
 
-        compartment_query = "select * from {}".format(compartment)
-
         if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
             self.get_subsample(compartment=compartment)
 
         population_df = self.image_df.merge(
-            pd.read_sql(sql=compartment_query, con=self.conn),
+            self.load_compartment(compartment=compartment),
             how="inner",
             on=self.merge_cols,
         )
@@ -259,6 +289,87 @@ def aggregate_compartment(self, compartment, compute_subsample=False):
 
         return object_df
 
+    def merge_single_cells(
+        self,
+        sc_output_file="none",
+        compression=None,
+        float_format=None,
+        single_cell_normalize=False,
+        normalize_args=None,
+    ):
+        """Given the linking columns, merge single cell data. Normalization is also supported
+
+        :param sc_output_file: the name of a file to output, defaults to "none":
+        :type sc_output_file: str, optional
+        :param compression: the mechanism to compress, defaults to None
+        :type compression: str, optional
+        :param float_format: decimal precision to use in writing output file, defaults to None
+        :type float_format: str, optional
+        :param single_cell_normalize: determine if the single cell data should also be normalized
+        :type single_cell_normalize: bool
+        :param normalize_args: additional arguments passed as a dictionary as input to pycytominer.normalize()
+        :return: Either a dataframe (if output_file="none") or will write to file
+        :rtype: pd.DataFrame, optional
+        """
+
+        # Load the single cell dataframe by merging on the specific linking columns
+        sc_df = ""
+        linking_check_cols = []
+        for left_compartment in default_linking_cols:
+            for right_compartment in default_linking_cols[left_compartment]:
+                # Make sure only one merge per combination occurs
+                linking_check = "-".join(sorted([left_compartment, right_compartment]))
+                if linking_check in linking_check_cols:
+                    continue
+
+                # Specify how to indicate merge suffixes
+                merge_suffix = [
+                    "_{comp_l}".format(comp_l=left_compartment),
+                    "_{comp_r}".format(comp_r=right_compartment),
+                ]
+
+                left_link_col = default_linking_cols[left_compartment][
+                    right_compartment
+                ]
+                right_link_col = default_linking_cols[right_compartment][
+                    left_compartment
+                ]
+
+                if isinstance(sc_df, str):
+                    sc_df = self.load_compartment(compartment=left_compartment).merge(
+                        self.load_compartment(compartment=right_compartment),
+                        left_on=self.merge_cols + [left_link_col],
+                        right_on=self.merge_cols + [right_link_col],
+                        suffixes=merge_suffix,
+                    )
+                else:
+                    sc_df = sc_df.merge(
+                        self.load_compartment(compartment=right_compartment),
+                        left_on=self.merge_cols + [left_link_col],
+                        right_on=self.merge_cols + [right_link_col],
+                        suffixes=merge_suffix,
+                    )
+
+                linking_check_cols.append(linking_check)
+
+        # Add image data to single cell dataframe
+        if not self.load_image_data:
+            self.load_image()
+
+        sc_df = self.image_df.merge(sc_df, on=self.merge_cols, how="right")
+        if single_cell_normalize:
+            sc_df = normalize(profiles=sc_df, **normalize_args)
+
+        if sc_output_file != "none":
+            output(
+                df=sc_df,
+                output_filename=sc_output_file,
+                compression=compression,
+                float_format=float_format,
+            )
+        else:
+            return sc_df
+
     def aggregate_profiles(
         self,
         compute_subsample=False,
@@ -276,9 +387,8 @@ def aggregate_profiles(
         :type compression: str, optional
         :param float_format: decimal precision to use in writing output file, defaults to None
         :type float_format: str, optional
-
-        Return:
-        if output_file is set, then write to file. If not then return
+        :return: Either a dataframe (if output_file="none") or will write to file
+        :rtype: pd.DataFrame, optional
 
         .. note::
             compute_subsample must be specified to perform subsampling. The function

From ae2ac5e72bb63855336e52c6dc188d9f0976b101 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Mon, 14 Dec 2020 18:31:04 -0500
Subject: [PATCH 07/15] add get_default_compartments() function

---
 pycytominer/cyto_utils/__init__.py             | 5 +++++
 pycytominer/cyto_utils/util.py                 | 4 ++++
 pycytominer/tests/test_cyto_utils/test_util.py | 6 ++++++
 3 files changed, 15 insertions(+)

diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py
index 4396f93e..b26c4a43 100644
--- a/pycytominer/cyto_utils/__init__.py
+++ b/pycytominer/cyto_utils/__init__.py
@@ -1,12 +1,17 @@
 from .output import output
 from .util import (
     check_compartments,
+    get_default_compartments,
     load_known_metadata_dictionary,
     check_correlation_method,
     check_aggregate_operation,
     check_consensus_operation,
     get_pairwise_correlation,
 )
+from .single_cell_ingest_utils import (
+    get_default_linking_cols,
+    assert_linking_cols_complete,
+)
 from .load import (
     load_profiles,
     load_platemap,
diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py
index 4c806a34..121303ca 100644
--- a/pycytominer/cyto_utils/util.py
+++ b/pycytominer/cyto_utils/util.py
@@ -12,6 +12,10 @@
 )
 
 
+def get_default_compartments():
+    return ["cells", "cytoplasm", "nuclei"]
+
+
 def check_compartments(compartments):
     valid_compartments = ["cells", "cytoplasm", "nuclei"]
     error_str = "compartment not supported, use one of {}".format(valid_compartments)
diff --git a/pycytominer/tests/test_cyto_utils/test_util.py b/pycytominer/tests/test_cyto_utils/test_util.py
index 8786dc86..5aca0385 100644
--- a/pycytominer/tests/test_cyto_utils/test_util.py
+++ b/pycytominer/tests/test_cyto_utils/test_util.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from pycytominer.cyto_utils.util import (
     check_compartments,
+    get_default_compartments,
     load_known_metadata_dictionary,
     get_pairwise_correlation,
     check_correlation_method,
@@ -52,6 +53,11 @@ def test_check_compartments_not_valid():
     assert "compartment not supported" in str(ae.value)
 
 
+def test_get_default_compartments():
+    default_comparments = get_default_compartments()
+    assert ["cells", "cytoplasm", "nuclei"] == default_comparments
+
+
 def test_load_known_metadata_dictionary():
     meta_cols = ["ObjectNumber", "ImageNumber", "TableNumber"]
     meta_df = pd.DataFrame(

From 45cd6158eb62ae4f38a9b6de42114e81fe8af4f3 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Mon, 14 Dec 2020 18:31:24 -0500
Subject: [PATCH 08/15] add test to load compartment

---
 pycytominer/tests/test_cyto_utils/test_cells.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 3fb70e98..0a068bad 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -133,6 +133,11 @@ def test_SingleCells_count():
     pd.testing.assert_frame_equal(count_df, expected_count, check_names=False)
 
 
+def test_load_compartment():
+    loaded_compartment_df = ap.load_compartment(compartment="cells")
+    pd.testing.assert_frame_equal(loaded_compartment_df, cells_df)
+
+
 def test_aggregate_comparment():
     df = image_df.merge(cells_df, how="inner", on=["TableNumber", "ImageNumber"])
     result = aggregate(df)

From f0720c52ab320c65988d896332565da38842b0fe Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Mon, 14 Dec 2020 18:31:44 -0500
Subject: [PATCH 09/15] add file to test single cell ingestion processing

---
 .../test_single_cell_ingest_utils.py          | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py

diff --git a/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py b/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py
new file mode 100644
index 00000000..f96f9d47
--- /dev/null
+++ b/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py
@@ -0,0 +1,56 @@
+import pytest
+from pycytominer.cyto_utils import (
+    get_default_linking_cols,
+    get_default_compartments,
+    assert_linking_cols_complete,
+)
+
+default_compartments = get_default_compartments()
+default_linking_cols = {
+    "cytoplasm": {
+        "cells": "Cytoplasm_Parent_Cells",
+        "nuclei": "Cytoplasm_Parent_Nuclei",
+    },
+    "cells": {"cytoplasm": "ObjectNumber"},
+    "nuclei": {"cytoplasm": "ObjectNumber"},
+}
+
+
+def test_default_linking_cols():
+    linking_cols = get_default_linking_cols()
+    assert linking_cols == default_linking_cols
+
+
+def test_assert_linking_cols_complete():
+    assert_linking_cols_complete()
+    assert_linking_cols_complete(
+        linking_cols=default_linking_cols, compartments=default_compartments
+    )
+
+    with pytest.raises(AssertionError) as err:
+        assert_linking_cols_complete(
+            linking_cols=default_linking_cols, compartments=["cells", "cytoplasm"]
+        )
+
+    assert "nuclei compartment not found." in str(err.value)
+
+    error_linking_cols = {
+        "cytoplasm": {"cells": "Cytoplasm_Parent_Cells"},
+        "cells": {"cytoplasm": "ObjectNumber"},
+        "nuclei": {"cytoplasm": "ObjectNumber"},
+    }
+    with pytest.raises(AssertionError) as err:
+        assert_linking_cols_complete(
+            linking_cols=error_linking_cols, compartments=default_compartments
+        )
+    assert "Missing column identifier between cytoplasm-nuclei" in str(err.value)
+
+    with pytest.raises(AssertionError) as err:
+        assert_linking_cols_complete(
+            linking_cols=default_linking_cols,
+            compartments=["cells", "cytoplasm", "nuclei", "sandwich"],
+        )
+    assert (
+        "All compartments must be specified in the linking_cols, {'sandwich'} is missing"
+        in str(err.value)
+    )

From 764c629f1b1bde3fe7251b99396c423881f4e727 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 15 Dec 2020 10:12:12 -0500
Subject: [PATCH 10/15] add single cell ingest util file

---
 .../cyto_utils/single_cell_ingest_utils.py    | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 pycytominer/cyto_utils/single_cell_ingest_utils.py

diff --git a/pycytominer/cyto_utils/single_cell_ingest_utils.py b/pycytominer/cyto_utils/single_cell_ingest_utils.py
new file mode 100644
index 00000000..374fcb2d
--- /dev/null
+++ b/pycytominer/cyto_utils/single_cell_ingest_utils.py
@@ -0,0 +1,67 @@
+from collections import Counter
+from pycytominer.cyto_utils import check_compartments, get_default_compartments
+
+
+def get_default_linking_cols():
+    """Define the standard experiment linking columns between tables
+
+    :return: Dictionary of compartment-specific column names used to link compartments across tables
+    :rtype: dict
+
+    .. note::
+        every dictionary pair has a 1 to 1 correspondence (e.g. cytoplasm-cells and cells-cytoplasm both must exist)
+    """
+    linking_cols = {
+        "cytoplasm": {
+            "cells": "Cytoplasm_Parent_Cells",
+            "nuclei": "Cytoplasm_Parent_Nuclei",
+        },
+        "cells": {"cytoplasm": "ObjectNumber"},
+        "nuclei": {"cytoplasm": "ObjectNumber"},
+    }
+
+    return linking_cols
+
+
+def assert_linking_cols_complete(linking_cols="default", compartments="default"):
+    """Confirm that the linking cols and compartments are compatible
+
+    :return: Dictionary of compartment-specific column names used to link compartments across tables
+    :rtype: dict
+
+    .. note::
+        assert_linking_cols_complete() does not check if columns are present
+    """
+    if linking_cols == "default":
+        linking_cols = get_default_linking_cols()
+
+    if compartments == "default":
+        compartments = get_default_compartments()
+
+    comp_err = "compartment not found. Check the specified compartments"
+
+    linking_check = []
+    unique_linking_cols = []
+    for x in linking_cols:
+        unique_linking_cols.append(x)
+        assert x in compartments, "{com} {err}".format(com=x, err=comp_err)
+        for y in linking_cols[x]:
+            unique_linking_cols.append(y)
+            assert y in compartments, "{com} {err}".format(com=y, err=comp_err)
+            linking_check.append("-".join(sorted([x, y])))
+
+    # Make sure that each combination has been specified exactly twice
+    linking_counter = Counter(linking_check)
+    for combo in linking_counter:
+        assert (
+            linking_counter[combo] == 2
+        ), "Missing column identifier between {combo}".format(combo=combo)
+
+    # Confirm that every compartment has been specified in the linking_cols
+    unique_linking_cols = sorted(list(set(unique_linking_cols)))
+    diff_column = set(compartments).difference(unique_linking_cols)
+    assert unique_linking_cols == sorted(
+        compartments
+    ), "All compartments must be specified in the linking_cols, {miss} is missing".format(
+        miss=diff_column
+    )

From 29a702ab27ea7624f422103d83019142de8208f4 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 15 Dec 2020 11:07:30 -0500
Subject: [PATCH 11/15] one line import

---
 pycytominer/cyto_utils/cells.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 53676418..95146277 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -1,8 +1,7 @@
 import numpy as np
 import pandas as pd
 from sqlalchemy import create_engine
-from pycytominer import aggregate
-from pycytominer import normalize
+from pycytominer import aggregate, normalize
 from pycytominer.cyto_utils import (
     output,
     check_compartments,

From 689ebc19fd13e235f7282c631c20865b98c900bc Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 15 Dec 2020 11:08:45 -0500
Subject: [PATCH 12/15] closes #110

---
 pycytominer/cyto_utils/util.py                | 22 +++++++++++----
 .../tests/test_cyto_utils/test_util.py        | 28 +++++++++++++------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py
index 121303ca..a55c6dbe 100644
--- a/pycytominer/cyto_utils/util.py
+++ b/pycytominer/cyto_utils/util.py
@@ -1,8 +1,9 @@
 """
-Miscellaneous utility function
+Miscellaneous utility functions
 """
 
 import os
+import warnings
 import numpy as np
 import pandas as pd
 from pycytominer.cyto_utils.features import infer_cp_features
@@ -17,14 +18,23 @@ def get_default_compartments():
 
 
 def check_compartments(compartments):
-    valid_compartments = ["cells", "cytoplasm", "nuclei"]
-    error_str = "compartment not supported, use one of {}".format(valid_compartments)
+    default_compartments = get_default_compartments()
+
     if isinstance(compartments, list):
         compartments = [x.lower() for x in compartments]
-        assert all([x in valid_compartments for x in compartments]), error_str
     elif isinstance(compartments, str):
-        compartments = compartments.lower()
-        assert compartments in valid_compartments, error_str
+        compartments = [compartments.lower()]
+
+    non_canonical_compartments = []
+    for compartment in compartments:
+        if compartment not in default_compartments:
+            non_canonical_compartments.append(compartment)
+
+    if len(non_canonical_compartments) > 0:
+        warn_str = "Non-canonical compartment detected: {x}".format(
+            x=", ".join(non_canonical_compartments)
+        )
+        warnings.warn(warn_str)
 
 
 def load_known_metadata_dictionary(metadata_file=default_metadata_file):
diff --git a/pycytominer/tests/test_cyto_utils/test_util.py b/pycytominer/tests/test_cyto_utils/test_util.py
index 5aca0385..3f47ca20 100644
--- a/pycytominer/tests/test_cyto_utils/test_util.py
+++ b/pycytominer/tests/test_cyto_utils/test_util.py
@@ -2,6 +2,7 @@
 import random
 import pytest
 import tempfile
+import warnings
 import pandas as pd
 from pycytominer.cyto_utils.util import (
     check_compartments,
@@ -37,20 +38,31 @@ def test_check_compartments():
 
 
 def test_check_compartments_not_valid():
-    with pytest.raises(AssertionError) as ae:
+    warn_expected_string = "Non-canonical compartment detected: something"
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
         not_valid = ["SOMETHING"]
         output = check_compartments(not_valid)
-    assert "compartment not supported" in str(ae.value)
+    assert issubclass(w[-1].category, UserWarning)
+    assert warn_expected_string in str(w[-1].message)
 
-    with pytest.raises(AssertionError) as ae:
-        not_valid = "SOMETHING"
+    with warnings.catch_warnings(record=True) as w:
+        not_valid = "SOMETHING"  # Also works with strings
         output = check_compartments(not_valid)
-    assert "compartment not supported" in str(ae.value)
+    assert issubclass(w[-1].category, UserWarning)
+    assert warn_expected_string in str(w[-1].message)
 
-    with pytest.raises(AssertionError) as ae:
-        not_valid = ["Cells", "Cytoplasm", "SOMETHING"]
+    with warnings.catch_warnings(record=True) as w:
+        not_valid = ["CelLs", "CytopLasM", "SOMETHING"]
         output = check_compartments(not_valid)
-    assert "compartment not supported" in str(ae.value)
+    assert issubclass(w[-1].category, UserWarning)
+    assert warn_expected_string in str(w[-1].message)
+
+    with warnings.catch_warnings(record=True) as w:
+        not_valid = ["CelLs", "CytopLasM", "SOMETHING", "NOTHING"]
+        output = check_compartments(not_valid)
+    assert issubclass(w[-1].category, UserWarning)
+    assert "{x}, nothing".format(x=warn_expected_string) in str(w[-1].message)
 
 
 def test_get_default_compartments():

From c3e0ce3aa10063480351d6ea1adf590a418ccbe0 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 15 Dec 2020 15:57:47 -0500
Subject: [PATCH 13/15] fix column name issue when merging - do not consider
 linking columns as morphology features

---
 pycytominer/cyto_utils/__init__.py            |   1 +
 pycytominer/cyto_utils/cells.py               |  68 +++++--
 .../cyto_utils/single_cell_ingest_utils.py    |  24 +++
 .../tests/test_cyto_utils/test_cells.py       | 183 +++++++++++++++---
 .../test_single_cell_ingest_utils.py          |  26 +++
 5 files changed, 269 insertions(+), 33 deletions(-)

diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py
index b26c4a43..f5960a04 100644
--- a/pycytominer/cyto_utils/__init__.py
+++ b/pycytominer/cyto_utils/__init__.py
@@ -11,6 +11,7 @@
 from .single_cell_ingest_utils import (
     get_default_linking_cols,
     assert_linking_cols_complete,
+    provide_linking_cols_feature_name_update,
 )
 from .load import (
     load_profiles,
diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 95146277..ebe74bc1 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -10,6 +10,7 @@
     get_default_linking_cols,
     get_default_compartments,
     assert_linking_cols_complete,
+    provide_linking_cols_feature_name_update,
 )
 
 default_compartments = get_default_compartments()
@@ -105,6 +106,11 @@ def __init__(
             compartments=self.compartments, linking_cols=self.compartment_linking_cols
         )
 
+        # Build a dictionary to update linking column feature names
+        self.linking_col_rename = provide_linking_cols_feature_name_update(
+            self.compartment_linking_cols
+        )
+
         if self.subsample_n != "all":
             self.set_subsample_n(self.subsample_n)
 
@@ -184,10 +190,10 @@ def count_cells(self, compartment="cells", count_subset=False):
             assert self.is_aggregated, "Make sure to aggregate_profiles() first!"
             assert self.is_subset_computed, "Make sure to get_subsample() first!"
             count_df = (
-                self.subset_data_df.groupby(self.strata)["ObjectNumber"]
+                self.subset_data_df.groupby(self.strata)["Metadata_ObjectNumber"]
                 .count()
                 .reset_index()
-                .rename({"ObjectNumber": "cell_count"}, axis="columns")
+                .rename({"Metadata_ObjectNumber": "cell_count"}, axis="columns")
             )
         else:
             query_cols = "TableNumber, ImageNumber, ObjectNumber"
@@ -217,17 +223,21 @@ def subsample_profiles(self, df):
             self.set_subsample_random_state(random_state)
 
         if self.subsample_frac == 1:
-            return pd.DataFrame.sample(
+
+            output_df = pd.DataFrame.sample(
                 df,
                 n=self.subsample_n,
                 replace=True,
                 random_state=self.subsampling_random_state,
             )
         else:
-            return pd.DataFrame.sample(
+            output_df = pd.DataFrame.sample(
                 df, frac=self.subsample_frac, random_state=self.subsampling_random_state
             )
 
+        output_df = output_df.rename(self.linking_col_rename, axis="columns")
+        return output_df
+
     def get_subsample(self, compartment="cells"):
         """Apply the subsampling procedure
 
@@ -276,7 +286,7 @@ def aggregate_compartment(self, compartment, compute_subsample=False):
             self.load_compartment(compartment=compartment),
             how="inner",
             on=self.merge_cols,
-        )
+        ).rename(self.linking_col_rename, axis="columns")
 
         object_df = aggregate(
             population_df=population_df,
@@ -314,8 +324,9 @@ def merge_single_cells(
         # Load the single cell dataframe by merging on the specific linking columns
         sc_df = ""
         linking_check_cols = []
-        for left_compartment in default_linking_cols:
-            for right_compartment in default_linking_cols[left_compartment]:
+        merge_suffix_rename = []
+        for left_compartment in self.compartment_linking_cols:
+            for right_compartment in self.compartment_linking_cols[left_compartment]:
                 # Make sure only one merge per combination occurs
                 linking_check = "-".join(sorted([left_compartment, right_compartment]))
                 if linking_check in linking_check_cols:
@@ -326,11 +337,11 @@ def merge_single_cells(
                     "_{comp_l}".format(comp_l=left_compartment),
                     "_{comp_r}".format(comp_r=right_compartment),
                 ]
-
-                left_link_col = default_linking_cols[left_compartment][
+                merge_suffix_rename += merge_suffix
+                left_link_col = self.compartment_linking_cols[left_compartment][
                     right_compartment
                 ]
-                right_link_col = default_linking_cols[right_compartment][
+                right_link_col = self.compartment_linking_cols[right_compartment][
                     left_compartment
                 ]
 
@@ -351,12 +362,47 @@ def merge_single_cells(
 
                 linking_check_cols.append(linking_check)
 
+        # Add metadata prefix to merged suffixes
+        full_merge_suffix_rename = []
+        full_merge_suffix_original = []
+        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
+            full_merge_suffix_original.append(col_name)
+            full_merge_suffix_rename.append("Metadata_{x}".format(x=col_name))
+
+        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
+            for suffix in set(merge_suffix_rename):
+                full_merge_suffix_original.append("{x}{y}".format(x=col_name, y=suffix))
+                full_merge_suffix_rename.append(
+                    "Metadata_{x}{y}".format(x=col_name, y=suffix)
+                )
+
+        self.full_merge_suffix_rename = dict(
+            zip(full_merge_suffix_original, full_merge_suffix_rename)
+        )
+
         # Add image data to single cell dataframe
         if not self.load_image_data:
             self.load_image()
 
-        sc_df = self.image_df.merge(sc_df, on=self.merge_cols, how="right")
+        sc_df = (
+            self.image_df.merge(sc_df, on=self.merge_cols, how="right")
+            .rename(self.linking_col_rename, axis="columns")
+            .rename(self.full_merge_suffix_rename, axis="columns")
+        )
         if single_cell_normalize:
+            # Infering features is tricky with non-canonical data
+            if normalize_args is None:
+                normalize_args = {}
+                features = infer_cp_features(sc_df, compartments=self.compartments)
+            elif "features" not in normalize_args:
+                features = infer_cp_features(sc_df, compartments=self.compartments)
+            elif normalize_args["features"] == "infer":
+                features = infer_cp_features(sc_df, compartments=self.compartments)
+            else:
+                features = normalize_args["features"]
+
+            normalize_args["features"] = features
+
             sc_df = normalize(profiles=sc_df, **normalize_args)
 
         if sc_output_file != "none":
diff --git a/pycytominer/cyto_utils/single_cell_ingest_utils.py b/pycytominer/cyto_utils/single_cell_ingest_utils.py
index 374fcb2d..555dc7b2 100644
--- a/pycytominer/cyto_utils/single_cell_ingest_utils.py
+++ b/pycytominer/cyto_utils/single_cell_ingest_utils.py
@@ -65,3 +65,27 @@ def assert_linking_cols_complete(linking_cols="default", compartments="default")
     ), "All compartments must be specified in the linking_cols, {miss} is missing".format(
         miss=diff_column
     )
+
+
+def provide_linking_cols_feature_name_update(linking_cols="default"):
+    """Output a dictionary to use to update pandas dataframe column names. The linking
+    cols must be Metadata.
+
+    :return: Dictionary of the linking column names to update after they are used
+    :rtype: dict
+    """
+    if linking_cols == "default":
+        linking_cols = get_default_linking_cols()
+
+    metadata_update_cols = []
+    for col in linking_cols:
+        for right_col in linking_cols[col]:
+            metadata_update_cols.append(linking_cols[col][right_col])
+
+    update_name = dict(
+        zip(
+            metadata_update_cols,
+            ["Metadata_{x}".format(x=y) for y in metadata_update_cols],
+        )
+    )
+    return update_name
diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 0a068bad..b24a1fa0 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -4,8 +4,14 @@
 import tempfile
 import pandas as pd
 from sqlalchemy import create_engine
-from pycytominer import aggregate
+
+from pycytominer import aggregate, normalize
 from pycytominer.cyto_utils.cells import SingleCells
+from pycytominer.cyto_utils import (
+    get_default_linking_cols,
+    get_default_compartments,
+    infer_cp_features,
+)
 
 random.seed(123)
 
@@ -47,7 +53,10 @@ def build_random_data(
 
 # Setup data
 cells_df = build_random_data(compartment="cells")
-cytoplasm_df = build_random_data(compartment="cytoplasm")
+cytoplasm_df = build_random_data(compartment="cytoplasm").assign(
+    Cytoplasm_Parent_Cells=(list(range(1, 51)) * 2)[::-1],
+    Cytoplasm_Parent_Nuclei=(list(range(1, 51)) * 2)[::-1],
+)
 nuclei_df = build_random_data(compartment="nuclei")
 image_df = pd.DataFrame(
     {
@@ -64,11 +73,41 @@ def build_random_data(
 cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace")
 nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace")
 
+# Create a new table with a fourth compartment
+new_file = "sqlite:///{}/test_new.sqlite".format(tmpdir)
+new_compartment_df = build_random_data(compartment="new")
+
+test_new_engine = create_engine(new_file)
+test_new_conn = test_new_engine.connect()
+
+image_df.to_sql("image", con=test_new_engine, index=False, if_exists="replace")
+cells_df.to_sql("cells", con=test_new_engine, index=False, if_exists="replace")
+new_cytoplasm_df = cytoplasm_df.assign(
+    Cytoplasm_Parent_New=(list(range(1, 51)) * 2)[::-1]
+)
+new_cytoplasm_df.to_sql(
+    "cytoplasm", con=test_new_engine, index=False, if_exists="replace"
+)
+nuclei_df.to_sql("nuclei", con=test_new_engine, index=False, if_exists="replace")
+new_compartment_df.to_sql("new", con=test_new_engine, index=False, if_exists="replace")
+
+new_compartments = ["cells", "cytoplasm", "nuclei", "new"]
+
+new_linking_cols = get_default_linking_cols()
+new_linking_cols["cytoplasm"]["new"] = "Cytoplasm_Parent_New"
+new_linking_cols["new"] = {"cytoplasm": "ObjectNumber"}
+
 # Setup SingleCells Class
 ap = SingleCells(file_or_conn=file)
 ap_subsample = SingleCells(
     file_or_conn=file, subsample_n=2, subsampling_random_state=123
 )
+ap_new = SingleCells(
+    file_or_conn=new_file,
+    load_image_data=False,
+    compartments=new_compartments,
+    compartment_linking_cols=new_linking_cols,
+)
 
 
 def test_SingleCells_init():
@@ -90,6 +129,8 @@ def test_SingleCells_init():
     assert not ap.is_aggregated
     assert ap.subsampling_random_state == "none"
     assert ap_subsample.subsampling_random_state == 123
+    assert ap.compartment_linking_cols == get_default_linking_cols()
+    assert ap.compartments == get_default_compartments()
 
 
 def test_SingleCells_reset_variables():
@@ -137,6 +178,123 @@ def test_load_compartment():
     loaded_compartment_df = ap.load_compartment(compartment="cells")
     pd.testing.assert_frame_equal(loaded_compartment_df, cells_df)
 
+    # Test non-canonical compartment loading
+    pd.testing.assert_frame_equal(new_compartment_df, ap_new.load_compartment("new"))
+
+
+def test_merge_single_cells():
+    sc_merged_df = ap.merge_single_cells()
+
+    # Assert that the image data was merged
+    assert all(x in sc_merged_df.columns for x in ["Metadata_Plate", "Metadata_Well"])
+
+    # Assert that metadata columns were renamed appropriately
+    for x in ap.full_merge_suffix_rename:
+        assert ap.full_merge_suffix_rename[x] == "Metadata_{x}".format(x=x)
+
+    # Perform a manual merge
+    manual_merge = cytoplasm_df.merge(
+        cells_df,
+        left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
+        right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
+        suffixes=["_cytoplasm", "_cells"],
+    ).merge(
+        nuclei_df,
+        left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
+        right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
+        suffixes=["_cytoplasm", "_nuclei"],
+    )
+
+    manual_merge = image_df.merge(manual_merge, on=ap.merge_cols, how="right").rename(
+        ap.full_merge_suffix_rename, axis="columns"
+    )
+
+    # Confirm that the merge correctly reversed the object number (opposite from Parent)
+    assert (
+        sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1]
+        == sc_merged_df.Metadata_ObjectNumber.tolist()
+    )
+    assert (
+        manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1]
+        == sc_merged_df.Metadata_ObjectNumber.tolist()
+    )
+    assert (
+        manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1]
+        == sc_merged_df.Metadata_ObjectNumber.tolist()
+    )
+    assert (
+        manual_merge.Metadata_ObjectNumber_cells.tolist()
+        == sc_merged_df.Metadata_ObjectNumber.tolist()
+    )
+
+    # Confirm the merge and adding merge options
+    for method in ["standardize", "robustize"]:
+        for samples in ["all", "Metadata_ImageNumber == 'x'"]:
+            for features in ["infer", ["Cytoplasm_a", "Cells_a"]]:
+
+                norm_method_df = ap.merge_single_cells(
+                    single_cell_normalize=True,
+                    normalize_args={
+                        "method": method,
+                        "samples": samples,
+                        "features": features,
+                    },
+                )
+
+                manual_merge_normalize = normalize(
+                    manual_merge, method=method, samples=samples, features=features
+                )
+
+                pd.testing.assert_frame_equal(norm_method_df, manual_merge_normalize)
+
+    # Test non-canonical compartment merging
+    new_sc_merge_df = ap_new.merge_single_cells()
+
+    assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4
+    assert (
+        new_compartment_df.ObjectNumber.tolist()[::-1]
+        == new_sc_merge_df.Metadata_ObjectNumber_new.tolist()
+    )
+
+    norm_new_method_df = ap_new.merge_single_cells(
+        single_cell_normalize=True,
+        normalize_args={
+            "method": "standardize",
+            "samples": "all",
+            "features": "infer",
+        },
+    )
+
+    norm_new_method_no_feature_infer_df = ap_new.merge_single_cells(
+        single_cell_normalize=True,
+        normalize_args={
+            "method": "standardize",
+            "samples": "all",
+        },
+    )
+
+    default_feature_infer_df = ap_new.merge_single_cells(single_cell_normalize=True)
+
+    pd.testing.assert_frame_equal(norm_new_method_df, default_feature_infer_df)
+    pd.testing.assert_frame_equal(
+        norm_new_method_df, norm_new_method_no_feature_infer_df
+    )
+
+    new_compartment_cols = infer_cp_features(
+        new_compartment_df, compartments=ap_new.compartments
+    )
+    traditional_norm_df = normalize(
+        ap_new.image_df.merge(new_compartment_df, on=ap.merge_cols),
+        features=new_compartment_cols,
+        samples="all",
+        method="standardize",
+    )
+
+    pd.testing.assert_frame_equal(
+        norm_new_method_df.loc[:, new_compartment_cols].abs().describe(),
+        traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
+    )
+
 
 def test_aggregate_comparment():
     df = image_df.merge(cells_df, how="inner", on=["TableNumber", "ImageNumber"])
@@ -217,26 +375,7 @@ def test_aggregate_subsampling_profile():
             "ImageNumber": sorted(["x", "y"] * 2),
             "Metadata_Plate": ["plate"] * 4,
             "Metadata_Well": sorted(["A01", "A02"] * 2),
-            "ObjectNumber": [46, 3] * 2,
-        }
-    )
-
-    expected_result = pd.DataFrame(
-        {
-            "Metadata_Plate": ["plate", "plate"],
-            "Metadata_Well": ["A01", "A02"],
-            "Cells_a": [110.0, 680.5],
-            "Cells_b": [340.5, 201.5],
-            "Cells_c": [285.0, 481.0],
-            "Cells_d": [352.0, 549.0],
-            "Cytoplasm_a": [407.5, 705.5],
-            "Cytoplasm_b": [650.0, 439.5],
-            "Cytoplasm_c": [243.5, 78.5],
-            "Cytoplasm_d": [762.5, 625.0],
-            "Nuclei_a": [683.5, 171.0],
-            "Nuclei_b": [50.5, 625.0],
-            "Nuclei_c": [431.0, 483.0],
-            "Nuclei_d": [519.0, 286.5],
+            "Metadata_ObjectNumber": [46, 3] * 2,
         }
     )
 
diff --git a/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py b/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py
index f96f9d47..196faf02 100644
--- a/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py
+++ b/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py
@@ -3,6 +3,7 @@
     get_default_linking_cols,
     get_default_compartments,
     assert_linking_cols_complete,
+    provide_linking_cols_feature_name_update,
 )
 
 default_compartments = get_default_compartments()
@@ -54,3 +55,28 @@ def test_assert_linking_cols_complete():
         "All compartments must be specified in the linking_cols, {'sandwich'} is missing"
         in str(err.value)
     )
+
+
+def test_provide_linking_cols_feature_name_update():
+    expected_result = {
+        "Cytoplasm_Parent_Cells": "Metadata_Cytoplasm_Parent_Cells",
+        "Cytoplasm_Parent_Nuclei": "Metadata_Cytoplasm_Parent_Nuclei",
+        "ObjectNumber": "Metadata_ObjectNumber",
+    }
+
+    result = provide_linking_cols_feature_name_update()
+    assert result == expected_result
+
+    new_linking_cols = get_default_linking_cols()
+    new_linking_cols["cytoplasm"]["new"] = "Cytoplasm_Parent_New"
+    new_linking_cols["new"] = {"cytoplasm": "ObjectNumber"}
+    result = provide_linking_cols_feature_name_update(new_linking_cols)
+
+    expected_result = {
+        "Cytoplasm_Parent_Cells": "Metadata_Cytoplasm_Parent_Cells",
+        "Cytoplasm_Parent_Nuclei": "Metadata_Cytoplasm_Parent_Nuclei",
+        "Cytoplasm_Parent_New": "Metadata_Cytoplasm_Parent_New",
+        "ObjectNumber": "Metadata_ObjectNumber",
+    }
+
+    assert result == expected_result

From adcb1aa3d5e4601b751129aea057b5675ce146fc Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 15 Dec 2020 16:55:25 -0500
Subject: [PATCH 14/15] enable infer_cp_feature with string

and abstract out function
---
 pycytominer/cyto_utils/__init__.py                  |  1 +
 pycytominer/cyto_utils/features.py                  | 12 ++++++++++++
 pycytominer/cyto_utils/util.py                      | 10 +++++-----
 .../tests/test_cyto_utils/test_features_util.py     | 13 +++++++++++++
 4 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100644 pycytominer/tests/test_cyto_utils/test_features_util.py

diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py
index f5960a04..690dd09f 100644
--- a/pycytominer/cyto_utils/__init__.py
+++ b/pycytominer/cyto_utils/__init__.py
@@ -23,6 +23,7 @@
     count_na_features,
     infer_cp_features,
     drop_outlier_features,
+    convert_compartment_format_to_list,
 )
 from .write_gct import write_gct
 from .modz import modz
diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py
index a6b77449..4e871edc 100644
--- a/pycytominer/cyto_utils/features.py
+++ b/pycytominer/cyto_utils/features.py
@@ -72,6 +72,9 @@ def infer_cp_features(
     """
     Given a dataframe, output features that we expect to be cell painting features
     """
+    compartments = convert_compartment_format_to_list(compartments)
+    compartments = [x.title() for x in compartments]
+
     features = []
     for col in population_df.columns.tolist():
         if any([col.startswith(x.title()) for x in compartments]):
@@ -140,3 +143,12 @@ def drop_outlier_features(
     ].index.tolist()
 
     return outlier_features
+
+
+def convert_compartment_format_to_list(compartments):
+    if isinstance(compartments, list):
+        compartments = [x.lower() for x in compartments]
+    elif isinstance(compartments, str):
+        compartments = [compartments.lower()]
+
+    return compartments
diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py
index a55c6dbe..b8057687 100644
--- a/pycytominer/cyto_utils/util.py
+++ b/pycytominer/cyto_utils/util.py
@@ -6,7 +6,10 @@
 import warnings
 import numpy as np
 import pandas as pd
-from pycytominer.cyto_utils.features import infer_cp_features
+from pycytominer.cyto_utils.features import (
+    infer_cp_features,
+    convert_compartment_format_to_list,
+)
 
 default_metadata_file = os.path.join(
     os.path.dirname(__file__), "..", "data", "metadata_feature_dictionary.txt"
@@ -20,10 +23,7 @@ def get_default_compartments():
 def check_compartments(compartments):
     default_compartments = get_default_compartments()
 
-    if isinstance(compartments, list):
-        compartments = [x.lower() for x in compartments]
-    elif isinstance(compartments, str):
-        compartments = [compartments.lower()]
+    compartments = convert_compartment_format_to_list(compartments)
 
     non_canonical_compartments = []
     for compartment in compartments:
diff --git a/pycytominer/tests/test_cyto_utils/test_features_util.py b/pycytominer/tests/test_cyto_utils/test_features_util.py
new file mode 100644
index 00000000..5191fcf7
--- /dev/null
+++ b/pycytominer/tests/test_cyto_utils/test_features_util.py
@@ -0,0 +1,13 @@
+import os
+import random
+import pytest
+import pandas as pd
+from pycytominer.cyto_utils.features import convert_compartment_format_to_list
+
+
+def test_convert_compartment_format_to_list():
+    compartments = convert_compartment_format_to_list(["cells", "CYTOplasm", "nuclei"])
+    assert compartments == ["cells", "cytoplasm", "nuclei"]
+
+    compartments = convert_compartment_format_to_list("FoO")
+    assert compartments == ["foo"]

From f9adf9407f587519f6479bf875fe722be65799c6 Mon Sep 17 00:00:00 2001
From: gwaygenomics <gregory.way@gmail.com>
Date: Tue, 15 Dec 2020 16:56:30 -0500
Subject: [PATCH 15/15] remove features init argument

enable aggregation_args and no longer hardcoded aggregation
---
 pycytominer/cyto_utils/cells.py               | 63 ++++++++++++-------
 .../tests/test_cyto_utils/test_cells.py       |  8 ++-
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index ebe74bc1..ba5c0a9e 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -25,8 +25,6 @@ class SingleCells(object):
     :type file_or_conn: str
     :param strata: The columns to groupby and aggregate single cells, defaults to ["Metadata_Plate", "Metadata_Well"]
     :type strata: list
-    :param features: The features that should be aggregated, defaults to "infer"
-    :type features: str, list
     :param aggregation_operation: operation to perform single cell aggregation, defaults to "median"
     :type aggregation_operation: str
     :param output_file: If specified, the location to write the file, defaults to "none"
@@ -62,7 +60,6 @@ def __init__(
         self,
         file_or_conn,
         strata=["Metadata_Plate", "Metadata_Well"],
-        features="infer",
         aggregation_operation="median",
         output_file="none",
         compartments=default_compartments,
@@ -87,7 +84,6 @@ def __init__(
 
         self.file_or_conn = file_or_conn
         self.strata = strata
-        self.features = features
         self.load_image_data = load_image_data
         self.aggregation_operation = aggregation_operation.lower()
         self.output_file = output_file
@@ -267,13 +263,17 @@ def load_compartment(self, compartment):
         df = pd.read_sql(sql=compartment_query, con=self.conn)
         return df
 
-    def aggregate_compartment(self, compartment, compute_subsample=False):
+    def aggregate_compartment(
+        self, compartment, compute_subsample=False, aggregate_args=None
+    ):
         """Aggregate morphological profiles. Uses pycytominer.aggregate()
 
         :param compartment: string indicating the specific compartment, defaults to "cells"
         :type compartment: str
         :param compute_subsample: determine if subsample should be computed, defaults to False
         :type compute_subsample: bool
+        :param aggregate_args: additional arguments passed as a dictionary as input to pycytominer.aggregate()
+        :type aggregate_args: None, dict
         :return: Aggregated single-cell profiles
         :rtype: pd.DataFrame
         """
@@ -282,18 +282,36 @@ def aggregate_compartment(self, compartment, compute_subsample=False):
         if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample:
             self.get_subsample(compartment=compartment)
 
+        # Load image data if not already loaded
+        if not self.load_image_data:
+            self.load_image()
+            self.load_image_data = True
+
         population_df = self.image_df.merge(
             self.load_compartment(compartment=compartment),
             how="inner",
             on=self.merge_cols,
         ).rename(self.linking_col_rename, axis="columns")
 
+        # Infering features is tricky with non-canonical data
+        if aggregate_args is None:
+            aggregate_args = {}
+            features = infer_cp_features(population_df, compartments=compartment)
+        elif "features" not in aggregate_args:
+            features = infer_cp_features(population_df, compartments=compartment)
+        elif aggregate_args["features"] == "infer":
+            features = infer_cp_features(population_df, compartments=compartment)
+        else:
+            features = aggregate_args["features"]
+
+        aggregate_args["features"] = features
+
         object_df = aggregate(
             population_df=population_df,
             strata=self.strata,
-            features=self.features,
             operation=self.aggregation_operation,
             subset_data_df=self.subset_data_df,
+            **aggregate_args
         )
 
         return object_df
@@ -317,6 +335,7 @@ def merge_single_cells(
         :param single_cell_normalize: determine if the single cell data should also be normalized
         :type single_cell_normalize: bool
         :param normalize_args: additional arguments passed as a dictionary as input to pycytominer.normalize()
+        :type normalize_args: None, dict
         :return: Either a dataframe (if output_file="none") or will write to file
         :rtype: pd.DataFrame, optional
         """
@@ -383,6 +402,7 @@ def merge_single_cells(
         # Add image data to single cell dataframe
         if not self.load_image_data:
             self.load_image()
+            self.load_image_data = True
 
         sc_df = (
             self.image_df.merge(sc_df, on=self.merge_cols, how="right")
@@ -421,6 +441,7 @@ def aggregate_profiles(
         output_file="none",
         compression=None,
         float_format=None,
+        aggregate_args=None,
     ):
         """Aggregate and merge compartments. This is the primary entry to this class.
 
@@ -432,6 +453,8 @@ def aggregate_profiles(
         :type compression: str, optional
         :param float_format: decimal precision to use in writing output file, defaults to None
         :type float_format: str, optional
+        :param aggregate_args: additional arguments passed as a dictionary as input to pycytominer.aggregate()
+        :type aggregate_args: None, dict
         :return: Either a dataframe (if output_file="none") or will write to file
         :rtype: pd.DataFrame, optional
 
@@ -449,21 +472,19 @@ def aggregate_profiles(
         if output_file != "none":
             self.set_output_file(output_file)
 
-        aggregated = (
-            self.aggregate_compartment(
-                compartment="cells", compute_subsample=compute_subsample
-            )
-            .merge(
-                self.aggregate_compartment(compartment="cytoplasm"),
-                on=self.strata,
-                how="inner",
-            )
-            .merge(
-                self.aggregate_compartment(compartment="nuclei"),
-                on=self.strata,
-                how="inner",
-            )
-        )
+        compartment_idx = 0
+        for compartment in self.compartments:
+            if compartment_idx == 0:
+                aggregated = self.aggregate_compartment(
+                    compartment=compartment, compute_subsample=compute_subsample
+                )
+            else:
+                aggregated = aggregated.merge(
+                    self.aggregate_compartment(compartment=compartment),
+                    on=self.strata,
+                    how="inner",
+                )
+            compartment_idx += 1
 
         self.is_aggregated = True
 
diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index b24a1fa0..80ac14d2 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -117,7 +117,6 @@ def test_SingleCells_init():
     assert ap.file_or_conn == file
     assert ap.strata == ["Metadata_Plate", "Metadata_Well"]
     assert ap.merge_cols == ["TableNumber", "ImageNumber"]
-    assert ap.features == "infer"
     pd.testing.assert_frame_equal(image_df, ap.image_df)
     assert ap.subsample_frac == 1
     assert ap_subsample.subsample_frac == 1
@@ -341,6 +340,13 @@ def test_aggregate_profiles():
 
     pd.testing.assert_frame_equal(result, expected_result)
 
+    # Confirm aggregation after merging single cells
+    sc_aggregated_df = aggregate(
+            ap.merge_single_cells()
+        ).sort_index(axis="columns")
+
+    pd.testing.assert_frame_equal(result.sort_index(axis="columns"), sc_aggregated_df)
+
 
 def test_aggregate_subsampling_count_cells():
     count_df = ap_subsample.count_cells()