cytomining · gwaybio · Nov 6, 2019 · Nov 6, 2019 · Nov 6, 2019 · Nov 6, 2019
diff --git a/pycytominer/aggregate.py b/pycytominer/aggregate.py
@@ -6,7 +6,7 @@
 import pandas as pd
 from sqlalchemy import create_engine
 from pycytominer.cyto_utils.output import output
-from pycytominer.cyto_utils.util import check_compartments
+from pycytominer.cyto_utils.util import check_compartments, check_aggregate_operation
 
 
 class AggregateProfiles:
@@ -47,10 +47,7 @@ def __init__(
         check_compartments(compartments)
 
         # Check if correct operation is specified
-        assert operation in [
-            "mean",
-            "median",
-        ], "operation must be one ['mean', 'median']"
+        operation = check_aggregate_operation(operation)
 
         # Check that the subsample_frac is between 0 and 1
         assert (
@@ -80,13 +77,15 @@ def __init__(
         # Connect to sqlite engine
         self.engine = create_engine(self.sql_file)
         self.conn = self.engine.connect()
+
+        # Throw an error if both subsample_frac and subsample_n is set
         self._check_subsampling()
 
         if load_image_data:
             self.load_image()
 
     def _check_subsampling(self):
-        # Check that the user didn't specify both subset frac and
+        # Check that the user didn't specify both subset frac and subsample all
         assert (
             self.subsample_frac == 1 or self.subsample_n == "all"
         ), "Do not set both subsample_frac and subsample_n"
@@ -221,7 +220,9 @@ def aggregate_compartment(self, compartment, compute_subsample=False):
 
         return object_df
 
-    def aggregate_profiles(self, compute_subsample="False", output_file="none", **kwargs):
+    def aggregate_profiles(
+        self, compute_subsample="False", output_file="none", **kwargs
+    ):
         """
         Aggregate and merge compartments. This is the primary entry to this class.
 
@@ -290,6 +291,9 @@ def aggregate(
     Return:
     Pandas DataFrame of aggregated features
     """
+    # Check that the operation is supported
+    operation = check_aggregate_operation(operation)
+
     # Subset dataframe to only specified variables if provided
     if features != "all":
         strata_df = population_df.loc[:, strata]

diff --git a/pycytominer/correlation_threshold.py b/pycytominer/correlation_threshold.py
@@ -6,6 +6,10 @@
 import numpy as np
 import pandas as pd
 from pycytominer.cyto_utils.features import infer_cp_features
+from pycytominer.cyto_utils.util import (
+    get_pairwise_correlation,
+    check_correlation_method,
+)
 
 
 def correlation_threshold(
@@ -17,8 +21,8 @@ def correlation_threshold(
     Arguments:
     population_df - pandas DataFrame that includes metadata and observation features
     features - a list of features present in the population dataframe [default: "infer"]
-               if "infer", then assume cell painting features are those that do not
-               start with "Metadata_"
+               if "infer", then assume cell painting features are those that start with
+               "Cells_", "Nuclei_", or "Cytoplasm_"
     samples - list samples to perform operation on
               [default: "none"] - if "none", use all samples to calculate
     threshold - float between (0, 1) to exclude features [default: 0.9]
@@ -28,42 +32,31 @@ def correlation_threshold(
     Return:
     list of features to exclude from the population_df
     """
-    method = method.lower()
+    # Check that the input method is supported
+    method = check_correlation_method(method)
 
     assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)"
-    assert method in [
-        "pearson",
-        "spearman",
-        "kendall",
-    ], "method not supported, select one of ['pearson', 'spearman', 'kendall']"
 
     # Subset dataframe and calculate correlation matrix across subset features
     if samples != "none":
         population_df = population_df.loc[samples, :]
 
     if features == "infer":
         features = infer_cp_features(population_df)
+        population_df = population_df.loc[:, features]
     else:
         population_df = population_df.loc[:, features]
 
-    data_cor_df = population_df.corr(method=method)
-
-    # Create a copy of the dataframe to generate upper triangle of zeros
-    data_cor_zerotri_df = data_cor_df.copy()
-
-    # Zero out upper triangle in correlation matrix
-    data_cor_zerotri_df.loc[:, :] = np.tril(data_cor_df, k=-1)
+    # Get correlation matrix and lower triangle of pairwise correlations in long format
+    data_cor_df, pairwise_df = get_pairwise_correlation(
+        population_df=population_df, method=method
+    )
 
     # Get absolute sum of correlation across features
     # The lower the index, the less correlation to the full data frame
     # We want to drop features with highest correlation, so drop higher index
     variable_cor_sum = data_cor_df.abs().sum().sort_values().index
 
-    # Acquire pairwise correlations in a long format
-    # Note that we are using the zero triangle DataFrame
-    pairwise_df = data_cor_zerotri_df.stack().reset_index()
-    pairwise_df.columns = ["pair_a", "pair_b", "correlation"]
-
     # And subset to only variable combinations that pass the threshold
     pairwise_df = pairwise_df.query("correlation > @threshold")
 

diff --git a/pycytominer/cyto_utils/consensus.py b/pycytominer/cyto_utils/consensus.py
@@ -0,0 +1,106 @@
+"""
+Acquire consensus signatures for input samples
+"""
+
+import os
+import numpy as np
+import pandas as pd
+from pycytominer.cyto_utils.util import (
+    get_pairwise_correlation,
+    check_correlation_method,
+)
+
+
+def modz_base(population_df, method="spearman", min_weight=0.01, precision=4):
+    """
+    Perform a modified z score transformation. This code is modified from cmapPy.
+    (see https://github.com/cytomining/pycytominer/issues/52). Note that this will
+    apply the transformation to the FULL population_df.
+    See modz() for replicate level procedures.
+
+    Arguments:
+    population_df - pandas DataFrame that includes metadata and observation features.
+                    rows are samples and columns are features
+    method - string indicating which correlation metric to use [default: "spearman"]
+    min_weight - the minimum correlation to clip all non-negative values lower to
+    precision - how many significant digits to round weights to
+
+    Return:
+    modz transformed dataframe - a consensus signature of the input population_df
+    weighted by replicate correlation
+    """
+    assert population_df.shape[0] > 0, "population_df must include at least one sample"
+
+    method = check_correlation_method(method=method)
+
+    # Step 1: Extract pairwise correlations of samples
+    # Transpose so samples are columns
+    population_df = population_df.transpose()
+    cor_df, pair_df = get_pairwise_correlation(population_df, method=method)
+
+    # Round correlation results
+    pair_df = pair_df.round(precision)
+
+    # Step 2: Identify sample weights
+    # Fill diagonal of correlation_matrix with np.nan
+    np.fill_diagonal(cor_df.values, np.nan)
+
+    # Remove negative values
+    cor_df = cor_df.clip(lower=0)
+
+    # Get average correlation for each profile (will ignore NaN)
+    raw_weights = cor_df.mean(axis=1)
+
+    # Threshold weights (any value < min_weight will become min_weight)
+    raw_weights = raw_weights.clip(lower=min_weight)
+
+    # normalize raw_weights so that they add to 1
+    weights = raw_weights / sum(raw_weights)
+    weights = weights.round(precision)
+
+    # Step 3: Normalize
+    if population_df.shape[1] == 1:
+        # There is only one sample (note that columns are now samples)
+        modz_df = population_df
+    else:
+        modz_df = population_df * weights
+        modz_df = modz_df.sum(axis=1)
+
+    return modz_df
+
+
+def modz(
+    population_df, replicate_columns, method="spearman", min_weight=0.01, precision=4
+):
+    """
+    Collapse replicates into a consensus signature using a weighted transformation
+
+    Arguments:
+    population_df - pandas DataFrame that includes metadata and observation features.
+                    rows are samples and columns are features
+    replicate_columns - a string or list of columns in the population dataframe that
+                        indicate replicate level information
+    method - string indicating which correlation metric to use [default: "spearman"]
+    min_weight - the minimum correlation to clip all non-negative values lower to
+    precision - how many significant digits to round weights to
+
+    Return:
+    Consensus signatures for all replicates in the given DataFrame
+    """
+    population_features = population_df.columns.tolist()
+    assert_error = "{} not in input dataframe".format(replicate_columns)
+    if isinstance(replicate_columns, list):
+        assert all([x in population_features for x in replicate_columns]), assert_error
+    elif isinstance(replicate_columns, str):
+        assert replicate_columns in population_features, assert_error
+        replicate_columns = replicate_columns.split()
+    else:
+        return ValueError("replicate_columns must be a list or string")
+
+    modz_df = population_df.groupby(replicate_columns).apply(
+        lambda x: modz_base(
+            x, method=method, min_weight=min_weight, precision=precision
+        )
+    )
+
+    return modz_df
diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py
@@ -80,4 +80,8 @@ def infer_cp_features(population_df):
         )
     ]
 
+    assert (
+        len(features) > 0
+    ), "No CP features found. Are you sure this dataframe is from CellProfiler?"
+
     return features
diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py
@@ -3,7 +3,9 @@
 """
 
 import os
+import numpy as np
 import pandas as pd
+from pycytominer.cyto_utils.features import infer_cp_features
 
 default_metadata_file = os.path.join(
     os.path.dirname(__file__), "..", "data", "metadata_feature_dictionary.txt"
@@ -12,9 +14,7 @@
 
 def check_compartments(compartments):
     valid_compartments = ["cells", "cytoplasm", "nuclei"]
-    error_str = "compartment not supported, use one of {}".format(
-        valid_compartments
-    )
+    error_str = "compartment not supported, use one of {}".format(valid_compartments)
     if isinstance(compartments, list):
         compartments = [x.lower() for x in compartments]
         assert all([x in valid_compartments for x in compartments]), error_str
@@ -46,3 +46,75 @@ def load_known_metadata_dictionary(metadata_file=default_metadata_file):
                 metadata_dict[compartment] = [feature]
 
     return metadata_dict
+
+
+def check_correlation_method(method):
+    """
+    Confirm that the input method is currently supported
+
+    Arguments:
+    method - string indicating the correlation metric to use
+
+    Return:
+    Correctly formatted correlation method
+    """
+    method = method.lower()
+    avail_methods = ["pearson", "spearman", "kendall"]
+    assert method in avail_methods, "method {} not supported, select one of {}".format(
+        method, avail_methods
+    )
+
+    return method
+
+
+def check_aggregate_operation(operation):
+    """
+    Confirm that the input operation for aggregation is currently supported
+
+    Arguments:
+    operation - string indicating the aggregation operation to provide
+
+    Return:
+    Correctly formatted operation method
+    """
+    operation = operation.lower()
+    avail_ops = ["mean", "median"]
+    assert (
+        operation in avail_ops
+    ), "operation {} not supported, select one of {}".format(operation, avail_ops)
+
+    return operation
+
+
+def get_pairwise_correlation(population_df, method="pearson"):
+    """
+    Given a population dataframe, calculate all pairwise correlations
+
+    Arguments:
+    population_df - pandas DataFrame that includes metadata and observation features
+    method - string indicating which correlation metric to use to test cutoff
+             [default: "pearson"]
+
+    Return:
+    list of features to exclude from the population_df
+    """
+    # Check that the input method is supported
+    method = check_correlation_method(method)
+
+    # Get a symmetrical correlation matrix
+    data_cor_df = population_df.corr(method=method)
+
+    # Create a copy of the dataframe to generate upper triangle of zeros
+    data_cor_natri_df = data_cor_df.copy()
+
+    # Replace upper triangle in correlation matrix with NaN
+    data_cor_natri_df = data_cor_natri_df.where(
+        np.tril(np.ones(data_cor_natri_df.shape), k=-1).astype(np.bool)
+    )
+
+    # Acquire pairwise correlations in a long format
+    # Note that we are using the NaN upper triangle DataFrame
+    pairwise_df = data_cor_natri_df.stack().reset_index()
+    pairwise_df.columns = ["pair_a", "pair_b", "correlation"]
+
+    return data_cor_df, pairwise_df