diff --git a/pycytominer/aggregate.py b/pycytominer/aggregate.py index c76f6b19..70919d4b 100644 --- a/pycytominer/aggregate.py +++ b/pycytominer/aggregate.py @@ -6,7 +6,7 @@ import pandas as pd from sqlalchemy import create_engine from pycytominer.cyto_utils.output import output -from pycytominer.cyto_utils.util import check_compartments +from pycytominer.cyto_utils.util import check_compartments, check_aggregate_operation class AggregateProfiles: @@ -47,10 +47,7 @@ def __init__( check_compartments(compartments) # Check if correct operation is specified - assert operation in [ - "mean", - "median", - ], "operation must be one ['mean', 'median']" + operation = check_aggregate_operation(operation) # Check that the subsample_frac is between 0 and 1 assert ( @@ -80,13 +77,15 @@ def __init__( # Connect to sqlite engine self.engine = create_engine(self.sql_file) self.conn = self.engine.connect() + + # Throw an error if both subsample_frac and subsample_n is set self._check_subsampling() if load_image_data: self.load_image() def _check_subsampling(self): - # Check that the user didn't specify both subset frac and + # Check that the user didn't specify both subset frac and subsample all assert ( self.subsample_frac == 1 or self.subsample_n == "all" ), "Do not set both subsample_frac and subsample_n" @@ -221,7 +220,9 @@ def aggregate_compartment(self, compartment, compute_subsample=False): return object_df - def aggregate_profiles(self, compute_subsample="False", output_file="none", **kwargs): + def aggregate_profiles( + self, compute_subsample="False", output_file="none", **kwargs + ): """ Aggregate and merge compartments. This is the primary entry to this class. @@ -290,6 +291,9 @@ def aggregate( Return: Pandas DataFrame of aggregated features """ + # Check that the operation is supported + operation = check_aggregate_operation(operation) + # Subset dataframe to only specified variables if provided if features != "all": strata_df = population_df.loc[:, strata] diff --git a/pycytominer/correlation_threshold.py b/pycytominer/correlation_threshold.py index 9e7ff849..f4931aeb 100644 --- a/pycytominer/correlation_threshold.py +++ b/pycytominer/correlation_threshold.py @@ -6,6 +6,10 @@ import numpy as np import pandas as pd from pycytominer.cyto_utils.features import infer_cp_features +from pycytominer.cyto_utils.util import ( + get_pairwise_correlation, + check_correlation_method, +) def correlation_threshold( @@ -17,8 +21,8 @@ def correlation_threshold( Arguments: population_df - pandas DataFrame that includes metadata and observation features features - a list of features present in the population dataframe [default: "infer"] - if "infer", then assume cell painting features are those that do not - start with "Metadata_" + if "infer", then assume cell painting features are those that start with + "Cells_", "Nuclei_", or "Cytoplasm_" samples - list samples to perform operation on [default: "none"] - if "none", use all samples to calculate threshold - float between (0, 1) to exclude features [default: 0.9] @@ -28,14 +32,10 @@ def correlation_threshold( Return: list of features to exclude from the population_df """ - method = method.lower() + # Check that the input method is supported + method = check_correlation_method(method) assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)" - assert method in [ - "pearson", - "spearman", - "kendall", - ], "method not supported, select one of ['pearson', 'spearman', 'kendall']" # Subset dataframe and calculate correlation matrix across subset features if samples != "none": @@ -43,27 +43,20 @@ def correlation_threshold( if features == "infer": features = infer_cp_features(population_df) + population_df = population_df.loc[:, features] else: population_df = population_df.loc[:, features] - data_cor_df = population_df.corr(method=method) - - # Create a copy of the dataframe to generate upper triangle of zeros - data_cor_zerotri_df = data_cor_df.copy() - - # Zero out upper triangle in correlation matrix - data_cor_zerotri_df.loc[:, :] = np.tril(data_cor_df, k=-1) + # Get correlation matrix and lower triangle of pairwise correlations in long format + data_cor_df, pairwise_df = get_pairwise_correlation( + population_df=population_df, method=method + ) # Get absolute sum of correlation across features # The lower the index, the less correlation to the full data frame # We want to drop features with highest correlation, so drop higher index variable_cor_sum = data_cor_df.abs().sum().sort_values().index - # Acquire pairwise correlations in a long format - # Note that we are using the zero triangle DataFrame - pairwise_df = data_cor_zerotri_df.stack().reset_index() - pairwise_df.columns = ["pair_a", "pair_b", "correlation"] - # And subset to only variable combinations that pass the threshold pairwise_df = pairwise_df.query("correlation > @threshold") diff --git a/pycytominer/cyto_utils/consensus.py b/pycytominer/cyto_utils/consensus.py new file mode 100644 index 00000000..fdc4a8f1 --- /dev/null +++ b/pycytominer/cyto_utils/consensus.py @@ -0,0 +1,106 @@ +""" +Acquire consensus signatures for input samples +""" + +import os +import numpy as np +import pandas as pd +from pycytominer.cyto_utils.util import ( + get_pairwise_correlation, + check_correlation_method, +) + + +def modz_base(population_df, method="spearman", min_weight=0.01, precision=4): + """ + Perform a modified z score transformation. This code is modified from cmapPy. + (see https://github.com/cytomining/pycytominer/issues/52). Note that this will + apply the transformation to the FULL population_df. + See modz() for replicate level procedures. + + Arguments: + population_df - pandas DataFrame that includes metadata and observation features. + rows are samples and columns are features + method - string indicating which correlation metric to use [default: "spearman"] + min_weight - the minimum correlation to clip all non-negative values lower to + precision - how many significant digits to round weights to + + Return: + modz transformed dataframe - a consensus signature of the input population_df + weighted by replicate correlation + """ + assert population_df.shape[0] > 0, "population_df must include at least one sample" + + method = check_correlation_method(method=method) + + # Step 1: Extract pairwise correlations of samples + # Transpose so samples are columns + population_df = population_df.transpose() + cor_df, pair_df = get_pairwise_correlation(population_df, method=method) + + # Round correlation results + pair_df = pair_df.round(precision) + + # Step 2: Identify sample weights + # Fill diagonal of correlation_matrix with np.nan + np.fill_diagonal(cor_df.values, np.nan) + + # Remove negative values + cor_df = cor_df.clip(lower=0) + + # Get average correlation for each profile (will ignore NaN) + raw_weights = cor_df.mean(axis=1) + + # Threshold weights (any value < min_weight will become min_weight) + raw_weights = raw_weights.clip(lower=min_weight) + + # normalize raw_weights so that they add to 1 + weights = raw_weights / sum(raw_weights) + weights = weights.round(precision) + + # Step 3: Normalize + if population_df.shape[1] == 1: + # There is only one sample (note that columns are now samples) + modz_df = population_df + else: + modz_df = population_df * weights + modz_df = modz_df.sum(axis=1) + + return modz_df + + +def modz( + population_df, replicate_columns, method="spearman", min_weight=0.01, precision=4 +): + """ + Collapse replicates into a consensus signature using a weighted transformation + + Arguments: + population_df - pandas DataFrame that includes metadata and observation features. + rows are samples and columns are features + replicate_columns - a string or list of columns in the population dataframe that + indicate replicate level information + method - string indicating which correlation metric to use [default: "spearman"] + min_weight - the minimum correlation to clip all non-negative values lower to + precision - how many significant digits to round weights to + + Return: + Consensus signatures for all replicates in the given DataFrame + """ + population_features = population_df.columns.tolist() + assert_error = "{} not in input dataframe".format(replicate_columns) + if isinstance(replicate_columns, list): + assert all([x in population_features for x in replicate_columns]), assert_error + elif isinstance(replicate_columns, str): + assert replicate_columns in population_features, assert_error + replicate_columns = replicate_columns.split() + else: + return ValueError("replicate_columns must be a list or string") + + modz_df = population_df.groupby(replicate_columns).apply( + lambda x: modz_base( + x, method=method, min_weight=min_weight, precision=precision + ) + ) + + return modz_df diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py index 7efe62e4..0fdbcb7b 100644 --- a/pycytominer/cyto_utils/features.py +++ b/pycytominer/cyto_utils/features.py @@ -80,4 +80,8 @@ def infer_cp_features(population_df): ) ] + assert ( + len(features) > 0 + ), "No CP features found. Are you sure this dataframe is from CellProfiler?" + return features diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py index 3ebb3eea..a69facf9 100644 --- a/pycytominer/cyto_utils/util.py +++ b/pycytominer/cyto_utils/util.py @@ -3,7 +3,9 @@ """ import os +import numpy as np import pandas as pd +from pycytominer.cyto_utils.features import infer_cp_features default_metadata_file = os.path.join( os.path.dirname(__file__), "..", "data", "metadata_feature_dictionary.txt" @@ -12,9 +14,7 @@ def check_compartments(compartments): valid_compartments = ["cells", "cytoplasm", "nuclei"] - error_str = "compartment not supported, use one of {}".format( - valid_compartments - ) + error_str = "compartment not supported, use one of {}".format(valid_compartments) if isinstance(compartments, list): compartments = [x.lower() for x in compartments] assert all([x in valid_compartments for x in compartments]), error_str @@ -46,3 +46,75 @@ def load_known_metadata_dictionary(metadata_file=default_metadata_file): metadata_dict[compartment] = [feature] return metadata_dict + + +def check_correlation_method(method): + """ + Confirm that the input method is currently supported + + Arguments: + method - string indicating the correlation metric to use + + Return: + Correctly formatted correlation method + """ + method = method.lower() + avail_methods = ["pearson", "spearman", "kendall"] + assert method in avail_methods, "method {} not supported, select one of {}".format( + method, avail_methods + ) + + return method + + +def check_aggregate_operation(operation): + """ + Confirm that the input operation for aggregation is currently supported + + Arguments: + operation - string indicating the aggregation operation to provide + + Return: + Correctly formatted operation method + """ + operation = operation.lower() + avail_ops = ["mean", "median"] + assert ( + operation in avail_ops + ), "operation {} not supported, select one of {}".format(operation, avail_ops) + + return operation + + +def get_pairwise_correlation(population_df, method="pearson"): + """ + Given a population dataframe, calculate all pairwise correlations + + Arguments: + population_df - pandas DataFrame that includes metadata and observation features + method - string indicating which correlation metric to use to test cutoff + [default: "pearson"] + + Return: + list of features to exclude from the population_df + """ + # Check that the input method is supported + method = check_correlation_method(method) + + # Get a symmetrical correlation matrix + data_cor_df = population_df.corr(method=method) + + # Create a copy of the dataframe to generate upper triangle of zeros + data_cor_natri_df = data_cor_df.copy() + + # Replace upper triangle in correlation matrix with NaN + data_cor_natri_df = data_cor_natri_df.where( + np.tril(np.ones(data_cor_natri_df.shape), k=-1).astype(np.bool) + ) + + # Acquire pairwise correlations in a long format + # Note that we are using the NaN upper triangle DataFrame + pairwise_df = data_cor_natri_df.stack().reset_index() + pairwise_df.columns = ["pair_a", "pair_b", "correlation"] + + return data_cor_df, pairwise_df diff --git a/pycytominer/tests/test_correlation_threshold.py b/pycytominer/tests/test_correlation_threshold.py index c619d209..08410cd4 100644 --- a/pycytominer/tests/test_correlation_threshold.py +++ b/pycytominer/tests/test_correlation_threshold.py @@ -1,4 +1,5 @@ import pandas as pd +import pytest from pycytominer.correlation_threshold import correlation_threshold # Build data to use in tests @@ -23,32 +24,80 @@ def test_correlation_threshold(): - """ - Testing correlation_threshold pycytominer function - """ correlation_threshold_result = correlation_threshold( population_df=data_df, + features=data_df.columns.tolist(), samples="none", threshold=0.9, method="pearson", ) - expected_result = ['y'] + expected_result = ["y"] assert correlation_threshold_result == expected_result + correlation_threshold_result = correlation_threshold( + population_df=data_df, + features=data_df.columns.tolist(), + samples="none", + threshold=0.2, + method="pearson", + ) + + expected_result = sorted(["y", "zz", "x"]) + + assert sorted(correlation_threshold_result) == expected_result + + +def test_correlation_threshold_uncorrelated(): + correlation_threshold_result = correlation_threshold( + population_df=data_uncorrelated_df, + features=data_uncorrelated_df.columns.tolist(), + samples="none", + threshold=0.9, + method="pearson", + ) + + assert len(correlation_threshold_result) == 0 + def test_correlation_threshold_samples(): - """ - Testing correlation_threshold pycytominer function - """ correlation_threshold_result = correlation_threshold( population_df=data_df, + features=data_df.columns.tolist(), samples=[0, 1, 3, 4, 5], threshold=0.9, method="pearson", ) - expected_result = ['y'] + expected_result = ["y"] + + assert correlation_threshold_result == expected_result + + +def test_correlation_threshold_featureinfer(): + with pytest.raises(AssertionError) as nocp: + correlation_threshold_result = correlation_threshold( + population_df=data_df, + features="infer", + samples="none", + threshold=0.9, + method="pearson", + ) + + assert "No CP features found." in str(nocp.value) + + data_cp_df = data_df.copy() + data_cp_df.columns = ["Cells_{}".format(x) for x in data_df.columns] + + correlation_threshold_result = correlation_threshold( + population_df=data_cp_df, + features="infer", + samples="none", + threshold=0.9, + method="pearson", + ) + + expected_result = ["Cells_y"] assert correlation_threshold_result == expected_result diff --git a/pycytominer/tests/test_cyto_utils/test_consensus.py b/pycytominer/tests/test_cyto_utils/test_consensus.py new file mode 100644 index 00000000..d181cc85 --- /dev/null +++ b/pycytominer/tests/test_cyto_utils/test_consensus.py @@ -0,0 +1,86 @@ +import os +import random +import numpy as np +import pandas as pd +from pycytominer.cyto_utils.consensus import modz_base, modz + +# No replicate information +data_df = pd.DataFrame({"x": [1, 1, -1], "y": [5, 5, -5], "z": [2, 2, -2]}) +data_df.index = ["sample_{}".format(x) for x in data_df.index] + +# Include replicate information +data_replicate_df = pd.concat( + [ + pd.DataFrame({"g": "a", "x": [1, 1, -1], "y": [5, 5, -5], "z": [2, 2, -2]}), + pd.DataFrame({"g": "b", "x": [1, 3, 5], "y": [8, 3, 1], "z": [5, -2, 1]}), + ] +).reset_index(drop=True) +data_replicate_df.index = ["sample_{}".format(x) for x in data_replicate_df.index] + +precision = 4 +replicate_columns = "g" + + +def test_modz_base(): + # The expected result is to completely remove influence of anticorrelated sample + consensus_df = modz_base(data_df, min_weight=0, precision=precision) + expected_result = pd.Series([1.0, 5.0, 2.0], index=data_df.columns) + pd.testing.assert_series_equal(expected_result, consensus_df) + + # With the min_weight = 1, then modz is mean + consensus_df = modz_base(data_df, min_weight=1, precision=precision) + expected_result = data_df.mean().round(precision) + pd.testing.assert_series_equal( + expected_result, consensus_df, check_less_precise=True + ) + + +def test_modz(): + # The expected result is to completely remove influence of anticorrelated sample + consensus_df = modz( + data_replicate_df, replicate_columns, min_weight=0, precision=precision + ) + expected_result = pd.DataFrame( + {"x": [1.0, 4.0], "y": [5.0, 2.0], "z": [2.0, -0.5]}, index=["a", "b"] + ) + expected_result.index.name = replicate_columns + pd.testing.assert_frame_equal(expected_result, consensus_df) + + # With the min_weight = 1, then modz is mean + consensus_df = modz( + data_replicate_df, replicate_columns, min_weight=1, precision=precision + ) + expected_result = data_replicate_df.groupby(replicate_columns).mean().round(4) + expected_result.index.name = replicate_columns + pd.testing.assert_frame_equal( + expected_result, consensus_df, check_less_precise=True + ) + + +def test_modz_multiple_columns(): + replicate_columns = ["g", "h"] + data_replicate_multi_df = data_replicate_df.assign(h=["c", "c", "c", "d", "d", "d"]) + # The expected result is to completely remove influence of anticorrelated sample + consensus_df = modz( + data_replicate_multi_df, replicate_columns, min_weight=0, precision=precision + ) + expected_result = pd.DataFrame( + { + "g": ["a", "b"], + "h": ["c", "d"], + "x": [1.0, 4.0], + "y": [5.0, 2.0], + "z": [2.0, -0.5], + } + ) + pd.testing.assert_frame_equal(expected_result, consensus_df.reset_index()) + + # With the min_weight = 1, then modz is mean + consensus_df = modz( + data_replicate_multi_df, replicate_columns, min_weight=1, precision=precision + ) + expected_result = data_replicate_multi_df.groupby(replicate_columns).mean().round(4) + expected_result.index.name = replicate_columns + pd.testing.assert_frame_equal( + expected_result, consensus_df, check_less_precise=True + ) diff --git a/pycytominer/tests/test_cyto_utils/test_feature_infer.py b/pycytominer/tests/test_cyto_utils/test_feature_infer.py index 187820aa..014d70d8 100644 --- a/pycytominer/tests/test_cyto_utils/test_feature_infer.py +++ b/pycytominer/tests/test_cyto_utils/test_feature_infer.py @@ -20,6 +20,16 @@ ).reset_index(drop=True) +non_cp_data_df = pd.DataFrame( + { + "x": [1, 3, 8, 5, 2, 2], + "y": [1, 2, 8, 5, 2, 1], + "z": [9, 3, 8, 9, 2, 9], + "zz": [0, -3, 8, 9, 6, 9], + } +).reset_index(drop=True) + + def test_feature_infer(): features = infer_cp_features(population_df=data_df) expected = [ @@ -30,3 +40,10 @@ def test_feature_infer(): ] assert features == expected + + +def test_feature_infer_nocp(): + with pytest.raises(AssertionError) as nocp: + features = infer_cp_features(population_df=non_cp_data_df) + + assert "No CP features found." in str(nocp.value) diff --git a/pycytominer/tests/test_cyto_utils/test_util.py b/pycytominer/tests/test_cyto_utils/test_util.py index a2c77c51..6854a30e 100644 --- a/pycytominer/tests/test_cyto_utils/test_util.py +++ b/pycytominer/tests/test_cyto_utils/test_util.py @@ -6,6 +6,9 @@ from pycytominer.cyto_utils.util import ( check_compartments, load_known_metadata_dictionary, + get_pairwise_correlation, + check_correlation_method, + check_aggregate_operation, ) tmpdir = tempfile.gettempdir() @@ -68,3 +71,44 @@ def test_load_known_metadata_dictionary(): "cytoplasm": meta_cols, } assert result == expected_result + + +def test_check_correlation_method(): + method = check_correlation_method(method="PeaRSon") + expected_method = "pearson" + + assert method == expected_method + + with pytest.raises(AssertionError) as nomethod: + method = check_correlation_method(method="DOES NOT EXIST") + + assert "not supported, select one of" in str(nomethod.value) + + +def test_check_aggregate_operation_method(): + operation = check_aggregate_operation(operation="MEAn") + expected_op = "mean" + + assert operation == expected_op + + with pytest.raises(AssertionError) as nomethod: + method = check_aggregate_operation(operation="DOES NOT EXIST") + + assert "not supported, select one of" in str(nomethod.value) + + +def test_get_pairwise_correlation(): + data_df = pd.concat( + [ + pd.DataFrame({"x": [1, 3, 8], "y": [5, 3, 1]}), + pd.DataFrame({"x": [1, 3, 5], "y": [8, 3, 1]}), + ] + ).reset_index(drop=True) + + cor_df, pair_df = get_pairwise_correlation(data_df, method="pearson") + + pd.testing.assert_frame_equal(cor_df, data_df.corr(method="pearson")) + + expected_result = -0.8 + x_y_cor = pair_df.query("correlation != 0").round(1).correlation.values[0] + assert x_y_cor == expected_result diff --git a/pycytominer/tests/test_get_na_columns.py b/pycytominer/tests/test_get_na_columns.py index 08842d57..304e7b17 100644 --- a/pycytominer/tests/test_get_na_columns.py +++ b/pycytominer/tests/test_get_na_columns.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pytest from pycytominer.get_na_columns import get_na_columns data_df = pd.DataFrame( @@ -23,7 +24,7 @@ def test_get_na_columns(): assert get_na_columns_result == expected_result get_na_columns_result = get_na_columns( - population_df=data_df, features="infer", cutoff=0.1 + population_df=data_df, features=data_df.columns.tolist(), cutoff=0.1 ) expected_result = ["x", "y", "z", "zz"] assert sorted(get_na_columns_result) == expected_result @@ -60,3 +61,15 @@ def test_get_na_columns_sample(): ) expected_result = ["y", "zz"] assert sorted(get_na_columns_result) == expected_result + + +def test_get_na_columns_featureinfer(): + with pytest.raises(AssertionError) as nocp: + na_result = get_na_columns( + population_df=data_df, + samples="none", + features="infer", + cutoff=0.1 + ) + + assert "No CP features found." in str(nocp.value) diff --git a/pycytominer/tests/test_variance_threshold.py b/pycytominer/tests/test_variance_threshold.py index 01e5268b..6f70a0fd 100644 --- a/pycytominer/tests/test_variance_threshold.py +++ b/pycytominer/tests/test_variance_threshold.py @@ -1,4 +1,5 @@ import random +import pytest import numpy as np import pandas as pd from pycytominer.variance_threshold import variance_threshold, calculate_frequency @@ -62,13 +63,11 @@ def test_calculate_frequency(): def test_variance_threshold(): - """ - Testing pycyominer variance threshold calculation - """ - unique_cut = 0.01 excluded_features = variance_threshold( - population_df=data_unique_test_df, unique_cut=unique_cut + population_df=data_unique_test_df, + features=data_unique_test_df.columns.tolist(), + unique_cut=unique_cut ) expected_result = ["a"] @@ -76,7 +75,9 @@ def test_variance_threshold(): unique_cut = 0.03 excluded_features = variance_threshold( - population_df=data_unique_test_df, unique_cut=unique_cut + population_df=data_unique_test_df, + features=data_unique_test_df.columns.tolist(), + unique_cut=unique_cut ) expected_result = ["a", "b"] @@ -92,3 +93,28 @@ def test_variance_threshold(): ].index.tolist() assert len(excluded_features_freq) == 0 + + +def test_cvariance_threshold_featureinfer(): + unique_cut = 0.01 + with pytest.raises(AssertionError) as nocp: + excluded_features = variance_threshold( + population_df=data_unique_test_df, + features="infer", + unique_cut=unique_cut + ) + + assert "No CP features found." in str(nocp.value) + + data_cp_df = data_unique_test_df.copy() + data_cp_df.columns = ["Cells_{}".format(x) for x in data_unique_test_df.columns] + + excluded_features = variance_threshold( + population_df=data_cp_df, + features="infer", + unique_cut=unique_cut + ) + + expected_result = ["Cells_a"] + + assert excluded_features == expected_result diff --git a/pycytominer/variance_threshold.py b/pycytominer/variance_threshold.py index 69ab7cd4..aa4299d6 100644 --- a/pycytominer/variance_threshold.py +++ b/pycytominer/variance_threshold.py @@ -17,8 +17,8 @@ def variance_threshold( Arguments: population_df - pandas DataFrame that includes metadata and observation features features - a list of features present in the population dataframe [default: "infer"] - if "infer", then assume cell painting features are those that do not - start with "Cells", "Nuclei", or "Cytoplasm" + if "infer", then assume cell painting features are those that start with + "Cells_", "Nuclei_", or "Cytoplasm_" samples - list samples to perform operation on [default: "none"] - if "none", use all samples to calculate freq_cut - float of ratio (second most common feature value / most common) [default: 0.1] @@ -31,12 +31,13 @@ def variance_threshold( assert 0 <= freq_cut <= 1, "freq_cut variable must be between (0 and 1)" assert 0 <= unique_cut <= 1, "unique_cut variable must be between (0 and 1)" - # Subset dataframe and calculate correlation matrix across subset features + # Subset dataframe if samples != "none": population_df = population_df.loc[samples, :] if features == "infer": features = infer_cp_features(population_df) + population_df = population_df.loc[:, features] else: population_df = population_df.loc[:, features]