Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding MODZ Consensus Signature Operation #53

Merged
merged 11 commits into from
Nov 6, 2019
18 changes: 11 additions & 7 deletions pycytominer/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
from sqlalchemy import create_engine
from pycytominer.cyto_utils.output import output
from pycytominer.cyto_utils.util import check_compartments
from pycytominer.cyto_utils.util import check_compartments, check_aggregate_operation


class AggregateProfiles:
Expand Down Expand Up @@ -47,10 +47,7 @@ def __init__(
check_compartments(compartments)

# Check if correct operation is specified
assert operation in [
"mean",
"median",
], "operation must be one ['mean', 'median']"
operation = check_aggregate_operation(operation)

# Check that the subsample_frac is between 0 and 1
assert (
Expand Down Expand Up @@ -80,13 +77,15 @@ def __init__(
# Connect to sqlite engine
self.engine = create_engine(self.sql_file)
self.conn = self.engine.connect()

# Throw an error if both subsample_frac and subsample_n is set
self._check_subsampling()

if load_image_data:
self.load_image()

def _check_subsampling(self):
# Check that the user didn't specify both subset frac and
# Check that the user didn't specify both subset frac and subsample all
assert (
self.subsample_frac == 1 or self.subsample_n == "all"
), "Do not set both subsample_frac and subsample_n"
Expand Down Expand Up @@ -221,7 +220,9 @@ def aggregate_compartment(self, compartment, compute_subsample=False):

return object_df

def aggregate_profiles(self, compute_subsample="False", output_file="none", **kwargs):
def aggregate_profiles(
self, compute_subsample="False", output_file="none", **kwargs
):
"""
Aggregate and merge compartments. This is the primary entry to this class.

Expand Down Expand Up @@ -290,6 +291,9 @@ def aggregate(
Return:
Pandas DataFrame of aggregated features
"""
# Check that the operation is supported
operation = check_aggregate_operation(operation)

# Subset dataframe to only specified variables if provided
if features != "all":
strata_df = population_df.loc[:, strata]
Expand Down
33 changes: 13 additions & 20 deletions pycytominer/correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import numpy as np
import pandas as pd
from pycytominer.cyto_utils.features import infer_cp_features
from pycytominer.cyto_utils.util import (
get_pairwise_correlation,
check_correlation_method,
)


def correlation_threshold(
Expand All @@ -17,8 +21,8 @@ def correlation_threshold(
Arguments:
population_df - pandas DataFrame that includes metadata and observation features
features - a list of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that do not
start with "Metadata_"
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_"
samples - list samples to perform operation on
[default: "none"] - if "none", use all samples to calculate
threshold - float between (0, 1) to exclude features [default: 0.9]
Expand All @@ -28,42 +32,31 @@ def correlation_threshold(
Return:
list of features to exclude from the population_df
"""
method = method.lower()
# Check that the input method is supported
method = check_correlation_method(method)

assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)"
assert method in [
"pearson",
"spearman",
"kendall",
], "method not supported, select one of ['pearson', 'spearman', 'kendall']"

# Subset dataframe and calculate correlation matrix across subset features
if samples != "none":
population_df = population_df.loc[samples, :]

if features == "infer":
features = infer_cp_features(population_df)
population_df = population_df.loc[:, features]
else:
population_df = population_df.loc[:, features]

data_cor_df = population_df.corr(method=method)

# Create a copy of the dataframe to generate upper triangle of zeros
data_cor_zerotri_df = data_cor_df.copy()

# Zero out upper triangle in correlation matrix
data_cor_zerotri_df.loc[:, :] = np.tril(data_cor_df, k=-1)
# Get correlation matrix and lower triangle of pairwise correlations in long format
data_cor_df, pairwise_df = get_pairwise_correlation(
population_df=population_df, method=method
)

# Get absolute sum of correlation across features
# The lower the index, the less correlation to the full data frame
# We want to drop features with highest correlation, so drop higher index
variable_cor_sum = data_cor_df.abs().sum().sort_values().index

# Acquire pairwise correlations in a long format
# Note that we are using the zero triangle DataFrame
pairwise_df = data_cor_zerotri_df.stack().reset_index()
pairwise_df.columns = ["pair_a", "pair_b", "correlation"]

# And subset to only variable combinations that pass the threshold
pairwise_df = pairwise_df.query("correlation > @threshold")

Expand Down
106 changes: 106 additions & 0 deletions pycytominer/cyto_utils/consensus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
Acquire consensus signatures for input samples
"""

import os
import numpy as np
import pandas as pd
from pycytominer.cyto_utils.util import (
get_pairwise_correlation,
check_correlation_method,
)


def modz_base(population_df, method="spearman", min_weight=0.01, precision=4):
"""
Perform a modified z score transformation. This code is modified from cmapPy.
(see https://github.com/cytomining/pycytominer/issues/52). Note that this will
apply the transformation to the FULL population_df.
See modz() for replicate level procedures.

Arguments:
population_df - pandas DataFrame that includes metadata and observation features.
rows are samples and columns are features
method - string indicating which correlation metric to use [default: "spearman"]
min_weight - the minimum correlation to clip all non-negative values lower to
precision - how many significant digits to round weights to

Return:
modz transformed dataframe - a consensus signature of the input population_df
weighted by replicate correlation
"""
assert population_df.shape[0] > 0, "population_df must include at least one sample"

method = check_correlation_method(method=method)

# Step 1: Extract pairwise correlations of samples
# Transpose so samples are columns
population_df = population_df.transpose()
cor_df, pair_df = get_pairwise_correlation(population_df, method=method)

# Round correlation results
pair_df = pair_df.round(precision)

# Step 2: Identify sample weights
# Fill diagonal of correlation_matrix with np.nan
np.fill_diagonal(cor_df.values, np.nan)

# Remove negative values
cor_df = cor_df.clip(lower=0)

# Get average correlation for each profile (will ignore NaN)
raw_weights = cor_df.mean(axis=1)

# Threshold weights (any value < min_weight will become min_weight)
raw_weights = raw_weights.clip(lower=min_weight)

# normalize raw_weights so that they add to 1
weights = raw_weights / sum(raw_weights)
weights = weights.round(precision)

# Step 3: Normalize
if population_df.shape[1] == 1:
# There is only one sample (note that columns are now samples)
modz_df = population_df
else:
modz_df = population_df * weights
modz_df = modz_df.sum(axis=1)

return modz_df


def modz(
population_df, replicate_columns, method="spearman", min_weight=0.01, precision=4
):
"""
Collapse replicates into a consensus signature using a weighted transformation

Arguments:
population_df - pandas DataFrame that includes metadata and observation features.
rows are samples and columns are features
replicate_columns - a string or list of columns in the population dataframe that
indicate replicate level information
method - string indicating which correlation metric to use [default: "spearman"]
min_weight - the minimum correlation to clip all non-negative values lower to
precision - how many significant digits to round weights to

Return:
Consensus signatures for all replicates in the given DataFrame
"""
population_features = population_df.columns.tolist()
assert_error = "{} not in input dataframe".format(replicate_columns)
if isinstance(replicate_columns, list):
assert all([x in population_features for x in replicate_columns]), assert_error
elif isinstance(replicate_columns, str):
assert replicate_columns in population_features, assert_error
replicate_columns = replicate_columns.split()
else:
return ValueError("replicate_columns must be a list or string")

modz_df = population_df.groupby(replicate_columns).apply(
lambda x: modz_base(
x, method=method, min_weight=min_weight, precision=precision
)
)

return modz_df
4 changes: 4 additions & 0 deletions pycytominer/cyto_utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,8 @@ def infer_cp_features(population_df):
)
]

assert (
len(features) > 0
), "No CP features found. Are you sure this dataframe is from CellProfiler?"

return features
78 changes: 75 additions & 3 deletions pycytominer/cyto_utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
"""

import os
import numpy as np
import pandas as pd
from pycytominer.cyto_utils.features import infer_cp_features

default_metadata_file = os.path.join(
os.path.dirname(__file__), "..", "data", "metadata_feature_dictionary.txt"
Expand All @@ -12,9 +14,7 @@

def check_compartments(compartments):
valid_compartments = ["cells", "cytoplasm", "nuclei"]
error_str = "compartment not supported, use one of {}".format(
valid_compartments
)
error_str = "compartment not supported, use one of {}".format(valid_compartments)
if isinstance(compartments, list):
compartments = [x.lower() for x in compartments]
assert all([x in valid_compartments for x in compartments]), error_str
Expand Down Expand Up @@ -46,3 +46,75 @@ def load_known_metadata_dictionary(metadata_file=default_metadata_file):
metadata_dict[compartment] = [feature]

return metadata_dict


def check_correlation_method(method):
"""
Confirm that the input method is currently supported

Arguments:
method - string indicating the correlation metric to use

Return:
Correctly formatted correlation method
"""
method = method.lower()
avail_methods = ["pearson", "spearman", "kendall"]
assert method in avail_methods, "method {} not supported, select one of {}".format(
method, avail_methods
)

return method


def check_aggregate_operation(operation):
"""
Confirm that the input operation for aggregation is currently supported

Arguments:
operation - string indicating the aggregation operation to provide

Return:
Correctly formatted operation method
"""
operation = operation.lower()
avail_ops = ["mean", "median"]
assert (
operation in avail_ops
), "operation {} not supported, select one of {}".format(operation, avail_ops)

return operation


def get_pairwise_correlation(population_df, method="pearson"):
"""
Given a population dataframe, calculate all pairwise correlations

Arguments:
population_df - pandas DataFrame that includes metadata and observation features
method - string indicating which correlation metric to use to test cutoff
[default: "pearson"]

Return:
list of features to exclude from the population_df
"""
# Check that the input method is supported
method = check_correlation_method(method)

# Get a symmetrical correlation matrix
data_cor_df = population_df.corr(method=method)

# Create a copy of the dataframe to generate upper triangle of zeros
data_cor_natri_df = data_cor_df.copy()

# Replace upper triangle in correlation matrix with NaN
data_cor_natri_df = data_cor_natri_df.where(
np.tril(np.ones(data_cor_natri_df.shape), k=-1).astype(np.bool)
)

# Acquire pairwise correlations in a long format
# Note that we are using the NaN upper triangle DataFrame
pairwise_df = data_cor_natri_df.stack().reset_index()
pairwise_df.columns = ["pair_a", "pair_b", "correlation"]

return data_cor_df, pairwise_df
Loading