From 22d305ef6ce40787b2c96e4b3e60ce005cca9940 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Fri, 25 Sep 2020 15:18:38 -0400 Subject: [PATCH 01/15] move AggregateProfiles class to SingleCell class --- pycytominer/aggregate.py | 290 +----------------- .../test_cells.py} | 24 +- 2 files changed, 26 insertions(+), 288 deletions(-) rename pycytominer/tests/{test_aggregate_profiles.py => test_cyto_utils/test_cells.py} (95%) diff --git a/pycytominer/aggregate.py b/pycytominer/aggregate.py index 1d691a5c..8cb327c2 100644 --- a/pycytominer/aggregate.py +++ b/pycytominer/aggregate.py @@ -4,294 +4,19 @@ import numpy as np import pandas as pd -from sqlalchemy import create_engine from pycytominer.cyto_utils import ( output, - check_compartments, check_aggregate_operation, infer_cp_features, ) -class AggregateProfiles: - """ - Class to aggregate single cell morphological profiles - """ - - def __init__( - self, - sql_file, - strata=["Metadata_Plate", "Metadata_Well"], - features="infer", - operation="median", - output_file="none", - compartments=["cells", "cytoplasm", "nuclei"], - merge_cols=["TableNumber", "ImageNumber"], - load_image_data=True, - subsample_frac=1, - subsample_n="all", - subsampling_random_state="none", - ): - """ - Arguments: - sql_file - string or sqlalchemy connection - strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate - features - [default: "all"] or list indicating features that should be aggregated - operation - [default: "median"] a string indicating how the data is aggregated - currently only supports one of ['mean', 'median'] - output_file - [default: "none"] string if specified, write to location - compartments - list of compartments to process - merge_cols - column indicating which columns to merge images and compartments - subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of - single cells to select - subsample_n - [default: "all"] int indicating how many samples to include - subsampling_random_state - [default: "none"] the random state to init subsample - """ - # Check compartments specified - check_compartments(compartments) - - # Check if correct operation is specified - operation = check_aggregate_operation(operation) - - # Check that the subsample_frac is between 0 and 1 - assert ( - 0 < subsample_frac and 1 >= subsample_frac - ), "subsample_frac must be between 0 and 1" - - self.sql_file = sql_file - self.strata = strata - self.features = features - self.operation = operation.lower() - self.output_file = output_file - self.compartments = compartments - self.merge_cols = merge_cols - self.subsample_frac = subsample_frac - self.subsample_n = subsample_n - self.subset_data_df = "none" - self.subsampling_random_state = subsampling_random_state - self.is_aggregated = False - self.is_subset_computed = False - - if self.subsample_n != "all": - self.set_subsample_n(self.subsample_n) - - # Connect to sqlite engine - self.engine = create_engine(self.sql_file) - self.conn = self.engine.connect() - - # Throw an error if both subsample_frac and subsample_n is set - self._check_subsampling() - - if load_image_data: - self.load_image() - - def _check_subsampling(self): - # Check that the user didn't specify both subset frac and subsample all - assert ( - self.subsample_frac == 1 or self.subsample_n == "all" - ), "Do not set both subsample_frac and subsample_n" - - def set_output_file(self, output_file): - self.output_file = output_file - - def set_subsample_frac(self, subsample_frac): - self.subsample_frac = subsample_frac - self._check_subsampling() - - def set_subsample_n(self, subsample_n): - try: - self.subsample_n = int(subsample_n) - except ValueError: - raise ValueError("subsample n must be an integer or coercable") - self._check_subsampling() - - def set_subsample_random_state(self, random_state): - self.subsampling_random_state = random_state - - def load_image(self): - """ - Load image table from sqlite file - """ - # Extract image metadata - image_cols = "TableNumber, ImageNumber, {}".format(", ".join(self.strata)) - image_query = "select {} from image".format(image_cols) - self.image_df = pd.read_sql(sql=image_query, con=self.conn) - - def count_cells(self, compartment="cells", count_subset=False): - """ - Determine how many cells are measured per well. - - Arguments: - compartment - string indicating the compartment to subset - count_subset - [default: False] count the number of cells in subset partition - """ - check_compartments(compartment) - - if count_subset: - assert self.is_aggregated, "Make sure to aggregate_profiles() first!" - assert self.is_subset_computed, "Make sure to get_subsample() first!" - count_df = ( - self.subset_data_df.groupby(self.strata)["ObjectNumber"] - .count() - .reset_index() - .rename({"ObjectNumber": "cell_count"}, axis="columns") - ) - else: - query_cols = "TableNumber, ImageNumber, ObjectNumber" - query = "select {} from {}".format(query_cols, compartment) - count_df = self.image_df.merge( - pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols - ) - count_df = ( - count_df.groupby(self.strata)["ObjectNumber"] - .count() - .reset_index() - .rename({"ObjectNumber": "cell_count"}, axis="columns") - ) - - return count_df - - def subsample_profiles(self, x): - """ - Sample a Pandas DataFrame given the subsampling fraction - """ - if self.subsampling_random_state == "none": - random_state = np.random.randint(0, 10000, size=1)[0] - self.set_subsample_random_state(random_state) - - if self.subsample_frac == 1: - return pd.DataFrame.sample( - x, - n=self.subsample_n, - replace=True, - random_state=self.subsampling_random_state, - ) - else: - return pd.DataFrame.sample( - x, frac=self.subsample_frac, random_state=self.subsampling_random_state - ) - - def get_subsample(self, compartment="cells"): - """ - Extract subsample from sqlite file - - Arguments: - compartment - [default: "cells"] string indicating the compartment to subset - """ - check_compartments(compartment) - - query_cols = "TableNumber, ImageNumber, ObjectNumber" - query = "select {} from {}".format(query_cols, compartment) - - # Load query and merge with image_df - query_df = self.image_df.merge( - pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols - ) - - self.subset_data_df = ( - query_df.groupby(self.strata) - .apply(lambda x: self.subsample_profiles(x)) - .reset_index(drop=True) - ) - - self.is_subset_computed = True - - def aggregate_compartment(self, compartment, compute_subsample=False): - """ - Aggregate morphological profiles - - Arguments: - compartment - str indicating specific compartment to extract - - Return: - Either the merged object file or write object to disk - """ - check_compartments(compartment) - - compartment_query = "select * from {}".format(compartment) - - if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: - self.get_subsample(compartment=compartment) - - population_df = self.image_df.merge( - pd.read_sql(sql=compartment_query, con=self.conn), - how="inner", - on=self.merge_cols, - ) - - object_df = aggregate( - population_df=population_df, - strata=self.strata, - features=self.features, - operation=self.operation, - subset_data_df=self.subset_data_df, - ) - - return object_df - - def aggregate_profiles( - self, - compute_subsample="False", - output_file="none", - compression=None, - float_format=None, - ): - """ - Aggregate and merge compartments. This is the primary entry to this class. - - Arguments: - compute_subsample - [default: False] boolean if subsample should be computed. - NOTE: Must be specified to perform subsampling. Will not - apply subsetting if set to False even if subsample is - initialized - output_file - [default: "none"] if provided, will write annotated profiles to file - if not specified, will return the annotated profiles. We recommend - that this output file be suffixed with "_augmented.csv". - compression - the mechanism to compress [default: None] - float_format - decimal precision to use in writing output file [default: None] - For example, use "%.3g" for 3 decimal precision. - - Return: - if output_file is set, then write to file. If not then return - """ - - if output_file != "none": - self.set_output_file(output_file) - - aggregated = ( - self.aggregate_compartment( - compartment="cells", compute_subsample=compute_subsample - ) - .merge( - self.aggregate_compartment(compartment="cytoplasm"), - on=self.strata, - how="inner", - ) - .merge( - self.aggregate_compartment(compartment="nuclei"), - on=self.strata, - how="inner", - ) - ) - - self.is_aggregated = True - - if self.output_file != "none": - output( - df=aggregated, - output_filename=self.output_file, - compression=compression, - float_format=float_format, - ) - else: - return aggregated - - def aggregate( population_df, strata=["Metadata_Plate", "Metadata_Well"], features="infer", operation="median", + output_file="none", subset_data_df="none", ): """ @@ -303,6 +28,9 @@ def aggregate( features - [default: "all"] or list indicating features that should be aggregated operation - [default: "median"] a string indicating how the data is aggregated currently only supports one of ['mean', 'median'] + output_file - [default: "none"] if provided, will write aggregated profiles to file + if not specified, will return the aggregated profiles. We recommend + naming the file based on the plate name. subset_data_df - [default: "none"] a pandas dataframe indicating how to subset the input Return: @@ -345,4 +73,14 @@ def aggregate( if col in population_df.columns: population_df = population_df.drop([col], axis="columns") + if output_file != "none": + output( + df=population_df, + output_filename=output_file, + compression=compression, + float_format=float_format, + ) + else: + return population_df + return population_df diff --git a/pycytominer/tests/test_aggregate_profiles.py b/pycytominer/tests/test_cyto_utils/test_cells.py similarity index 95% rename from pycytominer/tests/test_aggregate_profiles.py rename to pycytominer/tests/test_cyto_utils/test_cells.py index c18f73c1..97ec40c7 100644 --- a/pycytominer/tests/test_aggregate_profiles.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -5,7 +5,7 @@ import pandas as pd from sqlalchemy import create_engine from pycytominer import aggregate -from pycytominer.aggregate import AggregateProfiles +from pycytominer.cyto_utils.cells import SingleCells random.seed(123) @@ -64,16 +64,16 @@ def build_random_data( cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace") nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace") -# Setup AggregateProfiles Class -ap = AggregateProfiles(sql_file=file) -ap_subsample = AggregateProfiles( +# Setup SingleCells Class +ap = SingleCells(sql_file=file) +ap_subsample = SingleCells( sql_file=file, subsample_n=2, subsampling_random_state=123 ) -def test_AggregateProfiles_init(): +def test_SingleCells_init(): """ - Testing initialization of AggregateProfiles + Testing initialization of SingleCells """ assert ap.sql_file == file assert ap.strata == ["Metadata_Plate", "Metadata_Well"] @@ -92,11 +92,11 @@ def test_AggregateProfiles_init(): assert ap_subsample.subsampling_random_state == 123 -def test_AggregateProfiles_reset_variables(): +def test_SingleCells_reset_variables(): """ - Testing initialization of AggregateProfiles + Testing initialization of SingleCells """ - ap_switch = AggregateProfiles(sql_file=file) + ap_switch = SingleCells(sql_file=file) assert ap_switch.subsample_frac == 1 assert ap_switch.subsample_n == "all" assert ap_switch.subsampling_random_state == "none" @@ -121,7 +121,7 @@ def test_AggregateProfiles_reset_variables(): assert "subsample n must be an integer or coercable" in str(errorinfo.value.args[0]) -def test_AggregateProfiles_count(): +def test_SingleCells_count(): count_df = ap.count_cells() expected_count = pd.DataFrame( { @@ -307,8 +307,8 @@ def test_aggregate_count_cells_multiple_strata(): cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace") nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace") - # Setup AggregateProfiles Class - ap_strata = AggregateProfiles( + # Setup SingleCells Class + ap_strata = SingleCells( sql_file=file, subsample_n="4", strata=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], From 93759dad597636ea097864b97189cde70ec25d8a Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Fri, 9 Oct 2020 11:53:56 -0400 Subject: [PATCH 02/15] black on test --- pycytominer/tests/test_cyto_utils/test_cells.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 97ec40c7..4b293082 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -66,9 +66,7 @@ def build_random_data( # Setup SingleCells Class ap = SingleCells(sql_file=file) -ap_subsample = SingleCells( - sql_file=file, subsample_n=2, subsampling_random_state=123 -) +ap_subsample = SingleCells(sql_file=file, subsample_n=2, subsampling_random_state=123) def test_SingleCells_init(): From 3b86693f6eb68facc638efebd329830a1146e910 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Thu, 10 Dec 2020 09:50:34 -0500 Subject: [PATCH 03/15] add cells module --- pycytominer/cyto_utils/cells.py | 284 ++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 pycytominer/cyto_utils/cells.py diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py new file mode 100644 index 00000000..dbf8de83 --- /dev/null +++ b/pycytominer/cyto_utils/cells.py @@ -0,0 +1,284 @@ +import numpy as np +import pandas as pd +from sqlalchemy import create_engine +from pycytominer import aggregate +from pycytominer.cyto_utils import ( + output, + check_compartments, + check_aggregate_operation, + infer_cp_features, +) + + +class SingleCells: + """ + Class to interact with single cell morphological profiles + """ + + def __init__( + self, + sql_file, + strata=["Metadata_Plate", "Metadata_Well"], + features="infer", + operation="median", + output_file="none", + compartments=["cells", "cytoplasm", "nuclei"], + merge_cols=["TableNumber", "ImageNumber"], + load_image_data=True, + subsample_frac=1, + subsample_n="all", + subsampling_random_state="none", + ): + """ + Arguments: + sql_file - string or sqlalchemy connection + strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate + features - [default: "all"] or list indicating features that should be aggregated + operation - [default: "median"] a string indicating how the data is aggregated + currently only supports one of ['mean', 'median'] + output_file - [default: "none"] string if specified, write to location + compartments - list of compartments to process + merge_cols - column indicating which columns to merge images and compartments + subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of + single cells to select + subsample_n - [default: "all"] int indicating how many samples to include + subsampling_random_state - [default: "none"] the random state to init subsample + """ + # Check compartments specified + check_compartments(compartments) + + # Check if correct operation is specified + operation = check_aggregate_operation(operation) + + # Check that the subsample_frac is between 0 and 1 + assert ( + 0 < subsample_frac and 1 >= subsample_frac + ), "subsample_frac must be between 0 and 1" + + self.sql_file = sql_file + self.strata = strata + self.features = features + self.operation = operation.lower() + self.output_file = output_file + self.compartments = compartments + self.merge_cols = merge_cols + self.subsample_frac = subsample_frac + self.subsample_n = subsample_n + self.subset_data_df = "none" + self.subsampling_random_state = subsampling_random_state + self.is_aggregated = False + self.is_subset_computed = False + + if self.subsample_n != "all": + self.set_subsample_n(self.subsample_n) + + # Connect to sqlite engine + self.engine = create_engine(self.sql_file) + self.conn = self.engine.connect() + + # Throw an error if both subsample_frac and subsample_n is set + self._check_subsampling() + + if load_image_data: + self.load_image() + + def _check_subsampling(self): + # Check that the user didn't specify both subset frac and subsample all + assert ( + self.subsample_frac == 1 or self.subsample_n == "all" + ), "Do not set both subsample_frac and subsample_n" + + def set_output_file(self, output_file): + self.output_file = output_file + + def set_subsample_frac(self, subsample_frac): + self.subsample_frac = subsample_frac + self._check_subsampling() + + def set_subsample_n(self, subsample_n): + try: + self.subsample_n = int(subsample_n) + except ValueError: + raise ValueError("subsample n must be an integer or coercable") + self._check_subsampling() + + def set_subsample_random_state(self, random_state): + self.subsampling_random_state = random_state + + def load_image(self): + """ + Load image table from sqlite file + """ + # Extract image metadata + image_cols = "TableNumber, ImageNumber, {}".format(", ".join(self.strata)) + image_query = "select {} from image".format(image_cols) + self.image_df = pd.read_sql(sql=image_query, con=self.conn) + + def count_cells(self, compartment="cells", count_subset=False): + """ + Determine how many cells are measured per well. + + Arguments: + compartment - string indicating the compartment to subset + count_subset - [default: False] count the number of cells in subset partition + """ + check_compartments(compartment) + + if count_subset: + assert self.is_aggregated, "Make sure to aggregate_profiles() first!" + assert self.is_subset_computed, "Make sure to get_subsample() first!" + count_df = ( + self.subset_data_df.groupby(self.strata)["ObjectNumber"] + .count() + .reset_index() + .rename({"ObjectNumber": "cell_count"}, axis="columns") + ) + else: + query_cols = "TableNumber, ImageNumber, ObjectNumber" + query = "select {} from {}".format(query_cols, compartment) + count_df = self.image_df.merge( + pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols + ) + count_df = ( + count_df.groupby(self.strata)["ObjectNumber"] + .count() + .reset_index() + .rename({"ObjectNumber": "cell_count"}, axis="columns") + ) + + return count_df + + def subsample_profiles(self, x): + """ + Sample a Pandas DataFrame given the subsampling fraction + """ + if self.subsampling_random_state == "none": + random_state = np.random.randint(0, 10000, size=1)[0] + self.set_subsample_random_state(random_state) + + if self.subsample_frac == 1: + return pd.DataFrame.sample( + x, + n=self.subsample_n, + replace=True, + random_state=self.subsampling_random_state, + ) + else: + return pd.DataFrame.sample( + x, frac=self.subsample_frac, random_state=self.subsampling_random_state + ) + + def get_subsample(self, compartment="cells"): + """ + Extract subsample from sqlite file + + Arguments: + compartment - [default: "cells"] string indicating the compartment to subset + """ + check_compartments(compartment) + + query_cols = "TableNumber, ImageNumber, ObjectNumber" + query = "select {} from {}".format(query_cols, compartment) + + # Load query and merge with image_df + query_df = self.image_df.merge( + pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols + ) + + self.subset_data_df = ( + query_df.groupby(self.strata) + .apply(lambda x: self.subsample_profiles(x)) + .reset_index(drop=True) + ) + + self.is_subset_computed = True + + def aggregate_compartment(self, compartment, compute_subsample=False): + """ + Aggregate morphological profiles + + Arguments: + compartment - str indicating specific compartment to extract + + Return: + Either the merged object file or write object to disk + """ + check_compartments(compartment) + + compartment_query = "select * from {}".format(compartment) + + if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: + self.get_subsample(compartment=compartment) + + population_df = self.image_df.merge( + pd.read_sql(sql=compartment_query, con=self.conn), + how="inner", + on=self.merge_cols, + ) + + object_df = aggregate( + population_df=population_df, + strata=self.strata, + features=self.features, + operation=self.operation, + subset_data_df=self.subset_data_df, + ) + + return object_df + + def aggregate_profiles( + self, + compute_subsample="False", + output_file="none", + compression=None, + float_format=None, + ): + """ + Aggregate and merge compartments. This is the primary entry to this class. + + Arguments: + compute_subsample - [default: False] boolean if subsample should be computed. + NOTE: Must be specified to perform subsampling. Will not + apply subsetting if set to False even if subsample is + initialized + output_file - [default: "none"] if provided, will write annotated profiles to file + if not specified, will return the annotated profiles. We recommend + that this output file be suffixed with "_augmented.csv". + compression - the mechanism to compress [default: None] + float_format - decimal precision to use in writing output file [default: None] + For example, use "%.3g" for 3 decimal precision. + + Return: + if output_file is set, then write to file. If not then return + """ + + if output_file != "none": + self.set_output_file(output_file) + + aggregated = ( + self.aggregate_compartment( + compartment="cells", compute_subsample=compute_subsample + ) + .merge( + self.aggregate_compartment(compartment="cytoplasm"), + on=self.strata, + how="inner", + ) + .merge( + self.aggregate_compartment(compartment="nuclei"), + on=self.strata, + how="inner", + ) + ) + + self.is_aggregated = True + + if self.output_file != "none": + output( + df=aggregated, + output_filename=self.output_file, + compression=compression, + float_format=float_format, + ) + else: + return aggregated From bdbf15bedfa2566bb103a9522cf30dac67c73018 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Mon, 14 Dec 2020 16:11:00 -0500 Subject: [PATCH 04/15] add sphinx comments and prep for file input modularity --- pycytominer/cyto_utils/cells.py | 169 ++++++++++++++++++++------------ 1 file changed, 104 insertions(+), 65 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index dbf8de83..c96e713a 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -2,6 +2,7 @@ import pandas as pd from sqlalchemy import create_engine from pycytominer import aggregate +from pycytominer import normalize from pycytominer.cyto_utils import ( output, check_compartments, @@ -10,17 +11,40 @@ ) -class SingleCells: - """ - Class to interact with single cell morphological profiles +class SingleCells(object): + """This is a class to interact with single cell morphological profiles. Interaction + includes aggregation, normalization, and output. + + :param file_or_conn: A file string or database connection storing the location of single cell profiles + :type file_or_conn: str + :param strata: The columns to groupby and aggregate single cells, defaults to ["Metadata_Plate", "Metadata_Well"] + :type strata: list + :param features: The features that should be aggregated, defaults to "infer" + :type features: str, list + :param aggregation_operation: operation to perform single cell aggregation, defaults to "median" + :type aggregation_operation: str + :param output_file: If specified, the location to write the file, defaults to "none" + :type output_file: str + :param compartments: list of compartments to process, defaults to ["cells", "cytoplasm", "nuclei"] + :type compartments: list + :param merge_cols: columns indicating how to merge image and compartment data, defaults to ["TableNumber", "ImageNumber"] + :type merge_cols: list + :param load_image_data: if image data should be loaded into memory, defaults to True + :type load_image_data: bool + :param subsample_frac: indicating percentage of single cells to select (0 < subsample_frac <= 1), defaults to 1 + :type subsample_frac: float + :param subsample_n: indicate how many samples to subsample - do not specify both subsample_frac and subsample_n, defaults to "all" + :type subsample_n:, str, int + :param subsampling_random_state: the random state to init subsample, defaults to "none" + :type subsampling_random_state: str, int """ def __init__( self, - sql_file, + file_or_conn, strata=["Metadata_Plate", "Metadata_Well"], features="infer", - operation="median", + aggregation_operation="median", output_file="none", compartments=["cells", "cytoplasm", "nuclei"], merge_cols=["TableNumber", "ImageNumber"], @@ -29,36 +53,22 @@ def __init__( subsample_n="all", subsampling_random_state="none", ): - """ - Arguments: - sql_file - string or sqlalchemy connection - strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate - features - [default: "all"] or list indicating features that should be aggregated - operation - [default: "median"] a string indicating how the data is aggregated - currently only supports one of ['mean', 'median'] - output_file - [default: "none"] string if specified, write to location - compartments - list of compartments to process - merge_cols - column indicating which columns to merge images and compartments - subsample_frac - [default: 1] float (0 < subsample <= 1) indicating percentage of - single cells to select - subsample_n - [default: "all"] int indicating how many samples to include - subsampling_random_state - [default: "none"] the random state to init subsample - """ + """Constructor method""" # Check compartments specified check_compartments(compartments) # Check if correct operation is specified - operation = check_aggregate_operation(operation) + aggregation_operation = check_aggregate_operation(aggregation_operation) # Check that the subsample_frac is between 0 and 1 assert ( 0 < subsample_frac and 1 >= subsample_frac ), "subsample_frac must be between 0 and 1" - self.sql_file = sql_file + self.file_or_conn = file_or_conn self.strata = strata self.features = features - self.operation = operation.lower() + self.aggregation_operation = aggregation_operation.lower() self.output_file = output_file self.compartments = compartments self.merge_cols = merge_cols @@ -73,7 +83,7 @@ def __init__( self.set_subsample_n(self.subsample_n) # Connect to sqlite engine - self.engine = create_engine(self.sql_file) + self.engine = create_engine(self.file_or_conn) self.conn = self.engine.connect() # Throw an error if both subsample_frac and subsample_n is set @@ -83,19 +93,35 @@ def __init__( self.load_image() def _check_subsampling(self): + """Internal method checking if subsampling options were specified correctly""" # Check that the user didn't specify both subset frac and subsample all assert ( self.subsample_frac == 1 or self.subsample_n == "all" ), "Do not set both subsample_frac and subsample_n" def set_output_file(self, output_file): + """Setting operation to conveniently rename output file + + :param output_file: the new output file name + :type output_file: str + """ self.output_file = output_file def set_subsample_frac(self, subsample_frac): + """Setting operation to conveniently update the subsample fraction + + :param subsample_frac: indicating percentage of single cells to select (0 < subsample_frac <= 1), defaults to 1 + :type subsample_frac: float + """ self.subsample_frac = subsample_frac self._check_subsampling() def set_subsample_n(self, subsample_n): + """Setting operation to conveniently update the subsample n + + :param subsample_n: indicate how many samples to subsample - do not specify both subsample_frac and subsample_n, defaults to "all" + :type subsample_n:, str, int + """ try: self.subsample_n = int(subsample_n) except ValueError: @@ -103,24 +129,28 @@ def set_subsample_n(self, subsample_n): self._check_subsampling() def set_subsample_random_state(self, random_state): + """Setting operation to conveniently update the subsample random state + + :param random_state: the random state to init subsample, defaults to "none" + :type random_state:, str, int + """ self.subsampling_random_state = random_state def load_image(self): - """ - Load image table from sqlite file - """ + """Load image table from sqlite file""" # Extract image metadata image_cols = "TableNumber, ImageNumber, {}".format(", ".join(self.strata)) image_query = "select {} from image".format(image_cols) self.image_df = pd.read_sql(sql=image_query, con=self.conn) def count_cells(self, compartment="cells", count_subset=False): - """ - Determine how many cells are measured per well. + """Determine how many cells are measured per well. - Arguments: - compartment - string indicating the compartment to subset - count_subset - [default: False] count the number of cells in subset partition + :param compartment: string indicating the compartment to subset, defaults to "cells" + :type compartment: str + :param count_subset: whether or not count the number of cells as specified by the strata groups + :return: A pandas dataframe of cell counts in the experiment + :rtype: pd.DataFrame """ check_compartments(compartment) @@ -148,9 +178,13 @@ def count_cells(self, compartment="cells", count_subset=False): return count_df - def subsample_profiles(self, x): - """ - Sample a Pandas DataFrame given the subsampling fraction + def subsample_profiles(self, df): + """Sample a Pandas DataFrame given subsampling information + + :param df: A single cell profile dataframe + :type df: pd.DataFrame + :return: A subsampled pandas dataframe of single cell profiles + :rtype: pd.DataFrame """ if self.subsampling_random_state == "none": random_state = np.random.randint(0, 10000, size=1)[0] @@ -158,22 +192,21 @@ def subsample_profiles(self, x): if self.subsample_frac == 1: return pd.DataFrame.sample( - x, + df, n=self.subsample_n, replace=True, random_state=self.subsampling_random_state, ) else: return pd.DataFrame.sample( - x, frac=self.subsample_frac, random_state=self.subsampling_random_state + df, frac=self.subsample_frac, random_state=self.subsampling_random_state ) def get_subsample(self, compartment="cells"): - """ - Extract subsample from sqlite file + """Apply the subsampling procedure - Arguments: - compartment - [default: "cells"] string indicating the compartment to subset + :param compartment: string indicating the compartment to process, defaults to "cells" + :type compartment: str """ check_compartments(compartment) @@ -194,14 +227,14 @@ def get_subsample(self, compartment="cells"): self.is_subset_computed = True def aggregate_compartment(self, compartment, compute_subsample=False): - """ - Aggregate morphological profiles - - Arguments: - compartment - str indicating specific compartment to extract - - Return: - Either the merged object file or write object to disk + """Aggregate morphological profiles. Uses pycytominer.aggregate() + + :param compartment: string indicating the specific compartment, defaults to "cells" + :type compartment: str + :param compute_subsample: determine if subsample should be computed, defaults to False + :type compute_subsample: bool + :return: Aggregated single-cell profiles + :rtype: pd.DataFrame """ check_compartments(compartment) @@ -220,7 +253,7 @@ def aggregate_compartment(self, compartment, compute_subsample=False): population_df=population_df, strata=self.strata, features=self.features, - operation=self.operation, + operation=self.aggregation_operation, subset_data_df=self.subset_data_df, ) @@ -228,28 +261,34 @@ def aggregate_compartment(self, compartment, compute_subsample=False): def aggregate_profiles( self, - compute_subsample="False", + compute_subsample=False, output_file="none", compression=None, float_format=None, ): - """ - Aggregate and merge compartments. This is the primary entry to this class. - - Arguments: - compute_subsample - [default: False] boolean if subsample should be computed. - NOTE: Must be specified to perform subsampling. Will not - apply subsetting if set to False even if subsample is - initialized - output_file - [default: "none"] if provided, will write annotated profiles to file - if not specified, will return the annotated profiles. We recommend - that this output file be suffixed with "_augmented.csv". - compression - the mechanism to compress [default: None] - float_format - decimal precision to use in writing output file [default: None] - For example, use "%.3g" for 3 decimal precision. + """Aggregate and merge compartments. This is the primary entry to this class. + + :param compute_subsample: Determine if subsample should be computed, defaults to False + :type compute_subsample: bool + :param output_file: the name of a file to output, defaults to "none": + :type output_file: str, optional + :param compression: the mechanism to compress, defaults to None + :type compression: str, optional + :param float_format: decimal precision to use in writing output file, defaults to None + :type float_format: str, optional Return: if output_file is set, then write to file. If not then return + + .. note:: + compute_subsample must be specified to perform subsampling. The function + aggregate_profiles(compute_subsample=True) will apply subsetting if even if + subsample is initialized + + .. note:: + We recommend that, if provided, the output file be suffixed with "_augmented" + + :Example: """ if output_file != "none": From 4cd0896956f205acea1aa2103e3133c2a7f4aff6 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Mon, 14 Dec 2020 16:11:54 -0500 Subject: [PATCH 05/15] update tests for updated cells function args --- .../tests/test_cyto_utils/test_cells.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 4b293082..3fb70e98 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -65,15 +65,17 @@ def build_random_data( nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace") # Setup SingleCells Class -ap = SingleCells(sql_file=file) -ap_subsample = SingleCells(sql_file=file, subsample_n=2, subsampling_random_state=123) +ap = SingleCells(file_or_conn=file) +ap_subsample = SingleCells( + file_or_conn=file, subsample_n=2, subsampling_random_state=123 +) def test_SingleCells_init(): """ Testing initialization of SingleCells """ - assert ap.sql_file == file + assert ap.file_or_conn == file assert ap.strata == ["Metadata_Plate", "Metadata_Well"] assert ap.merge_cols == ["TableNumber", "ImageNumber"] assert ap.features == "infer" @@ -84,7 +86,7 @@ def test_SingleCells_init(): assert ap_subsample.subsample_n == 2 assert ap.subset_data_df == "none" assert ap.output_file == "none" - assert ap.operation == "median" + assert ap.aggregation_operation == "median" assert not ap.is_aggregated assert ap.subsampling_random_state == "none" assert ap_subsample.subsampling_random_state == 123 @@ -94,7 +96,7 @@ def test_SingleCells_reset_variables(): """ Testing initialization of SingleCells """ - ap_switch = SingleCells(sql_file=file) + ap_switch = SingleCells(file_or_conn=file) assert ap_switch.subsample_frac == 1 assert ap_switch.subsample_n == "all" assert ap_switch.subsampling_random_state == "none" @@ -188,7 +190,7 @@ def test_aggregate_subsampling_count_cells(): ) pd.testing.assert_frame_equal(count_df, expected_count, check_names=False) - profiles = ap_subsample.aggregate_profiles() + profiles = ap_subsample.aggregate_profiles(compute_subsample=True) count_df = ap_subsample.count_cells(count_subset=True) expected_count = pd.DataFrame( @@ -307,7 +309,7 @@ def test_aggregate_count_cells_multiple_strata(): # Setup SingleCells Class ap_strata = SingleCells( - sql_file=file, + file_or_conn=file, subsample_n="4", strata=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], ) @@ -323,7 +325,7 @@ def test_aggregate_count_cells_multiple_strata(): ) pd.testing.assert_frame_equal(count_df, expected_count, check_names=False) - profiles = ap_strata.aggregate_profiles() + profiles = ap_strata.aggregate_profiles(compute_subsample=True) count_df = ap_strata.count_cells(count_subset=True) expected_count = pd.DataFrame( From a55bf74dd2188604ebad7a31118d72c575e7a817 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Mon, 14 Dec 2020 18:29:35 -0500 Subject: [PATCH 06/15] add function to merge single cells given linking columns --- pycytominer/cyto_utils/cells.py | 128 +++++++++++++++++++++++++++++--- 1 file changed, 119 insertions(+), 9 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index c96e713a..53676418 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -8,8 +8,14 @@ check_compartments, check_aggregate_operation, infer_cp_features, + get_default_linking_cols, + get_default_compartments, + assert_linking_cols_complete, ) +default_compartments = get_default_compartments() +default_linking_cols = get_default_linking_cols() + class SingleCells(object): """This is a class to interact with single cell morphological profiles. Interaction @@ -27,6 +33,8 @@ class SingleCells(object): :type output_file: str :param compartments: list of compartments to process, defaults to ["cells", "cytoplasm", "nuclei"] :type compartments: list + :param compartment_linking_cols: dictionary identifying how to merge columns across tables, default noted below: + :type compartment_linking_cols: dict :param merge_cols: columns indicating how to merge image and compartment data, defaults to ["TableNumber", "ImageNumber"] :type merge_cols: list :param load_image_data: if image data should be loaded into memory, defaults to True @@ -37,6 +45,17 @@ class SingleCells(object): :type subsample_n:, str, int :param subsampling_random_state: the random state to init subsample, defaults to "none" :type subsampling_random_state: str, int + + .. note:: + the argument compartment_linking_cols is designed to work with CellProfiler output, + as curated by cytominer-database. The defaut is: { + "cytoplasm": { + "cells": "Cytoplasm_Parent_Cells", + "nuclei": "Cytoplasm_Parent_Nuclei", + }, + "cells": {"cytoplasm": "ObjectNumber"}, + "nuclei": {"cytoplasm": "ObjectNumber"}, + } """ def __init__( @@ -46,7 +65,8 @@ def __init__( features="infer", aggregation_operation="median", output_file="none", - compartments=["cells", "cytoplasm", "nuclei"], + compartments=default_compartments, + compartment_linking_cols=default_linking_cols, merge_cols=["TableNumber", "ImageNumber"], load_image_data=True, subsample_frac=1, @@ -68,9 +88,9 @@ def __init__( self.file_or_conn = file_or_conn self.strata = strata self.features = features + self.load_image_data = load_image_data self.aggregation_operation = aggregation_operation.lower() self.output_file = output_file - self.compartments = compartments self.merge_cols = merge_cols self.subsample_frac = subsample_frac self.subsample_n = subsample_n @@ -78,6 +98,13 @@ def __init__( self.subsampling_random_state = subsampling_random_state self.is_aggregated = False self.is_subset_computed = False + self.compartments = compartments + self.compartment_linking_cols = compartment_linking_cols + + # Confirm that the compartments and linking cols are formatted properly + assert_linking_cols_complete( + compartments=self.compartments, linking_cols=self.compartment_linking_cols + ) if self.subsample_n != "all": self.set_subsample_n(self.subsample_n) @@ -89,7 +116,7 @@ def __init__( # Throw an error if both subsample_frac and subsample_n is set self._check_subsampling() - if load_image_data: + if self.load_image_data: self.load_image() def _check_subsampling(self): @@ -226,6 +253,11 @@ def get_subsample(self, compartment="cells"): self.is_subset_computed = True + def load_compartment(self, compartment): + compartment_query = "select * from {}".format(compartment) + df = pd.read_sql(sql=compartment_query, con=self.conn) + return df + def aggregate_compartment(self, compartment, compute_subsample=False): """Aggregate morphological profiles. Uses pycytominer.aggregate() @@ -238,13 +270,11 @@ def aggregate_compartment(self, compartment, compute_subsample=False): """ check_compartments(compartment) - compartment_query = "select * from {}".format(compartment) - if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: self.get_subsample(compartment=compartment) population_df = self.image_df.merge( - pd.read_sql(sql=compartment_query, con=self.conn), + self.load_compartment(compartment=compartment), how="inner", on=self.merge_cols, ) @@ -259,6 +289,87 @@ def aggregate_compartment(self, compartment, compute_subsample=False): return object_df + def merge_single_cells( + self, + sc_output_file="none", + compression=None, + float_format=None, + single_cell_normalize=False, + normalize_args=None, + ): + """Given the linking columns, merge single cell data. Normalization is also supported + + :param sc_output_file: the name of a file to output, defaults to "none": + :type sc_output_file: str, optional + :param compression: the mechanism to compress, defaults to None + :type compression: str, optional + :param float_format: decimal precision to use in writing output file, defaults to None + :type float_format: str, optional + :param single_cell_normalize: determine if the single cell data should also be normalized + :type single_cell_normalize: bool + :param normalize_args: additional arguments passed as a dictionary as input to pycytominer.normalize() + :return: Either a dataframe (if output_file="none") or will write to file + :rtype: pd.DataFrame, optional + """ + + # Load the single cell dataframe by merging on the specific linking columns + sc_df = "" + linking_check_cols = [] + for left_compartment in default_linking_cols: + for right_compartment in default_linking_cols[left_compartment]: + # Make sure only one merge per combination occurs + linking_check = "-".join(sorted([left_compartment, right_compartment])) + if linking_check in linking_check_cols: + continue + + # Specify how to indicate merge suffixes + merge_suffix = [ + "_{comp_l}".format(comp_l=left_compartment), + "_{comp_r}".format(comp_r=right_compartment), + ] + + left_link_col = default_linking_cols[left_compartment][ + right_compartment + ] + right_link_col = default_linking_cols[right_compartment][ + left_compartment + ] + + if isinstance(sc_df, str): + sc_df = self.load_compartment(compartment=left_compartment).merge( + self.load_compartment(compartment=right_compartment), + left_on=self.merge_cols + [left_link_col], + right_on=self.merge_cols + [right_link_col], + suffixes=merge_suffix, + ) + else: + sc_df = sc_df.merge( + self.load_compartment(compartment=right_compartment), + left_on=self.merge_cols + [left_link_col], + right_on=self.merge_cols + [right_link_col], + suffixes=merge_suffix, + ) + + linking_check_cols.append(linking_check) + + # Add image data to single cell dataframe + if not self.load_image_data: + self.load_image() + + sc_df = self.image_df.merge(sc_df, on=self.merge_cols, how="right") + if single_cell_normalize: + sc_df = normalize(profiles=sc_df, **normalize_args) + + if sc_output_file != "none": + output( + df=sc_df, + output_filename=sc_output_file, + compression=compression, + float_format=float_format, + ) + else: + return sc_df + def aggregate_profiles( self, compute_subsample=False, @@ -276,9 +387,8 @@ def aggregate_profiles( :type compression: str, optional :param float_format: decimal precision to use in writing output file, defaults to None :type float_format: str, optional - - Return: - if output_file is set, then write to file. If not then return + :return: Either a dataframe (if output_file="none") or will write to file + :rtype: pd.DataFrame, optional .. note:: compute_subsample must be specified to perform subsampling. The function From ae2ac5e72bb63855336e52c6dc188d9f0976b101 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Mon, 14 Dec 2020 18:31:04 -0500 Subject: [PATCH 07/15] add get_default_compartments() function --- pycytominer/cyto_utils/__init__.py | 5 +++++ pycytominer/cyto_utils/util.py | 4 ++++ pycytominer/tests/test_cyto_utils/test_util.py | 6 ++++++ 3 files changed, 15 insertions(+) diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py index 4396f93e..b26c4a43 100644 --- a/pycytominer/cyto_utils/__init__.py +++ b/pycytominer/cyto_utils/__init__.py @@ -1,12 +1,17 @@ from .output import output from .util import ( check_compartments, + get_default_compartments, load_known_metadata_dictionary, check_correlation_method, check_aggregate_operation, check_consensus_operation, get_pairwise_correlation, ) +from .single_cell_ingest_utils import ( + get_default_linking_cols, + assert_linking_cols_complete, +) from .load import ( load_profiles, load_platemap, diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py index 4c806a34..121303ca 100644 --- a/pycytominer/cyto_utils/util.py +++ b/pycytominer/cyto_utils/util.py @@ -12,6 +12,10 @@ ) +def get_default_compartments(): + return ["cells", "cytoplasm", "nuclei"] + + def check_compartments(compartments): valid_compartments = ["cells", "cytoplasm", "nuclei"] error_str = "compartment not supported, use one of {}".format(valid_compartments) diff --git a/pycytominer/tests/test_cyto_utils/test_util.py b/pycytominer/tests/test_cyto_utils/test_util.py index 8786dc86..5aca0385 100644 --- a/pycytominer/tests/test_cyto_utils/test_util.py +++ b/pycytominer/tests/test_cyto_utils/test_util.py @@ -5,6 +5,7 @@ import pandas as pd from pycytominer.cyto_utils.util import ( check_compartments, + get_default_compartments, load_known_metadata_dictionary, get_pairwise_correlation, check_correlation_method, @@ -52,6 +53,11 @@ def test_check_compartments_not_valid(): assert "compartment not supported" in str(ae.value) +def test_get_default_compartments(): + default_comparments = get_default_compartments() + assert ["cells", "cytoplasm", "nuclei"] == default_comparments + + def test_load_known_metadata_dictionary(): meta_cols = ["ObjectNumber", "ImageNumber", "TableNumber"] meta_df = pd.DataFrame( From 45cd6158eb62ae4f38a9b6de42114e81fe8af4f3 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Mon, 14 Dec 2020 18:31:24 -0500 Subject: [PATCH 08/15] add test to load compartment --- pycytominer/tests/test_cyto_utils/test_cells.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 3fb70e98..0a068bad 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -133,6 +133,11 @@ def test_SingleCells_count(): pd.testing.assert_frame_equal(count_df, expected_count, check_names=False) +def test_load_compartment(): + loaded_compartment_df = ap.load_compartment(compartment="cells") + pd.testing.assert_frame_equal(loaded_compartment_df, cells_df) + + def test_aggregate_comparment(): df = image_df.merge(cells_df, how="inner", on=["TableNumber", "ImageNumber"]) result = aggregate(df) From f0720c52ab320c65988d896332565da38842b0fe Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Mon, 14 Dec 2020 18:31:44 -0500 Subject: [PATCH 09/15] add file to test single cell ingestion processing --- .../test_single_cell_ingest_utils.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py diff --git a/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py b/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py new file mode 100644 index 00000000..f96f9d47 --- /dev/null +++ b/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py @@ -0,0 +1,56 @@ +import pytest +from pycytominer.cyto_utils import ( + get_default_linking_cols, + get_default_compartments, + assert_linking_cols_complete, +) + +default_compartments = get_default_compartments() +default_linking_cols = { + "cytoplasm": { + "cells": "Cytoplasm_Parent_Cells", + "nuclei": "Cytoplasm_Parent_Nuclei", + }, + "cells": {"cytoplasm": "ObjectNumber"}, + "nuclei": {"cytoplasm": "ObjectNumber"}, +} + + +def test_default_linking_cols(): + linking_cols = get_default_linking_cols() + assert linking_cols == default_linking_cols + + +def test_assert_linking_cols_complete(): + assert_linking_cols_complete() + assert_linking_cols_complete( + linking_cols=default_linking_cols, compartments=default_compartments + ) + + with pytest.raises(AssertionError) as err: + assert_linking_cols_complete( + linking_cols=default_linking_cols, compartments=["cells", "cytoplasm"] + ) + + assert "nuclei compartment not found." in str(err.value) + + error_linking_cols = { + "cytoplasm": {"cells": "Cytoplasm_Parent_Cells"}, + "cells": {"cytoplasm": "ObjectNumber"}, + "nuclei": {"cytoplasm": "ObjectNumber"}, + } + with pytest.raises(AssertionError) as err: + assert_linking_cols_complete( + linking_cols=error_linking_cols, compartments=default_compartments + ) + assert "Missing column identifier between cytoplasm-nuclei" in str(err.value) + + with pytest.raises(AssertionError) as err: + assert_linking_cols_complete( + linking_cols=default_linking_cols, + compartments=["cells", "cytoplasm", "nuclei", "sandwich"], + ) + assert ( + "All compartments must be specified in the linking_cols, {'sandwich'} is missing" + in str(err.value) + ) From 764c629f1b1bde3fe7251b99396c423881f4e727 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 15 Dec 2020 10:12:12 -0500 Subject: [PATCH 10/15] add single cell ingest util file --- .../cyto_utils/single_cell_ingest_utils.py | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 pycytominer/cyto_utils/single_cell_ingest_utils.py diff --git a/pycytominer/cyto_utils/single_cell_ingest_utils.py b/pycytominer/cyto_utils/single_cell_ingest_utils.py new file mode 100644 index 00000000..374fcb2d --- /dev/null +++ b/pycytominer/cyto_utils/single_cell_ingest_utils.py @@ -0,0 +1,67 @@ +from collections import Counter +from pycytominer.cyto_utils import check_compartments, get_default_compartments + + +def get_default_linking_cols(): + """Define the standard experiment linking columns between tables + + :return: Dictionary of compartment-specific column names used to link compartments across tables + :rtype: dict + + .. note:: + every dictionary pair has a 1 to 1 correspondence (e.g. cytoplasm-cells and cells-cytoplasm both must exist) + """ + linking_cols = { + "cytoplasm": { + "cells": "Cytoplasm_Parent_Cells", + "nuclei": "Cytoplasm_Parent_Nuclei", + }, + "cells": {"cytoplasm": "ObjectNumber"}, + "nuclei": {"cytoplasm": "ObjectNumber"}, + } + + return linking_cols + + +def assert_linking_cols_complete(linking_cols="default", compartments="default"): + """Confirm that the linking cols and compartments are compatible + + :return: Dictionary of compartment-specific column names used to link compartments across tables + :rtype: dict + + .. note:: + assert_linking_cols_complete() does not check if columns are present + """ + if linking_cols == "default": + linking_cols = get_default_linking_cols() + + if compartments == "default": + compartments = get_default_compartments() + + comp_err = "compartment not found. Check the specified compartments" + + linking_check = [] + unique_linking_cols = [] + for x in linking_cols: + unique_linking_cols.append(x) + assert x in compartments, "{com} {err}".format(com=x, err=comp_err) + for y in linking_cols[x]: + unique_linking_cols.append(y) + assert y in compartments, "{com} {err}".format(com=y, err=comp_err) + linking_check.append("-".join(sorted([x, y]))) + + # Make sure that each combination has been specified exactly twice + linking_counter = Counter(linking_check) + for combo in linking_counter: + assert ( + linking_counter[combo] == 2 + ), "Missing column identifier between {combo}".format(combo=combo) + + # Confirm that every compartment has been specified in the linking_cols + unique_linking_cols = sorted(list(set(unique_linking_cols))) + diff_column = set(compartments).difference(unique_linking_cols) + assert unique_linking_cols == sorted( + compartments + ), "All compartments must be specified in the linking_cols, {miss} is missing".format( + miss=diff_column + ) From 29a702ab27ea7624f422103d83019142de8208f4 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 15 Dec 2020 11:07:30 -0500 Subject: [PATCH 11/15] one line import --- pycytominer/cyto_utils/cells.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 53676418..95146277 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -1,8 +1,7 @@ import numpy as np import pandas as pd from sqlalchemy import create_engine -from pycytominer import aggregate -from pycytominer import normalize +from pycytominer import aggregate, normalize from pycytominer.cyto_utils import ( output, check_compartments, From 689ebc19fd13e235f7282c631c20865b98c900bc Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 15 Dec 2020 11:08:45 -0500 Subject: [PATCH 12/15] closes #110 --- pycytominer/cyto_utils/util.py | 22 +++++++++++---- .../tests/test_cyto_utils/test_util.py | 28 +++++++++++++------ 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py index 121303ca..a55c6dbe 100644 --- a/pycytominer/cyto_utils/util.py +++ b/pycytominer/cyto_utils/util.py @@ -1,8 +1,9 @@ """ -Miscellaneous utility function +Miscellaneous utility functions """ import os +import warnings import numpy as np import pandas as pd from pycytominer.cyto_utils.features import infer_cp_features @@ -17,14 +18,23 @@ def get_default_compartments(): def check_compartments(compartments): - valid_compartments = ["cells", "cytoplasm", "nuclei"] - error_str = "compartment not supported, use one of {}".format(valid_compartments) + default_compartments = get_default_compartments() + if isinstance(compartments, list): compartments = [x.lower() for x in compartments] - assert all([x in valid_compartments for x in compartments]), error_str elif isinstance(compartments, str): - compartments = compartments.lower() - assert compartments in valid_compartments, error_str + compartments = [compartments.lower()] + + non_canonical_compartments = [] + for compartment in compartments: + if compartment not in default_compartments: + non_canonical_compartments.append(compartment) + + if len(non_canonical_compartments) > 0: + warn_str = "Non-canonical compartment detected: {x}".format( + x=", ".join(non_canonical_compartments) + ) + warnings.warn(warn_str) def load_known_metadata_dictionary(metadata_file=default_metadata_file): diff --git a/pycytominer/tests/test_cyto_utils/test_util.py b/pycytominer/tests/test_cyto_utils/test_util.py index 5aca0385..3f47ca20 100644 --- a/pycytominer/tests/test_cyto_utils/test_util.py +++ b/pycytominer/tests/test_cyto_utils/test_util.py @@ -2,6 +2,7 @@ import random import pytest import tempfile +import warnings import pandas as pd from pycytominer.cyto_utils.util import ( check_compartments, @@ -37,20 +38,31 @@ def test_check_compartments(): def test_check_compartments_not_valid(): - with pytest.raises(AssertionError) as ae: + warn_expected_string = "Non-canonical compartment detected: something" + warnings.simplefilter("always") + with warnings.catch_warnings(record=True) as w: not_valid = ["SOMETHING"] output = check_compartments(not_valid) - assert "compartment not supported" in str(ae.value) + assert issubclass(w[-1].category, UserWarning) + assert warn_expected_string in str(w[-1].message) - with pytest.raises(AssertionError) as ae: - not_valid = "SOMETHING" + with warnings.catch_warnings(record=True) as w: + not_valid = "SOMETHING" # Also works with strings output = check_compartments(not_valid) - assert "compartment not supported" in str(ae.value) + assert issubclass(w[-1].category, UserWarning) + assert warn_expected_string in str(w[-1].message) - with pytest.raises(AssertionError) as ae: - not_valid = ["Cells", "Cytoplasm", "SOMETHING"] + with warnings.catch_warnings(record=True) as w: + not_valid = ["CelLs", "CytopLasM", "SOMETHING"] output = check_compartments(not_valid) - assert "compartment not supported" in str(ae.value) + assert issubclass(w[-1].category, UserWarning) + assert warn_expected_string in str(w[-1].message) + + with warnings.catch_warnings(record=True) as w: + not_valid = ["CelLs", "CytopLasM", "SOMETHING", "NOTHING"] + output = check_compartments(not_valid) + assert issubclass(w[-1].category, UserWarning) + assert "{x}, nothing".format(x=warn_expected_string) in str(w[-1].message) def test_get_default_compartments(): From c3e0ce3aa10063480351d6ea1adf590a418ccbe0 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 15 Dec 2020 15:57:47 -0500 Subject: [PATCH 13/15] fix column name issue when merging - do not consider linking columns as morphology features --- pycytominer/cyto_utils/__init__.py | 1 + pycytominer/cyto_utils/cells.py | 68 +++++-- .../cyto_utils/single_cell_ingest_utils.py | 24 +++ .../tests/test_cyto_utils/test_cells.py | 183 +++++++++++++++--- .../test_single_cell_ingest_utils.py | 26 +++ 5 files changed, 269 insertions(+), 33 deletions(-) diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py index b26c4a43..f5960a04 100644 --- a/pycytominer/cyto_utils/__init__.py +++ b/pycytominer/cyto_utils/__init__.py @@ -11,6 +11,7 @@ from .single_cell_ingest_utils import ( get_default_linking_cols, assert_linking_cols_complete, + provide_linking_cols_feature_name_update, ) from .load import ( load_profiles, diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 95146277..ebe74bc1 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -10,6 +10,7 @@ get_default_linking_cols, get_default_compartments, assert_linking_cols_complete, + provide_linking_cols_feature_name_update, ) default_compartments = get_default_compartments() @@ -105,6 +106,11 @@ def __init__( compartments=self.compartments, linking_cols=self.compartment_linking_cols ) + # Build a dictionary to update linking column feature names + self.linking_col_rename = provide_linking_cols_feature_name_update( + self.compartment_linking_cols + ) + if self.subsample_n != "all": self.set_subsample_n(self.subsample_n) @@ -184,10 +190,10 @@ def count_cells(self, compartment="cells", count_subset=False): assert self.is_aggregated, "Make sure to aggregate_profiles() first!" assert self.is_subset_computed, "Make sure to get_subsample() first!" count_df = ( - self.subset_data_df.groupby(self.strata)["ObjectNumber"] + self.subset_data_df.groupby(self.strata)["Metadata_ObjectNumber"] .count() .reset_index() - .rename({"ObjectNumber": "cell_count"}, axis="columns") + .rename({"Metadata_ObjectNumber": "cell_count"}, axis="columns") ) else: query_cols = "TableNumber, ImageNumber, ObjectNumber" @@ -217,17 +223,21 @@ def subsample_profiles(self, df): self.set_subsample_random_state(random_state) if self.subsample_frac == 1: - return pd.DataFrame.sample( + + output_df = pd.DataFrame.sample( df, n=self.subsample_n, replace=True, random_state=self.subsampling_random_state, ) else: - return pd.DataFrame.sample( + output_df = pd.DataFrame.sample( df, frac=self.subsample_frac, random_state=self.subsampling_random_state ) + output_df = output_df.rename(self.linking_col_rename, axis="columns") + return output_df + def get_subsample(self, compartment="cells"): """Apply the subsampling procedure @@ -276,7 +286,7 @@ def aggregate_compartment(self, compartment, compute_subsample=False): self.load_compartment(compartment=compartment), how="inner", on=self.merge_cols, - ) + ).rename(self.linking_col_rename, axis="columns") object_df = aggregate( population_df=population_df, @@ -314,8 +324,9 @@ def merge_single_cells( # Load the single cell dataframe by merging on the specific linking columns sc_df = "" linking_check_cols = [] - for left_compartment in default_linking_cols: - for right_compartment in default_linking_cols[left_compartment]: + merge_suffix_rename = [] + for left_compartment in self.compartment_linking_cols: + for right_compartment in self.compartment_linking_cols[left_compartment]: # Make sure only one merge per combination occurs linking_check = "-".join(sorted([left_compartment, right_compartment])) if linking_check in linking_check_cols: @@ -326,11 +337,11 @@ def merge_single_cells( "_{comp_l}".format(comp_l=left_compartment), "_{comp_r}".format(comp_r=right_compartment), ] - - left_link_col = default_linking_cols[left_compartment][ + merge_suffix_rename += merge_suffix + left_link_col = self.compartment_linking_cols[left_compartment][ right_compartment ] - right_link_col = default_linking_cols[right_compartment][ + right_link_col = self.compartment_linking_cols[right_compartment][ left_compartment ] @@ -351,12 +362,47 @@ def merge_single_cells( linking_check_cols.append(linking_check) + # Add metadata prefix to merged suffixes + full_merge_suffix_rename = [] + full_merge_suffix_original = [] + for col_name in self.merge_cols + list(self.linking_col_rename.keys()): + full_merge_suffix_original.append(col_name) + full_merge_suffix_rename.append("Metadata_{x}".format(x=col_name)) + + for col_name in self.merge_cols + list(self.linking_col_rename.keys()): + for suffix in set(merge_suffix_rename): + full_merge_suffix_original.append("{x}{y}".format(x=col_name, y=suffix)) + full_merge_suffix_rename.append( + "Metadata_{x}{y}".format(x=col_name, y=suffix) + ) + + self.full_merge_suffix_rename = dict( + zip(full_merge_suffix_original, full_merge_suffix_rename) + ) + # Add image data to single cell dataframe if not self.load_image_data: self.load_image() - sc_df = self.image_df.merge(sc_df, on=self.merge_cols, how="right") + sc_df = ( + self.image_df.merge(sc_df, on=self.merge_cols, how="right") + .rename(self.linking_col_rename, axis="columns") + .rename(self.full_merge_suffix_rename, axis="columns") + ) if single_cell_normalize: + # Infering features is tricky with non-canonical data + if normalize_args is None: + normalize_args = {} + features = infer_cp_features(sc_df, compartments=self.compartments) + elif "features" not in normalize_args: + features = infer_cp_features(sc_df, compartments=self.compartments) + elif normalize_args["features"] == "infer": + features = infer_cp_features(sc_df, compartments=self.compartments) + else: + features = normalize_args["features"] + + normalize_args["features"] = features + sc_df = normalize(profiles=sc_df, **normalize_args) if sc_output_file != "none": diff --git a/pycytominer/cyto_utils/single_cell_ingest_utils.py b/pycytominer/cyto_utils/single_cell_ingest_utils.py index 374fcb2d..555dc7b2 100644 --- a/pycytominer/cyto_utils/single_cell_ingest_utils.py +++ b/pycytominer/cyto_utils/single_cell_ingest_utils.py @@ -65,3 +65,27 @@ def assert_linking_cols_complete(linking_cols="default", compartments="default") ), "All compartments must be specified in the linking_cols, {miss} is missing".format( miss=diff_column ) + + +def provide_linking_cols_feature_name_update(linking_cols="default"): + """Output a dictionary to use to update pandas dataframe column names. The linking + cols must be Metadata. + + :return: Dictionary of the linking column names to update after they are used + :rtype: dict + """ + if linking_cols == "default": + linking_cols = get_default_linking_cols() + + metadata_update_cols = [] + for col in linking_cols: + for right_col in linking_cols[col]: + metadata_update_cols.append(linking_cols[col][right_col]) + + update_name = dict( + zip( + metadata_update_cols, + ["Metadata_{x}".format(x=y) for y in metadata_update_cols], + ) + ) + return update_name diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 0a068bad..b24a1fa0 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -4,8 +4,14 @@ import tempfile import pandas as pd from sqlalchemy import create_engine -from pycytominer import aggregate + +from pycytominer import aggregate, normalize from pycytominer.cyto_utils.cells import SingleCells +from pycytominer.cyto_utils import ( + get_default_linking_cols, + get_default_compartments, + infer_cp_features, +) random.seed(123) @@ -47,7 +53,10 @@ def build_random_data( # Setup data cells_df = build_random_data(compartment="cells") -cytoplasm_df = build_random_data(compartment="cytoplasm") +cytoplasm_df = build_random_data(compartment="cytoplasm").assign( + Cytoplasm_Parent_Cells=(list(range(1, 51)) * 2)[::-1], + Cytoplasm_Parent_Nuclei=(list(range(1, 51)) * 2)[::-1], +) nuclei_df = build_random_data(compartment="nuclei") image_df = pd.DataFrame( { @@ -64,11 +73,41 @@ def build_random_data( cytoplasm_df.to_sql("cytoplasm", con=test_engine, index=False, if_exists="replace") nuclei_df.to_sql("nuclei", con=test_engine, index=False, if_exists="replace") +# Create a new table with a fourth compartment +new_file = "sqlite:///{}/test_new.sqlite".format(tmpdir) +new_compartment_df = build_random_data(compartment="new") + +test_new_engine = create_engine(new_file) +test_new_conn = test_new_engine.connect() + +image_df.to_sql("image", con=test_new_engine, index=False, if_exists="replace") +cells_df.to_sql("cells", con=test_new_engine, index=False, if_exists="replace") +new_cytoplasm_df = cytoplasm_df.assign( + Cytoplasm_Parent_New=(list(range(1, 51)) * 2)[::-1] +) +new_cytoplasm_df.to_sql( + "cytoplasm", con=test_new_engine, index=False, if_exists="replace" +) +nuclei_df.to_sql("nuclei", con=test_new_engine, index=False, if_exists="replace") +new_compartment_df.to_sql("new", con=test_new_engine, index=False, if_exists="replace") + +new_compartments = ["cells", "cytoplasm", "nuclei", "new"] + +new_linking_cols = get_default_linking_cols() +new_linking_cols["cytoplasm"]["new"] = "Cytoplasm_Parent_New" +new_linking_cols["new"] = {"cytoplasm": "ObjectNumber"} + # Setup SingleCells Class ap = SingleCells(file_or_conn=file) ap_subsample = SingleCells( file_or_conn=file, subsample_n=2, subsampling_random_state=123 ) +ap_new = SingleCells( + file_or_conn=new_file, + load_image_data=False, + compartments=new_compartments, + compartment_linking_cols=new_linking_cols, +) def test_SingleCells_init(): @@ -90,6 +129,8 @@ def test_SingleCells_init(): assert not ap.is_aggregated assert ap.subsampling_random_state == "none" assert ap_subsample.subsampling_random_state == 123 + assert ap.compartment_linking_cols == get_default_linking_cols() + assert ap.compartments == get_default_compartments() def test_SingleCells_reset_variables(): @@ -137,6 +178,123 @@ def test_load_compartment(): loaded_compartment_df = ap.load_compartment(compartment="cells") pd.testing.assert_frame_equal(loaded_compartment_df, cells_df) + # Test non-canonical compartment loading + pd.testing.assert_frame_equal(new_compartment_df, ap_new.load_compartment("new")) + + +def test_merge_single_cells(): + sc_merged_df = ap.merge_single_cells() + + # Assert that the image data was merged + assert all(x in sc_merged_df.columns for x in ["Metadata_Plate", "Metadata_Well"]) + + # Assert that metadata columns were renamed appropriately + for x in ap.full_merge_suffix_rename: + assert ap.full_merge_suffix_rename[x] == "Metadata_{x}".format(x=x) + + # Perform a manual merge + manual_merge = cytoplasm_df.merge( + cells_df, + left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"], + right_on=["TableNumber", "ImageNumber", "ObjectNumber"], + suffixes=["_cytoplasm", "_cells"], + ).merge( + nuclei_df, + left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"], + right_on=["TableNumber", "ImageNumber", "ObjectNumber"], + suffixes=["_cytoplasm", "_nuclei"], + ) + + manual_merge = image_df.merge(manual_merge, on=ap.merge_cols, how="right").rename( + ap.full_merge_suffix_rename, axis="columns" + ) + + # Confirm that the merge correctly reversed the object number (opposite from Parent) + assert ( + sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] + == sc_merged_df.Metadata_ObjectNumber.tolist() + ) + assert ( + manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] + == sc_merged_df.Metadata_ObjectNumber.tolist() + ) + assert ( + manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] + == sc_merged_df.Metadata_ObjectNumber.tolist() + ) + assert ( + manual_merge.Metadata_ObjectNumber_cells.tolist() + == sc_merged_df.Metadata_ObjectNumber.tolist() + ) + + # Confirm the merge and adding merge options + for method in ["standardize", "robustize"]: + for samples in ["all", "Metadata_ImageNumber == 'x'"]: + for features in ["infer", ["Cytoplasm_a", "Cells_a"]]: + + norm_method_df = ap.merge_single_cells( + single_cell_normalize=True, + normalize_args={ + "method": method, + "samples": samples, + "features": features, + }, + ) + + manual_merge_normalize = normalize( + manual_merge, method=method, samples=samples, features=features + ) + + pd.testing.assert_frame_equal(norm_method_df, manual_merge_normalize) + + # Test non-canonical compartment merging + new_sc_merge_df = ap_new.merge_single_cells() + + assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4 + assert ( + new_compartment_df.ObjectNumber.tolist()[::-1] + == new_sc_merge_df.Metadata_ObjectNumber_new.tolist() + ) + + norm_new_method_df = ap_new.merge_single_cells( + single_cell_normalize=True, + normalize_args={ + "method": "standardize", + "samples": "all", + "features": "infer", + }, + ) + + norm_new_method_no_feature_infer_df = ap_new.merge_single_cells( + single_cell_normalize=True, + normalize_args={ + "method": "standardize", + "samples": "all", + }, + ) + + default_feature_infer_df = ap_new.merge_single_cells(single_cell_normalize=True) + + pd.testing.assert_frame_equal(norm_new_method_df, default_feature_infer_df) + pd.testing.assert_frame_equal( + norm_new_method_df, norm_new_method_no_feature_infer_df + ) + + new_compartment_cols = infer_cp_features( + new_compartment_df, compartments=ap_new.compartments + ) + traditional_norm_df = normalize( + ap_new.image_df.merge(new_compartment_df, on=ap.merge_cols), + features=new_compartment_cols, + samples="all", + method="standardize", + ) + + pd.testing.assert_frame_equal( + norm_new_method_df.loc[:, new_compartment_cols].abs().describe(), + traditional_norm_df.loc[:, new_compartment_cols].abs().describe(), + ) + def test_aggregate_comparment(): df = image_df.merge(cells_df, how="inner", on=["TableNumber", "ImageNumber"]) @@ -217,26 +375,7 @@ def test_aggregate_subsampling_profile(): "ImageNumber": sorted(["x", "y"] * 2), "Metadata_Plate": ["plate"] * 4, "Metadata_Well": sorted(["A01", "A02"] * 2), - "ObjectNumber": [46, 3] * 2, - } - ) - - expected_result = pd.DataFrame( - { - "Metadata_Plate": ["plate", "plate"], - "Metadata_Well": ["A01", "A02"], - "Cells_a": [110.0, 680.5], - "Cells_b": [340.5, 201.5], - "Cells_c": [285.0, 481.0], - "Cells_d": [352.0, 549.0], - "Cytoplasm_a": [407.5, 705.5], - "Cytoplasm_b": [650.0, 439.5], - "Cytoplasm_c": [243.5, 78.5], - "Cytoplasm_d": [762.5, 625.0], - "Nuclei_a": [683.5, 171.0], - "Nuclei_b": [50.5, 625.0], - "Nuclei_c": [431.0, 483.0], - "Nuclei_d": [519.0, 286.5], + "Metadata_ObjectNumber": [46, 3] * 2, } ) diff --git a/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py b/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py index f96f9d47..196faf02 100644 --- a/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py +++ b/pycytominer/tests/test_cyto_utils/test_single_cell_ingest_utils.py @@ -3,6 +3,7 @@ get_default_linking_cols, get_default_compartments, assert_linking_cols_complete, + provide_linking_cols_feature_name_update, ) default_compartments = get_default_compartments() @@ -54,3 +55,28 @@ def test_assert_linking_cols_complete(): "All compartments must be specified in the linking_cols, {'sandwich'} is missing" in str(err.value) ) + + +def test_provide_linking_cols_feature_name_update(): + expected_result = { + "Cytoplasm_Parent_Cells": "Metadata_Cytoplasm_Parent_Cells", + "Cytoplasm_Parent_Nuclei": "Metadata_Cytoplasm_Parent_Nuclei", + "ObjectNumber": "Metadata_ObjectNumber", + } + + result = provide_linking_cols_feature_name_update() + assert result == expected_result + + new_linking_cols = get_default_linking_cols() + new_linking_cols["cytoplasm"]["new"] = "Cytoplasm_Parent_New" + new_linking_cols["new"] = {"cytoplasm": "ObjectNumber"} + result = provide_linking_cols_feature_name_update(new_linking_cols) + + expected_result = { + "Cytoplasm_Parent_Cells": "Metadata_Cytoplasm_Parent_Cells", + "Cytoplasm_Parent_Nuclei": "Metadata_Cytoplasm_Parent_Nuclei", + "Cytoplasm_Parent_New": "Metadata_Cytoplasm_Parent_New", + "ObjectNumber": "Metadata_ObjectNumber", + } + + assert result == expected_result From adcb1aa3d5e4601b751129aea057b5675ce146fc Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 15 Dec 2020 16:55:25 -0500 Subject: [PATCH 14/15] enable infer_cp_feature with string and abstract out function --- pycytominer/cyto_utils/__init__.py | 1 + pycytominer/cyto_utils/features.py | 12 ++++++++++++ pycytominer/cyto_utils/util.py | 10 +++++----- .../tests/test_cyto_utils/test_features_util.py | 13 +++++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 pycytominer/tests/test_cyto_utils/test_features_util.py diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py index f5960a04..690dd09f 100644 --- a/pycytominer/cyto_utils/__init__.py +++ b/pycytominer/cyto_utils/__init__.py @@ -23,6 +23,7 @@ count_na_features, infer_cp_features, drop_outlier_features, + convert_compartment_format_to_list, ) from .write_gct import write_gct from .modz import modz diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py index a6b77449..4e871edc 100644 --- a/pycytominer/cyto_utils/features.py +++ b/pycytominer/cyto_utils/features.py @@ -72,6 +72,9 @@ def infer_cp_features( """ Given a dataframe, output features that we expect to be cell painting features """ + compartments = convert_compartment_format_to_list(compartments) + compartments = [x.title() for x in compartments] + features = [] for col in population_df.columns.tolist(): if any([col.startswith(x.title()) for x in compartments]): @@ -140,3 +143,12 @@ def drop_outlier_features( ].index.tolist() return outlier_features + + +def convert_compartment_format_to_list(compartments): + if isinstance(compartments, list): + compartments = [x.lower() for x in compartments] + elif isinstance(compartments, str): + compartments = [compartments.lower()] + + return compartments diff --git a/pycytominer/cyto_utils/util.py b/pycytominer/cyto_utils/util.py index a55c6dbe..b8057687 100644 --- a/pycytominer/cyto_utils/util.py +++ b/pycytominer/cyto_utils/util.py @@ -6,7 +6,10 @@ import warnings import numpy as np import pandas as pd -from pycytominer.cyto_utils.features import infer_cp_features +from pycytominer.cyto_utils.features import ( + infer_cp_features, + convert_compartment_format_to_list, +) default_metadata_file = os.path.join( os.path.dirname(__file__), "..", "data", "metadata_feature_dictionary.txt" @@ -20,10 +23,7 @@ def get_default_compartments(): def check_compartments(compartments): default_compartments = get_default_compartments() - if isinstance(compartments, list): - compartments = [x.lower() for x in compartments] - elif isinstance(compartments, str): - compartments = [compartments.lower()] + compartments = convert_compartment_format_to_list(compartments) non_canonical_compartments = [] for compartment in compartments: diff --git a/pycytominer/tests/test_cyto_utils/test_features_util.py b/pycytominer/tests/test_cyto_utils/test_features_util.py new file mode 100644 index 00000000..5191fcf7 --- /dev/null +++ b/pycytominer/tests/test_cyto_utils/test_features_util.py @@ -0,0 +1,13 @@ +import os +import random +import pytest +import pandas as pd +from pycytominer.cyto_utils.features import convert_compartment_format_to_list + + +def test_convert_compartment_format_to_list(): + compartments = convert_compartment_format_to_list(["cells", "CYTOplasm", "nuclei"]) + assert compartments == ["cells", "cytoplasm", "nuclei"] + + compartments = convert_compartment_format_to_list("FoO") + assert compartments == ["foo"] From f9adf9407f587519f6479bf875fe722be65799c6 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 15 Dec 2020 16:56:30 -0500 Subject: [PATCH 15/15] remove features init argument enable aggregation_args and no longer hardcoded aggregation --- pycytominer/cyto_utils/cells.py | 63 ++++++++++++------- .../tests/test_cyto_utils/test_cells.py | 8 ++- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index ebe74bc1..ba5c0a9e 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -25,8 +25,6 @@ class SingleCells(object): :type file_or_conn: str :param strata: The columns to groupby and aggregate single cells, defaults to ["Metadata_Plate", "Metadata_Well"] :type strata: list - :param features: The features that should be aggregated, defaults to "infer" - :type features: str, list :param aggregation_operation: operation to perform single cell aggregation, defaults to "median" :type aggregation_operation: str :param output_file: If specified, the location to write the file, defaults to "none" @@ -62,7 +60,6 @@ def __init__( self, file_or_conn, strata=["Metadata_Plate", "Metadata_Well"], - features="infer", aggregation_operation="median", output_file="none", compartments=default_compartments, @@ -87,7 +84,6 @@ def __init__( self.file_or_conn = file_or_conn self.strata = strata - self.features = features self.load_image_data = load_image_data self.aggregation_operation = aggregation_operation.lower() self.output_file = output_file @@ -267,13 +263,17 @@ def load_compartment(self, compartment): df = pd.read_sql(sql=compartment_query, con=self.conn) return df - def aggregate_compartment(self, compartment, compute_subsample=False): + def aggregate_compartment( + self, compartment, compute_subsample=False, aggregate_args=None + ): """Aggregate morphological profiles. Uses pycytominer.aggregate() :param compartment: string indicating the specific compartment, defaults to "cells" :type compartment: str :param compute_subsample: determine if subsample should be computed, defaults to False :type compute_subsample: bool + :param aggregate_args: additional arguments passed as a dictionary as input to pycytominer.aggregate() + :type aggregate_args: None, dict :return: Aggregated single-cell profiles :rtype: pd.DataFrame """ @@ -282,18 +282,36 @@ def aggregate_compartment(self, compartment, compute_subsample=False): if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: self.get_subsample(compartment=compartment) + # Load image data if not already loaded + if not self.load_image_data: + self.load_image() + self.load_image_data = True + population_df = self.image_df.merge( self.load_compartment(compartment=compartment), how="inner", on=self.merge_cols, ).rename(self.linking_col_rename, axis="columns") + # Infering features is tricky with non-canonical data + if aggregate_args is None: + aggregate_args = {} + features = infer_cp_features(population_df, compartments=compartment) + elif "features" not in aggregate_args: + features = infer_cp_features(population_df, compartments=compartment) + elif aggregate_args["features"] == "infer": + features = infer_cp_features(population_df, compartments=compartment) + else: + features = aggregate_args["features"] + + aggregate_args["features"] = features + object_df = aggregate( population_df=population_df, strata=self.strata, - features=self.features, operation=self.aggregation_operation, subset_data_df=self.subset_data_df, + **aggregate_args ) return object_df @@ -317,6 +335,7 @@ def merge_single_cells( :param single_cell_normalize: determine if the single cell data should also be normalized :type single_cell_normalize: bool :param normalize_args: additional arguments passed as a dictionary as input to pycytominer.normalize() + :type normalize_args: None, dict :return: Either a dataframe (if output_file="none") or will write to file :rtype: pd.DataFrame, optional """ @@ -383,6 +402,7 @@ def merge_single_cells( # Add image data to single cell dataframe if not self.load_image_data: self.load_image() + self.load_image_data = True sc_df = ( self.image_df.merge(sc_df, on=self.merge_cols, how="right") @@ -421,6 +441,7 @@ def aggregate_profiles( output_file="none", compression=None, float_format=None, + aggregate_args=None, ): """Aggregate and merge compartments. This is the primary entry to this class. @@ -432,6 +453,8 @@ def aggregate_profiles( :type compression: str, optional :param float_format: decimal precision to use in writing output file, defaults to None :type float_format: str, optional + :param aggregate_args: additional arguments passed as a dictionary as input to pycytominer.aggregate() + :type aggregate_args: None, dict :return: Either a dataframe (if output_file="none") or will write to file :rtype: pd.DataFrame, optional @@ -449,21 +472,19 @@ def aggregate_profiles( if output_file != "none": self.set_output_file(output_file) - aggregated = ( - self.aggregate_compartment( - compartment="cells", compute_subsample=compute_subsample - ) - .merge( - self.aggregate_compartment(compartment="cytoplasm"), - on=self.strata, - how="inner", - ) - .merge( - self.aggregate_compartment(compartment="nuclei"), - on=self.strata, - how="inner", - ) - ) + compartment_idx = 0 + for compartment in self.compartments: + if compartment_idx == 0: + aggregated = self.aggregate_compartment( + compartment=compartment, compute_subsample=compute_subsample + ) + else: + aggregated = aggregated.merge( + self.aggregate_compartment(compartment=compartment), + on=self.strata, + how="inner", + ) + compartment_idx += 1 self.is_aggregated = True diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index b24a1fa0..80ac14d2 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -117,7 +117,6 @@ def test_SingleCells_init(): assert ap.file_or_conn == file assert ap.strata == ["Metadata_Plate", "Metadata_Well"] assert ap.merge_cols == ["TableNumber", "ImageNumber"] - assert ap.features == "infer" pd.testing.assert_frame_equal(image_df, ap.image_df) assert ap.subsample_frac == 1 assert ap_subsample.subsample_frac == 1 @@ -341,6 +340,13 @@ def test_aggregate_profiles(): pd.testing.assert_frame_equal(result, expected_result) + # Confirm aggregation after merging single cells + sc_aggregated_df = aggregate( + ap.merge_single_cells() + ).sort_index(axis="columns") + + pd.testing.assert_frame_equal(result.sort_index(axis="columns"), sc_aggregated_df) + def test_aggregate_subsampling_count_cells(): count_df = ap_subsample.count_cells()