Skip to content

Commit

Permalink
Merge pull request #121 from gwaygenomics/update-subsample
Browse files Browse the repository at this point in the history
Subsampling at the point of single cell merging
  • Loading branch information
gwaybio authored Jan 7, 2021
2 parents 91afadc + b5ba767 commit 90fda6b
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 9 deletions.
36 changes: 27 additions & 9 deletions pycytominer/cyto_utils/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def count_cells(self, compartment="cells", count_subset=False):

return count_df

def subsample_profiles(self, df):
def subsample_profiles(self, df, rename_col=True):
"""Sample a Pandas DataFrame given subsampling information
:param df: A single cell profile dataframe
Expand All @@ -231,10 +231,12 @@ def subsample_profiles(self, df):
df, frac=self.subsample_frac, random_state=self.subsampling_random_state
)

output_df = output_df.rename(self.linking_col_rename, axis="columns")
if rename_col:
output_df = output_df.rename(self.linking_col_rename, axis="columns")

return output_df

def get_subsample(self, compartment="cells"):
def get_subsample(self, df=None, compartment="cells", rename_col=True):
"""Apply the subsampling procedure
:param compartment: string indicating the compartment to process, defaults to "cells"
Expand All @@ -246,13 +248,14 @@ def get_subsample(self, compartment="cells"):
query = "select {} from {}".format(query_cols, compartment)

# Load query and merge with image_df
query_df = self.image_df.merge(
pd.read_sql(sql=query, con=self.conn), how="inner", on=self.merge_cols
)
if df is None:
df = pd.read_sql(sql=query, con=self.conn)

query_df = self.image_df.merge(df, how="inner", on=self.merge_cols)

self.subset_data_df = (
query_df.groupby(self.strata)
.apply(lambda x: self.subsample_profiles(x))
.apply(lambda x: self.subsample_profiles(x, rename_col=rename_col))
.reset_index(drop=True)
)

Expand Down Expand Up @@ -318,6 +321,7 @@ def aggregate_compartment(

def merge_single_cells(
self,
compute_subsample=False,
sc_output_file="none",
compression_options=None,
float_format=None,
Expand Down Expand Up @@ -365,7 +369,21 @@ def merge_single_cells(
]

if isinstance(sc_df, str):
sc_df = self.load_compartment(compartment=left_compartment).merge(
initial_df = self.load_compartment(compartment=left_compartment)

if compute_subsample:
# Sample cells proportionally by self.strata
self.get_subsample(df=initial_df, rename_col=False)

subset_logic_df = self.subset_data_df.drop(
self.image_df.columns, axis="columns"
)

initial_df = subset_logic_df.merge(
initial_df, how="left", on=subset_logic_df.columns.tolist()
).reindex(initial_df.columns, axis="columns")

sc_df = initial_df.merge(
self.load_compartment(compartment=right_compartment),
left_on=self.merge_cols + [left_link_col],
right_on=self.merge_cols + [right_link_col],
Expand Down Expand Up @@ -447,7 +465,7 @@ def aggregate_profiles(
:param compute_subsample: Determine if subsample should be computed, defaults to False
:type compute_subsample: bool
:param output_file: the name of a file to output, defaults to "none":
:param output_file: the name of a file to output, defaults to "none"
:type output_file: str, optional
:param compression: the mechanism to compress, defaults to None
:type compression: str, optional
Expand Down
46 changes: 46 additions & 0 deletions pycytominer/tests/test_cyto_utils/test_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,52 @@ def test_merge_single_cells():
)


def test_merge_single_cells_subsample():

for subsample_frac in [0.1, 0.5, 0.9]:
ap_subsample = SingleCells(file_or_conn=file, subsample_frac=subsample_frac)

sc_merged_df = ap_subsample.merge_single_cells(
sc_output_file="none",
compute_subsample=True,
compression_options=None,
float_format=None,
single_cell_normalize=True,
normalize_args=None,
)

# Assert that the image data was merged
assert all(
x in sc_merged_df.columns for x in ["Metadata_Plate", "Metadata_Well"]
)

# Assert that metadata columns were renamed appropriately
for x in ap_subsample.full_merge_suffix_rename:
assert ap_subsample.full_merge_suffix_rename[x] == "Metadata_{x}".format(
x=x
)

# Assert that the subsample fraction worked
assert sc_merged_df.shape[0] == cells_df.shape[0] * subsample_frac

for subsample_n in [2, 5, 10]:
ap_subsample = SingleCells(file_or_conn=file, subsample_n=subsample_n)

sc_merged_df = ap_subsample.merge_single_cells(
sc_output_file="none",
compute_subsample=True,
compression_options=None,
float_format=None,
single_cell_normalize=True,
normalize_args=None,
)

# Assert that the number of each strata should be even
assert subsample_n == int(
sc_merged_df.loc[:, ap_subsample.strata].value_counts().values.mean()
)


def test_aggregate_comparment():
df = image_df.merge(cells_df, how="inner", on=["TableNumber", "ImageNumber"])
result = aggregate(df)
Expand Down

0 comments on commit 90fda6b

Please sign in to comment.