Skip to content

Commit

Permalink
Add support for MIBI runs acquired using autogain feature (#455)
Browse files Browse the repository at this point in the history
* Initial auto_detector testing

* Add support for auto detector normalization (same functionality as if < 10 obs for outlier detection)

* Add autogain flag to 4b notebook

* Document autogain for create_fitted_pulse_heights_file

* Correct min_obs threshold if autogain is True

* Move autogain setting to top of notebook, this is to be compatible with the detector plotting PR

* Update tests to pass normalize kwargs more correctly and add specific autogain vs non-autogain testing

* Fix dirname to join for autogain mph_run_dir

* Fix dirname call
  • Loading branch information
alex-l-kong authored Jan 10, 2024
1 parent 8b09afb commit b1bf08a
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 20 deletions.
41 changes: 34 additions & 7 deletions src/toffy/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ def smooth_outliers(vals, outlier_idx, smooth_range=2):
return smoothed_vals


def fit_mass_mph_curve(mph_vals, mass, save_dir, obj_func, min_obs=10):
def fit_mass_mph_curve(mph_vals, mass, save_dir, obj_func, min_obs=10, outlier_fraction=0.1):
"""Fits a curve for the MPH over time for the specified mass.
Args:
Expand All @@ -569,13 +569,17 @@ def fit_mass_mph_curve(mph_vals, mass, save_dir, obj_func, min_obs=10):
save_dir (str): the directory to save the fit parameters
obj_func (str): the function to use for constructing the fit
min_obs (int): the minimum number of observations to fit a curve, otherwise uses median
outlier_fraction (float): the fractional deviation from predicted value required in
order to classify a data point as an outlier
"""
fov_order = np.linspace(0, len(mph_vals) - 1, len(mph_vals))
save_path = os.path.join(save_dir, str(mass) + "_mph_fit.jpg")

if len(mph_vals) > min_obs:
# find outliers in the MPH vals
outlier_idx = identify_outliers(x_vals=fov_order, y_vals=mph_vals, obj_func=obj_func)
outlier_idx = identify_outliers(
x_vals=fov_order, y_vals=mph_vals, obj_func=obj_func, outlier_fraction=outlier_fraction
)

# replace with local smoothing around that point
smoothed_vals = smooth_outliers(vals=mph_vals, outlier_idx=outlier_idx)
Expand All @@ -597,7 +601,6 @@ def fit_mass_mph_curve(mph_vals, mass, save_dir, obj_func, min_obs=10):
plot_fit=True,
save_path=save_path,
)

else:
# default to using the median instead for short runs with small number of FOVs
mph_median = np.median(mph_vals)
Expand Down Expand Up @@ -629,7 +632,8 @@ def create_fitted_mass_mph_vals(pulse_height_df, obj_func_dir):
Args:
pulse_height_df (pd.DataFrame): contains the MPH value per mass for all FOVs
obj_func_dir (str): directory containing the curves generated for each mass
obj_func_dir (str): directory containing the curves generated for each mass.
Set to None if autogain flag is set for calling functions.
Returns:
pd.DataFrame: updated dataframe with fitted version of each MPH value for each mass
Expand All @@ -645,9 +649,10 @@ def create_fitted_mass_mph_vals(pulse_height_df, obj_func_dir):
fov_order = np.linspace(0, num_fovs - 1, num_fovs)

for mass in masses:
mass_idx = pulse_height_df["mass"] == mass

# if channel-specific prediction function does not exist, set to 0
mass_path = os.path.join(obj_func_dir, str(mass) + "_norm_func.json")
mass_idx = pulse_height_df["mass"] == mass

if not os.path.exists(mass_path):
pulse_height_df.loc[mass_idx, "pulse_height_fit"] = 0.0
Expand All @@ -667,14 +672,18 @@ def create_fitted_mass_mph_vals(pulse_height_df, obj_func_dir):
return pulse_height_df


def create_fitted_pulse_heights_file(pulse_height_dir, panel_info, norm_dir, mass_obj_func):
def create_fitted_pulse_heights_file(
pulse_height_dir, panel_info, norm_dir, mass_obj_func, autogain=False, **kwargs
):
"""Create a single file containing the pulse heights after fitting a curve per mass.
Args:
pulse_height_dir (str): path to directory containing pulse height csvs
panel_info (pd.DataFrame): the panel for this dataset
norm_dir (str): the directory where normalized images will be saved
mass_obj_func (str): the objective function used to fit the MPH over time per mass
autogain (bool): whether the run had autogain turned on or not
**kwargs (dict): additional parameters to pass to `fit_mass_mph_curve` for outlier detection
Returns:
pd.DataFrame: the combined pulse heights file
Expand Down Expand Up @@ -704,7 +713,19 @@ def create_fitted_pulse_heights_file(pulse_height_dir, panel_info, norm_dir, mas
warnings.warn("Skipping normalization for mass %s with all zero pulse heights" % mass)
continue

fit_mass_mph_curve(mph_vals=mph_vals, mass=mass, save_dir=fit_dir, obj_func=mass_obj_func)
# for non-autogain runs, use the default min_obs at 10 for curve fitting
# for autogain runs, the median of all observed MPH values should always be used
min_obs = 10**10 if autogain else kwargs.get("min_obs", 10)
outlier_fraction = kwargs.get("outlier_fraction", 0.1)

fit_mass_mph_curve(
mph_vals=mph_vals,
mass=mass,
save_dir=fit_dir,
obj_func=mass_obj_func,
min_obs=min_obs,
outlier_fraction=outlier_fraction,
)

# update pulse_height_df to include fitted mph values
pulse_height_df = create_fitted_mass_mph_vals(
Expand Down Expand Up @@ -782,6 +803,8 @@ def normalize_image_data(
mass_obj_func="poly_2",
extreme_vals=(0.4, 1.1),
norm_func_path=os.path.join("..", "tuning_curves", "avg_norm_func_2600.json"),
autogain=False,
**kwargs,
):
"""Normalizes image data based on median pulse height from the run and a tuning curve.
Expand All @@ -794,6 +817,8 @@ def normalize_image_data(
mass_obj_func (str): class of function to use for modeling MPH over time per mass
extreme_vals (tuple): determines the range for norm vals which will raise a warning
norm_func_path (str): file containing the saved weights for the normalization function
autogain (bool): whether the run had autogain turned on or not
**kwargs (dict): additional parameters to pass to `fit_mass_mph_curve` for outlier detection
"""
# error checks
if not os.path.exists(norm_func_path):
Expand All @@ -817,6 +842,8 @@ def normalize_image_data(
panel_info=panel_info,
norm_dir=norm_dir,
mass_obj_func=mass_obj_func,
autogain=autogain,
kwargs=kwargs,
)
# add channel name to pulse_height_df
renamed_panel = panel_info.rename({"Mass": "mass"}, axis=1)
Expand Down
11 changes: 6 additions & 5 deletions templates/4b_normalize_image_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"### You'll first need to specify the location of the relevant files to enable image normalization\n",
" - `run_name` should contain the exact name of the MIBI run to extract from\n",
" - `panel_path` should point to a panel csv specifying the targets on your panel (see [panel format](https://github.com/angelolab/toffy#panel-format) for more information)\n",
" - `tuning_curve_file` should point to a tuning curve contained in `toffy/tuning_curves` (`avg_norm_func_450.json`, `avg_norm_func_1300.json`, or `avg_norm_func_2600.json`).\n",
" - `tuning_curve_file` should point to a tuning curve contained in `toffy/tuning_curves` (`avg_norm_func_450.json`, `avg_norm_func_1300.json`, or `avg_norm_func_2600.json`)\n",
" - `autogain`: if the Ionpath autogain setting was used for this run\n",
"\n",
"`avg_norm_func_2600.json` is the curve that should be used when running the MIBI with Ionpath's default settings. `avg_norm_func_450.json` and `avg_norm_func_1300.json` are curves generated for Angelo Lab-specific parameters. Use the power supply settings to determine which curve is most applicable."
Expand All @@ -61,7 +61,7 @@
"tuning_curve_file = 'avg_norm_func_2600.json'\n",
"\n",
"# Autogain setting of run\n",
"autogain = True"
"autogain = False"
]
},
{
Expand Down Expand Up @@ -91,8 +91,8 @@
"# otherwise, verify the voltage across all FOVs is constant\n",
"if autogain:\n",
" normalize.plot_detector_voltage(\n",
" base_dir=os.path.join(bin_base_dir, run_name),\n",
" mph_run_dir=os.path.dirname(mph_run_dir, run_name)\n",
" run_folder=os.path.join(bin_base_dir, run_name),\n",
" mph_run_dir=os.path.dirname(mph_run_dir)\n",
" )\n",
"else:\n",
" normalize.check_detector_voltage(os.path.join(bin_base_dir, run_name))"
Expand Down Expand Up @@ -172,7 +172,8 @@
"source": [
"normalize.normalize_image_data(img_dir=os.path.join(rosetta_base_dir, run_name), norm_dir=normalized_run_dir, pulse_height_dir=mph_run_dir,\n",
" panel_info=panel, img_sub_folder=img_sub_folder,\n",
" norm_func_path=os.path.join('..', 'tuning_curves', tuning_curve_file))"
" norm_func_path=os.path.join('..', 'tuning_curves', tuning_curve_file),\n",
" autogain=autogain)"
]
}
],
Expand Down
32 changes: 26 additions & 6 deletions tests/normalize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,8 +499,9 @@ def test_create_fitted_mass_mph_vals(tmpdir, skip_norm_func):


@parametrize("test_zeros", [False, True])
@parametrize("autogain", [False, True])
@parametrize_with_cases("metrics", cases=test_cases.CombineRunMetricFiles)
def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, metrics):
def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, autogain, metrics):
# create metric files
pulse_dir = os.path.join(tmpdir, "pulse_heights")
os.makedirs(pulse_dir)
Expand All @@ -519,6 +520,7 @@ def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, metrics):

panel = test_cases.panel
fovs = natsort.natsorted(test_cases.fovs)
kwargs = {"min_obs": 4, "outlier_fraction": 2}

if test_zeros:
with pytest.warns(UserWarning, match="Skipping normalization"):
Expand All @@ -527,14 +529,21 @@ def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, metrics):
panel_info=panel,
norm_dir=tmpdir,
mass_obj_func="poly_3",
autogain=autogain,
**kwargs,
)
else:
df = normalize.create_fitted_pulse_heights_file(
pulse_height_dir=pulse_dir, panel_info=panel, norm_dir=tmpdir, mass_obj_func="poly_3"
pulse_height_dir=pulse_dir,
panel_info=panel,
norm_dir=tmpdir,
mass_obj_func="poly_3",
autogain=autogain,
**kwargs,
)

# all four FOVs included
assert len(np.unique(df["fov"].values)) == 4
assert len(np.unique(df["fov"].values)) == 5

# FOVs are ordered in proper order
ordered_fovs = df.loc[df["mass"] == 10, "fov"].values.astype("str")
Expand All @@ -545,8 +554,15 @@ def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, metrics):
assert np.all(df[df["mass"] == 5]["pulse_height"].values == 0)
df = df[df["mass"] != 5].copy()

# fitted values are distinct from original
assert np.all(df["pulse_height"].values != df["pulse_height_fit"])
# with autogain, max one FOV should contain the original value (depends on # of FOVs)
if autogain:
for mass in df["mass"].unique():
mass_ph = df.loc[df["mass"] == mass, "pulse_height"].values
mass_ph_med = np.median(mass_ph)
assert np.sum(mass_ph[mass_ph == mass_ph_med]) <= 1
# fitted values are distinct from original without autogain
else:
assert np.all(df["pulse_height"].values != df["pulse_height_fit"].values)


@parametrize("test_zeros", [False, True])
Expand Down Expand Up @@ -644,8 +660,9 @@ def test_normalize_fov(tmpdir, test_zeros, test_high_norm, test_low_norm):
)


@parametrize("autogain", [False, True])
@parametrize_with_cases("metrics", cases=test_cases.CombineRunMetricFiles)
def test_normalize_image_data(tmpdir, metrics):
def test_normalize_image_data(tmpdir, autogain, metrics):
# create directory of pulse height csvs
pulse_height_dir = os.path.join(tmpdir, "pulse_height_dir")
os.makedirs(pulse_height_dir)
Expand Down Expand Up @@ -688,6 +705,7 @@ def test_normalize_image_data(tmpdir, metrics):
pulse_height_dir=pulse_height_dir,
panel_info=panel,
norm_func_path=func_path,
autogain=autogain,
)

assert np.array_equal(io_utils.list_folders(norm_dir, "fov").sort(), fovs.sort())
Expand All @@ -700,6 +718,7 @@ def test_normalize_image_data(tmpdir, metrics):
pulse_height_dir=pulse_height_dir,
panel_info=panel,
norm_func_path="bad_path",
autogain=autogain,
)

# mismatch between FOVs
Expand All @@ -713,6 +732,7 @@ def test_normalize_image_data(tmpdir, metrics):
pulse_height_dir=pulse_height_dir,
panel_info=panel,
norm_func_path=func_path,
autogain=autogain,
)


Expand Down
4 changes: 2 additions & 2 deletions tests/utils/normalize_test_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
masses = np.arange(5, 15)
channels = ["chan_{}".format(i) for i in range(len(masses))]
panel = pd.DataFrame({"Mass": masses, "Target": channels})
fovs = ["fov{}".format(12 - i) for i in range(4)]
fovs = ["fov{}".format(12 - i) for i in range(5)]


def generate_tuning_data(channel_counts):
Expand Down Expand Up @@ -108,7 +108,7 @@ def case_default_metrics(self):
# create full directory of files, include proficient data which should be ignored
metrics = []
metrics_prof = []
for i in range(0, 4):
for i in range(0, 5):
metric_name = "pulse_heights_{}.csv".format(i)
metric_prof_name = "pulse_heights_{}_proficient.csv".format(i)
metric_values = {
Expand Down

0 comments on commit b1bf08a

Please sign in to comment.