Add support for MIBI runs acquired using autogain feature (#455)

* Initial auto_detector testing * Add support for auto detector normalization (same functionality as if < 10 obs for outlier detection) * Add autogain flag to 4b notebook * Document autogain for create_fitted_pulse_heights_file * Correct min_obs threshold if autogain is True * Move autogain setting to top of notebook, this is to be compatible with the detector plotting PR * Update tests to pass normalize kwargs more correctly and add specific autogain vs non-autogain testing * Fix dirname to join for autogain mph_run_dir * Fix dirname call
angelolab · Jan 10, 2024 · b1bf08a · b1bf08a
1 parent 8b09afb
commit b1bf08a
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 20 deletions.
diff --git a/src/toffy/normalize.py b/src/toffy/normalize.py
@@ -560,7 +560,7 @@ def smooth_outliers(vals, outlier_idx, smooth_range=2):
  return smoothed_vals
 
 
-def fit_mass_mph_curve(mph_vals, mass, save_dir, obj_func, min_obs=10):
+def fit_mass_mph_curve(mph_vals, mass, save_dir, obj_func, min_obs=10, outlier_fraction=0.1):
  """Fits a curve for the MPH over time for the specified mass.
 
  Args:
@@ -569,13 +569,17 @@ def fit_mass_mph_curve(mph_vals, mass, save_dir, obj_func, min_obs=10):
  save_dir (str): the directory to save the fit parameters
  obj_func (str): the function to use for constructing the fit
  min_obs (int): the minimum number of observations to fit a curve, otherwise uses median
+ outlier_fraction (float): the fractional deviation from predicted value required in
+ order to classify a data point as an outlier
  """
  fov_order = np.linspace(0, len(mph_vals) - 1, len(mph_vals))
  save_path = os.path.join(save_dir, str(mass) + "_mph_fit.jpg")
 
  if len(mph_vals) > min_obs:
  # find outliers in the MPH vals
- outlier_idx = identify_outliers(x_vals=fov_order, y_vals=mph_vals, obj_func=obj_func)
+ outlier_idx = identify_outliers(
+ x_vals=fov_order, y_vals=mph_vals, obj_func=obj_func, outlier_fraction=outlier_fraction
+ )
 
  # replace with local smoothing around that point
  smoothed_vals = smooth_outliers(vals=mph_vals, outlier_idx=outlier_idx)
@@ -597,7 +601,6 @@ def fit_mass_mph_curve(mph_vals, mass, save_dir, obj_func, min_obs=10):
  plot_fit=True,
  save_path=save_path,
  )
-
  else:
  # default to using the median instead for short runs with small number of FOVs
  mph_median = np.median(mph_vals)
@@ -629,7 +632,8 @@ def create_fitted_mass_mph_vals(pulse_height_df, obj_func_dir):
 
  Args:
  pulse_height_df (pd.DataFrame): contains the MPH value per mass for all FOVs
- obj_func_dir (str): directory containing the curves generated for each mass
+ obj_func_dir (str): directory containing the curves generated for each mass.
+ Set to None if autogain flag is set for calling functions.
 
  Returns:
  pd.DataFrame: updated dataframe with fitted version of each MPH value for each mass
@@ -645,9 +649,10 @@ def create_fitted_mass_mph_vals(pulse_height_df, obj_func_dir):
  fov_order = np.linspace(0, num_fovs - 1, num_fovs)
 
  for mass in masses:
+ mass_idx = pulse_height_df["mass"] == mass
+
  # if channel-specific prediction function does not exist, set to 0
  mass_path = os.path.join(obj_func_dir, str(mass) + "_norm_func.json")
- mass_idx = pulse_height_df["mass"] == mass
 
  if not os.path.exists(mass_path):
  pulse_height_df.loc[mass_idx, "pulse_height_fit"] = 0.0
@@ -667,14 +672,18 @@ def create_fitted_mass_mph_vals(pulse_height_df, obj_func_dir):
  return pulse_height_df
 
 
-def create_fitted_pulse_heights_file(pulse_height_dir, panel_info, norm_dir, mass_obj_func):
+def create_fitted_pulse_heights_file(
+ pulse_height_dir, panel_info, norm_dir, mass_obj_func, autogain=False, **kwargs
+):
  """Create a single file containing the pulse heights after fitting a curve per mass.
 
  Args:
  pulse_height_dir (str): path to directory containing pulse height csvs
  panel_info (pd.DataFrame): the panel for this dataset
  norm_dir (str): the directory where normalized images will be saved
  mass_obj_func (str): the objective function used to fit the MPH over time per mass
+ autogain (bool): whether the run had autogain turned on or not
+ **kwargs (dict): additional parameters to pass to `fit_mass_mph_curve` for outlier detection
 
  Returns:
  pd.DataFrame: the combined pulse heights file
@@ -704,7 +713,19 @@ def create_fitted_pulse_heights_file(pulse_height_dir, panel_info, norm_dir, mas
  warnings.warn("Skipping normalization for mass %s with all zero pulse heights" % mass)
  continue
 
- fit_mass_mph_curve(mph_vals=mph_vals, mass=mass, save_dir=fit_dir, obj_func=mass_obj_func)
+ # for non-autogain runs, use the default min_obs at 10 for curve fitting
+ # for autogain runs, the median of all observed MPH values should always be used
+ min_obs = 10**10 if autogain else kwargs.get("min_obs", 10)
+ outlier_fraction = kwargs.get("outlier_fraction", 0.1)
+
+ fit_mass_mph_curve(
+ mph_vals=mph_vals,
+ mass=mass,
+ save_dir=fit_dir,
+ obj_func=mass_obj_func,
+ min_obs=min_obs,
+ outlier_fraction=outlier_fraction,
+ )
 
  # update pulse_height_df to include fitted mph values
  pulse_height_df = create_fitted_mass_mph_vals(
@@ -782,6 +803,8 @@ def normalize_image_data(
  mass_obj_func="poly_2",
  extreme_vals=(0.4, 1.1),
  norm_func_path=os.path.join("..", "tuning_curves", "avg_norm_func_2600.json"),
+ autogain=False,
+ **kwargs,
 ):
  """Normalizes image data based on median pulse height from the run and a tuning curve.
 
@@ -794,6 +817,8 @@ def normalize_image_data(
  mass_obj_func (str): class of function to use for modeling MPH over time per mass
  extreme_vals (tuple): determines the range for norm vals which will raise a warning
  norm_func_path (str): file containing the saved weights for the normalization function
+ autogain (bool): whether the run had autogain turned on or not
+ **kwargs (dict): additional parameters to pass to `fit_mass_mph_curve` for outlier detection
  """
  # error checks
  if not os.path.exists(norm_func_path):
@@ -817,6 +842,8 @@ def normalize_image_data(
  panel_info=panel_info,
  norm_dir=norm_dir,
  mass_obj_func=mass_obj_func,
+ autogain=autogain,
+ kwargs=kwargs,
  )
  # add channel name to pulse_height_df
  renamed_panel = panel_info.rename({"Mass": "mass"}, axis=1)

diff --git a/templates/4b_normalize_image_data.ipynb b/templates/4b_normalize_image_data.ipynb
@@ -38,7 +38,7 @@
  "### You'll first need to specify the location of the relevant files to enable image normalization\n",
  " - `run_name` should contain the exact name of the MIBI run to extract from\n",
  " - `panel_path` should point to a panel csv specifying the targets on your panel (see [panel format](https://github.com/angelolab/toffy#panel-format) for more information)\n",
- " - `tuning_curve_file` should point to a tuning curve contained in `toffy/tuning_curves` (`avg_norm_func_450.json`, `avg_norm_func_1300.json`, or `avg_norm_func_2600.json`).\n",
+ " - `tuning_curve_file` should point to a tuning curve contained in `toffy/tuning_curves` (`avg_norm_func_450.json`, `avg_norm_func_1300.json`, or `avg_norm_func_2600.json`)\n",
  " - `autogain`: if the Ionpath autogain setting was used for this run\n",
  "\n",
  "`avg_norm_func_2600.json` is the curve that should be used when running the MIBI with Ionpath's default settings. `avg_norm_func_450.json` and `avg_norm_func_1300.json` are curves generated for Angelo Lab-specific parameters. Use the power supply settings to determine which curve is most applicable."
@@ -61,7 +61,7 @@
  "tuning_curve_file = 'avg_norm_func_2600.json'\n",
  "\n",
  "# Autogain setting of run\n",
- "autogain = True"
+ "autogain = False"
  ]
  },
  {
@@ -91,8 +91,8 @@
  "# otherwise, verify the voltage across all FOVs is constant\n",
  "if autogain:\n",
  " normalize.plot_detector_voltage(\n",
- " base_dir=os.path.join(bin_base_dir, run_name),\n",
- " mph_run_dir=os.path.dirname(mph_run_dir, run_name)\n",
+ " run_folder=os.path.join(bin_base_dir, run_name),\n",
+ " mph_run_dir=os.path.dirname(mph_run_dir)\n",
  " )\n",
  "else:\n",
  " normalize.check_detector_voltage(os.path.join(bin_base_dir, run_name))"
@@ -172,7 +172,8 @@
  "source": [
  "normalize.normalize_image_data(img_dir=os.path.join(rosetta_base_dir, run_name), norm_dir=normalized_run_dir, pulse_height_dir=mph_run_dir,\n",
  " panel_info=panel, img_sub_folder=img_sub_folder,\n",
- " norm_func_path=os.path.join('..', 'tuning_curves', tuning_curve_file))"
+ " norm_func_path=os.path.join('..', 'tuning_curves', tuning_curve_file),\n",
+ " autogain=autogain)"
  ]
  }
  ],

diff --git a/tests/normalize_test.py b/tests/normalize_test.py
@@ -499,8 +499,9 @@ def test_create_fitted_mass_mph_vals(tmpdir, skip_norm_func):
 
 
 @parametrize("test_zeros", [False, True])
+@parametrize("autogain", [False, True])
 @parametrize_with_cases("metrics", cases=test_cases.CombineRunMetricFiles)
-def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, metrics):
+def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, autogain, metrics):
  # create metric files
  pulse_dir = os.path.join(tmpdir, "pulse_heights")
  os.makedirs(pulse_dir)
@@ -519,6 +520,7 @@ def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, metrics):
 
  panel = test_cases.panel
  fovs = natsort.natsorted(test_cases.fovs)
+ kwargs = {"min_obs": 4, "outlier_fraction": 2}
 
  if test_zeros:
  with pytest.warns(UserWarning, match="Skipping normalization"):
@@ -527,14 +529,21 @@ def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, metrics):
  panel_info=panel,
  norm_dir=tmpdir,
  mass_obj_func="poly_3",
+ autogain=autogain,
+ **kwargs,
  )
  else:
  df = normalize.create_fitted_pulse_heights_file(
- pulse_height_dir=pulse_dir, panel_info=panel, norm_dir=tmpdir, mass_obj_func="poly_3"
+ pulse_height_dir=pulse_dir,
+ panel_info=panel,
+ norm_dir=tmpdir,
+ mass_obj_func="poly_3",
+ autogain=autogain,
+ **kwargs,
  )
 
  # all four FOVs included
- assert len(np.unique(df["fov"].values)) == 4
+ assert len(np.unique(df["fov"].values)) == 5
 
  # FOVs are ordered in proper order
  ordered_fovs = df.loc[df["mass"] == 10, "fov"].values.astype("str")
@@ -545,8 +554,15 @@ def test_create_fitted_pulse_heights_file(tmpdir, test_zeros, metrics):
  assert np.all(df[df["mass"] == 5]["pulse_height"].values == 0)
  df = df[df["mass"] != 5].copy()
 
- # fitted values are distinct from original
- assert np.all(df["pulse_height"].values != df["pulse_height_fit"])
+ # with autogain, max one FOV should contain the original value (depends on # of FOVs)
+ if autogain:
+ for mass in df["mass"].unique():
+ mass_ph = df.loc[df["mass"] == mass, "pulse_height"].values
+ mass_ph_med = np.median(mass_ph)
+ assert np.sum(mass_ph[mass_ph == mass_ph_med]) <= 1
+ # fitted values are distinct from original without autogain
+ else:
+ assert np.all(df["pulse_height"].values != df["pulse_height_fit"].values)
 
 
 @parametrize("test_zeros", [False, True])
@@ -644,8 +660,9 @@ def test_normalize_fov(tmpdir, test_zeros, test_high_norm, test_low_norm):
  )
 
 
+@parametrize("autogain", [False, True])
 @parametrize_with_cases("metrics", cases=test_cases.CombineRunMetricFiles)
-def test_normalize_image_data(tmpdir, metrics):
+def test_normalize_image_data(tmpdir, autogain, metrics):
  # create directory of pulse height csvs
  pulse_height_dir = os.path.join(tmpdir, "pulse_height_dir")
  os.makedirs(pulse_height_dir)
@@ -688,6 +705,7 @@ def test_normalize_image_data(tmpdir, metrics):
  pulse_height_dir=pulse_height_dir,
  panel_info=panel,
  norm_func_path=func_path,
+ autogain=autogain,
  )
 
  assert np.array_equal(io_utils.list_folders(norm_dir, "fov").sort(), fovs.sort())
@@ -700,6 +718,7 @@ def test_normalize_image_data(tmpdir, metrics):
  pulse_height_dir=pulse_height_dir,
  panel_info=panel,
  norm_func_path="bad_path",
+ autogain=autogain,
  )
 
  # mismatch between FOVs
@@ -713,6 +732,7 @@ def test_normalize_image_data(tmpdir, metrics):
  pulse_height_dir=pulse_height_dir,
  panel_info=panel,
  norm_func_path=func_path,
+ autogain=autogain,
  )
 
 

diff --git a/tests/utils/normalize_test_cases.py b/tests/utils/normalize_test_cases.py
@@ -5,7 +5,7 @@
 masses = np.arange(5, 15)
 channels = ["chan_{}".format(i) for i in range(len(masses))]
 panel = pd.DataFrame({"Mass": masses, "Target": channels})
-fovs = ["fov{}".format(12 - i) for i in range(4)]
+fovs = ["fov{}".format(12 - i) for i in range(5)]
 
 
 def generate_tuning_data(channel_counts):
@@ -108,7 +108,7 @@ def case_default_metrics(self):
  # create full directory of files, include proficient data which should be ignored
  metrics = []
  metrics_prof = []
- for i in range(0, 4):
+ for i in range(0, 5):
  metric_name = "pulse_heights_{}.csv".format(i)
  metric_prof_name = "pulse_heights_{}_proficient.csv".format(i)
  metric_values = {