angelolab · ngreenwald · Jul 20, 2023 · Apr 14, 2023 · Apr 14, 2023 · Apr 15, 2023
diff --git a/conftest.py b/conftest.py
@@ -1 +1,17 @@
-# empty file so that pyetst adds top-level directory
+from typing import Generator
+
+import numpy as np
+import pytest
+
+
+@pytest.fixture(scope="module")
+def rng() -> Generator[np.random.Generator, None, None]:
+ """
+ Create a new Random Number Generator for tests which require randomized data.
+
+ Yields:
+ Generator[np.random.Generator, None, None]: The generator used for creating randomized
+ numbers.
+ """
+ rng: np.random.Generator = np.random.default_rng(12345)
+ yield rng
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ style = "pep440"
 metadata = false
 
 [tool.poetry.dependencies]
-python = "^3.8"
+python = "^3.8,<3.12"
 alpineer = "^0.1.5"
 mibi-bin-tools = "^0.2.8"
 ipywidgets = "^8"
@@ -44,6 +44,7 @@ seaborn = "^0.12"
 scikit-learn = "^1"
 watchdog = "^3"
 tqdm = "^4"
+scipy = "^1.10.1"
 
 [tool.poetry.group.test]
 optional = true
@@ -66,6 +67,7 @@ black = "^22.10.0"
 isort = "^5.10.1"
 jupyterlab = "^3.6.1"
 jupyter-contrib-nbextensions = "^0.7.0"
+loguru = "^0.7.0"
 
 ## TYPE CHECKING ##
 

diff --git a/src/toffy/fov_watcher.py b/src/toffy/fov_watcher.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 from typing import Callable, Tuple
 
-from matplotlib import pyplot as plt
 from watchdog.events import FileCreatedEvent, FileSystemEventHandler
 from watchdog.observers import Observer
 

diff --git a/src/toffy/qc_comp.py b/src/toffy/qc_comp.py
@@ -1,17 +1,20 @@
 import copy
+import itertools
 import os
 import pathlib
+import re
 from shutil import rmtree
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
-import matplotlib.pyplot as plt
+import natsort as ns
 import numpy as np
 import pandas as pd
 import seaborn as sns
-import seaborn.objects as so
 from alpineer import image_utils, io_utils, load_utils, misc_utils
+from pandas.core.groupby import DataFrameGroupBy
 from requests.exceptions import HTTPError
 from scipy.ndimage import gaussian_filter
+from scipy.stats import rankdata
 
 from toffy import settings
 from toffy.mibitracker_utils import MibiRequests, MibiTrackerError
@@ -278,7 +281,7 @@ def compute_qc_metrics(
 
  Args:
  extracted_imgs_path (str):
- the directory when extracted images are stored
+ the directory where extracted images are stored
  fov_name (str):
  the name of the FOV to extract from `bin_file_path`, needs to correspond with JSON name
  gaussian_blur (bool):
@@ -291,8 +294,7 @@ def compute_qc_metrics(
  path to save csvs of the qc metrics to
 
  Returns:
- None | Dict[str, pd.DataFrame]:
- If save_csv is False, returns qc metrics. Otherwise, no return
+ None
  """
 
  # path validation checks
@@ -325,10 +327,6 @@ def compute_qc_metrics_direct(image_data, fov_name, gaussian_blur=False, blur_fa
  set to 0 to use raw inputs without Gaussian blurring
  ignored if `gaussian_blur` set to `False`
 
- Returns:
- Dict[str, pd.DataFrame]:
- Returns qc metrics
-
  """
 
  # there's only 1 FOV and 1 type ('pulse'), so subset on that
@@ -545,3 +543,233 @@ def format_img_data(img_data):
  img_data = img_data.rename({"fovs": "fov", "rows": "x", "cols": "y", "channels": "channel"})
 
  return img_data
+
+
+def _get_r_c(fov_name: pd.Series, search_term: re.Pattern) -> pd.Series:
+ """Gets the row and column value from a FOV's name containing RnCm.
+
+ Args:
+ fov_name (pd.Series): The FOV's name.
+ search_term (re.Pattern): The regex pattern for searching for RnCm.
+
+ Returns:
+ pd.Series: Returns `n` and `m` as a series.
+ """
+ r, c = map(int, re.search(search_term, fov_name).group(1, 2))
+ return pd.Series([r, c])
+
+
+def qc_tma_metrics(
+ extracted_imgs_path: Union[str, pathlib.Path],
+ qc_tma_metrics_dir: Union[str, pathlib.Path],
+ tma: str,
+) -> None:
+ """
+ Calculates the QC metrics for a user specified TMA.
+
+ Args:
+ extracted_imgs_path (Union[str,pathlib.Path]): The directory where the extracted images are stored.
+ qc_tma_metrics_dir (Union[str, pathlib.path]): The directory where to place the QC TMA metrics.
+ tma (str): The FOVs with the TMA in the folder name to gather.
+ """
+ # Get all the FOVs that match the input `tma` string
+ fovs = io_utils.list_folders(extracted_imgs_path, substrs=tma)
+
+ # Create regex pattern for searching RnCm
+ search_term: re.Pattern = re.compile(r"R\+?(\d+)C\+?(\d+)")
+
+ # Get qc metrics for each fov
+ for fov in ns.natsorted(fovs):
+ compute_qc_metrics(
+ extracted_imgs_path=extracted_imgs_path, fov_name=fov, save_csv=qc_tma_metrics_dir
+ )
+
+ # Combine the qc metrics for all fovs per TMA
+ for ms in settings.QC_SUFFIXES:
+ metric_files: List[str] = io_utils.list_files(qc_tma_metrics_dir, substrs=f"{ms}.csv")
+ metric_files: List[str] = [mf for mf in metric_files if "combined" not in mf]
+
+ # Define an aggregated metric DataFrame
+ combined_metric_df: pd.DataFrame = pd.concat(
+ (pd.read_csv(os.path.join(qc_tma_metrics_dir, mf)) for mf in metric_files),
+ ignore_index=True,
+ )
+
+ # Extract the Row and Column
+ combined_metric_df[["row", "column"]] = combined_metric_df["fov"].apply(
+ lambda row: _get_r_c(row, search_term)
+ )
+ combined_metric_df.to_csv(
+ os.path.join(qc_tma_metrics_dir, f"{tma}_combined_{ms}.csv"), index=False
+ )
+
+
+def _create_r_c_tma_matrix(
+ group: DataFrameGroupBy, x_size: int, y_size: int, qc_col: str
+) -> pd.Series:
+ """
+ Creates the FOV / TMA matrix.
+
+ Args:
+ group (DataFrameGroupBy): Each group consists of an individual channel, and all of it's associated FOVs.
+ x_size (int): The number of columns in the matrix.
+ y_size (int): The number of rows in the matrix.
+ qc_col (str): The column to get the the QC data.
+
+ Returns:
+ pd.Series[np.ndarray]: Returns the a series containing the matrix.
+ """
+
+ rc_array: np.ndarray = np.full(shape=(x_size, y_size), fill_value=np.nan)
+ rc_array[group["column"] - 1, group["row"] - 1] = group[qc_col]
+
+ return pd.Series([rc_array])
+
+
+def qc_tma_metrics_rank(
+ qc_tma_metrics_dir: Union[str, pathlib.Path],
+ tma: str,
+ qc_metrics: List[str] = None,
+ channel_exclude: List[str] = None,
+) -> Dict[str, np.ndarray]:
+ """
+ Creates the average rank for a given TMA across all FOVs and unfiltered / unexcluded channels.
+ By default the following channels are excluded: Au, Fe, Na, Ta, Noodle.
+
+
+ Args:
+ qc_tma_metrics_dir (Union[str, pathlib.Path]): The direcftory where to place the QC TMA metrics.
+ tma (str): The TMA to gather FOVs in.
+ qc_metrics (List[str], optional): The QC metrics to create plots for. Can be a subset of the
+ following:
+
+ * Non-zero mean intensity
+ * Total intensity
+ * 99.9% intensity value. Defaults to None.
+ channel_exclude (List[str], optional): An optional list of channels to further filter out. Defaults to None.
+
+ Returns:
+ Dict[str, np.ndarray]: A dictionary containing the QC column and the a numpy array
+ representing the average ranks for a given TMA."""
+ # Sort the loaded combined csv files based on QC_SUFFIXES
+ combined_metric_tmas = ns.natsorted(
+ io_utils.list_files(qc_tma_metrics_dir, substrs=f"{tma}_combined"),
+ key=lambda m: (i for i, qc_s in enumerate(settings.QC_SUFFIXES) if qc_s in m),
+ )
+ # Then filter out unused suffixes
+ if qc_metrics is not None:
+ filtered_qcs: List[bool] = [qcm in qc_metrics for qcm in settings.QC_COLUMNS]
+ qc_cols = list(itertools.compress(settings.QC_COLUMNS, filtered_qcs))
+ combined_metric_tmas = list(itertools.compress(combined_metric_tmas, filtered_qcs))
+ else:
+ qc_cols: List[str] = settings.QC_COLUMNS
+
+ cmt_data = dict()
+ for cmt, qc_col in zip(combined_metric_tmas, qc_cols):
+ # Open and filter the default ignored channels
+ cmt_df: pd.DataFrame = pd.read_csv(os.path.join(qc_tma_metrics_dir, cmt))
+ cmt_df: pd.DataFrame = cmt_df[~cmt_df["channel"].isin(settings.QC_CHANNEL_IGNORE)]
+
+ # Verify that the excluded channels exist in the combined metric tma DataFrame
+ # Then remove the excluded channels
+ if channel_exclude is not None:
+ misc_utils.verify_in_list(
+ channels_to_exclude=channel_exclude,
+ combined_metric_tma_df_channels=cmt_df["channel"].unique(),
+ )
+ cmt_df: pd.DataFrame = cmt_df[~cmt_df["channel"].isin(channel_exclude)]
+
+ # Get matrix dimensions
+ y_size: int = cmt_df["column"].max()
+ x_size: int = cmt_df["row"].max()
+
+ # Create the TMA matrix / for the heatmap
+ channel_tmas: pd.DataFrame = cmt_df.groupby(by="channel", sort=True).apply(
+ lambda group: _create_r_c_tma_matrix(group, y_size, x_size, qc_col)
+ )
+ channel_matrices: np.ndarray = np.array(
+ [c_tma[0] for c_tma in channel_tmas.values],
+ )
+
+ # Rank all FOVs for each channel.
+ ranked_channels: np.ndarray = rankdata(
+ a=channel_matrices.reshape((x_size * y_size), -1),
+ method="average",
+ nan_policy="omit",
+ axis=0,
+ ).reshape(len(channel_tmas), x_size, y_size)
+
+ # Average the rank for each channel.
+ avg_ranked_tma: np.ndarray = ranked_channels.mean(axis=0)
+
+ cmt_data[qc_col] = avg_ranked_tma
+
+ return cmt_data
+
+
+def batch_effect_qc_metrics(
+ cohort_data_dir: Union[str, pathlib.Path],
+ qc_cohort_metrics_dir: Union[str, pathlib.Path],
+ tissues: List[str],
+) -> None:
+ """
+ Computes QC metrics for a specified set of tissues and saves the tissue specific QC files
+ in the `qc_cohort_metrics_dir`. Calculates the following metrics for the specified tissues,
+ and the metrics for the invidual FOVs within that cohort:
+ * Non-zero mean intensity
+ * Total intensity
+ * 99.9% intensity value
+
+ Args:
+ cohort_data_dir (Union[str, pathlib.Path]): The directory which contains the FOVs for a cohort of interest.
+ qc_cohort_metrics_dir (Union[str,pathlib.Path]): The directory where the cohort metrics will be saved to.
+ tissues (List[str]): A list of tissues to find QC metrics for.
+
+ Raises:
+ ValueError: Errors if `tissues` is either None, or a list of size 0.
+ """
+ if tissues is None or len(tissues) < 1:
+ raise ValueError("The tissues must be specified")
+
+ # Input validation: cohort_data_dir, qc_cohort_metrics_dir
+ io_utils.validate_paths([cohort_data_dir, qc_cohort_metrics_dir])
+
+ samples = io_utils.list_folders(dir_name=cohort_data_dir, substrs=tissues)
+
+ tissue_to_sample_mapping: Dict[str, List[str]] = {}
+
+ for sample in samples:
+ for tissue in tissues:
+ if tissue in sample:
+ if tissue in tissue_to_sample_mapping.keys():
+ tissue_to_sample_mapping[tissue].append(sample)
+ else:
+ tissue_to_sample_mapping[tissue] = [sample]
+
+ # Use a set of the samples to avoid duplicate QC metric calculations
+ sample_set = set(list(itertools.chain.from_iterable(tissue_to_sample_mapping.values())))
+
+ # Compute the QC metrics for all unique samples that match with the user's tissue input.
+ for sample in ns.natsorted(sample_set):
+ compute_qc_metrics(
+ extracted_imgs_path=cohort_data_dir, fov_name=sample, save_csv=qc_cohort_metrics_dir
+ )
+
+ # Combined metrics per Tissue
+ for tissue, samples in tissue_to_sample_mapping.items():
+ for ms in settings.QC_SUFFIXES:
+ metric_files: List[str] = io_utils.list_files(
+ qc_cohort_metrics_dir, substrs=[f"{sample}_{ms}.csv" for sample in samples]
+ )
+
+ metric_files = list(filter(lambda mf: "combined" not in mf, metric_files))
+
+ # Define an aggregated metric DataFrame
+ combined_metric_tissue_df: pd.DataFrame = pd.concat(
+ (pd.read_csv(os.path.join(qc_cohort_metrics_dir, mf)) for mf in metric_files)
+ )
+
+ combined_metric_tissue_df.to_csv(
+ os.path.join(qc_cohort_metrics_dir, f"{tissue}_combined_{ms}.csv"),
+ index=False,
+ )