Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QC for TMAs and Control Samples #358

Merged
merged 32 commits into from
Jul 20, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
36fe2b4
added QC TMA logic
srivarra Apr 14, 2023
8176170
python 3.11?
srivarra Apr 14, 2023
822bb25
fixed type hints, python3.8 default
srivarra Apr 15, 2023
cf2f21b
undid variable name change
srivarra Apr 15, 2023
30a9fd7
removed unused imports + sorted used imports
srivarra Apr 15, 2023
5f469e8
typo
srivarra Apr 17, 2023
998a65b
itertools.product is very cool
srivarra Apr 19, 2023
7a337f7
figure formatting fix
srivarra Apr 26, 2023
9a5d328
Merge branch 'main' into qc/location_specific_image_artifacts
srivarra Apr 26, 2023
ed611b1
poetry lock file fix
srivarra Apr 26, 2023
c073faf
Merge branch 'main' into qc/location_specific_image_artifacts
srivarra Apr 27, 2023
88619f6
small fix
srivarra Apr 27, 2023
966bdfb
Merge branch 'main' into qc/location_specific_image_artifacts
srivarra May 24, 2023
72ddf08
Merge branch 'main' into qc/location_specific_image_artifacts
srivarra Jun 1, 2023
796cad5
refactored code, refactored tests, added notebook
srivarra Jun 16, 2023
138e8cc
fixed nb 3c qc_metrics_plots import
srivarra Jun 16, 2023
b3ec017
small documentation fixes
srivarra Jun 16, 2023
014ec7a
requested changes pt. 1
srivarra Jun 20, 2023
009fbe0
Merge branch 'main' into qc/location_specific_image_artifacts
srivarra Jul 5, 2023
b71af62
Merge branch 'main' into qc/location_specific_image_artifacts
srivarra Jul 5, 2023
d7d8419
poetry lock update
srivarra Jul 5, 2023
c52fce9
made requested changes, cleaned up code
srivarra Jul 6, 2023
79cf261
notebook adjustments
srivarra Jul 7, 2023
0f94923
colormap fix, adjusted ranking and csv filtering
srivarra Jul 13, 2023
885ed57
replaced qc_tma image
srivarra Jul 14, 2023
6dd2fc2
merge main
srivarra Jul 14, 2023
8b6364d
Merge branch 'main' into qc/location_specific_image_artifacts
srivarra Jul 14, 2023
86740d5
updated notebook, batch -> longitudinal
srivarra Jul 17, 2023
81d1c4d
added units, fixed some documentation
srivarra Jul 20, 2023
a7ff851
Merge branch 'main' into qc/location_specific_image_artifacts
srivarra Jul 20, 2023
5f65934
plotting and notebook adjustments
srivarra Jul 20, 2023
24ef0ff
more small adjustments
srivarra Jul 20, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion conftest.py
Original file line number Diff line number Diff line change
@@ -1 +1,17 @@
# empty file so that pyetst adds top-level directory
from typing import Generator

import numpy as np
import pytest


@pytest.fixture(scope="module")
def rng() -> Generator[np.random.Generator, None, None]:
"""
Create a new Random Number Generator for tests which require randomized data.

Yields:
Generator[np.random.Generator, None, None]: The generator used for creating randomized
numbers.
"""
rng: np.random.Generator = np.random.default_rng(12345)
yield rng
784 changes: 429 additions & 355 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ style = "pep440"
metadata = false

[tool.poetry.dependencies]
python = "^3.8"
python = "^3.8,<3.12"
alpineer = "^0.1.5"
mibi-bin-tools = "^0.2.8"
ipywidgets = "^8"
Expand All @@ -44,6 +44,7 @@ seaborn = "^0.12"
scikit-learn = "^1"
watchdog = "^3"
tqdm = "^4"
scipy = "^1.10.1"

[tool.poetry.group.test]
optional = true
Expand All @@ -66,6 +67,7 @@ black = "^22.10.0"
isort = "^5.10.1"
jupyterlab = "^3.6.1"
jupyter-contrib-nbextensions = "^0.7.0"
loguru = "^0.7.0"

## TYPE CHECKING ##

Expand Down
1 change: 0 additions & 1 deletion src/toffy/fov_watcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pathlib import Path
from typing import Callable, Tuple

from matplotlib import pyplot as plt
from watchdog.events import FileCreatedEvent, FileSystemEventHandler
from watchdog.observers import Observer

Expand Down
248 changes: 238 additions & 10 deletions src/toffy/qc_comp.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import copy
import itertools
import os
import pathlib
import re
from shutil import rmtree
from typing import List, Optional, Union
from typing import Dict, List, Optional, Union

import matplotlib.pyplot as plt
import natsort as ns
import numpy as np
import pandas as pd
import seaborn as sns
import seaborn.objects as so
from alpineer import image_utils, io_utils, load_utils, misc_utils
from pandas.core.groupby import DataFrameGroupBy
from requests.exceptions import HTTPError
from scipy.ndimage import gaussian_filter
from scipy.stats import rankdata

from toffy import settings
from toffy.mibitracker_utils import MibiRequests, MibiTrackerError
Expand Down Expand Up @@ -278,7 +281,7 @@ def compute_qc_metrics(

Args:
extracted_imgs_path (str):
the directory when extracted images are stored
the directory where extracted images are stored
fov_name (str):
the name of the FOV to extract from `bin_file_path`, needs to correspond with JSON name
gaussian_blur (bool):
Expand All @@ -291,8 +294,7 @@ def compute_qc_metrics(
path to save csvs of the qc metrics to

Returns:
None | Dict[str, pd.DataFrame]:
If save_csv is False, returns qc metrics. Otherwise, no return
None
"""

# path validation checks
Expand Down Expand Up @@ -325,10 +327,6 @@ def compute_qc_metrics_direct(image_data, fov_name, gaussian_blur=False, blur_fa
set to 0 to use raw inputs without Gaussian blurring
ignored if `gaussian_blur` set to `False`

Returns:
Dict[str, pd.DataFrame]:
Returns qc metrics

"""

# there's only 1 FOV and 1 type ('pulse'), so subset on that
Expand Down Expand Up @@ -545,3 +543,233 @@ def format_img_data(img_data):
img_data = img_data.rename({"fovs": "fov", "rows": "x", "cols": "y", "channels": "channel"})

return img_data


def _get_r_c(fov_name: pd.Series, search_term: re.Pattern) -> pd.Series:
"""Gets the row and column value from a FOV's name containing RnCm.

Args:
fov_name (pd.Series): The FOV's name.
search_term (re.Pattern): The regex pattern for searching for RnCm.

Returns:
pd.Series: Returns `n` and `m` as a series.
"""
r, c = map(int, re.search(search_term, fov_name).group(1, 2))
return pd.Series([r, c])


def qc_tma_metrics(
extracted_imgs_path: Union[str, pathlib.Path],
qc_tma_metrics_dir: Union[str, pathlib.Path],
tma: str,
) -> None:
"""
Calculates the QC metrics for a user specified TMA.

Args:
extracted_imgs_path (Union[str,pathlib.Path]): The directory where the extracted images are stored.
qc_tma_metrics_dir (Union[str, pathlib.path]): The directory where to place the QC TMA metrics.
tma (str): The FOVs with the TMA in the folder name to gather.
"""
# Get all the FOVs that match the input `tma` string
fovs = io_utils.list_folders(extracted_imgs_path, substrs=tma)

# Create regex pattern for searching RnCm
search_term: re.Pattern = re.compile(r"R\+?(\d+)C\+?(\d+)")

# Get qc metrics for each fov
for fov in ns.natsorted(fovs):
compute_qc_metrics(
extracted_imgs_path=extracted_imgs_path, fov_name=fov, save_csv=qc_tma_metrics_dir
)

# Combine the qc metrics for all fovs per TMA
for ms in settings.QC_SUFFIXES:
metric_files: List[str] = io_utils.list_files(qc_tma_metrics_dir, substrs=f"{ms}.csv")
metric_files: List[str] = [mf for mf in metric_files if "combined" not in mf]

# Define an aggregated metric DataFrame
combined_metric_df: pd.DataFrame = pd.concat(
(pd.read_csv(os.path.join(qc_tma_metrics_dir, mf)) for mf in metric_files),
ignore_index=True,
)

# Extract the Row and Column
combined_metric_df[["row", "column"]] = combined_metric_df["fov"].apply(
lambda row: _get_r_c(row, search_term)
)
combined_metric_df.to_csv(
os.path.join(qc_tma_metrics_dir, f"{tma}_combined_{ms}.csv"), index=False
)


def _create_r_c_tma_matrix(
group: DataFrameGroupBy, x_size: int, y_size: int, qc_col: str
) -> pd.Series:
"""
Creates the FOV / TMA matrix.

Args:
group (DataFrameGroupBy): Each group consists of an individual channel, and all of it's associated FOVs.
x_size (int): The number of columns in the matrix.
y_size (int): The number of rows in the matrix.
qc_col (str): The column to get the the QC data.

Returns:
pd.Series[np.ndarray]: Returns the a series containing the matrix.
"""

rc_array: np.ndarray = np.full(shape=(x_size, y_size), fill_value=np.nan)
rc_array[group["column"] - 1, group["row"] - 1] = group[qc_col]

return pd.Series([rc_array])


def qc_tma_metrics_rank(
qc_tma_metrics_dir: Union[str, pathlib.Path],
tma: str,
qc_metrics: List[str] = None,
channel_exclude: List[str] = None,
) -> Dict[str, np.ndarray]:
"""
Creates the average rank for a given TMA across all FOVs and unfiltered / unexcluded channels.
By default the following channels are excluded: Au, Fe, Na, Ta, Noodle.


Args:
qc_tma_metrics_dir (Union[str, pathlib.Path]): The direcftory where to place the QC TMA metrics.
tma (str): The TMA to gather FOVs in.
qc_metrics (List[str], optional): The QC metrics to create plots for. Can be a subset of the
following:

* Non-zero mean intensity
* Total intensity
* 99.9% intensity value. Defaults to None.
channel_exclude (List[str], optional): An optional list of channels to further filter out. Defaults to None.

Returns:
Dict[str, np.ndarray]: A dictionary containing the QC column and the a numpy array
representing the average ranks for a given TMA."""
# Sort the loaded combined csv files based on QC_SUFFIXES
combined_metric_tmas = ns.natsorted(
io_utils.list_files(qc_tma_metrics_dir, substrs=f"{tma}_combined"),
key=lambda m: (i for i, qc_s in enumerate(settings.QC_SUFFIXES) if qc_s in m),
)
# Then filter out unused suffixes
if qc_metrics is not None:
filtered_qcs: List[bool] = [qcm in qc_metrics for qcm in settings.QC_COLUMNS]
qc_cols = list(itertools.compress(settings.QC_COLUMNS, filtered_qcs))
combined_metric_tmas = list(itertools.compress(combined_metric_tmas, filtered_qcs))
else:
qc_cols: List[str] = settings.QC_COLUMNS

cmt_data = dict()
for cmt, qc_col in zip(combined_metric_tmas, qc_cols):
# Open and filter the default ignored channels
cmt_df: pd.DataFrame = pd.read_csv(os.path.join(qc_tma_metrics_dir, cmt))
cmt_df: pd.DataFrame = cmt_df[~cmt_df["channel"].isin(settings.QC_CHANNEL_IGNORE)]

# Verify that the excluded channels exist in the combined metric tma DataFrame
# Then remove the excluded channels
if channel_exclude is not None:
misc_utils.verify_in_list(
channels_to_exclude=channel_exclude,
combined_metric_tma_df_channels=cmt_df["channel"].unique(),
)
cmt_df: pd.DataFrame = cmt_df[~cmt_df["channel"].isin(channel_exclude)]

# Get matrix dimensions
y_size: int = cmt_df["column"].max()
x_size: int = cmt_df["row"].max()

# Create the TMA matrix / for the heatmap
channel_tmas: pd.DataFrame = cmt_df.groupby(by="channel", sort=True).apply(
lambda group: _create_r_c_tma_matrix(group, y_size, x_size, qc_col)
)
channel_matrices: np.ndarray = np.array(
[c_tma[0] for c_tma in channel_tmas.values],
)

# Rank all FOVs for each channel.
ranked_channels: np.ndarray = rankdata(
a=channel_matrices.reshape((x_size * y_size), -1),
method="average",
nan_policy="omit",
axis=0,
).reshape(len(channel_tmas), x_size, y_size)

# Average the rank for each channel.
avg_ranked_tma: np.ndarray = ranked_channels.mean(axis=0)

cmt_data[qc_col] = avg_ranked_tma

return cmt_data


def batch_effect_qc_metrics(
cohort_data_dir: Union[str, pathlib.Path],
qc_cohort_metrics_dir: Union[str, pathlib.Path],
tissues: List[str],
) -> None:
"""
Computes QC metrics for a specified set of tissues and saves the tissue specific QC files
in the `qc_cohort_metrics_dir`. Calculates the following metrics for the specified tissues,
and the metrics for the invidual FOVs within that cohort:
* Non-zero mean intensity
* Total intensity
* 99.9% intensity value

Args:
cohort_data_dir (Union[str, pathlib.Path]): The directory which contains the FOVs for a cohort of interest.
qc_cohort_metrics_dir (Union[str,pathlib.Path]): The directory where the cohort metrics will be saved to.
tissues (List[str]): A list of tissues to find QC metrics for.

Raises:
ValueError: Errors if `tissues` is either None, or a list of size 0.
"""
if tissues is None or len(tissues) < 1:
raise ValueError("The tissues must be specified")

# Input validation: cohort_data_dir, qc_cohort_metrics_dir
io_utils.validate_paths([cohort_data_dir, qc_cohort_metrics_dir])

samples = io_utils.list_folders(dir_name=cohort_data_dir, substrs=tissues)

tissue_to_sample_mapping: Dict[str, List[str]] = {}

for sample in samples:
for tissue in tissues:
if tissue in sample:
if tissue in tissue_to_sample_mapping.keys():
tissue_to_sample_mapping[tissue].append(sample)
else:
tissue_to_sample_mapping[tissue] = [sample]
srivarra marked this conversation as resolved.
Show resolved Hide resolved

# Use a set of the samples to avoid duplicate QC metric calculations
sample_set = set(list(itertools.chain.from_iterable(tissue_to_sample_mapping.values())))

# Compute the QC metrics for all unique samples that match with the user's tissue input.
for sample in ns.natsorted(sample_set):
compute_qc_metrics(
extracted_imgs_path=cohort_data_dir, fov_name=sample, save_csv=qc_cohort_metrics_dir
)

# Combined metrics per Tissue
for tissue, samples in tissue_to_sample_mapping.items():
for ms in settings.QC_SUFFIXES:
metric_files: List[str] = io_utils.list_files(
qc_cohort_metrics_dir, substrs=[f"{sample}_{ms}.csv" for sample in samples]
)

metric_files = list(filter(lambda mf: "combined" not in mf, metric_files))

# Define an aggregated metric DataFrame
combined_metric_tissue_df: pd.DataFrame = pd.concat(
(pd.read_csv(os.path.join(qc_cohort_metrics_dir, mf)) for mf in metric_files)
)

combined_metric_tissue_df.to_csv(
srivarra marked this conversation as resolved.
Show resolved Hide resolved
os.path.join(qc_cohort_metrics_dir, f"{tissue}_combined_{ms}.csv"),
index=False,
)
Loading