sagar87 · MeyerBender · Oct 7, 2024 · Oct 3, 2024 · Oct 4, 2024 · Oct 7, 2024
diff --git a/docs/index.rst b/docs/index.rst
@@ -24,7 +24,7 @@ Welcome to the documentation of spatialproteomics!
    notebooks/ExampleWorkflow
    notebooks/Slicing
    notebooks/Segmentation
-   notebooks/Cropping
+   notebooks/ArtifactRemoval
    notebooks/Plotting
    notebooks/ImageProcessing
    notebooks/Extracting

diff --git a/docs/notebooks/Cropping.ipynb → docs/notebooks/ArtifactRemoval.ipynb b/docs/notebooks/Cropping.ipynb → docs/notebooks/ArtifactRemoval.ipynb
@@ -5,7 +5,7 @@
    "id": "c463dc52-4a63-4a2e-9803-2e25ed1ec6ae",
    "metadata": {},
    "source": [
-    "# Custom Cropping\n",
+    "# Interactive Removal of Artifacts\n",
     "\n",
     "Sometimes, it can happen that only a part of the tissue is usable for analysis, for example due to artifacts or poor tissue quality in a certain region. In such cases, it can make sense to mask out problematic regions. This way, the image can still be used for downstream analysis, but the artifact will not influence results in a negative way. `Spatialproteomics` allows users to upload a binary mask, which indicates which part of the image is suitable for further analysis and which part is not.\n",
     "\n",
@@ -974,7 +974,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.9"
   }
  },
  "nbformat": 4,

diff --git a/docs/notebooks/CellTypePrediction.ipynb b/docs/notebooks/CellTypePrediction.ipynb
diff --git a/docs/notebooks/ExampleWorkflow.ipynb b/docs/notebooks/ExampleWorkflow.ipynb
diff --git a/docs/notebooks/ImageProcessing.ipynb b/docs/notebooks/ImageProcessing.ipynb
@@ -33,8 +33,7 @@
     "\n",
     "import spatialproteomics\n",
     "import matplotlib.pyplot as plt\n",
-    "import xarray as xr\n",
-    "xr.set_options(display_style='text')"
+    "import xarray as xr"
    ]
   },
   {
@@ -142,7 +141,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The marker above seems to have worked fairly well, but you can observe some unspecific binding. To boost the signal-to-noise ratio, we can threshold the image. This basically means that every value below that threshold gets set to zero. You can either threshold using an absolute intensity, or using a quantile value."
+    "The marker above seems to have worked fairly well, but you can observe some unspecific binding. To boost the signal-to-noise ratio, we can threshold the image. This basically means that every value below that threshold gets set to zero. In addition, the other intensities get shifted downwards by your specified value (you can deactivate this behavior by setting `pp.threshold(shift=False)`, which will only set values below the threshold to 0 and keep the other values unchanged). You can either threshold using an absolute intensity, or using a quantile value."
    ]
   },
   {

diff --git a/docs/notebooks/Interoperability.ipynb b/docs/notebooks/Interoperability.ipynb
@@ -507,9 +507,9 @@
    ],
    "source": [
     "# putting the expression matrix into an anndata object\n",
-    "adata = ds.ext.convert_to_anndata(expression_matrix_key=\"_arcsinh_mean\", \n",
-    "                                  additional_layers={\"arcsinh_sum\": \"_arcsinh_sum\", \"raw_mean\": \"_raw_mean\", \"raw_sum\": \"_raw_sum\"}, \n",
-    "                                  additional_uns={\"label_colors\": \"_labels\"})\n",
+    "adata = ds.tl.convert_to_anndata(expression_matrix_key=\"_arcsinh_mean\", \n",
+    "                                 additional_layers={\"arcsinh_sum\": \"_arcsinh_sum\", \"raw_mean\": \"_raw_mean\", \"raw_sum\": \"_raw_sum\"}, \n",
+    "                                 additional_uns={\"label_colors\": \"_labels\"})\n",
     "adata"
    ]
   },
@@ -567,7 +567,7 @@
     }
    ],
    "source": [
-    "spatialdata_object = ds.ext.convert_to_spatialdata(expression_matrix_key=\"_arcsinh_mean\")\n",
+    "spatialdata_object = ds.tl.convert_to_spatialdata(expression_matrix_key=\"_arcsinh_mean\")\n",
     "spatialdata_object"
    ]
   },
@@ -622,7 +622,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.9"
   }
  },
  "nbformat": 4,

diff --git a/docs/notebooks/Neighborhoods.ipynb b/docs/notebooks/Neighborhoods.ipynb
@@ -32,7 +32,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Neighborhood Analsis with the ImageContainer\n",
+    "## Neighborhood Analysis with the ImageContainer\n",
     "\n",
     "As a first step, we open three datasets and store them inside of a dictionary."
    ]

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ name = "spatialproteomics"
 packages = [
     { include = "spatialproteomics" },
 ]
-version = "0.5.7"
+version = "0.6.0"
 description = "spatialproteomics provides tools for the analysis of highly multiplexed immunofluorescence data"
 readme = "README.md"
 authors = ["Harald Vohringer", "Matthias Meyer-Bender"]

diff --git a/spatialproteomics/container.py b/spatialproteomics/container.py
@@ -50,9 +50,15 @@ def load_image_data(
 
     if labels is not None:
         assert segmentation is not None, "Labels may only be provided in conjunction with a segmentation."
+        assert (
+            labels.shape[0] == np.unique(segmentation).shape[0] - 1
+        ), f"Number of labels must match number of segments. Got {labels.shape[0]} labels, but segmentation contained {np.unique(segmentation).shape[0] - 1} cells."
 
     if neighborhood is not None:
         assert labels is not None, "Neighborhoods may only be provided in conjunction with labels."
+        assert (
+            neighborhood.shape[0] == labels.shape[0]
+        ), f"Number of neighborhoods must match number of labels. Got {neighborhood.shape[0]} neighborhoods, but {labels.shape[0]} labels."
 
     im = xr.DataArray(
         image,

diff --git a/spatialproteomics/la/label.py b/spatialproteomics/la/label.py
@@ -7,7 +7,11 @@
 
 from ..base_logger import logger
 from ..constants import COLORS, Dims, Features, Labels, Layers, Props
-from ..la.utils import _format_labels
+from ..la.utils import (
+    _format_labels,
+    _get_markers_from_subtype_dict,
+    _predict_cell_subtypes,
+)
 
 
 @xr.register_dataset_accessor("la")
@@ -925,3 +929,44 @@ def add_properties(
             )
 
         return xr.merge([da, self._obj])
+
+    def predict_cell_subtypes(self, subtype_dict: dict, overwrite_existing_labels: bool = True) -> xr.Dataset:
+        """
+        Predict cell subtypes based on the binarized marker intensities.
+
+        Returns
+        -------
+        xr.Dataset
+            The updated image container with the predicted cell subtypes.
+        overwrite_existing_labels : bool
+            If True, existing labels will be overwritten by the new, more granular cell type predictions. Default is True.
+        """
+        # check if we have labels in the object
+        assert Layers.LA_PROPERTIES in self._obj, "No cell type labels found in the object. Please add labels first."
+
+        # first, we want to recursively check if all markers are binarized
+        # if not, we throw an error and ask the user to binarize the markers first
+        binarized_markers = [
+            x.replace("_binarized", "") for x in self._obj.pp.get_layer_as_df().columns if "_binarized" in x
+        ]
+        markers_for_subtype_prediction = _get_markers_from_subtype_dict(subtype_dict)
+        # checking if all markers are binarized
+        assert all([marker in binarized_markers for marker in markers_for_subtype_prediction]), (
+            "All markers must be binarized before predicting cell subtypes. Please use the la.threshold_labels() method to binarize the markers first. Missing markers: "
+            + ", ".join([marker for marker in markers_for_subtype_prediction if marker not in binarized_markers])
+        )
+
+        # predicting all of the different levels
+        subtype_df = _predict_cell_subtypes(self._obj.pp.get_layer_as_df(), subtype_dict)
+
+        # adding the subtypes to the object
+        obj = self._obj.copy()
+        obj = obj.pp.add_obs_from_dataframe(subtype_df)
+
+        final_layer = subtype_df.columns[-1]
+        if overwrite_existing_labels:
+            obj = obj.pp.drop_layers(Layers.LA_PROPERTIES, suppress_warnings=True)
+            subtype_df["cell"] = subtype_df.index
+            obj = obj.la.add_labels_from_dataframe(subtype_df, cell_col="cell", label_col=final_layer)
+
+        return obj
diff --git a/spatialproteomics/la/utils.py b/spatialproteomics/la/utils.py
@@ -32,3 +32,93 @@ def _format_labels(labels):
         formatted_labels, _, _ = relabel_sequential(formatted_labels)
 
     return formatted_labels
+
+
+def _get_markers_from_subtype_dict(subtype_dict):
+    markers = []
+
+    def extract_markers(subtypes):
+        for subtype in subtypes:
+            # Add markers from the current subtype
+            markers.extend(subtype["markers"])
+            # Recursively extract markers from nested subtypes
+            if "subtypes" in subtype:
+                extract_markers(subtype["subtypes"])
+
+    for cell_type, details in subtype_dict.items():
+        if "subtypes" in details:
+            extract_markers(details["subtypes"])
+
+    return markers
+
+
+def _predict_cell_subtypes(df, subtype_dict):
+    """
+    Predicts cell subtypes based on a hierarchical dictionary of cell types and their markers.
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        DataFrame containing cell data with binarized marker columns.
+    subtype_dict : dict
+        Dictionary defining the hierarchy of cell types and their markers.
+        Each key is a cell type, and its value is a dictionary with keys:
+        - 'markers': List of markers associated with the cell type.
+        - 'subtypes': List of dictionaries defining subtypes, each with 'name' and 'markers'.
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame with new columns for each level of cell type annotations,
+        named '_labels_0', '_labels_1', etc.
+    """
+    # Create a copy of the dataframe to avoid modifying the original
+    df = df.copy()
+
+    # Level-wise breadth-first search to iterate through the hierarchy
+    level = 0
+    queue = [(None, subtype_dict)]  # Queue holds (parent_type, current_dict)
+
+    while queue:
+        next_queue = []
+        for parent_type, current_dict in queue:
+            for cell_type, cell_info in current_dict.items():
+                # Get subtypes if present
+                subtypes = cell_info.get("subtypes", [])
+
+                # If subtypes exist, add each to the queue
+                for subtype in subtypes:
+                    next_queue.append((cell_type, {subtype["name"]: subtype}))
+
+                # Get markers for this cell type
+                markers = cell_info.get("markers", [])
+
+                # Check which cells match the markers and parent type
+                for marker in markers:
+                    if parent_type is None:
+                        # Root level, assign to cells based on marker positivity
+                        condition = df[f"{marker}_binarized"] == 1
+                    else:
+                        # Must match both parent type and marker
+                        # Make this dependent on the previous level
+                        previous_label_column = f"labels_{level - 1}" if level > 0 else "_labels"
+                        condition = (df[f"{marker}_binarized"] == 1) & (df[previous_label_column] == parent_type)
+
+                    # Update labels for current level (adding a new column)
+                    new_label_column = f"labels_{level}"
+                    df.loc[condition, new_label_column] = cell_type
+
+        # Replace NaNs in the new label column with the annotations from the previous level
+        if level > 0:
+            previous_label_column = f"labels_{level - 1}"
+            new_label_column = f"labels_{level}"
+            df[new_label_column] = df[new_label_column].fillna(df[previous_label_column])
+        else:
+            # If level is 0, copy the root labels to the new label column
+            df["labels_0"] = df["_labels"]
+
+        # Update queue and increase level
+        queue = next_queue
+        level += 1
+
+    # selecting only the new cell type annotations
+    df = df[[x for x in df.columns if "labels_" in x]]
+    return df