YeoLab · ckmah · May 3, 2023 · Jul 26, 2022 · Aug 16, 2022 · Aug 16, 2022
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -41,7 +41,7 @@
 	// "forwardPorts": [],
 
 	// Use 'postCreateCommand' to run commands after the container is created.
-	// "postCreateCommand": "pip3 install --user -r requirements.txt",
+	"postCreateCommand": "pip3 install poetry==1.2.0; pip3 install -e .",
 
 	// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
 	"remoteUser": "vscode"

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -15,33 +15,30 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os : [ubuntu-18.04, macos-11, macos-12, windows-2019]
+        os : [ubuntu-22.04, macos-11, macos-12, windows-2019]
         python-version: ['3.8', '3.9']
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3.5.0
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3.1.3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install .[torch,docs]
-    - name: Lint with flake8
-      if: ${{ matrix.os == 'ubuntu-18.04' && matrix.python-version == '3.8' }}
+        python -m pip install .[docs]
+    - name: Lint & test coverage
+      if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.8' }}
       run: |
         pip install flake8
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Generate Report
-      if: ${{ matrix.os == 'ubuntu-18.04' && matrix.python-version == '3.8' }}
-      run: |
         pip install coverage
         coverage run -m unittest
     - name: Upload Coverage to Codecov
-      if: ${{ matrix.os == 'ubuntu-18.04' && matrix.python-version == '3.8' }}
+      if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.8' }}
       uses: codecov/codecov-action@v1
       with:
         fail_ci_if_error: true 
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Poetry
+poetry.lock
+
 .vscode/
 
 # Byte-compiled / optimized / DLL files
@@ -132,3 +135,7 @@ dmypy.json
 # Pyre type checker
 .pyre/
 docs/build.zip
+tests/data/processed/data.pt
+tests/data/processed/pre_filter.pt
+tests/data/processed/pre_transform.pt
+.DS_Store
diff --git a/README.md b/README.md
@@ -1,34 +1,15 @@
+
 [![PyPI version](https://badge.fury.io/py/bento-tools.svg)](https://badge.fury.io/py/bento-tools)
 [![codecov](https://codecov.io/gh/ckmah/bento-tools/branch/master/graph/badge.svg?token=XVHDKNDCDT)](https://codecov.io/gh/ckmah/bento-tools)
 [![Documentation Status](https://readthedocs.org/projects/bento-tools/badge/?version=latest)](https://bento-tools.readthedocs.io/en/latest/?badge=latest)
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/bento-tools)
 [![GitHub stars](https://badgen.net/github/stars/ckmah/bento-tools)](https://GitHub.com/Naereen/ckmah/bento-tools) 
 
-> ### :warning: Significant upgrades coming soon, with additional analysis and data ingestion methods!
-
-<img src="docs/source/_static/bento-name.png" alt="Bento Logo" width=350>
-
-Bento is a Python toolkit for performing subcellular analysis of spatial transcriptomics data.
-
-# Get started
-Install with Python >=3.8 and <3.11:
-```bash
-pip install bento-tools
-```
-
-Check out the [documentation](https://bento-tools.readthedocs.io/en/latest/) for the installation guide, tutorials, API and more! Read and cite [our preprint](https://doi.org/10.1101/2022.06.10.495510) if you use Bento in your work.
-
-
-# Main Features
-
-<img src="docs/source/_static/tutorial_img/bento_workflow.png" alt="Bento Analysis Workflow" width=800>
+# Bento
 
+Bento is a Python toolkit for performing subcellular analysis of spatial transcriptomics data. The package is part of the [Scverse ecosystem](https://scverse.org/packages/#ecosystem). Check out the [documentation](https://bento-tools.readthedocs.io/en/latest/) for installation instructions, tutorials, and API. Cite [our preprint](https://doi.org/10.1101/2022.06.10.495510) if you use Bento in your work. Thanks!
 
-- Store molecular coordinates and segmentation masks
-- Visualize spatial transcriptomics data at subcellular resolution
-- Compute subcellular spatial features
-- Predict localization patterns and signatures
-- Factor decomposition for high-dimensional spatial feature sets
+<img src="docs/source/_static/tutorial_img/bento_tools.png" alt="Bento Workflow" width="800">
 
 ---
 [![GitHub license](https://img.shields.io/github/license/ckmah/bento-tools.svg)](https://github.com/ckmah/bento-tools/blob/master/LICENSE)
diff --git a/bento/__init__.py b/bento/__init__.py
@@ -1,6 +1,8 @@
-from . import datasets
+from . import datasets as ds
 from . import io
 from . import plotting as pl
-from . import preprocessing as pp
 from . import tools as tl
-from ._utils import PATTERN_NAMES, TENSOR_DIM_NAMES
+from . import _utils as ut
+from . import geometry as geo
+from .plotting import _colors as colors
+from ._utils import sync
diff --git a/bento/_constants.py b/bento/_constants.py
@@ -0,0 +1,18 @@
+PATTERN_COLORS = ["#17becf", "#1f77b4", "#7f7f7f", "#ff7f0e", "#d62728"]
+PATTERN_NAMES = ["cell_edge", "cytoplasmic", "none", "nuclear", "nuclear_edge"]
+PATTERN_PROBS = [f"{p}_p" for p in PATTERN_NAMES]
+PATTERN_FEATURES = [
+    "cell_inner_proximity",
+    "nucleus_inner_proximity",
+    "nucleus_outer_proximity",
+    "cell_inner_asymmetry",
+    "nucleus_inner_asymmetry",
+    "nucleus_outer_asymmetry",
+    "l_max",
+    "l_max_gradient",
+    "l_min_gradient",
+    "l_monotony",
+    "l_half_radius",
+    "point_dispersion_norm",
+    "nucleus_dispersion_norm",
+]
diff --git a/bento/_utils.py b/bento/_utils.py
@@ -1,20 +1,13 @@
 import inspect
-from functools import wraps
-
-from anndata import AnnData
-
+import warnings
+import geopandas as gpd
+import pandas as pd
 import seaborn as sns
+from anndata import AnnData
+from functools import wraps
+from typing import Iterable
+from shapely import wkt
 
-PATTERN_NAMES = ["cell_edge", "cytoplasmic", "none", "nuclear", "nuclear_edge"]
-PATTERN_PROBS = [f"{p}_p" for p in PATTERN_NAMES]
-TENSOR_DIM_NAMES = ["layers", "cells", "genes"]
-
-# Colors correspond to order of PATTERN_NAMES: cyan, blue, gray, orange, red
-PATTERN_COLORS = ['#17becf', '#1f77b4', '#7f7f7f', '#ff7f0e', '#d62728']
-
-# Colors to represent each dimension (features, cells, genes); Set2 palette n_colors=3
-DIM_COLORS = ['#66c2a5', '#fc8d62', '#8da0cb']
-# ['#AD6A6C', '#f5b841', '#0cf2c9']
 
 def get_default_args(func):
     signature = inspect.signature(func)
@@ -28,7 +21,7 @@ def get_default_args(func):
 def track(func):
     """
     Track changes in AnnData object after applying function.
-    
+
     1. First remembers a shallow list of AnnData attributes by listing keys from obs, var, etc.
     2. Perform arbitrary task
     3. List attributes again, perform simple diff between list of old and new attributes
@@ -70,7 +63,6 @@ def wrapper(*args, **kwds):
 
         modified = False
         for attr in old_attr.keys():
-
             if attr == "n_obs" or attr == "n_vars":
                 continue
 
@@ -146,16 +138,185 @@ def pheno_to_color(pheno, palette):
         List of converted colors for each sample, formatted as RGBA tuples.
 
     """
-    import seaborn as sns
-
-    if type(palette) is str:
+    if isinstance(palette, str):
         palette = sns.color_palette(palette)
-    else:
-        palette = palette
 
     values = list(set(pheno))
     values.sort()
     palette = sns.color_palette(palette, n_colors=len(values))
     study2color = dict(zip(values, palette))
     sample_colors = [study2color[v] for v in pheno]
     return study2color, sample_colors
+
+
+def sync(data, copy=False):
+    """
+    Sync existing point sets and associated metadata with data.obs_names and data.var_names
+
+    Parameters
+    ----------
+    data : AnnData
+        Spatial formatted AnnData object
+    copy : bool, optional
+    """
+    adata = data.copy() if copy else data
+
+    if "point_sets" not in adata.uns.keys():
+        adata.uns["point_sets"] = dict(points=[])
+
+    # Iterate over point sets
+    for point_key in adata.uns["point_sets"]:
+        points = adata.uns[point_key]
+
+        # Subset for cells
+        cells = adata.obs_names.tolist()
+        in_cells = points["cell"].isin(cells)
+
+        # Subset for genes
+        in_genes = [True] * points.shape[0]
+        if "gene" in points.columns:
+            genes = adata.var_names.tolist()
+            in_genes = points["gene"].isin(genes)
+
+        # Combine boolean masks
+        valid_mask = (in_cells & in_genes).values
+
+        # Sync points using mask
+        points = points.loc[valid_mask]
+
+        # Remove unused categories for categorical columns
+        for col in points.columns:
+            if points[col].dtype == "category":
+                points[col].cat.remove_unused_categories(inplace=True)
+
+        adata.uns[point_key] = points
+
+        # Sync point metadata using mask
+        for metadata_key in adata.uns["point_sets"][point_key]:
+            if metadata_key not in adata.uns:
+                warnings.warn(
+                    f"Skipping: metadata {metadata_key} not found in adata.uns"
+                )
+                continue
+
+            metadata = adata.uns[metadata_key]
+            # Slice DataFrame if not empty
+            if isinstance(metadata, pd.DataFrame) and not metadata.empty:
+                adata.uns[metadata_key] = metadata.loc[valid_mask, :]
+
+            # Slice Iterable if not empty
+            elif isinstance(metadata, list) and any(metadata):
+                adata.uns[metadata_key] = [
+                    m for i, m in enumerate(metadata) if valid_mask[i]
+                ]
+            elif isinstance(metadata, Iterable) and metadata.shape[0] > 0:
+                adata.uns[metadata_key] = adata.uns[metadata_key][valid_mask]
+            else:
+                warnings.warn(f"Metadata {metadata_key} is not a DataFrame or Iterable")
+
+    return adata if copy else None
+
+
+def _register_points(data, point_key, metadata_keys):
+    required_cols = ["x", "y", "cell"]
+
+    if point_key not in data.uns.keys():
+        raise ValueError(f"Key {point_key} not found in data.uns")
+
+    points = data.uns[point_key]
+
+    if not all([col in points.columns for col in required_cols]):
+        raise ValueError(
+            f"Point DataFrame must have columns {', '.join(required_cols)}"
+        )
+
+    # Check for valid cells
+    cells = data.obs_names.tolist()
+    if not points["cell"].isin(cells).all():
+        raise ValueError("Invalid cells in point DataFrame")
+
+    # Initialize/add to point registry
+    if "point_sets" not in data.uns.keys():
+        data.uns["point_sets"] = dict()
+
+    if point_key not in data.uns["point_sets"].keys():
+        data.uns["point_sets"][point_key] = []
+
+    if len(metadata_keys) < 0:
+        return
+
+    # Register metadata
+    for key in metadata_keys:
+        # Check for valid metadata
+        if key not in data.uns.keys():
+            raise ValueError(f"Key {key} not found in data.uns")
+
+        n_points = data.uns[point_key].shape[0]
+        metadata_len = data.uns[key].shape[0]
+        if metadata_len != n_points:
+            raise ValueError(
+                f"Metadata {key} must have same length as points {point_key}"
+            )
+
+        # Add metadata key to registry
+        if key not in data.uns["point_sets"][point_key]:
+            data.uns["point_sets"][point_key].append(key)
+
+
+def register_points(point_key: str, metadata_keys: list):
+    """Decorator function to register points to the current `AnnData` object.
+    This keeps track of point sets and keeps them in sync with `AnnData` object.
+
+    Parameters
+    ----------
+    point_key : str
+        Key where points are stored in `data.uns`
+    metadata_keys : list
+        Keys where point metadata are stored in `data.uns`
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwds):
+            kwargs = get_default_args(func)
+            kwargs.update(kwds)
+
+            func(*args, **kwds)
+            data = args[0]
+            # Check for required columns
+            return _register_points(data, point_key, metadata_keys)
+
+        return wrapper
+
+    return decorator
+
+
+def sc_format(data, copy=False):
+    """
+    Convert data.obs GeoPandas columns to string for compatibility with scanpy.
+    """
+    adata = data.copy() if copy else data
+
+    shape_names = data.obs.columns.str.endswith("_shape")
+
+    for col in data.obs.columns[shape_names]:
+        adata.obs[col] = adata.obs[col].astype(str)
+
+    return adata if copy else None
+
+
+def geo_format(data, copy=False):
+    """
+    Convert data.obs scanpy columns to GeoPandas compatible types.
+    """
+    adata = data.copy() if copy else data
+
+    shape_names = adata.obs.columns[adata.obs.columns.str.endswith("_shape")]
+
+    adata.obs[shape_names] = adata.obs[shape_names].apply(
+        lambda col: gpd.GeoSeries(
+            col.astype(str).apply(lambda val: wkt.loads(val) if val != "None" else None)
+        )
+    )
+
+    return adata if copy else None
diff --git a/bento/datasets/__init__.py b/bento/datasets/__init__.py
@@ -1 +1,5 @@
-from ._datasets import get_dataset_info, load_dataset, sample_data
+from ._datasets import (
+    get_dataset_info,
+    load_dataset,
+    sample_data,
+)