From 8e9b5a5a05e2046dee46e1d63f1362b0fd850d80 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Mon, 27 Jun 2022 11:17:12 -0400
Subject: [PATCH 01/12] add license info for the PBA+ algorithm

---
 3rdparty/LICENSE.pba+ | 21 +++++++++++++++++++++
 LICENSE-3rdparty.md   |  6 ++++++
 2 files changed, 27 insertions(+)
 create mode 100644 3rdparty/LICENSE.pba+

diff --git a/3rdparty/LICENSE.pba+ b/3rdparty/LICENSE.pba+
new file mode 100644
index 000000000..9d0b4030a
--- /dev/null
+++ b/3rdparty/LICENSE.pba+
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 School of Computing, National University of Singapore
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/LICENSE-3rdparty.md b/LICENSE-3rdparty.md
index 0a3789af7..984d0e952 100644
--- a/LICENSE-3rdparty.md
+++ b/LICENSE-3rdparty.md
@@ -281,3 +281,9 @@ StainTools
   - https://github.com/Peter554/StainTools/blob/master/LICENSE.txt
 - Copyright: Peter Byfield
 - Usage: reference for stain color normalization algorithm
+
+PBA+
+- License: MIT License
+  - https://github.com/orzzzjq/Parallel-Banding-Algorithm-plus/blob/master/LICENSE
+- Copyright: School of Computing, National University of Singapore
+- Usage: PBA+ is used to implement the Euclidean distance transform.

From 0462a2c6935e7f240b5f32b23d8dc611a59f1bf7 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Mon, 27 Jun 2022 13:09:54 -0400
Subject: [PATCH 02/12] Add distance_transform_edt implementation based on PBA+

---
 .../core/operations/morphology/__init__.py    |   3 +
 .../morphology/_distance_transform.py         | 173 +++++++
 .../core/operations/morphology/_pba_2d.py     | 301 +++++++++++++
 .../core/operations/morphology/_pba_3d.py     | 264 +++++++++++
 .../morphology/cuda/pba_kernels_2d.h          | 425 ++++++++++++++++++
 .../morphology/cuda/pba_kernels_3d.h          | 209 +++++++++
 6 files changed, 1375 insertions(+)
 create mode 100644 python/cucim/src/cucim/core/operations/morphology/__init__.py
 create mode 100644 python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
 create mode 100644 python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
 create mode 100644 python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
 create mode 100644 python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h
 create mode 100644 python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h

diff --git a/python/cucim/src/cucim/core/operations/morphology/__init__.py b/python/cucim/src/cucim/core/operations/morphology/__init__.py
new file mode 100644
index 000000000..a9c676edb
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/__init__.py
@@ -0,0 +1,3 @@
+from ._distance_transform import distance_transform_edt
+
+__all__ = ["distance_transform_edt"]
diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
new file mode 100644
index 000000000..372227e85
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
@@ -0,0 +1,173 @@
+import numpy as np
+
+from ._pba_2d import _pba_2d
+from ._pba_3d import _pba_3d
+
+
+# TODO: make a 64-bit packed coordinates variant in 3D for large shapes
+#       support sampling distances
+#       support chamfer, chessboard and l1/manhattan distances too?
+#       remove __mul24 from the pba2d code? (is there any benefit on recent GPUs?)  # noqa
+
+
+def distance_transform_edt(image, sampling=None, return_distances=True,
+                           return_indices=False, distances=None, indices=None,
+                           *, block_params=None, float64_distances=False):
+    """Exact Euclidean distance transform.
+
+    This function calculates the distance transform of the `input`, by
+    replacing each foreground (non-zero) element, with its shortest distance to
+    the background (any zero-valued element).
+
+    In addition to the distance transform, the feature transform can be
+    calculated. In this case the index of the closest background element to
+    each foreground element is returned in a separate array.
+
+    Parameters
+    ----------
+    image : array_like
+        Input data to transform. Can be any type but will be converted into
+        binary: 1 wherever image equates to True, 0 elsewhere.
+    sampling : float, or sequence of float, optional
+        Spacing of elements along each dimension. If a sequence, must be of
+        length equal to the image rank; if a single number, this is used for
+        all axes. If not specified, a grid spacing of unity is implied.
+    return_distances : bool, optional
+        Whether to calculate the distance transform.
+    return_indices : bool, optional
+        Whether to calculate the feature transform.
+    distances : float32 cupy.ndarray, optional
+        An output array to store the calculated distance transform, instead of
+        returning it. `return_distances` must be True. It must be the same
+        shape as `image`.
+    indices : int32 cupy.ndarray, optional
+        An output array to store the calculated feature transform, instead of
+        returning it. `return_indicies` must be True. Its shape must be
+        `(image.ndim,) + image.shape`.
+
+    Other Parameters
+    ----------------
+    block_params : 3-tuple of int
+        The m1, m2, m3 algorithm parameters as described in [2]_. If None,
+        suitable defaults will be chosen. Note: This parameter is specific to
+        cuCIM and does not exist in SciPy.
+    float64_distances : bool, optional
+        If True, use double precision in the distance computation (to match
+        SciPy behavior). Otherwise, single precision will be used for
+        efficiency. Note: This parameter is specific to cuCIM and does not
+        exist in SciPy.
+
+    Returns
+    -------
+    distances : float64 ndarray, optional
+        The calculated distance transform. Returned only when
+        `return_distances` is True and `distances` is not supplied. It will
+        have the same shape as `image`.
+    indices : int32 ndarray, optional
+        The calculated feature transform. It has an image-shaped array for each
+        dimension of the image. See example below. Returned only when
+        `return_indices` is True and `indices` is not supplied.
+
+    Notes
+    -----
+    The Euclidean distance transform gives values of the Euclidean distance::
+
+                    n
+      y_i = sqrt(sum (x[i]-b[i])**2)
+                    i
+
+    where b[i] is the background point (value 0) with the smallest Euclidean
+    distance to input points x[i], and n is the number of dimensions.
+
+    The parallel banding algorithm implemented here was originally described in
+    [1]_. The kernels used here correspond to the revised PBA+ implementation
+    that is described on the author's website [2]_. The source code of the
+    author's PBA+ implementation is available at [3]_.
+
+    References
+    ----------
+    ..[1] Thanh-Tung Cao, Ke Tang, Anis Mohamed, and Tiow-Seng Tan. 2010.
+        Parallel Banding Algorithm to compute exact distance transform with the
+        GPU. In Proceedings of the 2010 ACM SIGGRAPH symposium on Interactive
+        3D Graphics and Games (I3D ’10). Association for Computing Machinery,
+        New York, NY, USA, 83–90.
+        DOI:https://doi.org/10.1145/1730804.1730818
+    .. [2] https://www.comp.nus.edu.sg/~tants/pba.html
+    .. [3] https://github.com/orzzzjq/Parallel-Banding-Algorithm-plus
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cucim.core.operations import morphology
+    >>> a = cp.array(([0,1,1,1,1],
+    ...               [0,0,1,1,1],
+    ...               [0,1,1,1,1],
+    ...               [0,1,1,1,0],
+    ...               [0,1,1,0,0]))
+    >>> morphology.distance_transform_edt(a)
+    array([[ 0.    ,  1.    ,  1.4142,  2.2361,  3.    ],
+           [ 0.    ,  0.    ,  1.    ,  2.    ,  2.    ],
+           [ 0.    ,  1.    ,  1.4142,  1.4142,  1.    ],
+           [ 0.    ,  1.    ,  1.4142,  1.    ,  0.    ],
+           [ 0.    ,  1.    ,  1.    ,  0.    ,  0.    ]])
+
+    With a sampling of 2 units along x, 1 along y:
+
+    >>> morphology.distance_transform_edt(a, sampling=[2,1])
+    array([[ 0.    ,  1.    ,  2.    ,  2.8284,  3.6056],
+           [ 0.    ,  0.    ,  1.    ,  2.    ,  3.    ],
+           [ 0.    ,  1.    ,  2.    ,  2.2361,  2.    ],
+           [ 0.    ,  1.    ,  2.    ,  1.    ,  0.    ],
+           [ 0.    ,  1.    ,  1.    ,  0.    ,  0.    ]])
+
+    Asking for indices as well:
+
+    >>> edt, inds = morphology.distance_transform_edt(a, return_indices=True)
+    >>> inds
+    array([[[0, 0, 1, 1, 3],
+            [1, 1, 1, 1, 3],
+            [2, 2, 1, 3, 3],
+            [3, 3, 4, 4, 3],
+            [4, 4, 4, 4, 4]],
+           [[0, 0, 1, 1, 4],
+            [0, 1, 1, 1, 4],
+            [0, 0, 1, 4, 4],
+            [0, 0, 3, 3, 4],
+            [0, 0, 3, 3, 4]]])
+
+    """
+    if distances is not None:
+        raise NotImplementedError(
+            "preallocated distances image is not supported"
+        )
+    if indices is not None:
+        raise NotImplementedError(
+            "preallocated indices image is not supported"
+        )
+    if sampling is not None:
+        sampling = np.unique(np.atleast_1d(sampling))
+        if len(sampling) != 1:
+            raise NotImplementedError(
+                "non-uniform values in sampling is not currently supported"
+            )
+
+    if image.ndim == 3:
+        pba_func = _pba_3d
+    elif image.ndim == 2:
+        pba_func = _pba_2d
+    else:
+        raise NotImplementedError(
+            "Only 2D and 3D distance transforms are supported.")
+
+    vals = pba_func(
+        image,
+        sampling=sampling,
+        return_distances=return_distances,
+        return_indices=return_indices,
+        block_params=block_params
+    )
+
+    if return_distances and sampling is not None:
+        vals[0] = vals[0] * sampling
+
+    return vals
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
new file mode 100644
index 000000000..01007072c
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
@@ -0,0 +1,301 @@
+import math
+import os
+from functools import reduce
+
+import cupy
+
+pba2d_defines_template = """
+
+// MARKER is used to mark blank pixels in the texture.
+// Any uncolored pixels will have x = MARKER.
+// Input texture should have x = MARKER for all pixels other than sites
+#define MARKER      {marker}
+#define BLOCKSIZE   {block_size_2d}
+#define pixel_int2_t {pixel_int2_t}                // typically short2 (int2 for images with > 32k pixels per side)
+#define make_pixel(x, y)  {make_pixel_func}(x, y)  // typically make_short2 (make_int2 images with > 32k pixels per side
+
+"""  # noqa
+
+
+def _init_marker(int_dtype):
+    """use a minimum value that is appropriate to the integer dtype"""
+    if int_dtype == cupy.int16:
+        # marker = cupy.iinfo(int_dtype).min
+        marker = -32768
+    elif int_dtype == cupy.int32:
+        # divide by two so we don't have to promote other intermediate int variables to 64-bit int
+        marker = -2147483648 // 2
+    else:
+        raise ValueError("expected int_dtype to be either cupy.int16 or cupy.int32")
+    return marker
+
+
+@cupy.memoize(True)
+def get_pba2d_src(block_size_2d=64, marker=-32768, pixel_int2_t='short2'):
+    make_pixel_func = 'make_' + pixel_int2_t
+
+    pba2d_code = pba2d_defines_template.format(
+        block_size_2d=block_size_2d,
+        marker=marker,
+        pixel_int2_t=pixel_int2_t,
+        make_pixel_func=make_pixel_func
+    )
+    kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda')
+    with open(os.path.join(kernel_directory, 'pba_kernels_2d.h'), 'rt') as f:
+        pba2d_kernels = '\n'.join(f.readlines())
+
+    pba2d_code += pba2d_kernels
+    return pba2d_code
+
+
+def _get_block_size(check_warp_size=False):
+    if check_warp_size:
+        dev = cupy.cuda.runtime.getDevice()
+        device_properties = cupy.cuda.runtime.getDeviceProperties(dev)
+        return int(device_properties['warpSize'])
+    else:
+        return 32
+
+
+def _pack_int2(arr, marker=-32768, int_dtype=cupy.int16):
+    if arr.ndim != 2:
+        raise ValueError("only 2d arr suppported")
+    input_x = cupy.zeros(arr.shape, dtype=int_dtype)
+    input_y = cupy.zeros(arr.shape, dtype=int_dtype)
+    cond = arr == 0
+    y, x = cupy.where(cond)
+    input_x[cond] = x
+    mask = arr != 0
+    input_x[mask] = marker  # 1 << 32
+    input_y[cond] = y
+    input_y[mask] = marker  # 1 << 32
+    int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype]*2})
+    # in C++ code x is the contiguous axis and corresponds to width
+    #             y is the non-contiguous axis and corresponds to height
+    # given that, store input_x as the last axis here
+    return cupy.squeeze(
+        cupy.stack((input_x, input_y), axis=-1).view(int2_dtype)
+    )
+
+
+def _unpack_int2(img, make_copy=False, int_dtype=cupy.int16):
+    temp = img.view(int_dtype).reshape(img.shape + (2,))
+    if make_copy:
+        temp = temp.copy()
+    return temp
+
+
+def _determine_padding(shape, block_size):
+    # all kernels assume equal size along both axes, so pad up to equal size if
+    # shape is not isotropic
+    round_up = False
+    orig_sy, orig_sx = shape
+    if orig_sx % block_size != 0:
+        # round up size to a multiple of the band size
+        round_up = True
+        sx = block_size * math.ceil(orig_sx / block_size)
+    else:
+        sx = orig_sx
+    if orig_sy % block_size != 0:
+        # round up size to a multiple of the band size
+        round_up = True
+        sy = block_size * math.ceil(orig_sy / block_size)
+    else:
+        sy = orig_sy
+    if sx != sy or round_up:
+        smax = max(sx, sy)
+        padding_width = ((0, smax - orig_sy), (0, smax - orig_sx))
+    else:
+        padding_width = None
+    return padding_width
+
+
+def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
+            block_params=None, check_warp_size=False, *,
+            float64_distances=False):
+
+    # input_arr: a 2D image
+    #    For each site at (x, y), the pixel at coordinate (x, y) should contain
+    #    the pair (x, y). Pixels that are not sites should contain the pair
+    #    (MARKER, MARKER)
+
+    # See the following on float2, etc. use in cupy:
+    #     https://github.com/cupy/cupy/issues/1535
+
+    # Note: could query warp size here, but for now just assume 32 to avoid
+    #       overhead of querying properties
+    block_size = _get_block_size(check_warp_size)
+
+    padded_size = max(arr.shape)
+    if block_params is None:
+        # should be <= size / 64. sy must be a multiple of m1
+        m1 = max(1, min(padded_size // block_size, 32))
+        # size must be a multiple of m2
+        m2 = max(1, min(padded_size // block_size, 32))
+        # should be <= 64. texture size must be a multiple of m3
+        m3 = min(min(m1, m2), 2)
+    else:
+        m1, m2, m3 = block_params
+
+    if m1 > padded_size // block_size:
+        raise ValueError("m1 too large. must be <= arr.shape[0] // 32")
+    if m2 > padded_size // block_size:
+        raise ValueError("m2 too large. must be <= arr.shape[1] // 32")
+    for m in (m1, m2, m3):
+        if padded_size % m1 != 0:
+            raise ValueError(
+                f"Largest dimension of image ({padded_size}) must be evenly "
+                f"disivible by each element of block_params: {block_params}."
+            )
+
+    shape_max = max(arr.shape)
+    if shape_max <= 32768:
+        int_dtype = cupy.int16
+        pixel_int2_type = 'short2'
+    else:
+        if shape_max > (1 << 24):
+            # limit to coordinate range to 2**24 due to use of __mul24 in
+            # coordinate TOID macro
+            raise ValueError(
+                f"maximum axis size of {1 << 24} exceeded, for image with "
+                f"shape {arr.shape}"
+            )
+        int_dtype = cupy.int32
+        pixel_int2_type = 'int2'
+
+    marker = _init_marker(int_dtype)
+
+    orig_sy, orig_sx = arr.shape
+    padding_width = _determine_padding(arr.shape, block_size)
+    if padding_width is not None:
+        arr = cupy.pad(arr, padding_width, mode='constant', constant_values=1)
+    size = arr.shape[0]
+    print(f"{orig_sy=}, {orig_sx=}, {padding_width=}, {arr.shape=}")
+
+    input_arr = _pack_int2(arr, marker=marker, int_dtype=int_dtype)
+    output = cupy.zeros_like(input_arr)
+
+    int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype]*2})
+    margin = cupy.empty((2 * m1 * size,), dtype=int2_dtype)
+
+    # phase 1 of PBA. m1 must divide texture size and be <= 64
+    pba2d = cupy.RawModule(
+        code=get_pba2d_src(
+            block_size_2d=block_size,
+            marker=marker,
+            pixel_int2_t=pixel_int2_type,
+        )
+    )
+    kernelFloodDown = pba2d.get_function('kernelFloodDown')
+    kernelFloodUp = pba2d.get_function('kernelFloodUp')
+    kernelPropagateInterband = pba2d.get_function('kernelPropagateInterband')
+    kernelUpdateVertical = pba2d.get_function('kernelUpdateVertical')
+    kernelProximatePoints = pba2d.get_function('kernelProximatePoints')
+    kernelCreateForwardPointers = pba2d.get_function(
+        'kernelCreateForwardPointers'
+    )
+    kernelMergeBands = pba2d.get_function('kernelMergeBands')
+    kernelDoubleToSingleList = pba2d.get_function('kernelDoubleToSingleList')
+    kernelColor = pba2d.get_function('kernelColor')
+
+    block = (block_size, 1, 1)
+    grid = (math.ceil(size / block[0]), m1, 1)
+    bandSize1 = size // m1
+    # kernelFloodDown modifies input_arr in-place
+    kernelFloodDown(
+        grid,
+        block,
+        (input_arr, input_arr, size, bandSize1),
+    )
+    # kernelFloodUp modifies input_arr in-place
+    kernelFloodUp(
+        grid,
+        block,
+        (input_arr, input_arr, size, bandSize1),
+    )
+    # kernelFloodUp fills values into margin
+    kernelPropagateInterband(
+        grid,
+        block,
+        (input_arr, margin, size, bandSize1),
+    )
+    # kernelUpdateVertical stores output into an intermediate array of
+    # transposed shape
+    kernelUpdateVertical(
+        grid,
+        block,
+        (input_arr, margin, output, size, bandSize1),
+    )
+
+    # phase 2
+    block = (block_size, 1, 1)
+    grid = (math.ceil(size / block[0]), m2, 1)
+    bandSize2 = size // m2
+    kernelProximatePoints(
+        grid,
+        block,
+        (output, input_arr, size, bandSize2),
+    )
+    kernelCreateForwardPointers(
+        grid,
+        block,
+        (input_arr, input_arr, size, bandSize2),
+    )
+    # Repeatly merging two bands into one
+    noBand = m2
+    while noBand > 1:
+        grid = (math.ceil(size / block[0]), noBand // 2)
+        kernelMergeBands(
+            grid,
+            block,
+            (output, input_arr, input_arr, size, size // noBand),
+        )
+        noBand //= 2
+    # Replace the forward link with the X coordinate of the seed to remove
+    # the need of looking at the other texture. We need it for coloring.
+    grid = (math.ceil(size / block[0]), size);
+    kernelDoubleToSingleList(
+        grid,
+        block,
+        (output, input_arr, input_arr, size),
+    )
+
+    # Phase 3 of PBA
+    block = (block_size, m3, 1)
+    grid = (math.ceil(size / block[0]), 1, 1)
+    kernelColor(
+        grid,
+        block,
+        (input_arr, output, size),
+    )
+
+    output = _unpack_int2(output, make_copy=False, int_dtype=int_dtype)
+    # make sure to crop any padding that was added here!
+    x = output[:orig_sy, :orig_sx, 0]
+    y = output[:orig_sy, :orig_sx, 1]
+
+    # raise NotImplementedError("TODO")
+    vals = ()
+    if return_distances:
+        y0, x0 = cupy.meshgrid(
+            *(
+                cupy.arange(s, dtype=cupy.int32)
+                for s in (orig_sy, orig_sx)
+            ),
+            indexing='ij',
+            sparse=True,
+        )
+        tmp = (x - x0)
+        dist = tmp * tmp
+        tmp = (y - y0)
+        dist += tmp * tmp
+        if float64_distances:
+            dist = cupy.sqrt(dist)
+        else:
+            dist = dist.astype(cupy.float32)
+            cupy.sqrt(dist, out=dist)
+        vals = vals + (dist,)
+    if return_indices:
+        indices = cupy.stack((x, y), axis=0)
+        vals = vals + (indices,)
+    return vals
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
new file mode 100644
index 000000000..2ba7851dc
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -0,0 +1,264 @@
+import math
+import os
+from functools import reduce
+
+import cupy
+import numpy as np
+
+from ._pba_2d import _get_block_size
+
+
+pba3d_defines_template = """
+
+#define MARKER     {marker}
+#define MAX_INT    {max_int}
+#define BLOCKSIZE  {block_size_3d}
+
+"""
+
+# For efficiency, the original PBA+ packs three 10-bit integers and two binary
+# flags into a single 32-bit integer. The defines in
+# `pba3d_defines_encode_32bit` handle this format.
+pba3d_defines_encode_32bit = """
+// Sites     : ENCODE(x, y, z, 0, 0)
+// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+#define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
+#define DECODE(value, x, y, z) \
+    x = ((value) >> 20) & 0x3ff; \
+    y = ((value) >> 10) & 0x3ff; \
+    z = (value) & 0x3ff
+
+#define NOTSITE(value)  (((value) >> 31) & 1)
+#define HASNEXT(value)  (((value) >> 30) & 1)
+
+#define GET_X(value)    (((value) >> 20) & 0x3ff)
+#define GET_Y(value)    (((value) >> 10) & 0x3ff)
+#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
+
+""" # noqa
+
+
+# 64bit version of ENCODE/DECODE to allow a 20-bit integer per coordinate axis.
+pba3d_defines_encode_64bit = """
+// Sites     : ENCODE(x, y, z, 0, 0)
+// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+#define ENCODE(x, y, z, a, b)  (((x) << 40) | ((y) << 20) | (z) | ((a) << 63) | ((b) << 62))
+#define DECODE(value, x, y, z) \
+    x = ((value) >> 40) & 0x3ff; \
+    y = ((value) >> 20) & 0x3ff; \
+    z = (value) & 0x3ff
+
+#define NOTSITE(value)  (((value) >> 63) & 1)
+#define HASNEXT(value)  (((value) >> 62) & 1)
+
+#define GET_X(value)    (((value) >> 40) & 0x3ff)
+#define GET_Y(value)    (((value) >> 20) & 0x3ff)
+#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
+
+""" # noqa
+
+
+@cupy.memoize(True)
+def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647):
+    pba3d_code = pba3d_defines_template.format(
+        block_size_3d=block_size_3d, marker=marker, max_int=max_int
+    )
+    pba3d_code += pba3d_defines_encode_32bit
+    kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda')
+    with open(os.path.join(kernel_directory, 'pba_kernels_3d.h'), 'rt') as f:
+        pba3d_kernels = '\n'.join(f.readlines())
+    pba3d_code += pba3d_kernels
+    return pba3d_code
+
+
+# TODO: custom kernel for encode3d
+def encode3d(arr, marker=-2147483648, bit_depth=32):
+    if arr.ndim != 3:
+        raise ValueError("only 3d arr suppported")
+    if bit_depth not in [32, 64]:
+        raise ValueError("only bit_depth of 32 or 64 is supported")
+    if any(s > 1024 for s in arr.shape):
+        raise ValueError(
+            "encoding is for maximum of 1024 along each dimension")
+    input = cupy.zeros(arr.shape, dtype=np.int32, order='C')
+    cond = arr == 0
+    z, y, x = cupy.where(cond)
+    # z, y, x so that x is the contiguous axis (must match the TOID macro in the C++ code!)
+    input[cond] = (((x) << 20) | ((y) << 10) | (z))
+    input[arr != 0] = marker  # 1 << 32
+    return input
+
+
+# TODO: custom kernel for decode3d
+def decode3d(output):
+    # Note: z, y, x reversed vs. DECODE define in C above due to C-ordered memory layout
+    x = (output >> 20) & 0x3ff
+    y = (output >> 10) & 0x3ff
+    z = output & 0x3ff
+    return (x, y, z)
+
+
+def _determine_padding(shape, block_size, m1, m2, m3, blockx, blocky):
+    # TODO: can possibly revise to consider only particular factors for LCM on
+    #       a given axis
+    LCM = math.lcm(block_size, m1, m2, m3, blockx, blocky)
+    orig_sz, orig_sy, orig_sx = shape
+    round_up = False
+    if orig_sx % LCM != 0:
+        # round up size to a multiple of the band size
+        round_up = True
+        sx = LCM * math.ceil(orig_sx / LCM)
+    else:
+        sx = orig_sx
+    if orig_sy % LCM != 0:
+        # round up size to a multiple of the band size
+        round_up = True
+        sy = LCM * math.ceil(orig_sy / LCM)
+    else:
+        sy = orig_sy
+    if orig_sz % LCM != 0:
+        # round up size to a multiple of the band size
+        round_up = True
+        sz = LCM * math.ceil(orig_sz / LCM)
+    else:
+        sz = orig_sz
+
+    aniso = not (sx == sy == sz)
+    if aniso or round_up:
+        smax = max(sz, sy, sx)
+        padding_width = (
+            (0, smax - orig_sz), (0, smax - orig_sy), (0, smax - orig_sx)
+        )
+    else:
+        padding_width = None
+    return padding_width
+
+
+def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
+            block_params=None, check_warp_size=False, *,
+            float64_distances=False):
+    if arr.ndim != 3:
+        raise ValueError(f"expected a 3D array, got {arr.ndim}D")
+
+    if block_params is None:
+        m1 = 1
+        m2 = 1
+        m3 = 2
+    else:
+        m1, m2, m3 = block_params
+
+    # reduce blockx for small inputs
+    s_min = min(arr.shape)
+    if s_min <= 4:
+        blockx = 4
+    elif s_min <= 8:
+        blockx = 8
+    elif s_min <= 16:
+        blockx = 16
+    else:
+        blockx = 32
+    blocky = 4
+
+    block_size = _get_block_size(check_warp_size)
+
+    orig_sz, orig_sy, orig_sx = arr.shape
+    padding_width = _determine_padding(
+        arr.shape, block_size, m1, m2, m3, blockx, blocky
+    )
+    if padding_width is not None:
+        arr = cupy.pad(arr, padding_width, mode='constant', constant_values=1)
+    size = arr.shape[0]
+
+    # pba algorithm was implemented to use 32-bit integer to store compressed
+    # coordinates. input_arr will be C-contiguous, int32
+    input_arr = encode3d(arr)
+    buffer_idx = 0
+    output = cupy.zeros_like(input_arr)
+    pba_images = [input_arr, output]
+
+    block = (blockx, blocky, 1)
+    grid = (size // block[0], size // block[1], 1)
+
+    # MAKE SURE NO GRID/BLOCK SIZES WILL CAUSE OUT-OF-RANGE ACCESS
+    # block_size = min(32, min_pow2)
+    marker = -2147483648
+
+
+    pba3d = cupy.RawModule(
+        code=get_pba3d_src(block_size_3d=block_size, marker=marker)
+    )
+
+    kernelFloodZ = pba3d.get_function('kernelFloodZ')
+    kernelMaurerAxis = pba3d.get_function('kernelMaurerAxis')
+    kernelColorAxis = pba3d.get_function('kernelColorAxis')
+
+    kernelFloodZ(
+        grid,
+        block,
+        (pba_images[buffer_idx], pba_images[1 - buffer_idx], size)
+    )
+    buffer_idx = 1 - buffer_idx
+
+    block = (blockx, blocky, 1)
+    grid = (size // block[0], size // block[1], 1)
+    kernelMaurerAxis(
+        grid,
+        block,
+        (pba_images[buffer_idx], pba_images[1 - buffer_idx], size),
+    )
+
+    block = (block_size, m3, 1)
+    grid = (size // block[0], size, 1)
+    kernelColorAxis(
+        grid,
+        block,
+        (pba_images[1 - buffer_idx], pba_images[buffer_idx], size),
+    )
+
+    block = (blockx, blocky, 1)
+    grid = (size // block[0], size // block[1], 1)
+    kernelMaurerAxis(
+        grid,
+        block,
+        (pba_images[buffer_idx], pba_images[1 - buffer_idx], size),
+    )
+
+    block = (block_size, m3, 1)
+    grid = (size // block[0], size, 1)
+    kernelColorAxis(
+        grid,
+        block,
+        (pba_images[1 - buffer_idx], pba_images[buffer_idx], size),
+    )
+
+    output = pba_images[buffer_idx]
+
+    if return_distances or return_indices:
+        x, y, z = decode3d(output[:orig_sz, :orig_sy, :orig_sx])
+
+    vals = ()
+    if return_distances:
+        # TODO: kernel for more efficient distance computation
+        orig_shape = (orig_sz, orig_sy, orig_sx)
+        z0, y0, x0 = cupy.meshgrid(
+            *(cupy.arange(s, dtype=cupy.int32) for s in orig_shape),
+            indexing='ij',
+            sparse=True
+        )
+        tmp = (x - x0)
+        dist = tmp * tmp
+        tmp = (y - y0)
+        dist += tmp * tmp
+        tmp = (z - z0)
+        dist += tmp * tmp
+        if float64_distances:
+            dist = cupy.sqrt(dist)
+        else:
+            dist = dist.astype(cupy.float32)
+            cupy.sqrt(dist, out=dist)
+        vals = vals + (dist,)
+    if return_indices:
+        indices = cupy.stack((z, y, x), axis=0)
+        vals = vals + (indices,)
+    return vals
+
diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h
new file mode 100644
index 000000000..3ea84e3b2
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h
@@ -0,0 +1,425 @@
+
+#ifndef MARKER
+    
+#define MARKER -32768
+#define BLOCKSIZE 32
+#define pixel_int2_t short2
+#define make_pixel(x, y)  make_short2(x, y)
+
+#endif  // MARKER
+
+
+#define TOID(x, y, size)  (__mul24((y), (size)) + (x))
+
+#define LL long long
+__device__ bool dominate(LL x1, LL y1, LL x2, LL y2, LL x3, LL y3, LL x0)
+{
+    LL k1 = y2 - y1, k2 = y3 - y2;
+    return (k1 * (y1 + y2) + (x2 - x1) * ((x1 + x2) - (x0 << 1))) * k2 > \
+            (k2 * (y2 + y3) + (x3 - x2) * ((x2 + x3) - (x0 << 1))) * k1;
+}
+#undef LL
+
+
+extern "C"{
+
+__global__ void kernelFloodDown(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+    int id = TOID(tx, ty, size);
+
+    pixel_int2_t pixel1, pixel2;
+
+    pixel1 = make_pixel(MARKER, MARKER);
+
+    for (int i = 0; i < bandSize; i++, id += size) {
+        pixel2 = input[id];
+
+        if (pixel2.x != MARKER)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+__global__ void kernelFloodUp(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = (blockIdx.y+1) * bandSize - 1;
+    int id = TOID(tx, ty, size);
+
+    pixel_int2_t pixel1, pixel2;
+    int dist1, dist2;
+
+    pixel1 = make_pixel(MARKER, MARKER);
+
+    for (int i = 0; i < bandSize; i++, id -= size) {
+        dist1 = abs(pixel1.y - ty + i);
+
+        pixel2 = input[id];
+        dist2 = abs(pixel2.y - ty + i);
+
+        if (dist2 < dist1)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+__global__ void kernelPropagateInterband(pixel_int2_t *input, pixel_int2_t *margin_out, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int inc = bandSize * size;
+    int ny, nid, nDist;
+    pixel_int2_t pixel;
+
+    // Top row, look backward
+    int ty = blockIdx.y * bandSize;
+    int topId = TOID(tx, ty, size);
+    int bottomId = TOID(tx, ty + bandSize - 1, size);
+    int tid = blockIdx.y * size + tx;
+    int bid = tid + (size * size / bandSize);
+
+    pixel = input[topId];
+    int myDist = abs(pixel.y - ty);
+    margin_out[tid] = pixel;
+
+    for (nid = bottomId - inc; nid >= 0; nid -= inc) {
+        pixel = input[nid];
+
+        if (pixel.x != MARKER) {
+            nDist = abs(pixel.y - ty);
+
+            if (nDist < myDist)
+                margin_out[tid] = pixel;
+
+            break;
+        }
+    }
+
+    // Last row, look downward
+    ty = ty + bandSize - 1;
+    pixel = input[bottomId];
+    myDist = abs(pixel.y - ty);
+    margin_out[bid] = pixel;
+
+    for (ny = ty + 1, nid = topId + inc; ny < size; ny += bandSize, nid += inc) {
+        pixel = input[nid];
+
+        if (pixel.x != MARKER) {
+            nDist = abs(pixel.y - ty);
+
+            if (nDist < myDist)
+                margin_out[bid] = pixel;
+
+            break;
+        }
+    }
+}
+
+__global__ void kernelUpdateVertical(pixel_int2_t *color, pixel_int2_t *margin, pixel_int2_t *output, int size, int bandSize)
+{
+    __shared__ pixel_int2_t block[BLOCKSIZE][BLOCKSIZE];
+
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+
+    pixel_int2_t top = margin[blockIdx.y * size + tx];
+    pixel_int2_t bottom = margin[(blockIdx.y + size / bandSize) * size + tx];
+    pixel_int2_t pixel;
+
+    int dist, myDist;
+
+    int id = TOID(tx, ty, size);
+
+    int n_step = bandSize / blockDim.x;
+    for(int step = 0; step < n_step; ++step) {
+        int y_start = blockIdx.y * bandSize + step * blockDim.x;
+        int y_end = y_start + blockDim.x;
+
+        for (ty = y_start; ty < y_end; ++ty, id += size) {
+            pixel = color[id];
+            myDist = abs(pixel.y - ty);
+
+            dist = abs(top.y - ty);
+            if (dist < myDist) { myDist = dist; pixel = top; }
+
+            dist = abs(bottom.y - ty);
+            if (dist < myDist) pixel = bottom;
+
+            // temporary result is stored in block
+            block[threadIdx.x][ty - y_start] = make_pixel(pixel.y, pixel.x);
+        }
+
+        __syncthreads();
+
+        // block is written to a transposed location in the output
+
+        int tid = TOID(blockIdx.y * bandSize + step * blockDim.x + threadIdx.x, \
+                        blockIdx.x * blockDim.x, size);
+
+        for(int i = 0; i < blockDim.x; ++i, tid += size) {
+            output[tid] = block[i][threadIdx.x];
+        }
+
+        __syncthreads();
+    }
+}
+
+__global__ void kernelProximatePoints(pixel_int2_t *input, pixel_int2_t *stack, int size, int bandSize)
+{
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = __mul24(blockIdx.y, bandSize);
+    int id = TOID(tx, ty, size);
+    int lasty = -1;
+    pixel_int2_t last1, last2, current;
+
+    last1.y = -1; last2.y = -1;
+
+    for (int i = 0; i < bandSize; i++, id += size) {
+        current = input[id];
+
+        if (current.x != MARKER) {
+            while (last2.y >= 0) {
+                if (!dominate(last1.x, last2.y, last2.x, \
+                    lasty, current.x, current.y, tx))
+                    break;
+
+                lasty = last2.y; last2 = last1;
+
+                if (last1.y >= 0)
+                    last1 = stack[TOID(tx, last1.y, size)];
+            }
+
+            last1 = last2; last2 = make_pixel(current.x, lasty); lasty = current.y;
+
+            stack[id] = last2;
+        }
+    }
+
+    // Store the pointer to the tail at the last pixel of this band
+    if (lasty != ty + bandSize - 1)
+        stack[TOID(tx, ty + bandSize - 1, size)] = make_pixel(MARKER, lasty);
+}
+
+__global__ void kernelCreateForwardPointers(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize)
+{
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = __mul24(blockIdx.y+1, bandSize) - 1;
+    int id = TOID(tx, ty, size);
+    int lasty = -1, nexty;
+    pixel_int2_t current;
+
+    // Get the tail pointer
+    current = input[id];
+
+    if (current.x == MARKER)
+        nexty = current.y;
+    else
+        nexty = ty;
+
+    for (int i = 0; i < bandSize; i++, id -= size)
+        if (ty - i == nexty) {
+            current = make_pixel(lasty, input[id].y);
+            output[id] = current;
+
+            lasty = nexty;
+            nexty = current.y;
+        }
+
+    // Store the pointer to the head at the first pixel of this band
+    if (lasty != ty - bandSize + 1)
+        output[id + size] = make_pixel(lasty, MARKER);
+}
+
+__global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size, int bandSize)
+{
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int band1 = blockIdx.y * 2;
+    int band2 = band1 + 1;
+    int firsty, lasty;
+    pixel_int2_t last1, last2, current;
+    // last1 and last2: x component store the x coordinate of the site,
+    // y component store the backward pointer
+    // current: y component store the x coordinate of the site,
+    // x component store the forward pointer
+
+    // Get the two last items of the first list
+    lasty = __mul24(band2, bandSize) - 1;
+    last2 = make_pixel(color[TOID(tx, lasty, size)].x,
+        link[TOID(tx, lasty, size)].y);
+
+    if (last2.x == MARKER) {
+        lasty = last2.y;
+
+        if (lasty >= 0)
+            last2 = make_pixel(color[TOID(tx, lasty, size)].x,
+            link[TOID(tx, lasty, size)].y);
+        else
+            last2 = make_pixel(MARKER, MARKER);
+    }
+
+    if (last2.y >= 0) {
+        // Second item at the top of the stack
+        last1 = make_pixel(color[TOID(tx, last2.y, size)].x,
+            link[TOID(tx, last2.y, size)].y);
+    }
+
+    // Get the first item of the second band
+    firsty = __mul24(band2, bandSize);
+    current = make_pixel(link[TOID(tx, firsty, size)].x,
+        color[TOID(tx, firsty, size)].x);
+
+    if (current.y == MARKER) {
+        firsty = current.x;
+
+        if (firsty >= 0)
+            current = make_pixel(link[TOID(tx, firsty, size)].x,
+            color[TOID(tx, firsty, size)].x);
+        else
+            current = make_pixel(MARKER, MARKER);
+    }
+
+    // Count the number of item in the second band that survive so far.
+    // Once it reaches 2, we can stop.
+    int top = 0;
+
+    while (top < 2 && current.y >= 0) {
+        // While there's still something on the left
+        while (last2.y >= 0) {
+
+            if (!dominate(last1.x, last2.y, last2.x, \
+                lasty, current.y, firsty, tx))
+                break;
+
+            lasty = last2.y; last2 = last1;
+            top--;
+
+            if (last1.y >= 0)
+                last1 = make_pixel(color[TOID(tx, last1.y, size)].x,
+                link[TOID(tx, last1.y, size)].y);
+        }
+
+        // Update the current pointer
+        output[TOID(tx, firsty, size)] = make_pixel(current.x, lasty);
+
+        if (lasty >= 0)
+            output[TOID(tx, lasty, size)] = make_pixel(firsty, last2.y);
+
+        last1 = last2; last2 = make_pixel(current.y, lasty); lasty = firsty;
+        firsty = current.x;
+
+        top = max(1, top + 1);
+
+        // Advance the current pointer to the next one
+        if (firsty >= 0)
+            current = make_pixel(link[TOID(tx, firsty, size)].x,
+            color[TOID(tx, firsty, size)].x);
+        else
+            current = make_pixel(MARKER, MARKER);
+    }
+
+    // Update the head and tail pointer.
+    firsty = __mul24(band1, bandSize);
+    lasty = __mul24(band2, bandSize);
+    current = link[TOID(tx, firsty, size)];
+
+    if (current.y == MARKER && current.x < 0) { // No head?
+        last1 = link[TOID(tx, lasty, size)];
+
+        if (last1.y == MARKER)
+            current.x = last1.x;
+        else
+            current.x = lasty;
+
+        output[TOID(tx, firsty, size)] = current;
+    }
+
+    firsty = __mul24(band1, bandSize) + bandSize - 1;
+    lasty = __mul24(band2, bandSize) + bandSize - 1;
+    current = link[TOID(tx, lasty, size)];
+
+    if (current.x == MARKER && current.y < 0) { // No tail?
+        last1 = link[TOID(tx, firsty, size)];
+
+        if (last1.x == MARKER)
+            current.y = last1.y;
+        else
+            current.y = firsty;
+
+        output[TOID(tx, lasty, size)] = current;
+    }
+}
+
+__global__ void kernelDoubleToSingleList(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size)
+{
+    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int ty = blockIdx.y;
+    int id = TOID(tx, ty, size);
+
+    output[id] = make_pixel(color[id].x, link[id].y);
+}
+
+__global__ void kernelColor(pixel_int2_t *input, pixel_int2_t *output, int size)
+{
+    __shared__ pixel_int2_t block[BLOCKSIZE][BLOCKSIZE];
+
+    int col = threadIdx.x;
+    int tid = threadIdx.y;
+    int tx = __mul24(blockIdx.x, blockDim.x) + col;
+    int dx, dy, lasty;
+    unsigned int best, dist;
+    pixel_int2_t last1, last2;
+
+    lasty = size - 1;
+
+    last2 = input[TOID(tx, lasty, size)];
+
+    if (last2.x == MARKER) {
+        lasty = last2.y;
+        last2 = input[TOID(tx, lasty, size)];
+    }
+
+    if (last2.y >= 0)
+        last1 = input[TOID(tx, last2.y, size)];
+
+    int y_start, y_end, n_step = size / blockDim.x;
+    for(int step = 0; step < n_step; ++step) {
+        y_start = size - step * blockDim.x - 1;
+        y_end = size - (step + 1) * blockDim.x;
+
+        for (int ty = y_start - tid; ty >= y_end; ty -= blockDim.y) {
+            dx = last2.x - tx; dy = lasty - ty;
+            best = dist = __mul24(dx, dx) + __mul24(dy, dy);
+
+            while (last2.y >= 0) {
+                dx = last1.x - tx; dy = last2.y - ty;
+                dist = __mul24(dx, dx) + __mul24(dy, dy);
+
+                if (dist > best)
+                    break;
+
+                best = dist; lasty = last2.y; last2 = last1;
+
+                if (last2.y >= 0)
+                    last1 = input[TOID(tx, last2.y, size)];
+            }
+
+            block[threadIdx.x][ty - y_end] = make_pixel(lasty, last2.x);
+        }
+
+        __syncthreads();
+
+        // note: transposes back to original shape here
+        if(!threadIdx.y) {
+            int id = TOID(y_end + threadIdx.x, blockIdx.x * blockDim.x, size);
+            for(int i = 0; i < blockDim.x; ++i, id+=size) {
+                output[id] = block[i][threadIdx.x];
+            }
+        }
+
+        __syncthreads();
+    }
+}
+} // extern C
diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
new file mode 100644
index 000000000..bcb748fe6
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
@@ -0,0 +1,209 @@
+
+#ifndef MARKER
+
+#define MARKER     -2147483648
+#define MAX_INT    2147483647
+#define BLOCKSIZE  32
+
+#endif  // MARKER
+
+#define LL long long
+__device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3, LL y_3, LL z_3, LL x_0, LL z_0)
+{
+    LL k_1 = y_2 - y_1, k_2 = y_3 - y_2;
+
+    return (((y_1 + y_2) * k_1 + ((x_2 - x_1) * (x_1 + x_2 - (x_0 << 1)) + (z_2 - z_1) * (z_1 + z_2 - (z_0 << 1)))) * k_2 > \
+            ((y_2 + y_3) * k_2 + ((x_3 - x_2) * (x_2 + x_3 - (x_0 << 1)) + (z_3 - z_2) * (z_2 + z_3 - (z_0 << 1)))) * k_1);
+}
+#undef LL
+
+#define TOID(x, y, z, size)    ((((z) * (size)) + (y)) * (size) + (x))
+
+
+#ifndef ENCODE
+
+// Sites     : ENCODE(x, y, z, 0, 0)
+// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+#define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
+#define DECODE(value, x, y, z) \
+    x = ((value) >> 20) & 0x3ff; \
+    y = ((value) >> 10) & 0x3ff; \
+    z = (value) & 0x3ff
+
+#define NOTSITE(value)  (((value) >> 31) & 1)
+#define HASNEXT(value)  (((value) >> 30) & 1)
+
+#define GET_X(value)    (((value) >> 20) & 0x3ff)
+#define GET_Y(value)    (((value) >> 10) & 0x3ff)
+#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
+
+#endif // ENCODE
+
+
+extern "C"{
+
+__global__ void kernelFloodZ(int *input, int *output, int size)
+{
+
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+    int tz = 0;
+
+    int plane = size * size;
+    int id = TOID(tx, ty, tz, size);
+    int pixel1, pixel2;
+
+    pixel1 = ENCODE(0,0,0,1,0);
+
+    // Sweep down
+    for (int i = 0; i < size; i++, id += plane) {
+        pixel2 = input[id];
+
+        if (!NOTSITE(pixel2))
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+
+    int dist1, dist2, nz;
+
+    id -= plane + plane;
+
+    // Sweep up
+    for (int i = size - 2; i >= 0; i--, id -= plane) {
+        nz = GET_Z(pixel1);
+        dist1 = abs(nz - (tz + i));
+
+        pixel2 = output[id];
+        nz = GET_Z(pixel2);
+        dist2 = abs(nz - (tz + i));
+
+        if (dist2 < dist1)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+
+__global__ void kernelMaurerAxis(int *input, int *stack, int size)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int tz = blockIdx.y * blockDim.y + threadIdx.y;
+    int ty = 0;
+
+    int id = TOID(tx, ty, tz, size);
+
+    int lasty = 0;
+    int x1, y1, z1, x2, y2, z2, nx, ny, nz;
+    int p = ENCODE(0,0,0,1,0), s1 = ENCODE(0,0,0,1,0), s2 = ENCODE(0,0,0,1,0);
+    int flag = 0;
+
+    for (ty = 0; ty < size; ++ty, id += size) {
+        p = input[id];
+
+        if (!NOTSITE(p)) {
+
+            while (HASNEXT(s2)) {
+                DECODE(s1, x1, y1, z1);
+                DECODE(s2, x2, y2, z2);
+                DECODE(p, nx, ny, nz);
+
+                if (!dominate(x1, y2, z1, x2, lasty, z2, nx, ty, nz, tx, tz))
+                    break;
+
+                lasty = y2; s2 = s1; y2 = y1;
+
+                if (HASNEXT(s2))
+                    s1 = stack[TOID(tx, y2, tz, size)];
+            }
+
+            DECODE(p, nx, ny, nz);
+            s1 = s2;
+            s2 = ENCODE(nx, lasty, nz, 0, flag);
+            y2 = lasty;
+            lasty = ty;
+
+            stack[id] = s2;
+
+            flag = 1;
+        }
+    }
+
+    if (NOTSITE(p))
+        stack[TOID(tx, ty - 1, tz, size)] = ENCODE(0, lasty, 0, 1, flag);
+}
+
+__global__ void kernelColorAxis(int *input, int *output, int size)
+{
+    __shared__ int block[BLOCKSIZE][BLOCKSIZE];
+
+    int col = threadIdx.x;
+    int tid = threadIdx.y;
+    int tx = blockIdx.x * blockDim.x + col;
+    int tz = blockIdx.y;
+
+    int x1, y1, z1, x2, y2, z2;
+    int last1 = ENCODE(0,0,0,1,0), last2 = ENCODE(0,0,0,1,0), lasty;
+    long long dx, dy, dz, best, dist;
+
+    lasty = size - 1;
+
+    last2 = input[TOID(tx, lasty, tz, size)];
+    DECODE(last2, x2, y2, z2);
+
+    if (NOTSITE(last2)) {
+        lasty = y2;
+        if(HASNEXT(last2)) {
+            last2 = input[TOID(tx, lasty, tz, size)];
+            DECODE(last2, x2, y2, z2);
+        }
+    }
+
+    if (HASNEXT(last2)) {
+        last1 = input[TOID(tx, y2, tz, size)];
+        DECODE(last1, x1, y1, z1);
+    }
+
+    int y_start, y_end, n_step = size / blockDim.x;
+    for(int step = 0; step < n_step; ++step) {
+        y_start = size - step * blockDim.x - 1;
+        y_end = size - (step + 1) * blockDim.x;
+
+        for (int ty = y_start - tid; ty >= y_end; ty -= blockDim.y) {
+            dx = x2 - tx; dy = lasty - ty; dz = z2 - tz;
+            best = dx * dx + dy * dy + dz * dz;
+
+            while (HASNEXT(last2)) {
+                dx = x1 - tx; dy = y2 - ty; dz = z1 - tz;
+                dist = dx * dx + dy * dy + dz * dz;
+
+                if(dist > best) break;
+
+                best = dist; lasty = y2; last2 = last1;
+                DECODE(last2, x2, y2, z2);
+
+                if (HASNEXT(last2)) {
+                    last1 = input[TOID(tx, y2, tz, size)];
+                    DECODE(last1, x1, y1, z1);
+                }
+            }
+
+            block[threadIdx.x][ty - y_end] = ENCODE(lasty, x2, z2, NOTSITE(last2), 0);
+        }
+
+        __syncthreads();
+
+        if(!threadIdx.y) {
+            int id = TOID(y_end + threadIdx.x, blockIdx.x * blockDim.x, tz, size);
+            for(int i = 0; i < blockDim.x; i++, id+=size) {
+                output[id] = block[i][threadIdx.x];
+            }
+        }
+
+        __syncthreads();
+    }
+}
+
+
+} // extern C
\ No newline at end of file

From 6a9997ff9805fc31112793697a075cb6e7867fe9 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Mon, 27 Jun 2022 22:01:52 -0400
Subject: [PATCH 03/12] add tests for 2d distance transform

fix automated setting of m1, m2, m3 parameters for 2D PBA+
---
 .../morphology/_distance_transform.py         | 10 +-
 .../core/operations/morphology/_pba_2d.py     | 37 +++-----
 .../tests/test_distance_transform.py          | 91 +++++++++++++++++++
 3 files changed, 114 insertions(+), 24 deletions(-)
 create mode 100644 python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py

diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
index 372227e85..d3acf5ec7 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
@@ -79,6 +79,10 @@ def distance_transform_edt(image, sampling=None, return_distances=True,
     where b[i] is the background point (value 0) with the smallest Euclidean
     distance to input points x[i], and n is the number of dimensions.
 
+    Note that the `indices` output may differ from the one given by
+    `scipy.ndimage.distance_transform_edt` in the case of input pixels that are
+    equidistant from multiple background points.
+
     The parallel banding algorithm implemented here was originally described in
     [1]_. The kernels used here correspond to the revised PBA+ implementation
     that is described on the author's website [2]_. The source code of the
@@ -150,6 +154,7 @@ def distance_transform_edt(image, sampling=None, return_distances=True,
             raise NotImplementedError(
                 "non-uniform values in sampling is not currently supported"
             )
+        sampling = float(sampling)
 
     if image.ndim == 3:
         pba_func = _pba_3d
@@ -168,6 +173,9 @@ def distance_transform_edt(image, sampling=None, return_distances=True,
     )
 
     if return_distances and sampling is not None:
-        vals[0] = vals[0] * sampling
+        vals= (vals[0] * sampling,) + vals[1:]
+
+    if len(vals) == 1:
+        vals = vals[0]
 
     return vals
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
index 01007072c..f26ff3760 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
@@ -85,26 +85,12 @@ def _unpack_int2(img, make_copy=False, int_dtype=cupy.int16):
     return temp
 
 
-def _determine_padding(shape, block_size):
+def _determine_padding(shape, padded_size, block_size):
     # all kernels assume equal size along both axes, so pad up to equal size if
     # shape is not isotropic
-    round_up = False
     orig_sy, orig_sx = shape
-    if orig_sx % block_size != 0:
-        # round up size to a multiple of the band size
-        round_up = True
-        sx = block_size * math.ceil(orig_sx / block_size)
-    else:
-        sx = orig_sx
-    if orig_sy % block_size != 0:
-        # round up size to a multiple of the band size
-        round_up = True
-        sy = block_size * math.ceil(orig_sy / block_size)
-    else:
-        sy = orig_sy
-    if sx != sy or round_up:
-        smax = max(sx, sy)
-        padding_width = ((0, smax - orig_sy), (0, smax - orig_sx))
+    if orig_sx != padded_size or orig_sy != padded_size:
+        padding_width = ((0, padded_size - orig_sy), (0, padded_size - orig_sx))
     else:
         padding_width = None
     return padding_width
@@ -126,12 +112,18 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
     #       overhead of querying properties
     block_size = _get_block_size(check_warp_size)
 
-    padded_size = max(arr.shape)
+    padded_size = math.ceil(max(arr.shape) / block_size) * block_size
     if block_params is None:
         # should be <= size / 64. sy must be a multiple of m1
         m1 = max(1, min(padded_size // block_size, 32))
+
         # size must be a multiple of m2
         m2 = max(1, min(padded_size // block_size, 32))
+        # m2 must also be a power of two
+        m2 = 2**math.floor(math.log2(m2))
+        if padded_size % m2 != 0:
+            raise RuntimeError("error in setting default m2")
+
         # should be <= 64. texture size must be a multiple of m3
         m3 = min(min(m1, m2), 2)
     else:
@@ -142,10 +134,10 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
     if m2 > padded_size // block_size:
         raise ValueError("m2 too large. must be <= arr.shape[1] // 32")
     for m in (m1, m2, m3):
-        if padded_size % m1 != 0:
+        if padded_size % m != 0:
             raise ValueError(
                 f"Largest dimension of image ({padded_size}) must be evenly "
-                f"disivible by each element of block_params: {block_params}."
+                f"disivible by each element of block_params: {(m1, m2, m3)}."
             )
 
     shape_max = max(arr.shape)
@@ -166,11 +158,10 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
     marker = _init_marker(int_dtype)
 
     orig_sy, orig_sx = arr.shape
-    padding_width = _determine_padding(arr.shape, block_size)
+    padding_width = _determine_padding(arr.shape, padded_size, block_size)
     if padding_width is not None:
         arr = cupy.pad(arr, padding_width, mode='constant', constant_values=1)
     size = arr.shape[0]
-    print(f"{orig_sy=}, {orig_sx=}, {padding_width=}, {arr.shape=}")
 
     input_arr = _pack_int2(arr, marker=marker, int_dtype=int_dtype)
     output = cupy.zeros_like(input_arr)
@@ -296,6 +287,6 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
             cupy.sqrt(dist, out=dist)
         vals = vals + (dist,)
     if return_indices:
-        indices = cupy.stack((x, y), axis=0)
+        indices = cupy.stack((y, x), axis=0)
         vals = vals + (indices,)
     return vals
diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
new file mode 100644
index 000000000..59e95469d
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
@@ -0,0 +1,91 @@
+from copy import copy
+
+import cupy as cp
+import numpy as np
+import pytest
+from cupy import testing
+import scipy.ndimage as ndi_cpu
+
+from cucim.core.operations.morphology import distance_transform_edt
+
+
+
+def binary_image(shape, pct_true=50):
+    rng = cp.random.default_rng(123)
+    x = rng.integers(0, 100, size=shape,  dtype=cp.uint8)
+    return x >=pct_true
+
+
+def assert_percentile_equal(arr1, arr2, pct=95):
+    """Assert that at least pct% of the entries in arr1 and arr2 are equal."""
+    pct_mismatch = (100 - pct) / 100
+    arr1 = cp.asnumpy(arr1)
+    arr2 = cp.asnumpy(arr2)
+    mismatch = np.sum(arr1 != arr2) / arr1.size
+    assert mismatch < pct_mismatch
+
+
+@pytest.mark.parametrize('block_params', [(1, 1, 1), None])
+@pytest.mark.parametrize('sampling', [None, (1.5, 1.5)])
+@pytest.mark.parametrize('shape', [(256, 256), (537, 236)])
+@pytest.mark.parametrize('density', [20, 50, 80])
+def test_distance_transform_edt_2d(shape, sampling, block_params, density):
+
+    kwargs_scipy = dict(sampling=sampling)
+    kwargs_cucim = dict(sampling=sampling, block_params=block_params)
+    img = binary_image(shape, pct_true=density)
+    out = distance_transform_edt(img, **kwargs_cucim)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs_scipy)
+    cp.testing.assert_allclose(out, expected)
+
+
+@pytest.mark.parametrize('sx', list(range(32)))
+@pytest.mark.parametrize('sy', list(range(16)))
+def test_distance_transform_edt_2d_aniso_block_params(sx, sy):
+    """ensure default block_params is robust to anisotropic shape."""
+    shape = (128 + sx, 128 + sy)
+    img = binary_image(shape, pct_true=80)
+    out = distance_transform_edt(img)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img))
+    cp.testing.assert_allclose(out, expected)
+
+
+@pytest.mark.parametrize('value', [0, 1, 3])
+def test_distance_transform_edt_2d_uniform_valued(value):
+    """ensure default block_params is robust to anisotropic shape."""
+    img = cp.full((64, 64), value, dtype=cp.uint8)
+    # ensure there is at least 1 pixel at background intensity
+    img[13, 13] = 0
+    out = distance_transform_edt(img)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img))
+    cp.testing.assert_allclose(out, expected)
+
+
+@pytest.mark.parametrize('return_indices', [False, True])
+@pytest.mark.parametrize('return_distances', [False, True])
+@pytest.mark.parametrize('sampling', [None, (1.5, 1.5)])
+@pytest.mark.parametrize('shape', [(256, 256), (65, 128)])
+def test_distance_transform_edt_2d_returns(shape, sampling, return_distances,
+                                           return_indices):
+
+    if not (return_indices or return_distances):
+        return
+
+    kwargs = dict(
+        sampling=sampling,
+        return_distances=return_distances,
+        return_indices=return_indices,
+    )
+    img = binary_image(shape, pct_true=50)
+    out = distance_transform_edt(img, **kwargs)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs)
+    if return_indices and return_distances:
+        assert len(out) == 2
+        cp.testing.assert_allclose(out[0], expected[0])
+        # May differ at a small % of coordinates where multiple points were
+        # equidistant.
+        assert_percentile_equal(out[1], expected[1], pct=95)
+    elif return_distances:
+        cp.testing.assert_allclose(out, expected)
+    elif return_indices:
+        assert_percentile_equal(out, expected, pct=95)

From 3d44899eb11d1c1878b5c1a855b6a2a5cb6c50ae Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Mon, 27 Jun 2022 23:10:41 -0400
Subject: [PATCH 04/12] Update 3D kernels to allow use of size > 1024 per axis

When switching to 64-bit int can go up to size 2**20 on each axis.
---
 .../core/operations/morphology/_pba_3d.py     | 75 +++++++++------
 .../morphology/cuda/pba_kernels_3d.h          | 91 +++++++++++--------
 2 files changed, 100 insertions(+), 66 deletions(-)

diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
index 2ba7851dc..597df93f9 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -22,6 +22,9 @@
 pba3d_defines_encode_32bit = """
 // Sites     : ENCODE(x, y, z, 0, 0)
 // Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+#define ENCODED_INT_TYPE int
+#define ZERO 0
+#define ONE 1
 #define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
 #define DECODE(value, x, y, z) \
     x = ((value) >> 20) & 0x3ff; \
@@ -42,28 +45,35 @@
 pba3d_defines_encode_64bit = """
 // Sites     : ENCODE(x, y, z, 0, 0)
 // Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
-#define ENCODE(x, y, z, a, b)  (((x) << 40) | ((y) << 20) | (z) | ((a) << 63) | ((b) << 62))
+#define ENCODED_INT_TYPE long long
+#define ZERO 0L
+#define ONE 1L
+#define ENCODE(x, y, z, a, b)  (((x) << 40) | ((y) << 20) | (z) | ((a) << 61) | ((b) << 60))
 #define DECODE(value, x, y, z) \
-    x = ((value) >> 40) & 0x3ff; \
-    y = ((value) >> 20) & 0x3ff; \
-    z = (value) & 0x3ff
+    x = ((value) >> 40) & 0xfffff; \
+    y = ((value) >> 20) & 0xfffff; \
+    z = (value) & 0xfffff
 
-#define NOTSITE(value)  (((value) >> 63) & 1)
-#define HASNEXT(value)  (((value) >> 62) & 1)
+#define NOTSITE(value)  (((value) >> 61) & 1)
+#define HASNEXT(value)  (((value) >> 60) & 1)
 
-#define GET_X(value)    (((value) >> 40) & 0x3ff)
-#define GET_Y(value)    (((value) >> 20) & 0x3ff)
-#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
+#define GET_X(value)    (((value) >> 40) & 0xfffff)
+#define GET_Y(value)    (((value) >> 20) & 0xfffff)
+#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0xfffff))
 
 """ # noqa
 
 
 @cupy.memoize(True)
-def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647):
+def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647,
+                  size_max=1024):
     pba3d_code = pba3d_defines_template.format(
         block_size_3d=block_size_3d, marker=marker, max_int=max_int
     )
-    pba3d_code += pba3d_defines_encode_32bit
+    if size_max > 1024:
+        pba3d_code += pba3d_defines_encode_64bit
+    else:
+        pba3d_code += pba3d_defines_encode_32bit
     kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda')
     with open(os.path.join(kernel_directory, 'pba_kernels_3d.h'), 'rt') as f:
         pba3d_kernels = '\n'.join(f.readlines())
@@ -72,29 +82,38 @@ def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647):
 
 
 # TODO: custom kernel for encode3d
-def encode3d(arr, marker=-2147483648, bit_depth=32):
+def encode3d(arr, marker=-2147483648, bit_depth=32, size_max=1024):
     if arr.ndim != 3:
         raise ValueError("only 3d arr suppported")
     if bit_depth not in [32, 64]:
         raise ValueError("only bit_depth of 32 or 64 is supported")
-    if any(s > 1024 for s in arr.shape):
-        raise ValueError(
-            "encoding is for maximum of 1024 along each dimension")
-    input = cupy.zeros(arr.shape, dtype=np.int32, order='C')
+    if size_max > 1024:
+        dtype = np.int64
+    else:
+        dtype = np.int32
+    image = cupy.zeros(arr.shape, dtype=dtype, order='C')
     cond = arr == 0
     z, y, x = cupy.where(cond)
     # z, y, x so that x is the contiguous axis (must match the TOID macro in the C++ code!)
-    input[cond] = (((x) << 20) | ((y) << 10) | (z))
-    input[arr != 0] = marker  # 1 << 32
-    return input
+    if size_max > 1024:
+        image[cond] = (((x) << 40) | ((y) << 20) | (z))
+    else:
+        image[cond] = (((x) << 20) | ((y) << 10) | (z))
+    image[arr != 0] = marker  # 1 << 32
+    return image
 
 
 # TODO: custom kernel for decode3d
-def decode3d(output):
+def decode3d(output, size_max=1024):
     # Note: z, y, x reversed vs. DECODE define in C above due to C-ordered memory layout
-    x = (output >> 20) & 0x3ff
-    y = (output >> 10) & 0x3ff
-    z = output & 0x3ff
+    if size_max > 1024:
+        x = (output >> 40) & 0xfffff
+        y = (output >> 20) & 0xfffff
+        z = output & 0xfffff
+    else:
+        x = (output >> 20) & 0x3ff
+        y = (output >> 10) & 0x3ff
+        z = output & 0x3ff
     return (x, y, z)
 
 
@@ -171,7 +190,8 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
 
     # pba algorithm was implemented to use 32-bit integer to store compressed
     # coordinates. input_arr will be C-contiguous, int32
-    input_arr = encode3d(arr)
+    size_max = max(arr.shape)  # TODO: update
+    input_arr = encode3d(arr, size_max=size_max)
     buffer_idx = 0
     output = cupy.zeros_like(input_arr)
     pba_images = [input_arr, output]
@@ -185,7 +205,8 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
 
 
     pba3d = cupy.RawModule(
-        code=get_pba3d_src(block_size_3d=block_size, marker=marker)
+        code=get_pba3d_src(block_size_3d=block_size, marker=marker,
+                           size_max=size_max)
     )
 
     kernelFloodZ = pba3d.get_function('kernelFloodZ')
@@ -232,9 +253,9 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
     )
 
     output = pba_images[buffer_idx]
-
     if return_distances or return_indices:
-        x, y, z = decode3d(output[:orig_sz, :orig_sy, :orig_sx])
+        x, y, z = decode3d(output[:orig_sz, :orig_sy, :orig_sx],
+                           size_max=size_max)
 
     vals = ()
     if return_distances:
diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
index bcb748fe6..f6e90d2fb 100644
--- a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
+++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
@@ -1,11 +1,44 @@
+// Euclidean Distance Transform
+// 
+// Kernels for the 3D version of the Parallel Banding Algorithm (PBA+). 
+// 
+// MIT license: see 3rdparty/LICENSE.pba+
 
-#ifndef MARKER
 
+// START OF DEFINITIONS LIKELY OVERRIDDEN BY THE PYTHON SCRIPT
+
+#ifndef MARKER
 #define MARKER     -2147483648
+#endif  // MARKER
+
+#ifndef MAX_INT
 #define MAX_INT    2147483647
+#endif
+
+#ifndef BLOCKSIZE
 #define BLOCKSIZE  32
+#endif
 
-#endif  // MARKER
+// #ifndef ENCODE
+//
+// // Sites     : ENCODE(x, y, z, 0, 0)
+// // Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+// #define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
+// #define DECODE(value, x, y, z) \
+//     x = ((value) >> 20) & 0x3ff; \
+//     y = ((value) >> 10) & 0x3ff; \
+//     z = (value) & 0x3ff
+
+// #define NOTSITE(value)  (((value) >> 31) & 1)
+// #define HASNEXT(value)  (((value) >> 30) & 1)
+
+// #define GET_X(value)    (((value) >> 20) & 0x3ff)
+// #define GET_Y(value)    (((value) >> 10) & 0x3ff)
+// #define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
+
+// #endif // ENCODE
+
+// END OF DEFINITIONS POTENTIALLY DEFINED IN THE PYTHON SCRIPT
 
 #define LL long long
 __device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3, LL y_3, LL z_3, LL x_0, LL z_0)
@@ -20,29 +53,9 @@ __device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3,
 #define TOID(x, y, z, size)    ((((z) * (size)) + (y)) * (size) + (x))
 
 
-#ifndef ENCODE
-
-// Sites     : ENCODE(x, y, z, 0, 0)
-// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
-#define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
-#define DECODE(value, x, y, z) \
-    x = ((value) >> 20) & 0x3ff; \
-    y = ((value) >> 10) & 0x3ff; \
-    z = (value) & 0x3ff
-
-#define NOTSITE(value)  (((value) >> 31) & 1)
-#define HASNEXT(value)  (((value) >> 30) & 1)
-
-#define GET_X(value)    (((value) >> 20) & 0x3ff)
-#define GET_Y(value)    (((value) >> 10) & 0x3ff)
-#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
-
-#endif // ENCODE
-
-
 extern "C"{
 
-__global__ void kernelFloodZ(int *input, int *output, int size)
+__global__ void kernelFloodZ(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *output, int size)
 {
 
     int tx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -51,9 +64,9 @@ __global__ void kernelFloodZ(int *input, int *output, int size)
 
     int plane = size * size;
     int id = TOID(tx, ty, tz, size);
-    int pixel1, pixel2;
+    ENCODED_INT_TYPE pixel1, pixel2;
 
-    pixel1 = ENCODE(0,0,0,1,0);
+    pixel1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO);
 
     // Sweep down
     for (int i = 0; i < size; i++, id += plane) {
@@ -65,7 +78,7 @@ __global__ void kernelFloodZ(int *input, int *output, int size)
         output[id] = pixel1;
     }
 
-    int dist1, dist2, nz;
+    ENCODED_INT_TYPE dist1, dist2, nz;
 
     id -= plane + plane;
 
@@ -86,7 +99,7 @@ __global__ void kernelFloodZ(int *input, int *output, int size)
 }
 
 
-__global__ void kernelMaurerAxis(int *input, int *stack, int size)
+__global__ void kernelMaurerAxis(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *stack, int size)
 {
     int tx = blockIdx.x * blockDim.x + threadIdx.x;
     int tz = blockIdx.y * blockDim.y + threadIdx.y;
@@ -94,10 +107,10 @@ __global__ void kernelMaurerAxis(int *input, int *stack, int size)
 
     int id = TOID(tx, ty, tz, size);
 
-    int lasty = 0;
-    int x1, y1, z1, x2, y2, z2, nx, ny, nz;
-    int p = ENCODE(0,0,0,1,0), s1 = ENCODE(0,0,0,1,0), s2 = ENCODE(0,0,0,1,0);
-    int flag = 0;
+    ENCODED_INT_TYPE lasty = 0;
+    ENCODED_INT_TYPE x1, y1, z1, x2, y2, z2, nx, ny, nz;
+    ENCODED_INT_TYPE p = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), s1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), s2 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO);
+    ENCODED_INT_TYPE flag = 0;
 
     for (ty = 0; ty < size; ++ty, id += size) {
         p = input[id];
@@ -120,31 +133,31 @@ __global__ void kernelMaurerAxis(int *input, int *stack, int size)
 
             DECODE(p, nx, ny, nz);
             s1 = s2;
-            s2 = ENCODE(nx, lasty, nz, 0, flag);
+            s2 = ENCODE(nx, lasty, nz, ZERO, flag);
             y2 = lasty;
             lasty = ty;
 
             stack[id] = s2;
 
-            flag = 1;
+            flag = ONE;
         }
     }
 
     if (NOTSITE(p))
-        stack[TOID(tx, ty - 1, tz, size)] = ENCODE(0, lasty, 0, 1, flag);
+        stack[TOID(tx, ty - 1, tz, size)] = ENCODE(ZERO, lasty, ZERO, ONE, flag);
 }
 
-__global__ void kernelColorAxis(int *input, int *output, int size)
+__global__ void kernelColorAxis(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *output, int size)
 {
-    __shared__ int block[BLOCKSIZE][BLOCKSIZE];
+    __shared__ ENCODED_INT_TYPE block[BLOCKSIZE][BLOCKSIZE];
 
     int col = threadIdx.x;
     int tid = threadIdx.y;
     int tx = blockIdx.x * blockDim.x + col;
     int tz = blockIdx.y;
 
-    int x1, y1, z1, x2, y2, z2;
-    int last1 = ENCODE(0,0,0,1,0), last2 = ENCODE(0,0,0,1,0), lasty;
+    ENCODED_INT_TYPE x1, y1, z1, x2, y2, z2;
+    ENCODED_INT_TYPE last1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), last2 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), lasty;
     long long dx, dy, dz, best, dist;
 
     lasty = size - 1;
@@ -189,7 +202,7 @@ __global__ void kernelColorAxis(int *input, int *output, int size)
                 }
             }
 
-            block[threadIdx.x][ty - y_end] = ENCODE(lasty, x2, z2, NOTSITE(last2), 0);
+            block[threadIdx.x][ty - y_end] = ENCODE(lasty, x2, z2, NOTSITE(last2), ZERO);
         }
 
         __syncthreads();

From 8509bfff884ba1968879e7b13f01d39f1abbb94d Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Tue, 28 Jun 2022 10:00:30 -0400
Subject: [PATCH 05/12] expand tests and extend to 3D cases

---
 .../tests/test_distance_transform.py          | 111 +++++++++++-------
 1 file changed, 69 insertions(+), 42 deletions(-)

diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
index 59e95469d..60a692f48 100644
--- a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
+++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
@@ -25,67 +25,94 @@ def assert_percentile_equal(arr1, arr2, pct=95):
     assert mismatch < pct_mismatch
 
 
-@pytest.mark.parametrize('block_params', [(1, 1, 1), None])
-@pytest.mark.parametrize('sampling', [None, (1.5, 1.5)])
-@pytest.mark.parametrize('shape', [(256, 256), (537, 236)])
-@pytest.mark.parametrize('density', [20, 50, 80])
-def test_distance_transform_edt_2d(shape, sampling, block_params, density):
-
-    kwargs_scipy = dict(sampling=sampling)
-    kwargs_cucim = dict(sampling=sampling, block_params=block_params)
+@pytest.mark.parametrize('return_indices', [False, True])
+@pytest.mark.parametrize('return_distances', [False, True])
+@pytest.mark.parametrize('shape, sampling',
+    [
+        ((256, 128), None),
+        ((384, 256), (1.5, 1.5)),
+        ((14, 32, 50), None),
+        ((50, 32, 24), (2, 2, 2)),
+    ]
+)
+@pytest.mark.parametrize('density', [5, 50, 95])
+@pytest.mark.parametrize('block_params', [None, (1, 1, 1)])
+def test_distance_transform_edt(shape, sampling, return_distances,
+                                return_indices, density, block_params):
+
+    if not (return_indices or return_distances):
+        return
+
+    kwargs_scipy = dict(
+        sampling=sampling,
+        return_distances=return_distances,
+        return_indices=return_indices,
+    )
+    kwargs_cucim = copy(kwargs_scipy)
+    kwargs_cucim['block_params'] = block_params
     img = binary_image(shape, pct_true=density)
     out = distance_transform_edt(img, **kwargs_cucim)
     expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs_scipy)
+    if return_indices and return_distances:
+        assert len(out) == 2
+        cp.testing.assert_allclose(out[0], expected[0])
+        # May differ at a small % of coordinates where multiple points were
+        # equidistant.
+        assert_percentile_equal(out[1], expected[1], pct=95)
+    elif return_distances:
+        cp.testing.assert_allclose(out, expected)
+    elif return_indices:
+        assert_percentile_equal(out, expected, pct=95)
+
+
+@pytest.mark.parametrize('value', [0, 1, 3])
+@pytest.mark.parametrize('ndim', [2, 3])
+def test_distance_transform_edt_uniform_valued(value, ndim):
+    """ensure default block_params is robust to anisotropic shape."""
+    img = cp.full((48, ) * ndim, value, dtype=cp.uint8)
+    # ensure there is at least 1 pixel at background intensity
+    img[(slice(24, 25),) * ndim] = 0
+    out = distance_transform_edt(img)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img))
     cp.testing.assert_allclose(out, expected)
 
 
-@pytest.mark.parametrize('sx', list(range(32)))
+@pytest.mark.parametrize('sx', list(range(16)))
 @pytest.mark.parametrize('sy', list(range(16)))
-def test_distance_transform_edt_2d_aniso_block_params(sx, sy):
+def test_distance_transform_edt_2d_aniso(sx, sy):
     """ensure default block_params is robust to anisotropic shape."""
-    shape = (128 + sx, 128 + sy)
+    shape = (128 + sy, 128 + sx)
     img = binary_image(shape, pct_true=80)
     out = distance_transform_edt(img)
     expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img))
     cp.testing.assert_allclose(out, expected)
 
 
-@pytest.mark.parametrize('value', [0, 1, 3])
-def test_distance_transform_edt_2d_uniform_valued(value):
+@pytest.mark.parametrize('sx', list(range(4)))
+@pytest.mark.parametrize('sy', list(range(4)))
+@pytest.mark.parametrize('sz', list(range(4)))
+def test_distance_transform_edt_3d_aniso(sx, sy, sz):
     """ensure default block_params is robust to anisotropic shape."""
-    img = cp.full((64, 64), value, dtype=cp.uint8)
-    # ensure there is at least 1 pixel at background intensity
-    img[13, 13] = 0
+    shape = (16 + sz, 32 + sy, 48 + sx)
+    img = binary_image(shape, pct_true=80)
     out = distance_transform_edt(img)
     expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img))
     cp.testing.assert_allclose(out, expected)
 
 
-@pytest.mark.parametrize('return_indices', [False, True])
-@pytest.mark.parametrize('return_distances', [False, True])
-@pytest.mark.parametrize('sampling', [None, (1.5, 1.5)])
-@pytest.mark.parametrize('shape', [(256, 256), (65, 128)])
-def test_distance_transform_edt_2d_returns(shape, sampling, return_distances,
-                                           return_indices):
+@pytest.mark.parametrize('ndim', [1, 4, 5])
+def test_distance_transform_edt_unsupported_ndim(ndim):
+    """ensure default block_params is robust to anisotropic shape."""
+    with pytest.raises(NotImplementedError):
+        distance_transform_edt(cp.zeros((8,) * ndim))
 
-    if not (return_indices or return_distances):
-        return
 
-    kwargs = dict(
-        sampling=sampling,
-        return_distances=return_distances,
-        return_indices=return_indices,
-    )
-    img = binary_image(shape, pct_true=50)
-    out = distance_transform_edt(img, **kwargs)
-    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs)
-    if return_indices and return_distances:
-        assert len(out) == 2
-        cp.testing.assert_allclose(out[0], expected[0])
-        # May differ at a small % of coordinates where multiple points were
-        # equidistant.
-        assert_percentile_equal(out[1], expected[1], pct=95)
-    elif return_distances:
-        cp.testing.assert_allclose(out, expected)
-    elif return_indices:
-        assert_percentile_equal(out, expected, pct=95)
+@pytest.mark.skip(reason="excessive memory requirement")
+def test_distance_transform_edt_3d_int64():
+    """ensure default block_params is robust to anisotropic shape."""
+    shape = (1280, 1280, 1280)
+    img = binary_image(shape, pct_true=80)
+    distance_transform_edt(img)
+    # Note: no validation vs. scipy.ndimage due to excessive run time
+    return
+

From 0933d739d0717a3c47e940efd7b25cf1a63a2f34 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Tue, 28 Jun 2022 10:52:54 -0400
Subject: [PATCH 06/12] test error on non-uniform sampling

---
 .../morphology/_distance_transform.py         | 11 +++---
 .../core/operations/morphology/_pba_2d.py     |  5 +++
 .../core/operations/morphology/_pba_3d.py     |  5 +++
 .../tests/test_distance_transform.py          | 34 +++++++++++++++++--
 4 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
index d3acf5ec7..7e51a22bc 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
@@ -148,13 +148,16 @@ def distance_transform_edt(image, sampling=None, return_distances=True,
         raise NotImplementedError(
             "preallocated indices image is not supported"
         )
+    scalar_sampling = None
     if sampling is not None:
         sampling = np.unique(np.atleast_1d(sampling))
-        if len(sampling) != 1:
+        if len(sampling) == 1:
+            scalar_sampling = float(sampling)
+            sampling = None
+        else:
             raise NotImplementedError(
                 "non-uniform values in sampling is not currently supported"
             )
-        sampling = float(sampling)
 
     if image.ndim == 3:
         pba_func = _pba_3d
@@ -172,8 +175,8 @@ def distance_transform_edt(image, sampling=None, return_distances=True,
         block_params=block_params
     )
 
-    if return_distances and sampling is not None:
-        vals= (vals[0] * sampling,) + vals[1:]
+    if return_distances and scalar_sampling is not None:
+        vals= (vals[0] * scalar_sampling,) + vals[1:]
 
     if len(vals) == 1:
         vals = vals[0]
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
index f26ff3760..57fac41c0 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
@@ -112,6 +112,11 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
     #       overhead of querying properties
     block_size = _get_block_size(check_warp_size)
 
+    if sampling is not None:
+        raise NotImplementedError("sampling not yet supported")
+        # if len(sampling) != 2:
+        #     raise ValueError("sampling must be a sequence of two values.")
+
     padded_size = math.ceil(max(arr.shape) / block_size) * block_size
     if block_params is None:
         # should be <= size / 64. sy must be a multiple of m1
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
index 597df93f9..832817459 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -159,6 +159,11 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
     if arr.ndim != 3:
         raise ValueError(f"expected a 3D array, got {arr.ndim}D")
 
+    if sampling is not None:
+        raise NotImplementedError("sampling not yet supported")
+        # if len(sampling) != 3:
+        #     raise ValueError("sampling must be a sequence of three values.")
+
     if block_params is None:
         m1 = 1
         m2 = 1
diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
index 60a692f48..a607f492f 100644
--- a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
+++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
@@ -37,8 +37,9 @@ def assert_percentile_equal(arr1, arr2, pct=95):
 )
 @pytest.mark.parametrize('density', [5, 50, 95])
 @pytest.mark.parametrize('block_params', [None, (1, 1, 1)])
-def test_distance_transform_edt(shape, sampling, return_distances,
-                                return_indices, density, block_params):
+def test_distance_transform_edt(
+    shape, sampling, return_distances, return_indices, density, block_params
+):
 
     if not (return_indices or return_distances):
         return
@@ -65,6 +66,35 @@ def test_distance_transform_edt(shape, sampling, return_distances,
         assert_percentile_equal(out, expected, pct=95)
 
 
+@pytest.mark.parametrize('return_indices', [False, True])
+@pytest.mark.parametrize('return_distances', [False, True])
+@pytest.mark.parametrize('shape, sampling',
+    [
+        ((384, 256), (1, 3)),
+        ((50, 32, 24), (1, 2, 4)),
+    ]
+)
+@pytest.mark.parametrize('density', [5, 50, 95])
+def test_distance_transform_edt_nonuniform_sampling(
+    shape, sampling, return_distances, return_indices, density
+):
+
+    if not (return_indices or return_distances):
+        return
+
+    kwargs_scipy = dict(
+        sampling=sampling,
+        return_distances=return_distances,
+        return_indices=return_indices,
+    )
+    kwargs_cucim = copy(kwargs_scipy)
+    img = binary_image(shape, pct_true=density)
+    if sampling is not None and len(np.unique(sampling)) != 1:
+        with pytest.raises(NotImplementedError):
+            distance_transform_edt(img, **kwargs_cucim)
+        return
+
+
 @pytest.mark.parametrize('value', [0, 1, 3])
 @pytest.mark.parametrize('ndim', [2, 3])
 def test_distance_transform_edt_uniform_valued(value, ndim):

From e98b611fe9422cf8229eafa80ecd8809cabfcfb1 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Tue, 28 Jun 2022 11:47:37 -0400
Subject: [PATCH 07/12] flake8/isort fixes

---
 .../morphology/_distance_transform.py         |  3 +--
 .../core/operations/morphology/_pba_2d.py     | 23 +++++++++++--------
 .../core/operations/morphology/_pba_3d.py     | 18 ++++-----------
 .../tests/test_distance_transform.py          | 13 +++++------
 4 files changed, 24 insertions(+), 33 deletions(-)

diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
index 7e51a22bc..24c34fff6 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
@@ -3,7 +3,6 @@
 from ._pba_2d import _pba_2d
 from ._pba_3d import _pba_3d
 
-
 # TODO: make a 64-bit packed coordinates variant in 3D for large shapes
 #       support sampling distances
 #       support chamfer, chessboard and l1/manhattan distances too?
@@ -176,7 +175,7 @@ def distance_transform_edt(image, sampling=None, return_distances=True,
     )
 
     if return_distances and scalar_sampling is not None:
-        vals= (vals[0] * scalar_sampling,) + vals[1:]
+        vals = (vals[0] * scalar_sampling,) + vals[1:]
 
     if len(vals) == 1:
         vals = vals[0]
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
index 57fac41c0..edb10a983 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
@@ -1,6 +1,5 @@
 import math
 import os
-from functools import reduce
 
 import cupy
 
@@ -23,10 +22,13 @@ def _init_marker(int_dtype):
         # marker = cupy.iinfo(int_dtype).min
         marker = -32768
     elif int_dtype == cupy.int32:
-        # divide by two so we don't have to promote other intermediate int variables to 64-bit int
+        # divide by two so we don't have to promote other intermediate int
+        # variables to 64-bit int
         marker = -2147483648 // 2
     else:
-        raise ValueError("expected int_dtype to be either cupy.int16 or cupy.int32")
+        raise ValueError(
+            "expected int_dtype to be either cupy.int16 or cupy.int32"
+        )
     return marker
 
 
@@ -62,6 +64,7 @@ def _pack_int2(arr, marker=-32768, int_dtype=cupy.int16):
         raise ValueError("only 2d arr suppported")
     input_x = cupy.zeros(arr.shape, dtype=int_dtype)
     input_y = cupy.zeros(arr.shape, dtype=int_dtype)
+    # TODO: create custom kernel for setting values in input_x, input_y
     cond = arr == 0
     y, x = cupy.where(cond)
     input_x[cond] = x
@@ -69,7 +72,7 @@ def _pack_int2(arr, marker=-32768, int_dtype=cupy.int16):
     input_x[mask] = marker  # 1 << 32
     input_y[cond] = y
     input_y[mask] = marker  # 1 << 32
-    int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype]*2})
+    int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype] * 2})
     # in C++ code x is the contiguous axis and corresponds to width
     #             y is the non-contiguous axis and corresponds to height
     # given that, store input_x as the last axis here
@@ -90,7 +93,9 @@ def _determine_padding(shape, padded_size, block_size):
     # shape is not isotropic
     orig_sy, orig_sx = shape
     if orig_sx != padded_size or orig_sy != padded_size:
-        padding_width = ((0, padded_size - orig_sy), (0, padded_size - orig_sx))
+        padding_width = (
+            (0, padded_size - orig_sy), (0, padded_size - orig_sx)
+        )
     else:
         padding_width = None
     return padding_width
@@ -105,9 +110,6 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
     #    the pair (x, y). Pixels that are not sites should contain the pair
     #    (MARKER, MARKER)
 
-    # See the following on float2, etc. use in cupy:
-    #     https://github.com/cupy/cupy/issues/1535
-
     # Note: could query warp size here, but for now just assume 32 to avoid
     #       overhead of querying properties
     block_size = _get_block_size(check_warp_size)
@@ -171,7 +173,7 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
     input_arr = _pack_int2(arr, marker=marker, int_dtype=int_dtype)
     output = cupy.zeros_like(input_arr)
 
-    int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype]*2})
+    int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype] * 2})
     margin = cupy.empty((2 * m1 * size,), dtype=int2_dtype)
 
     # phase 1 of PBA. m1 must divide texture size and be <= 64
@@ -249,7 +251,7 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
         noBand //= 2
     # Replace the forward link with the X coordinate of the seed to remove
     # the need of looking at the other texture. We need it for coloring.
-    grid = (math.ceil(size / block[0]), size);
+    grid = (math.ceil(size / block[0]), size)
     kernelDoubleToSingleList(
         grid,
         block,
@@ -273,6 +275,7 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
     # raise NotImplementedError("TODO")
     vals = ()
     if return_distances:
+        # TODO: custom kernel for more efficient distance computation
         y0, x0 = cupy.meshgrid(
             *(
                 cupy.arange(s, dtype=cupy.int32)
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
index 832817459..80415ab3e 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -1,13 +1,11 @@
 import math
 import os
-from functools import reduce
 
 import cupy
 import numpy as np
 
 from ._pba_2d import _get_block_size
 
-
 pba3d_defines_template = """
 
 #define MARKER     {marker}
@@ -94,7 +92,8 @@ def encode3d(arr, marker=-2147483648, bit_depth=32, size_max=1024):
     image = cupy.zeros(arr.shape, dtype=dtype, order='C')
     cond = arr == 0
     z, y, x = cupy.where(cond)
-    # z, y, x so that x is the contiguous axis (must match the TOID macro in the C++ code!)
+    # z, y, x so that x is the contiguous axis
+    # (must match TOID macro in the C++ code!)
     if size_max > 1024:
         image[cond] = (((x) << 40) | ((y) << 20) | (z))
     else:
@@ -105,7 +104,6 @@ def encode3d(arr, marker=-2147483648, bit_depth=32, size_max=1024):
 
 # TODO: custom kernel for decode3d
 def decode3d(output, size_max=1024):
-    # Note: z, y, x reversed vs. DECODE define in C above due to C-ordered memory layout
     if size_max > 1024:
         x = (output >> 40) & 0xfffff
         y = (output >> 20) & 0xfffff
@@ -203,15 +201,8 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
 
     block = (blockx, blocky, 1)
     grid = (size // block[0], size // block[1], 1)
-
-    # MAKE SURE NO GRID/BLOCK SIZES WILL CAUSE OUT-OF-RANGE ACCESS
-    # block_size = min(32, min_pow2)
-    marker = -2147483648
-
-
     pba3d = cupy.RawModule(
-        code=get_pba3d_src(block_size_3d=block_size, marker=marker,
-                           size_max=size_max)
+        code=get_pba3d_src(block_size_3d=block_size, size_max=size_max)
     )
 
     kernelFloodZ = pba3d.get_function('kernelFloodZ')
@@ -264,7 +255,7 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
 
     vals = ()
     if return_distances:
-        # TODO: kernel for more efficient distance computation
+        # TODO: custom kernel for more efficient distance computation
         orig_shape = (orig_sz, orig_sy, orig_sx)
         z0, y0, x0 = cupy.meshgrid(
             *(cupy.arange(s, dtype=cupy.int32) for s in orig_shape),
@@ -287,4 +278,3 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
         indices = cupy.stack((z, y, x), axis=0)
         vals = vals + (indices,)
     return vals
-
diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
index a607f492f..f88d8f2e1 100644
--- a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
+++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
@@ -3,17 +3,15 @@
 import cupy as cp
 import numpy as np
 import pytest
-from cupy import testing
 import scipy.ndimage as ndi_cpu
 
 from cucim.core.operations.morphology import distance_transform_edt
 
 
-
 def binary_image(shape, pct_true=50):
     rng = cp.random.default_rng(123)
-    x = rng.integers(0, 100, size=shape,  dtype=cp.uint8)
-    return x >=pct_true
+    x = rng.integers(0, 100, size=shape, dtype=cp.uint8)
+    return x >= pct_true
 
 
 def assert_percentile_equal(arr1, arr2, pct=95):
@@ -27,7 +25,8 @@ def assert_percentile_equal(arr1, arr2, pct=95):
 
 @pytest.mark.parametrize('return_indices', [False, True])
 @pytest.mark.parametrize('return_distances', [False, True])
-@pytest.mark.parametrize('shape, sampling',
+@pytest.mark.parametrize(
+    'shape, sampling',
     [
         ((256, 128), None),
         ((384, 256), (1.5, 1.5)),
@@ -68,7 +67,8 @@ def test_distance_transform_edt(
 
 @pytest.mark.parametrize('return_indices', [False, True])
 @pytest.mark.parametrize('return_distances', [False, True])
-@pytest.mark.parametrize('shape, sampling',
+@pytest.mark.parametrize(
+    'shape, sampling',
     [
         ((384, 256), (1, 3)),
         ((50, 32, 24), (1, 2, 4)),
@@ -145,4 +145,3 @@ def test_distance_transform_edt_3d_int64():
     distance_transform_edt(img)
     # Note: no validation vs. scipy.ndimage due to excessive run time
     return
-

From e5a37a9593813b678edee97c4a6445f5f95bb3a2 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Tue, 28 Jun 2022 12:02:04 -0400
Subject: [PATCH 08/12] minor update to kernel headers

---
 .../morphology/cuda/pba_kernels_2d.h          | 62 +++++++++++++------
 .../morphology/cuda/pba_kernels_3d.h          | 51 +++++++++------
 2 files changed, 77 insertions(+), 36 deletions(-)

diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h
index 3ea84e3b2..61677c682 100644
--- a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h
+++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h
@@ -1,15 +1,41 @@
+// Euclidean Distance Transform
+// 
+// Kernels for the 2D version of the Parallel Banding Algorithm (PBA+). 
+// 
+// MIT license: see 3rdparty/LICENSE.pba+
+// Copyright: (c) 2019 School of Computing, National University of Singapore
+//
+// Modifications by Gregory Lee (2022) (NVIDIA)
+// - add user-defined pixel_int2_t to enable
+// - replace __mul24 operations with standard multiplication operator
+
+
+// START OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT
+
+// The values included in this header file are those defined in the original
+// PBA+ implementation
+
+// However, the Python code generation can potentially generate a different
+// ENCODE/DECODE that use 20 bits per coordinates instead of 10 bits per
+// coordinate with ENCODED_INT_TYPE as `long long`.
 
 #ifndef MARKER
-    
 #define MARKER -32768
+#endif
+
+#ifndef BLOCKSIZE
 #define BLOCKSIZE 32
+#endif
+
+#ifndef pixel_int2_t
 #define pixel_int2_t short2
 #define make_pixel(x, y)  make_short2(x, y)
+#endif
 
-#endif  // MARKER
+// END OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT
 
 
-#define TOID(x, y, size)  (__mul24((y), (size)) + (x))
+#define TOID(x, y, size)  ((y) * (size) + (x))
 
 #define LL long long
 __device__ bool dominate(LL x1, LL y1, LL x2, LL y2, LL x3, LL y3, LL x0)
@@ -169,8 +195,8 @@ __global__ void kernelUpdateVertical(pixel_int2_t *color, pixel_int2_t *margin,
 
 __global__ void kernelProximatePoints(pixel_int2_t *input, pixel_int2_t *stack, int size, int bandSize)
 {
-    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
-    int ty = __mul24(blockIdx.y, bandSize);
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
     int id = TOID(tx, ty, size);
     int lasty = -1;
     pixel_int2_t last1, last2, current;
@@ -205,8 +231,8 @@ __global__ void kernelProximatePoints(pixel_int2_t *input, pixel_int2_t *stack,
 
 __global__ void kernelCreateForwardPointers(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize)
 {
-    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
-    int ty = __mul24(blockIdx.y+1, bandSize) - 1;
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = (blockIdx.y+1) * bandSize - 1;
     int id = TOID(tx, ty, size);
     int lasty = -1, nexty;
     pixel_int2_t current;
@@ -235,7 +261,7 @@ __global__ void kernelCreateForwardPointers(pixel_int2_t *input, pixel_int2_t *o
 
 __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size, int bandSize)
 {
-    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
     int band1 = blockIdx.y * 2;
     int band2 = band1 + 1;
     int firsty, lasty;
@@ -246,7 +272,7 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_
     // x component store the forward pointer
 
     // Get the two last items of the first list
-    lasty = __mul24(band2, bandSize) - 1;
+    lasty = band2 * bandSize - 1;
     last2 = make_pixel(color[TOID(tx, lasty, size)].x,
         link[TOID(tx, lasty, size)].y);
 
@@ -267,7 +293,7 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_
     }
 
     // Get the first item of the second band
-    firsty = __mul24(band2, bandSize);
+    firsty = band2 * bandSize;
     current = make_pixel(link[TOID(tx, firsty, size)].x,
         color[TOID(tx, firsty, size)].x);
 
@@ -321,8 +347,8 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_
     }
 
     // Update the head and tail pointer.
-    firsty = __mul24(band1, bandSize);
-    lasty = __mul24(band2, bandSize);
+    firsty = band1 * bandSize;
+    lasty = band2 * bandSize;
     current = link[TOID(tx, firsty, size)];
 
     if (current.y == MARKER && current.x < 0) { // No head?
@@ -336,8 +362,8 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_
         output[TOID(tx, firsty, size)] = current;
     }
 
-    firsty = __mul24(band1, bandSize) + bandSize - 1;
-    lasty = __mul24(band2, bandSize) + bandSize - 1;
+    firsty = band1 * bandSize + bandSize - 1;
+    lasty = band2 * bandSize + bandSize - 1;
     current = link[TOID(tx, lasty, size)];
 
     if (current.x == MARKER && current.y < 0) { // No tail?
@@ -354,7 +380,7 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_
 
 __global__ void kernelDoubleToSingleList(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size)
 {
-    int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
     int ty = blockIdx.y;
     int id = TOID(tx, ty, size);
 
@@ -367,7 +393,7 @@ __global__ void kernelColor(pixel_int2_t *input, pixel_int2_t *output, int size)
 
     int col = threadIdx.x;
     int tid = threadIdx.y;
-    int tx = __mul24(blockIdx.x, blockDim.x) + col;
+    int tx = blockIdx.x * blockDim.x + col;
     int dx, dy, lasty;
     unsigned int best, dist;
     pixel_int2_t last1, last2;
@@ -391,11 +417,11 @@ __global__ void kernelColor(pixel_int2_t *input, pixel_int2_t *output, int size)
 
         for (int ty = y_start - tid; ty >= y_end; ty -= blockDim.y) {
             dx = last2.x - tx; dy = lasty - ty;
-            best = dist = __mul24(dx, dx) + __mul24(dy, dy);
+            best = dist = dx * dx + dy * dy;
 
             while (last2.y >= 0) {
                 dx = last1.x - tx; dy = last2.y - ty;
-                dist = __mul24(dx, dx) + __mul24(dy, dy);
+                dist = dx * dx + dy * dy;
 
                 if (dist > best)
                     break;
diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
index f6e90d2fb..c09f4b51f 100644
--- a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
+++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
@@ -3,9 +3,20 @@
 // Kernels for the 3D version of the Parallel Banding Algorithm (PBA+). 
 // 
 // MIT license: see 3rdparty/LICENSE.pba+
+//
+// Modifications by Gregory Lee (2022) (NVIDIA)
+// - allow user-defined ENCODED_INT_TYPE, ENCODE, DECODE
+
+
+// START OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT
+
+// The values included in this header file are those defined in the original
+// PBA+ implementation
 
+// However, the Python code generation can potentially generate a different
+// ENCODE/DECODE that use 20 bits per coordinates instead of 10 bits per
+// coordinate with ENCODED_INT_TYPE as `long long`.
 
-// START OF DEFINITIONS LIKELY OVERRIDDEN BY THE PYTHON SCRIPT
 
 #ifndef MARKER
 #define MARKER     -2147483648
@@ -19,26 +30,30 @@
 #define BLOCKSIZE  32
 #endif
 
-// #ifndef ENCODE
-//
-// // Sites     : ENCODE(x, y, z, 0, 0)
-// // Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
-// #define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
-// #define DECODE(value, x, y, z) \
-//     x = ((value) >> 20) & 0x3ff; \
-//     y = ((value) >> 10) & 0x3ff; \
-//     z = (value) & 0x3ff
+#ifndef ENCODE
+
+// Sites     : ENCODE(x, y, z, 0, 0)
+// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+#define ENCODED_INT_TYPE int
+#define ZERO 0
+#define ONE 1
+#define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
+#define DECODE(value, x, y, z) \
+    x = ((value) >> 20) & 0x3ff; \
+    y = ((value) >> 10) & 0x3ff; \
+    z = (value) & 0x3ff
+
+#define NOTSITE(value)  (((value) >> 31) & 1)
+#define HASNEXT(value)  (((value) >> 30) & 1)
 
-// #define NOTSITE(value)  (((value) >> 31) & 1)
-// #define HASNEXT(value)  (((value) >> 30) & 1)
+#define GET_X(value)    (((value) >> 20) & 0x3ff)
+#define GET_Y(value)    (((value) >> 10) & 0x3ff)
+#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
 
-// #define GET_X(value)    (((value) >> 20) & 0x3ff)
-// #define GET_Y(value)    (((value) >> 10) & 0x3ff)
-// #define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
+#endif // ENCODE
 
-// #endif // ENCODE
+// END OF DEFINITIONS DEFINED IN THE PYTHON SCRIPT
 
-// END OF DEFINITIONS POTENTIALLY DEFINED IN THE PYTHON SCRIPT
 
 #define LL long long
 __device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3, LL y_3, LL z_3, LL x_0, LL z_0)
@@ -219,4 +234,4 @@ __global__ void kernelColorAxis(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *outpu
 }
 
 
-} // extern C
\ No newline at end of file
+} // extern C

From e83d9a7cef28d20873f4617c785ca92846c6c1bb Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Tue, 28 Jun 2022 12:45:42 -0400
Subject: [PATCH 09/12] update outdated comment

---
 .../cucim/core/operations/morphology/_distance_transform.py  | 5 ++---
 python/cucim/src/cucim/core/operations/morphology/_pba_3d.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
index 24c34fff6..e781938b2 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
@@ -3,10 +3,9 @@
 from ._pba_2d import _pba_2d
 from ._pba_3d import _pba_3d
 
-# TODO: make a 64-bit packed coordinates variant in 3D for large shapes
-#       support sampling distances
+# TODO: support sampling distances
+#       support the distances and indices output arguments
 #       support chamfer, chessboard and l1/manhattan distances too?
-#       remove __mul24 from the pba2d code? (is there any benefit on recent GPUs?)  # noqa
 
 
 def distance_transform_edt(image, sampling=None, return_distances=True,
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
index 80415ab3e..bc85d0a79 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -193,7 +193,7 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
 
     # pba algorithm was implemented to use 32-bit integer to store compressed
     # coordinates. input_arr will be C-contiguous, int32
-    size_max = max(arr.shape)  # TODO: update
+    size_max = max(arr.shape)
     input_arr = encode3d(arr, size_max=size_max)
     buffer_idx = 0
     output = cupy.zeros_like(input_arr)

From c67b7c4fc0d3e6aa15911e5f2fcaf599bde9f03d Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Wed, 29 Jun 2022 16:41:41 -0400
Subject: [PATCH 10/12] add fallback implementation of math.lcm for Python <
 3.9

---
 .../core/operations/morphology/_pba_3d.py     | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
index bc85d0a79..2429a2f59 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -1,4 +1,6 @@
+import functools
 import math
+import numbers
 import os
 
 import cupy
@@ -6,6 +8,35 @@
 
 from ._pba_2d import _get_block_size
 
+try:
+    # math.lcm was introduced in Python 3.9
+    from math import lcm
+except ImportError:
+
+    """Fallback implementation of least common multiple (lcm)
+
+    TODO: remove once minimum Python requirement is >= 3.9
+    """
+
+    def _lcm(a, b):
+        return abs(b * (a // math.gcd(a, b)))
+
+    @functools.lru_cache()
+    def lcm(*args):
+        nargs = len(args)
+        if not all(isinstance(a, numbers.Integral) for a in args):
+            raise TypeError("all arguments must be integers")
+        if nargs == 0:
+            return 1
+        res = int(args[0])
+        if nargs == 1:
+            return abs(res)
+        for i in range(1, nargs):
+            x = int(args[i])
+            res = _lcm(res, x)
+        return res
+
+
 pba3d_defines_template = """
 
 #define MARKER     {marker}

From cb06d787aa3acc630283af07c4410af0720ff86b Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Wed, 29 Jun 2022 18:00:49 -0400
Subject: [PATCH 11/12] make sure fallback lcm function is actually used if
 needed

---
 python/cucim/src/cucim/core/operations/morphology/_pba_3d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
index 2429a2f59..dab484e90 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -149,7 +149,7 @@ def decode3d(output, size_max=1024):
 def _determine_padding(shape, block_size, m1, m2, m3, blockx, blocky):
     # TODO: can possibly revise to consider only particular factors for LCM on
     #       a given axis
-    LCM = math.lcm(block_size, m1, m2, m3, blockx, blocky)
+    LCM = lcm(block_size, m1, m2, m3, blockx, blocky)
     orig_sz, orig_sy, orig_sx = shape
     round_up = False
     if orig_sx % LCM != 0:

From 2a190b34829ef7b912cefc5499b47eeda2cc6645 Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Wed, 27 Jul 2022 12:33:11 -0700
Subject: [PATCH 12/12] Apply suggestions from code review

remove irrelevant text (copy/paste error)
---
 .../core/operations/morphology/tests/test_distance_transform.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
index f88d8f2e1..ab46ae1fb 100644
--- a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
+++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
@@ -132,14 +132,12 @@ def test_distance_transform_edt_3d_aniso(sx, sy, sz):
 
 @pytest.mark.parametrize('ndim', [1, 4, 5])
 def test_distance_transform_edt_unsupported_ndim(ndim):
-    """ensure default block_params is robust to anisotropic shape."""
     with pytest.raises(NotImplementedError):
         distance_transform_edt(cp.zeros((8,) * ndim))
 
 
 @pytest.mark.skip(reason="excessive memory requirement")
 def test_distance_transform_edt_3d_int64():
-    """ensure default block_params is robust to anisotropic shape."""
     shape = (1280, 1280, 1280)
     img = binary_image(shape, pct_true=80)
     distance_transform_edt(img)