From 8e9b5a5a05e2046dee46e1d63f1362b0fd850d80 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Mon, 27 Jun 2022 11:17:12 -0400 Subject: [PATCH 01/12] add license info for the PBA+ algorithm --- 3rdparty/LICENSE.pba+ | 21 +++++++++++++++++++++ LICENSE-3rdparty.md | 6 ++++++ 2 files changed, 27 insertions(+) create mode 100644 3rdparty/LICENSE.pba+ diff --git a/3rdparty/LICENSE.pba+ b/3rdparty/LICENSE.pba+ new file mode 100644 index 000000000..9d0b4030a --- /dev/null +++ b/3rdparty/LICENSE.pba+ @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 School of Computing, National University of Singapore + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LICENSE-3rdparty.md b/LICENSE-3rdparty.md index 0a3789af7..984d0e952 100644 --- a/LICENSE-3rdparty.md +++ b/LICENSE-3rdparty.md @@ -281,3 +281,9 @@ StainTools - https://github.com/Peter554/StainTools/blob/master/LICENSE.txt - Copyright: Peter Byfield - Usage: reference for stain color normalization algorithm + +PBA+ +- License: MIT License + - https://github.com/orzzzjq/Parallel-Banding-Algorithm-plus/blob/master/LICENSE +- Copyright: School of Computing, National University of Singapore +- Usage: PBA+ is used to implement the Euclidean distance transform. From 0462a2c6935e7f240b5f32b23d8dc611a59f1bf7 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Mon, 27 Jun 2022 13:09:54 -0400 Subject: [PATCH 02/12] Add distance_transform_edt implementation based on PBA+ --- .../core/operations/morphology/__init__.py | 3 + .../morphology/_distance_transform.py | 173 +++++++ .../core/operations/morphology/_pba_2d.py | 301 +++++++++++++ .../core/operations/morphology/_pba_3d.py | 264 +++++++++++ .../morphology/cuda/pba_kernels_2d.h | 425 ++++++++++++++++++ .../morphology/cuda/pba_kernels_3d.h | 209 +++++++++ 6 files changed, 1375 insertions(+) create mode 100644 python/cucim/src/cucim/core/operations/morphology/__init__.py create mode 100644 python/cucim/src/cucim/core/operations/morphology/_distance_transform.py create mode 100644 python/cucim/src/cucim/core/operations/morphology/_pba_2d.py create mode 100644 python/cucim/src/cucim/core/operations/morphology/_pba_3d.py create mode 100644 python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h create mode 100644 python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h diff --git a/python/cucim/src/cucim/core/operations/morphology/__init__.py b/python/cucim/src/cucim/core/operations/morphology/__init__.py new file mode 100644 index 000000000..a9c676edb --- /dev/null +++ b/python/cucim/src/cucim/core/operations/morphology/__init__.py @@ -0,0 +1,3 @@ +from ._distance_transform import distance_transform_edt + +__all__ = ["distance_transform_edt"] diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py new file mode 100644 index 000000000..372227e85 --- /dev/null +++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py @@ -0,0 +1,173 @@ +import numpy as np + +from ._pba_2d import _pba_2d +from ._pba_3d import _pba_3d + + +# TODO: make a 64-bit packed coordinates variant in 3D for large shapes +# support sampling distances +# support chamfer, chessboard and l1/manhattan distances too? +# remove __mul24 from the pba2d code? (is there any benefit on recent GPUs?) # noqa + + +def distance_transform_edt(image, sampling=None, return_distances=True, + return_indices=False, distances=None, indices=None, + *, block_params=None, float64_distances=False): + """Exact Euclidean distance transform. + + This function calculates the distance transform of the `input`, by + replacing each foreground (non-zero) element, with its shortest distance to + the background (any zero-valued element). + + In addition to the distance transform, the feature transform can be + calculated. In this case the index of the closest background element to + each foreground element is returned in a separate array. + + Parameters + ---------- + image : array_like + Input data to transform. Can be any type but will be converted into + binary: 1 wherever image equates to True, 0 elsewhere. + sampling : float, or sequence of float, optional + Spacing of elements along each dimension. If a sequence, must be of + length equal to the image rank; if a single number, this is used for + all axes. If not specified, a grid spacing of unity is implied. + return_distances : bool, optional + Whether to calculate the distance transform. + return_indices : bool, optional + Whether to calculate the feature transform. + distances : float32 cupy.ndarray, optional + An output array to store the calculated distance transform, instead of + returning it. `return_distances` must be True. It must be the same + shape as `image`. + indices : int32 cupy.ndarray, optional + An output array to store the calculated feature transform, instead of + returning it. `return_indicies` must be True. Its shape must be + `(image.ndim,) + image.shape`. + + Other Parameters + ---------------- + block_params : 3-tuple of int + The m1, m2, m3 algorithm parameters as described in [2]_. If None, + suitable defaults will be chosen. Note: This parameter is specific to + cuCIM and does not exist in SciPy. + float64_distances : bool, optional + If True, use double precision in the distance computation (to match + SciPy behavior). Otherwise, single precision will be used for + efficiency. Note: This parameter is specific to cuCIM and does not + exist in SciPy. + + Returns + ------- + distances : float64 ndarray, optional + The calculated distance transform. Returned only when + `return_distances` is True and `distances` is not supplied. It will + have the same shape as `image`. + indices : int32 ndarray, optional + The calculated feature transform. It has an image-shaped array for each + dimension of the image. See example below. Returned only when + `return_indices` is True and `indices` is not supplied. + + Notes + ----- + The Euclidean distance transform gives values of the Euclidean distance:: + + n + y_i = sqrt(sum (x[i]-b[i])**2) + i + + where b[i] is the background point (value 0) with the smallest Euclidean + distance to input points x[i], and n is the number of dimensions. + + The parallel banding algorithm implemented here was originally described in + [1]_. The kernels used here correspond to the revised PBA+ implementation + that is described on the author's website [2]_. The source code of the + author's PBA+ implementation is available at [3]_. + + References + ---------- + ..[1] Thanh-Tung Cao, Ke Tang, Anis Mohamed, and Tiow-Seng Tan. 2010. + Parallel Banding Algorithm to compute exact distance transform with the + GPU. In Proceedings of the 2010 ACM SIGGRAPH symposium on Interactive + 3D Graphics and Games (I3D ’10). Association for Computing Machinery, + New York, NY, USA, 83–90. + DOI:https://doi.org/10.1145/1730804.1730818 + .. [2] https://www.comp.nus.edu.sg/~tants/pba.html + .. [3] https://github.com/orzzzjq/Parallel-Banding-Algorithm-plus + + Examples + -------- + >>> import cupy as cp + >>> from cucim.core.operations import morphology + >>> a = cp.array(([0,1,1,1,1], + ... [0,0,1,1,1], + ... [0,1,1,1,1], + ... [0,1,1,1,0], + ... [0,1,1,0,0])) + >>> morphology.distance_transform_edt(a) + array([[ 0. , 1. , 1.4142, 2.2361, 3. ], + [ 0. , 0. , 1. , 2. , 2. ], + [ 0. , 1. , 1.4142, 1.4142, 1. ], + [ 0. , 1. , 1.4142, 1. , 0. ], + [ 0. , 1. , 1. , 0. , 0. ]]) + + With a sampling of 2 units along x, 1 along y: + + >>> morphology.distance_transform_edt(a, sampling=[2,1]) + array([[ 0. , 1. , 2. , 2.8284, 3.6056], + [ 0. , 0. , 1. , 2. , 3. ], + [ 0. , 1. , 2. , 2.2361, 2. ], + [ 0. , 1. , 2. , 1. , 0. ], + [ 0. , 1. , 1. , 0. , 0. ]]) + + Asking for indices as well: + + >>> edt, inds = morphology.distance_transform_edt(a, return_indices=True) + >>> inds + array([[[0, 0, 1, 1, 3], + [1, 1, 1, 1, 3], + [2, 2, 1, 3, 3], + [3, 3, 4, 4, 3], + [4, 4, 4, 4, 4]], + [[0, 0, 1, 1, 4], + [0, 1, 1, 1, 4], + [0, 0, 1, 4, 4], + [0, 0, 3, 3, 4], + [0, 0, 3, 3, 4]]]) + + """ + if distances is not None: + raise NotImplementedError( + "preallocated distances image is not supported" + ) + if indices is not None: + raise NotImplementedError( + "preallocated indices image is not supported" + ) + if sampling is not None: + sampling = np.unique(np.atleast_1d(sampling)) + if len(sampling) != 1: + raise NotImplementedError( + "non-uniform values in sampling is not currently supported" + ) + + if image.ndim == 3: + pba_func = _pba_3d + elif image.ndim == 2: + pba_func = _pba_2d + else: + raise NotImplementedError( + "Only 2D and 3D distance transforms are supported.") + + vals = pba_func( + image, + sampling=sampling, + return_distances=return_distances, + return_indices=return_indices, + block_params=block_params + ) + + if return_distances and sampling is not None: + vals[0] = vals[0] * sampling + + return vals diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py new file mode 100644 index 000000000..01007072c --- /dev/null +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py @@ -0,0 +1,301 @@ +import math +import os +from functools import reduce + +import cupy + +pba2d_defines_template = """ + +// MARKER is used to mark blank pixels in the texture. +// Any uncolored pixels will have x = MARKER. +// Input texture should have x = MARKER for all pixels other than sites +#define MARKER {marker} +#define BLOCKSIZE {block_size_2d} +#define pixel_int2_t {pixel_int2_t} // typically short2 (int2 for images with > 32k pixels per side) +#define make_pixel(x, y) {make_pixel_func}(x, y) // typically make_short2 (make_int2 images with > 32k pixels per side + +""" # noqa + + +def _init_marker(int_dtype): + """use a minimum value that is appropriate to the integer dtype""" + if int_dtype == cupy.int16: + # marker = cupy.iinfo(int_dtype).min + marker = -32768 + elif int_dtype == cupy.int32: + # divide by two so we don't have to promote other intermediate int variables to 64-bit int + marker = -2147483648 // 2 + else: + raise ValueError("expected int_dtype to be either cupy.int16 or cupy.int32") + return marker + + +@cupy.memoize(True) +def get_pba2d_src(block_size_2d=64, marker=-32768, pixel_int2_t='short2'): + make_pixel_func = 'make_' + pixel_int2_t + + pba2d_code = pba2d_defines_template.format( + block_size_2d=block_size_2d, + marker=marker, + pixel_int2_t=pixel_int2_t, + make_pixel_func=make_pixel_func + ) + kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda') + with open(os.path.join(kernel_directory, 'pba_kernels_2d.h'), 'rt') as f: + pba2d_kernels = '\n'.join(f.readlines()) + + pba2d_code += pba2d_kernels + return pba2d_code + + +def _get_block_size(check_warp_size=False): + if check_warp_size: + dev = cupy.cuda.runtime.getDevice() + device_properties = cupy.cuda.runtime.getDeviceProperties(dev) + return int(device_properties['warpSize']) + else: + return 32 + + +def _pack_int2(arr, marker=-32768, int_dtype=cupy.int16): + if arr.ndim != 2: + raise ValueError("only 2d arr suppported") + input_x = cupy.zeros(arr.shape, dtype=int_dtype) + input_y = cupy.zeros(arr.shape, dtype=int_dtype) + cond = arr == 0 + y, x = cupy.where(cond) + input_x[cond] = x + mask = arr != 0 + input_x[mask] = marker # 1 << 32 + input_y[cond] = y + input_y[mask] = marker # 1 << 32 + int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype]*2}) + # in C++ code x is the contiguous axis and corresponds to width + # y is the non-contiguous axis and corresponds to height + # given that, store input_x as the last axis here + return cupy.squeeze( + cupy.stack((input_x, input_y), axis=-1).view(int2_dtype) + ) + + +def _unpack_int2(img, make_copy=False, int_dtype=cupy.int16): + temp = img.view(int_dtype).reshape(img.shape + (2,)) + if make_copy: + temp = temp.copy() + return temp + + +def _determine_padding(shape, block_size): + # all kernels assume equal size along both axes, so pad up to equal size if + # shape is not isotropic + round_up = False + orig_sy, orig_sx = shape + if orig_sx % block_size != 0: + # round up size to a multiple of the band size + round_up = True + sx = block_size * math.ceil(orig_sx / block_size) + else: + sx = orig_sx + if orig_sy % block_size != 0: + # round up size to a multiple of the band size + round_up = True + sy = block_size * math.ceil(orig_sy / block_size) + else: + sy = orig_sy + if sx != sy or round_up: + smax = max(sx, sy) + padding_width = ((0, smax - orig_sy), (0, smax - orig_sx)) + else: + padding_width = None + return padding_width + + +def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, + block_params=None, check_warp_size=False, *, + float64_distances=False): + + # input_arr: a 2D image + # For each site at (x, y), the pixel at coordinate (x, y) should contain + # the pair (x, y). Pixels that are not sites should contain the pair + # (MARKER, MARKER) + + # See the following on float2, etc. use in cupy: + # https://github.com/cupy/cupy/issues/1535 + + # Note: could query warp size here, but for now just assume 32 to avoid + # overhead of querying properties + block_size = _get_block_size(check_warp_size) + + padded_size = max(arr.shape) + if block_params is None: + # should be <= size / 64. sy must be a multiple of m1 + m1 = max(1, min(padded_size // block_size, 32)) + # size must be a multiple of m2 + m2 = max(1, min(padded_size // block_size, 32)) + # should be <= 64. texture size must be a multiple of m3 + m3 = min(min(m1, m2), 2) + else: + m1, m2, m3 = block_params + + if m1 > padded_size // block_size: + raise ValueError("m1 too large. must be <= arr.shape[0] // 32") + if m2 > padded_size // block_size: + raise ValueError("m2 too large. must be <= arr.shape[1] // 32") + for m in (m1, m2, m3): + if padded_size % m1 != 0: + raise ValueError( + f"Largest dimension of image ({padded_size}) must be evenly " + f"disivible by each element of block_params: {block_params}." + ) + + shape_max = max(arr.shape) + if shape_max <= 32768: + int_dtype = cupy.int16 + pixel_int2_type = 'short2' + else: + if shape_max > (1 << 24): + # limit to coordinate range to 2**24 due to use of __mul24 in + # coordinate TOID macro + raise ValueError( + f"maximum axis size of {1 << 24} exceeded, for image with " + f"shape {arr.shape}" + ) + int_dtype = cupy.int32 + pixel_int2_type = 'int2' + + marker = _init_marker(int_dtype) + + orig_sy, orig_sx = arr.shape + padding_width = _determine_padding(arr.shape, block_size) + if padding_width is not None: + arr = cupy.pad(arr, padding_width, mode='constant', constant_values=1) + size = arr.shape[0] + print(f"{orig_sy=}, {orig_sx=}, {padding_width=}, {arr.shape=}") + + input_arr = _pack_int2(arr, marker=marker, int_dtype=int_dtype) + output = cupy.zeros_like(input_arr) + + int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype]*2}) + margin = cupy.empty((2 * m1 * size,), dtype=int2_dtype) + + # phase 1 of PBA. m1 must divide texture size and be <= 64 + pba2d = cupy.RawModule( + code=get_pba2d_src( + block_size_2d=block_size, + marker=marker, + pixel_int2_t=pixel_int2_type, + ) + ) + kernelFloodDown = pba2d.get_function('kernelFloodDown') + kernelFloodUp = pba2d.get_function('kernelFloodUp') + kernelPropagateInterband = pba2d.get_function('kernelPropagateInterband') + kernelUpdateVertical = pba2d.get_function('kernelUpdateVertical') + kernelProximatePoints = pba2d.get_function('kernelProximatePoints') + kernelCreateForwardPointers = pba2d.get_function( + 'kernelCreateForwardPointers' + ) + kernelMergeBands = pba2d.get_function('kernelMergeBands') + kernelDoubleToSingleList = pba2d.get_function('kernelDoubleToSingleList') + kernelColor = pba2d.get_function('kernelColor') + + block = (block_size, 1, 1) + grid = (math.ceil(size / block[0]), m1, 1) + bandSize1 = size // m1 + # kernelFloodDown modifies input_arr in-place + kernelFloodDown( + grid, + block, + (input_arr, input_arr, size, bandSize1), + ) + # kernelFloodUp modifies input_arr in-place + kernelFloodUp( + grid, + block, + (input_arr, input_arr, size, bandSize1), + ) + # kernelFloodUp fills values into margin + kernelPropagateInterband( + grid, + block, + (input_arr, margin, size, bandSize1), + ) + # kernelUpdateVertical stores output into an intermediate array of + # transposed shape + kernelUpdateVertical( + grid, + block, + (input_arr, margin, output, size, bandSize1), + ) + + # phase 2 + block = (block_size, 1, 1) + grid = (math.ceil(size / block[0]), m2, 1) + bandSize2 = size // m2 + kernelProximatePoints( + grid, + block, + (output, input_arr, size, bandSize2), + ) + kernelCreateForwardPointers( + grid, + block, + (input_arr, input_arr, size, bandSize2), + ) + # Repeatly merging two bands into one + noBand = m2 + while noBand > 1: + grid = (math.ceil(size / block[0]), noBand // 2) + kernelMergeBands( + grid, + block, + (output, input_arr, input_arr, size, size // noBand), + ) + noBand //= 2 + # Replace the forward link with the X coordinate of the seed to remove + # the need of looking at the other texture. We need it for coloring. + grid = (math.ceil(size / block[0]), size); + kernelDoubleToSingleList( + grid, + block, + (output, input_arr, input_arr, size), + ) + + # Phase 3 of PBA + block = (block_size, m3, 1) + grid = (math.ceil(size / block[0]), 1, 1) + kernelColor( + grid, + block, + (input_arr, output, size), + ) + + output = _unpack_int2(output, make_copy=False, int_dtype=int_dtype) + # make sure to crop any padding that was added here! + x = output[:orig_sy, :orig_sx, 0] + y = output[:orig_sy, :orig_sx, 1] + + # raise NotImplementedError("TODO") + vals = () + if return_distances: + y0, x0 = cupy.meshgrid( + *( + cupy.arange(s, dtype=cupy.int32) + for s in (orig_sy, orig_sx) + ), + indexing='ij', + sparse=True, + ) + tmp = (x - x0) + dist = tmp * tmp + tmp = (y - y0) + dist += tmp * tmp + if float64_distances: + dist = cupy.sqrt(dist) + else: + dist = dist.astype(cupy.float32) + cupy.sqrt(dist, out=dist) + vals = vals + (dist,) + if return_indices: + indices = cupy.stack((x, y), axis=0) + vals = vals + (indices,) + return vals diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py new file mode 100644 index 000000000..2ba7851dc --- /dev/null +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py @@ -0,0 +1,264 @@ +import math +import os +from functools import reduce + +import cupy +import numpy as np + +from ._pba_2d import _get_block_size + + +pba3d_defines_template = """ + +#define MARKER {marker} +#define MAX_INT {max_int} +#define BLOCKSIZE {block_size_3d} + +""" + +# For efficiency, the original PBA+ packs three 10-bit integers and two binary +# flags into a single 32-bit integer. The defines in +# `pba3d_defines_encode_32bit` handle this format. +pba3d_defines_encode_32bit = """ +// Sites : ENCODE(x, y, z, 0, 0) +// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER +#define ENCODE(x, y, z, a, b) (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30)) +#define DECODE(value, x, y, z) \ + x = ((value) >> 20) & 0x3ff; \ + y = ((value) >> 10) & 0x3ff; \ + z = (value) & 0x3ff + +#define NOTSITE(value) (((value) >> 31) & 1) +#define HASNEXT(value) (((value) >> 30) & 1) + +#define GET_X(value) (((value) >> 20) & 0x3ff) +#define GET_Y(value) (((value) >> 10) & 0x3ff) +#define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff)) + +""" # noqa + + +# 64bit version of ENCODE/DECODE to allow a 20-bit integer per coordinate axis. +pba3d_defines_encode_64bit = """ +// Sites : ENCODE(x, y, z, 0, 0) +// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER +#define ENCODE(x, y, z, a, b) (((x) << 40) | ((y) << 20) | (z) | ((a) << 63) | ((b) << 62)) +#define DECODE(value, x, y, z) \ + x = ((value) >> 40) & 0x3ff; \ + y = ((value) >> 20) & 0x3ff; \ + z = (value) & 0x3ff + +#define NOTSITE(value) (((value) >> 63) & 1) +#define HASNEXT(value) (((value) >> 62) & 1) + +#define GET_X(value) (((value) >> 40) & 0x3ff) +#define GET_Y(value) (((value) >> 20) & 0x3ff) +#define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff)) + +""" # noqa + + +@cupy.memoize(True) +def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647): + pba3d_code = pba3d_defines_template.format( + block_size_3d=block_size_3d, marker=marker, max_int=max_int + ) + pba3d_code += pba3d_defines_encode_32bit + kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda') + with open(os.path.join(kernel_directory, 'pba_kernels_3d.h'), 'rt') as f: + pba3d_kernels = '\n'.join(f.readlines()) + pba3d_code += pba3d_kernels + return pba3d_code + + +# TODO: custom kernel for encode3d +def encode3d(arr, marker=-2147483648, bit_depth=32): + if arr.ndim != 3: + raise ValueError("only 3d arr suppported") + if bit_depth not in [32, 64]: + raise ValueError("only bit_depth of 32 or 64 is supported") + if any(s > 1024 for s in arr.shape): + raise ValueError( + "encoding is for maximum of 1024 along each dimension") + input = cupy.zeros(arr.shape, dtype=np.int32, order='C') + cond = arr == 0 + z, y, x = cupy.where(cond) + # z, y, x so that x is the contiguous axis (must match the TOID macro in the C++ code!) + input[cond] = (((x) << 20) | ((y) << 10) | (z)) + input[arr != 0] = marker # 1 << 32 + return input + + +# TODO: custom kernel for decode3d +def decode3d(output): + # Note: z, y, x reversed vs. DECODE define in C above due to C-ordered memory layout + x = (output >> 20) & 0x3ff + y = (output >> 10) & 0x3ff + z = output & 0x3ff + return (x, y, z) + + +def _determine_padding(shape, block_size, m1, m2, m3, blockx, blocky): + # TODO: can possibly revise to consider only particular factors for LCM on + # a given axis + LCM = math.lcm(block_size, m1, m2, m3, blockx, blocky) + orig_sz, orig_sy, orig_sx = shape + round_up = False + if orig_sx % LCM != 0: + # round up size to a multiple of the band size + round_up = True + sx = LCM * math.ceil(orig_sx / LCM) + else: + sx = orig_sx + if orig_sy % LCM != 0: + # round up size to a multiple of the band size + round_up = True + sy = LCM * math.ceil(orig_sy / LCM) + else: + sy = orig_sy + if orig_sz % LCM != 0: + # round up size to a multiple of the band size + round_up = True + sz = LCM * math.ceil(orig_sz / LCM) + else: + sz = orig_sz + + aniso = not (sx == sy == sz) + if aniso or round_up: + smax = max(sz, sy, sx) + padding_width = ( + (0, smax - orig_sz), (0, smax - orig_sy), (0, smax - orig_sx) + ) + else: + padding_width = None + return padding_width + + +def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, + block_params=None, check_warp_size=False, *, + float64_distances=False): + if arr.ndim != 3: + raise ValueError(f"expected a 3D array, got {arr.ndim}D") + + if block_params is None: + m1 = 1 + m2 = 1 + m3 = 2 + else: + m1, m2, m3 = block_params + + # reduce blockx for small inputs + s_min = min(arr.shape) + if s_min <= 4: + blockx = 4 + elif s_min <= 8: + blockx = 8 + elif s_min <= 16: + blockx = 16 + else: + blockx = 32 + blocky = 4 + + block_size = _get_block_size(check_warp_size) + + orig_sz, orig_sy, orig_sx = arr.shape + padding_width = _determine_padding( + arr.shape, block_size, m1, m2, m3, blockx, blocky + ) + if padding_width is not None: + arr = cupy.pad(arr, padding_width, mode='constant', constant_values=1) + size = arr.shape[0] + + # pba algorithm was implemented to use 32-bit integer to store compressed + # coordinates. input_arr will be C-contiguous, int32 + input_arr = encode3d(arr) + buffer_idx = 0 + output = cupy.zeros_like(input_arr) + pba_images = [input_arr, output] + + block = (blockx, blocky, 1) + grid = (size // block[0], size // block[1], 1) + + # MAKE SURE NO GRID/BLOCK SIZES WILL CAUSE OUT-OF-RANGE ACCESS + # block_size = min(32, min_pow2) + marker = -2147483648 + + + pba3d = cupy.RawModule( + code=get_pba3d_src(block_size_3d=block_size, marker=marker) + ) + + kernelFloodZ = pba3d.get_function('kernelFloodZ') + kernelMaurerAxis = pba3d.get_function('kernelMaurerAxis') + kernelColorAxis = pba3d.get_function('kernelColorAxis') + + kernelFloodZ( + grid, + block, + (pba_images[buffer_idx], pba_images[1 - buffer_idx], size) + ) + buffer_idx = 1 - buffer_idx + + block = (blockx, blocky, 1) + grid = (size // block[0], size // block[1], 1) + kernelMaurerAxis( + grid, + block, + (pba_images[buffer_idx], pba_images[1 - buffer_idx], size), + ) + + block = (block_size, m3, 1) + grid = (size // block[0], size, 1) + kernelColorAxis( + grid, + block, + (pba_images[1 - buffer_idx], pba_images[buffer_idx], size), + ) + + block = (blockx, blocky, 1) + grid = (size // block[0], size // block[1], 1) + kernelMaurerAxis( + grid, + block, + (pba_images[buffer_idx], pba_images[1 - buffer_idx], size), + ) + + block = (block_size, m3, 1) + grid = (size // block[0], size, 1) + kernelColorAxis( + grid, + block, + (pba_images[1 - buffer_idx], pba_images[buffer_idx], size), + ) + + output = pba_images[buffer_idx] + + if return_distances or return_indices: + x, y, z = decode3d(output[:orig_sz, :orig_sy, :orig_sx]) + + vals = () + if return_distances: + # TODO: kernel for more efficient distance computation + orig_shape = (orig_sz, orig_sy, orig_sx) + z0, y0, x0 = cupy.meshgrid( + *(cupy.arange(s, dtype=cupy.int32) for s in orig_shape), + indexing='ij', + sparse=True + ) + tmp = (x - x0) + dist = tmp * tmp + tmp = (y - y0) + dist += tmp * tmp + tmp = (z - z0) + dist += tmp * tmp + if float64_distances: + dist = cupy.sqrt(dist) + else: + dist = dist.astype(cupy.float32) + cupy.sqrt(dist, out=dist) + vals = vals + (dist,) + if return_indices: + indices = cupy.stack((z, y, x), axis=0) + vals = vals + (indices,) + return vals + diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h new file mode 100644 index 000000000..3ea84e3b2 --- /dev/null +++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h @@ -0,0 +1,425 @@ + +#ifndef MARKER + +#define MARKER -32768 +#define BLOCKSIZE 32 +#define pixel_int2_t short2 +#define make_pixel(x, y) make_short2(x, y) + +#endif // MARKER + + +#define TOID(x, y, size) (__mul24((y), (size)) + (x)) + +#define LL long long +__device__ bool dominate(LL x1, LL y1, LL x2, LL y2, LL x3, LL y3, LL x0) +{ + LL k1 = y2 - y1, k2 = y3 - y2; + return (k1 * (y1 + y2) + (x2 - x1) * ((x1 + x2) - (x0 << 1))) * k2 > \ + (k2 * (y2 + y3) + (x3 - x2) * ((x2 + x3) - (x0 << 1))) * k1; +} +#undef LL + + +extern "C"{ + +__global__ void kernelFloodDown(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * bandSize; + int id = TOID(tx, ty, size); + + pixel_int2_t pixel1, pixel2; + + pixel1 = make_pixel(MARKER, MARKER); + + for (int i = 0; i < bandSize; i++, id += size) { + pixel2 = input[id]; + + if (pixel2.x != MARKER) + pixel1 = pixel2; + + output[id] = pixel1; + } +} + +__global__ void kernelFloodUp(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = (blockIdx.y+1) * bandSize - 1; + int id = TOID(tx, ty, size); + + pixel_int2_t pixel1, pixel2; + int dist1, dist2; + + pixel1 = make_pixel(MARKER, MARKER); + + for (int i = 0; i < bandSize; i++, id -= size) { + dist1 = abs(pixel1.y - ty + i); + + pixel2 = input[id]; + dist2 = abs(pixel2.y - ty + i); + + if (dist2 < dist1) + pixel1 = pixel2; + + output[id] = pixel1; + } +} + +__global__ void kernelPropagateInterband(pixel_int2_t *input, pixel_int2_t *margin_out, int size, int bandSize) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int inc = bandSize * size; + int ny, nid, nDist; + pixel_int2_t pixel; + + // Top row, look backward + int ty = blockIdx.y * bandSize; + int topId = TOID(tx, ty, size); + int bottomId = TOID(tx, ty + bandSize - 1, size); + int tid = blockIdx.y * size + tx; + int bid = tid + (size * size / bandSize); + + pixel = input[topId]; + int myDist = abs(pixel.y - ty); + margin_out[tid] = pixel; + + for (nid = bottomId - inc; nid >= 0; nid -= inc) { + pixel = input[nid]; + + if (pixel.x != MARKER) { + nDist = abs(pixel.y - ty); + + if (nDist < myDist) + margin_out[tid] = pixel; + + break; + } + } + + // Last row, look downward + ty = ty + bandSize - 1; + pixel = input[bottomId]; + myDist = abs(pixel.y - ty); + margin_out[bid] = pixel; + + for (ny = ty + 1, nid = topId + inc; ny < size; ny += bandSize, nid += inc) { + pixel = input[nid]; + + if (pixel.x != MARKER) { + nDist = abs(pixel.y - ty); + + if (nDist < myDist) + margin_out[bid] = pixel; + + break; + } + } +} + +__global__ void kernelUpdateVertical(pixel_int2_t *color, pixel_int2_t *margin, pixel_int2_t *output, int size, int bandSize) +{ + __shared__ pixel_int2_t block[BLOCKSIZE][BLOCKSIZE]; + + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * bandSize; + + pixel_int2_t top = margin[blockIdx.y * size + tx]; + pixel_int2_t bottom = margin[(blockIdx.y + size / bandSize) * size + tx]; + pixel_int2_t pixel; + + int dist, myDist; + + int id = TOID(tx, ty, size); + + int n_step = bandSize / blockDim.x; + for(int step = 0; step < n_step; ++step) { + int y_start = blockIdx.y * bandSize + step * blockDim.x; + int y_end = y_start + blockDim.x; + + for (ty = y_start; ty < y_end; ++ty, id += size) { + pixel = color[id]; + myDist = abs(pixel.y - ty); + + dist = abs(top.y - ty); + if (dist < myDist) { myDist = dist; pixel = top; } + + dist = abs(bottom.y - ty); + if (dist < myDist) pixel = bottom; + + // temporary result is stored in block + block[threadIdx.x][ty - y_start] = make_pixel(pixel.y, pixel.x); + } + + __syncthreads(); + + // block is written to a transposed location in the output + + int tid = TOID(blockIdx.y * bandSize + step * blockDim.x + threadIdx.x, \ + blockIdx.x * blockDim.x, size); + + for(int i = 0; i < blockDim.x; ++i, tid += size) { + output[tid] = block[i][threadIdx.x]; + } + + __syncthreads(); + } +} + +__global__ void kernelProximatePoints(pixel_int2_t *input, pixel_int2_t *stack, int size, int bandSize) +{ + int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int ty = __mul24(blockIdx.y, bandSize); + int id = TOID(tx, ty, size); + int lasty = -1; + pixel_int2_t last1, last2, current; + + last1.y = -1; last2.y = -1; + + for (int i = 0; i < bandSize; i++, id += size) { + current = input[id]; + + if (current.x != MARKER) { + while (last2.y >= 0) { + if (!dominate(last1.x, last2.y, last2.x, \ + lasty, current.x, current.y, tx)) + break; + + lasty = last2.y; last2 = last1; + + if (last1.y >= 0) + last1 = stack[TOID(tx, last1.y, size)]; + } + + last1 = last2; last2 = make_pixel(current.x, lasty); lasty = current.y; + + stack[id] = last2; + } + } + + // Store the pointer to the tail at the last pixel of this band + if (lasty != ty + bandSize - 1) + stack[TOID(tx, ty + bandSize - 1, size)] = make_pixel(MARKER, lasty); +} + +__global__ void kernelCreateForwardPointers(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize) +{ + int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int ty = __mul24(blockIdx.y+1, bandSize) - 1; + int id = TOID(tx, ty, size); + int lasty = -1, nexty; + pixel_int2_t current; + + // Get the tail pointer + current = input[id]; + + if (current.x == MARKER) + nexty = current.y; + else + nexty = ty; + + for (int i = 0; i < bandSize; i++, id -= size) + if (ty - i == nexty) { + current = make_pixel(lasty, input[id].y); + output[id] = current; + + lasty = nexty; + nexty = current.y; + } + + // Store the pointer to the head at the first pixel of this band + if (lasty != ty - bandSize + 1) + output[id + size] = make_pixel(lasty, MARKER); +} + +__global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size, int bandSize) +{ + int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int band1 = blockIdx.y * 2; + int band2 = band1 + 1; + int firsty, lasty; + pixel_int2_t last1, last2, current; + // last1 and last2: x component store the x coordinate of the site, + // y component store the backward pointer + // current: y component store the x coordinate of the site, + // x component store the forward pointer + + // Get the two last items of the first list + lasty = __mul24(band2, bandSize) - 1; + last2 = make_pixel(color[TOID(tx, lasty, size)].x, + link[TOID(tx, lasty, size)].y); + + if (last2.x == MARKER) { + lasty = last2.y; + + if (lasty >= 0) + last2 = make_pixel(color[TOID(tx, lasty, size)].x, + link[TOID(tx, lasty, size)].y); + else + last2 = make_pixel(MARKER, MARKER); + } + + if (last2.y >= 0) { + // Second item at the top of the stack + last1 = make_pixel(color[TOID(tx, last2.y, size)].x, + link[TOID(tx, last2.y, size)].y); + } + + // Get the first item of the second band + firsty = __mul24(band2, bandSize); + current = make_pixel(link[TOID(tx, firsty, size)].x, + color[TOID(tx, firsty, size)].x); + + if (current.y == MARKER) { + firsty = current.x; + + if (firsty >= 0) + current = make_pixel(link[TOID(tx, firsty, size)].x, + color[TOID(tx, firsty, size)].x); + else + current = make_pixel(MARKER, MARKER); + } + + // Count the number of item in the second band that survive so far. + // Once it reaches 2, we can stop. + int top = 0; + + while (top < 2 && current.y >= 0) { + // While there's still something on the left + while (last2.y >= 0) { + + if (!dominate(last1.x, last2.y, last2.x, \ + lasty, current.y, firsty, tx)) + break; + + lasty = last2.y; last2 = last1; + top--; + + if (last1.y >= 0) + last1 = make_pixel(color[TOID(tx, last1.y, size)].x, + link[TOID(tx, last1.y, size)].y); + } + + // Update the current pointer + output[TOID(tx, firsty, size)] = make_pixel(current.x, lasty); + + if (lasty >= 0) + output[TOID(tx, lasty, size)] = make_pixel(firsty, last2.y); + + last1 = last2; last2 = make_pixel(current.y, lasty); lasty = firsty; + firsty = current.x; + + top = max(1, top + 1); + + // Advance the current pointer to the next one + if (firsty >= 0) + current = make_pixel(link[TOID(tx, firsty, size)].x, + color[TOID(tx, firsty, size)].x); + else + current = make_pixel(MARKER, MARKER); + } + + // Update the head and tail pointer. + firsty = __mul24(band1, bandSize); + lasty = __mul24(band2, bandSize); + current = link[TOID(tx, firsty, size)]; + + if (current.y == MARKER && current.x < 0) { // No head? + last1 = link[TOID(tx, lasty, size)]; + + if (last1.y == MARKER) + current.x = last1.x; + else + current.x = lasty; + + output[TOID(tx, firsty, size)] = current; + } + + firsty = __mul24(band1, bandSize) + bandSize - 1; + lasty = __mul24(band2, bandSize) + bandSize - 1; + current = link[TOID(tx, lasty, size)]; + + if (current.x == MARKER && current.y < 0) { // No tail? + last1 = link[TOID(tx, firsty, size)]; + + if (last1.x == MARKER) + current.y = last1.y; + else + current.y = firsty; + + output[TOID(tx, lasty, size)] = current; + } +} + +__global__ void kernelDoubleToSingleList(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size) +{ + int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int ty = blockIdx.y; + int id = TOID(tx, ty, size); + + output[id] = make_pixel(color[id].x, link[id].y); +} + +__global__ void kernelColor(pixel_int2_t *input, pixel_int2_t *output, int size) +{ + __shared__ pixel_int2_t block[BLOCKSIZE][BLOCKSIZE]; + + int col = threadIdx.x; + int tid = threadIdx.y; + int tx = __mul24(blockIdx.x, blockDim.x) + col; + int dx, dy, lasty; + unsigned int best, dist; + pixel_int2_t last1, last2; + + lasty = size - 1; + + last2 = input[TOID(tx, lasty, size)]; + + if (last2.x == MARKER) { + lasty = last2.y; + last2 = input[TOID(tx, lasty, size)]; + } + + if (last2.y >= 0) + last1 = input[TOID(tx, last2.y, size)]; + + int y_start, y_end, n_step = size / blockDim.x; + for(int step = 0; step < n_step; ++step) { + y_start = size - step * blockDim.x - 1; + y_end = size - (step + 1) * blockDim.x; + + for (int ty = y_start - tid; ty >= y_end; ty -= blockDim.y) { + dx = last2.x - tx; dy = lasty - ty; + best = dist = __mul24(dx, dx) + __mul24(dy, dy); + + while (last2.y >= 0) { + dx = last1.x - tx; dy = last2.y - ty; + dist = __mul24(dx, dx) + __mul24(dy, dy); + + if (dist > best) + break; + + best = dist; lasty = last2.y; last2 = last1; + + if (last2.y >= 0) + last1 = input[TOID(tx, last2.y, size)]; + } + + block[threadIdx.x][ty - y_end] = make_pixel(lasty, last2.x); + } + + __syncthreads(); + + // note: transposes back to original shape here + if(!threadIdx.y) { + int id = TOID(y_end + threadIdx.x, blockIdx.x * blockDim.x, size); + for(int i = 0; i < blockDim.x; ++i, id+=size) { + output[id] = block[i][threadIdx.x]; + } + } + + __syncthreads(); + } +} +} // extern C diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h new file mode 100644 index 000000000..bcb748fe6 --- /dev/null +++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h @@ -0,0 +1,209 @@ + +#ifndef MARKER + +#define MARKER -2147483648 +#define MAX_INT 2147483647 +#define BLOCKSIZE 32 + +#endif // MARKER + +#define LL long long +__device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3, LL y_3, LL z_3, LL x_0, LL z_0) +{ + LL k_1 = y_2 - y_1, k_2 = y_3 - y_2; + + return (((y_1 + y_2) * k_1 + ((x_2 - x_1) * (x_1 + x_2 - (x_0 << 1)) + (z_2 - z_1) * (z_1 + z_2 - (z_0 << 1)))) * k_2 > \ + ((y_2 + y_3) * k_2 + ((x_3 - x_2) * (x_2 + x_3 - (x_0 << 1)) + (z_3 - z_2) * (z_2 + z_3 - (z_0 << 1)))) * k_1); +} +#undef LL + +#define TOID(x, y, z, size) ((((z) * (size)) + (y)) * (size) + (x)) + + +#ifndef ENCODE + +// Sites : ENCODE(x, y, z, 0, 0) +// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER +#define ENCODE(x, y, z, a, b) (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30)) +#define DECODE(value, x, y, z) \ + x = ((value) >> 20) & 0x3ff; \ + y = ((value) >> 10) & 0x3ff; \ + z = (value) & 0x3ff + +#define NOTSITE(value) (((value) >> 31) & 1) +#define HASNEXT(value) (((value) >> 30) & 1) + +#define GET_X(value) (((value) >> 20) & 0x3ff) +#define GET_Y(value) (((value) >> 10) & 0x3ff) +#define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff)) + +#endif // ENCODE + + +extern "C"{ + +__global__ void kernelFloodZ(int *input, int *output, int size) +{ + + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * blockDim.y + threadIdx.y; + int tz = 0; + + int plane = size * size; + int id = TOID(tx, ty, tz, size); + int pixel1, pixel2; + + pixel1 = ENCODE(0,0,0,1,0); + + // Sweep down + for (int i = 0; i < size; i++, id += plane) { + pixel2 = input[id]; + + if (!NOTSITE(pixel2)) + pixel1 = pixel2; + + output[id] = pixel1; + } + + int dist1, dist2, nz; + + id -= plane + plane; + + // Sweep up + for (int i = size - 2; i >= 0; i--, id -= plane) { + nz = GET_Z(pixel1); + dist1 = abs(nz - (tz + i)); + + pixel2 = output[id]; + nz = GET_Z(pixel2); + dist2 = abs(nz - (tz + i)); + + if (dist2 < dist1) + pixel1 = pixel2; + + output[id] = pixel1; + } +} + + +__global__ void kernelMaurerAxis(int *input, int *stack, int size) +{ + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int tz = blockIdx.y * blockDim.y + threadIdx.y; + int ty = 0; + + int id = TOID(tx, ty, tz, size); + + int lasty = 0; + int x1, y1, z1, x2, y2, z2, nx, ny, nz; + int p = ENCODE(0,0,0,1,0), s1 = ENCODE(0,0,0,1,0), s2 = ENCODE(0,0,0,1,0); + int flag = 0; + + for (ty = 0; ty < size; ++ty, id += size) { + p = input[id]; + + if (!NOTSITE(p)) { + + while (HASNEXT(s2)) { + DECODE(s1, x1, y1, z1); + DECODE(s2, x2, y2, z2); + DECODE(p, nx, ny, nz); + + if (!dominate(x1, y2, z1, x2, lasty, z2, nx, ty, nz, tx, tz)) + break; + + lasty = y2; s2 = s1; y2 = y1; + + if (HASNEXT(s2)) + s1 = stack[TOID(tx, y2, tz, size)]; + } + + DECODE(p, nx, ny, nz); + s1 = s2; + s2 = ENCODE(nx, lasty, nz, 0, flag); + y2 = lasty; + lasty = ty; + + stack[id] = s2; + + flag = 1; + } + } + + if (NOTSITE(p)) + stack[TOID(tx, ty - 1, tz, size)] = ENCODE(0, lasty, 0, 1, flag); +} + +__global__ void kernelColorAxis(int *input, int *output, int size) +{ + __shared__ int block[BLOCKSIZE][BLOCKSIZE]; + + int col = threadIdx.x; + int tid = threadIdx.y; + int tx = blockIdx.x * blockDim.x + col; + int tz = blockIdx.y; + + int x1, y1, z1, x2, y2, z2; + int last1 = ENCODE(0,0,0,1,0), last2 = ENCODE(0,0,0,1,0), lasty; + long long dx, dy, dz, best, dist; + + lasty = size - 1; + + last2 = input[TOID(tx, lasty, tz, size)]; + DECODE(last2, x2, y2, z2); + + if (NOTSITE(last2)) { + lasty = y2; + if(HASNEXT(last2)) { + last2 = input[TOID(tx, lasty, tz, size)]; + DECODE(last2, x2, y2, z2); + } + } + + if (HASNEXT(last2)) { + last1 = input[TOID(tx, y2, tz, size)]; + DECODE(last1, x1, y1, z1); + } + + int y_start, y_end, n_step = size / blockDim.x; + for(int step = 0; step < n_step; ++step) { + y_start = size - step * blockDim.x - 1; + y_end = size - (step + 1) * blockDim.x; + + for (int ty = y_start - tid; ty >= y_end; ty -= blockDim.y) { + dx = x2 - tx; dy = lasty - ty; dz = z2 - tz; + best = dx * dx + dy * dy + dz * dz; + + while (HASNEXT(last2)) { + dx = x1 - tx; dy = y2 - ty; dz = z1 - tz; + dist = dx * dx + dy * dy + dz * dz; + + if(dist > best) break; + + best = dist; lasty = y2; last2 = last1; + DECODE(last2, x2, y2, z2); + + if (HASNEXT(last2)) { + last1 = input[TOID(tx, y2, tz, size)]; + DECODE(last1, x1, y1, z1); + } + } + + block[threadIdx.x][ty - y_end] = ENCODE(lasty, x2, z2, NOTSITE(last2), 0); + } + + __syncthreads(); + + if(!threadIdx.y) { + int id = TOID(y_end + threadIdx.x, blockIdx.x * blockDim.x, tz, size); + for(int i = 0; i < blockDim.x; i++, id+=size) { + output[id] = block[i][threadIdx.x]; + } + } + + __syncthreads(); + } +} + + +} // extern C \ No newline at end of file From 6a9997ff9805fc31112793697a075cb6e7867fe9 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Mon, 27 Jun 2022 22:01:52 -0400 Subject: [PATCH 03/12] add tests for 2d distance transform fix automated setting of m1, m2, m3 parameters for 2D PBA+ --- .../morphology/_distance_transform.py | 10 +- .../core/operations/morphology/_pba_2d.py | 37 +++----- .../tests/test_distance_transform.py | 91 +++++++++++++++++++ 3 files changed, 114 insertions(+), 24 deletions(-) create mode 100644 python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py index 372227e85..d3acf5ec7 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py +++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py @@ -79,6 +79,10 @@ def distance_transform_edt(image, sampling=None, return_distances=True, where b[i] is the background point (value 0) with the smallest Euclidean distance to input points x[i], and n is the number of dimensions. + Note that the `indices` output may differ from the one given by + `scipy.ndimage.distance_transform_edt` in the case of input pixels that are + equidistant from multiple background points. + The parallel banding algorithm implemented here was originally described in [1]_. The kernels used here correspond to the revised PBA+ implementation that is described on the author's website [2]_. The source code of the @@ -150,6 +154,7 @@ def distance_transform_edt(image, sampling=None, return_distances=True, raise NotImplementedError( "non-uniform values in sampling is not currently supported" ) + sampling = float(sampling) if image.ndim == 3: pba_func = _pba_3d @@ -168,6 +173,9 @@ def distance_transform_edt(image, sampling=None, return_distances=True, ) if return_distances and sampling is not None: - vals[0] = vals[0] * sampling + vals= (vals[0] * sampling,) + vals[1:] + + if len(vals) == 1: + vals = vals[0] return vals diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py index 01007072c..f26ff3760 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py @@ -85,26 +85,12 @@ def _unpack_int2(img, make_copy=False, int_dtype=cupy.int16): return temp -def _determine_padding(shape, block_size): +def _determine_padding(shape, padded_size, block_size): # all kernels assume equal size along both axes, so pad up to equal size if # shape is not isotropic - round_up = False orig_sy, orig_sx = shape - if orig_sx % block_size != 0: - # round up size to a multiple of the band size - round_up = True - sx = block_size * math.ceil(orig_sx / block_size) - else: - sx = orig_sx - if orig_sy % block_size != 0: - # round up size to a multiple of the band size - round_up = True - sy = block_size * math.ceil(orig_sy / block_size) - else: - sy = orig_sy - if sx != sy or round_up: - smax = max(sx, sy) - padding_width = ((0, smax - orig_sy), (0, smax - orig_sx)) + if orig_sx != padded_size or orig_sy != padded_size: + padding_width = ((0, padded_size - orig_sy), (0, padded_size - orig_sx)) else: padding_width = None return padding_width @@ -126,12 +112,18 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, # overhead of querying properties block_size = _get_block_size(check_warp_size) - padded_size = max(arr.shape) + padded_size = math.ceil(max(arr.shape) / block_size) * block_size if block_params is None: # should be <= size / 64. sy must be a multiple of m1 m1 = max(1, min(padded_size // block_size, 32)) + # size must be a multiple of m2 m2 = max(1, min(padded_size // block_size, 32)) + # m2 must also be a power of two + m2 = 2**math.floor(math.log2(m2)) + if padded_size % m2 != 0: + raise RuntimeError("error in setting default m2") + # should be <= 64. texture size must be a multiple of m3 m3 = min(min(m1, m2), 2) else: @@ -142,10 +134,10 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, if m2 > padded_size // block_size: raise ValueError("m2 too large. must be <= arr.shape[1] // 32") for m in (m1, m2, m3): - if padded_size % m1 != 0: + if padded_size % m != 0: raise ValueError( f"Largest dimension of image ({padded_size}) must be evenly " - f"disivible by each element of block_params: {block_params}." + f"disivible by each element of block_params: {(m1, m2, m3)}." ) shape_max = max(arr.shape) @@ -166,11 +158,10 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, marker = _init_marker(int_dtype) orig_sy, orig_sx = arr.shape - padding_width = _determine_padding(arr.shape, block_size) + padding_width = _determine_padding(arr.shape, padded_size, block_size) if padding_width is not None: arr = cupy.pad(arr, padding_width, mode='constant', constant_values=1) size = arr.shape[0] - print(f"{orig_sy=}, {orig_sx=}, {padding_width=}, {arr.shape=}") input_arr = _pack_int2(arr, marker=marker, int_dtype=int_dtype) output = cupy.zeros_like(input_arr) @@ -296,6 +287,6 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, cupy.sqrt(dist, out=dist) vals = vals + (dist,) if return_indices: - indices = cupy.stack((x, y), axis=0) + indices = cupy.stack((y, x), axis=0) vals = vals + (indices,) return vals diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py new file mode 100644 index 000000000..59e95469d --- /dev/null +++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py @@ -0,0 +1,91 @@ +from copy import copy + +import cupy as cp +import numpy as np +import pytest +from cupy import testing +import scipy.ndimage as ndi_cpu + +from cucim.core.operations.morphology import distance_transform_edt + + + +def binary_image(shape, pct_true=50): + rng = cp.random.default_rng(123) + x = rng.integers(0, 100, size=shape, dtype=cp.uint8) + return x >=pct_true + + +def assert_percentile_equal(arr1, arr2, pct=95): + """Assert that at least pct% of the entries in arr1 and arr2 are equal.""" + pct_mismatch = (100 - pct) / 100 + arr1 = cp.asnumpy(arr1) + arr2 = cp.asnumpy(arr2) + mismatch = np.sum(arr1 != arr2) / arr1.size + assert mismatch < pct_mismatch + + +@pytest.mark.parametrize('block_params', [(1, 1, 1), None]) +@pytest.mark.parametrize('sampling', [None, (1.5, 1.5)]) +@pytest.mark.parametrize('shape', [(256, 256), (537, 236)]) +@pytest.mark.parametrize('density', [20, 50, 80]) +def test_distance_transform_edt_2d(shape, sampling, block_params, density): + + kwargs_scipy = dict(sampling=sampling) + kwargs_cucim = dict(sampling=sampling, block_params=block_params) + img = binary_image(shape, pct_true=density) + out = distance_transform_edt(img, **kwargs_cucim) + expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs_scipy) + cp.testing.assert_allclose(out, expected) + + +@pytest.mark.parametrize('sx', list(range(32))) +@pytest.mark.parametrize('sy', list(range(16))) +def test_distance_transform_edt_2d_aniso_block_params(sx, sy): + """ensure default block_params is robust to anisotropic shape.""" + shape = (128 + sx, 128 + sy) + img = binary_image(shape, pct_true=80) + out = distance_transform_edt(img) + expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img)) + cp.testing.assert_allclose(out, expected) + + +@pytest.mark.parametrize('value', [0, 1, 3]) +def test_distance_transform_edt_2d_uniform_valued(value): + """ensure default block_params is robust to anisotropic shape.""" + img = cp.full((64, 64), value, dtype=cp.uint8) + # ensure there is at least 1 pixel at background intensity + img[13, 13] = 0 + out = distance_transform_edt(img) + expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img)) + cp.testing.assert_allclose(out, expected) + + +@pytest.mark.parametrize('return_indices', [False, True]) +@pytest.mark.parametrize('return_distances', [False, True]) +@pytest.mark.parametrize('sampling', [None, (1.5, 1.5)]) +@pytest.mark.parametrize('shape', [(256, 256), (65, 128)]) +def test_distance_transform_edt_2d_returns(shape, sampling, return_distances, + return_indices): + + if not (return_indices or return_distances): + return + + kwargs = dict( + sampling=sampling, + return_distances=return_distances, + return_indices=return_indices, + ) + img = binary_image(shape, pct_true=50) + out = distance_transform_edt(img, **kwargs) + expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs) + if return_indices and return_distances: + assert len(out) == 2 + cp.testing.assert_allclose(out[0], expected[0]) + # May differ at a small % of coordinates where multiple points were + # equidistant. + assert_percentile_equal(out[1], expected[1], pct=95) + elif return_distances: + cp.testing.assert_allclose(out, expected) + elif return_indices: + assert_percentile_equal(out, expected, pct=95) From 3d44899eb11d1c1878b5c1a855b6a2a5cb6c50ae Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Mon, 27 Jun 2022 23:10:41 -0400 Subject: [PATCH 04/12] Update 3D kernels to allow use of size > 1024 per axis When switching to 64-bit int can go up to size 2**20 on each axis. --- .../core/operations/morphology/_pba_3d.py | 75 +++++++++------ .../morphology/cuda/pba_kernels_3d.h | 91 +++++++++++-------- 2 files changed, 100 insertions(+), 66 deletions(-) diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py index 2ba7851dc..597df93f9 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py @@ -22,6 +22,9 @@ pba3d_defines_encode_32bit = """ // Sites : ENCODE(x, y, z, 0, 0) // Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER +#define ENCODED_INT_TYPE int +#define ZERO 0 +#define ONE 1 #define ENCODE(x, y, z, a, b) (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30)) #define DECODE(value, x, y, z) \ x = ((value) >> 20) & 0x3ff; \ @@ -42,28 +45,35 @@ pba3d_defines_encode_64bit = """ // Sites : ENCODE(x, y, z, 0, 0) // Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER -#define ENCODE(x, y, z, a, b) (((x) << 40) | ((y) << 20) | (z) | ((a) << 63) | ((b) << 62)) +#define ENCODED_INT_TYPE long long +#define ZERO 0L +#define ONE 1L +#define ENCODE(x, y, z, a, b) (((x) << 40) | ((y) << 20) | (z) | ((a) << 61) | ((b) << 60)) #define DECODE(value, x, y, z) \ - x = ((value) >> 40) & 0x3ff; \ - y = ((value) >> 20) & 0x3ff; \ - z = (value) & 0x3ff + x = ((value) >> 40) & 0xfffff; \ + y = ((value) >> 20) & 0xfffff; \ + z = (value) & 0xfffff -#define NOTSITE(value) (((value) >> 63) & 1) -#define HASNEXT(value) (((value) >> 62) & 1) +#define NOTSITE(value) (((value) >> 61) & 1) +#define HASNEXT(value) (((value) >> 60) & 1) -#define GET_X(value) (((value) >> 40) & 0x3ff) -#define GET_Y(value) (((value) >> 20) & 0x3ff) -#define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff)) +#define GET_X(value) (((value) >> 40) & 0xfffff) +#define GET_Y(value) (((value) >> 20) & 0xfffff) +#define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0xfffff)) """ # noqa @cupy.memoize(True) -def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647): +def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647, + size_max=1024): pba3d_code = pba3d_defines_template.format( block_size_3d=block_size_3d, marker=marker, max_int=max_int ) - pba3d_code += pba3d_defines_encode_32bit + if size_max > 1024: + pba3d_code += pba3d_defines_encode_64bit + else: + pba3d_code += pba3d_defines_encode_32bit kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda') with open(os.path.join(kernel_directory, 'pba_kernels_3d.h'), 'rt') as f: pba3d_kernels = '\n'.join(f.readlines()) @@ -72,29 +82,38 @@ def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647): # TODO: custom kernel for encode3d -def encode3d(arr, marker=-2147483648, bit_depth=32): +def encode3d(arr, marker=-2147483648, bit_depth=32, size_max=1024): if arr.ndim != 3: raise ValueError("only 3d arr suppported") if bit_depth not in [32, 64]: raise ValueError("only bit_depth of 32 or 64 is supported") - if any(s > 1024 for s in arr.shape): - raise ValueError( - "encoding is for maximum of 1024 along each dimension") - input = cupy.zeros(arr.shape, dtype=np.int32, order='C') + if size_max > 1024: + dtype = np.int64 + else: + dtype = np.int32 + image = cupy.zeros(arr.shape, dtype=dtype, order='C') cond = arr == 0 z, y, x = cupy.where(cond) # z, y, x so that x is the contiguous axis (must match the TOID macro in the C++ code!) - input[cond] = (((x) << 20) | ((y) << 10) | (z)) - input[arr != 0] = marker # 1 << 32 - return input + if size_max > 1024: + image[cond] = (((x) << 40) | ((y) << 20) | (z)) + else: + image[cond] = (((x) << 20) | ((y) << 10) | (z)) + image[arr != 0] = marker # 1 << 32 + return image # TODO: custom kernel for decode3d -def decode3d(output): +def decode3d(output, size_max=1024): # Note: z, y, x reversed vs. DECODE define in C above due to C-ordered memory layout - x = (output >> 20) & 0x3ff - y = (output >> 10) & 0x3ff - z = output & 0x3ff + if size_max > 1024: + x = (output >> 40) & 0xfffff + y = (output >> 20) & 0xfffff + z = output & 0xfffff + else: + x = (output >> 20) & 0x3ff + y = (output >> 10) & 0x3ff + z = output & 0x3ff return (x, y, z) @@ -171,7 +190,8 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, # pba algorithm was implemented to use 32-bit integer to store compressed # coordinates. input_arr will be C-contiguous, int32 - input_arr = encode3d(arr) + size_max = max(arr.shape) # TODO: update + input_arr = encode3d(arr, size_max=size_max) buffer_idx = 0 output = cupy.zeros_like(input_arr) pba_images = [input_arr, output] @@ -185,7 +205,8 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, pba3d = cupy.RawModule( - code=get_pba3d_src(block_size_3d=block_size, marker=marker) + code=get_pba3d_src(block_size_3d=block_size, marker=marker, + size_max=size_max) ) kernelFloodZ = pba3d.get_function('kernelFloodZ') @@ -232,9 +253,9 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, ) output = pba_images[buffer_idx] - if return_distances or return_indices: - x, y, z = decode3d(output[:orig_sz, :orig_sy, :orig_sx]) + x, y, z = decode3d(output[:orig_sz, :orig_sy, :orig_sx], + size_max=size_max) vals = () if return_distances: diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h index bcb748fe6..f6e90d2fb 100644 --- a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h +++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h @@ -1,11 +1,44 @@ +// Euclidean Distance Transform +// +// Kernels for the 3D version of the Parallel Banding Algorithm (PBA+). +// +// MIT license: see 3rdparty/LICENSE.pba+ -#ifndef MARKER +// START OF DEFINITIONS LIKELY OVERRIDDEN BY THE PYTHON SCRIPT + +#ifndef MARKER #define MARKER -2147483648 +#endif // MARKER + +#ifndef MAX_INT #define MAX_INT 2147483647 +#endif + +#ifndef BLOCKSIZE #define BLOCKSIZE 32 +#endif -#endif // MARKER +// #ifndef ENCODE +// +// // Sites : ENCODE(x, y, z, 0, 0) +// // Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER +// #define ENCODE(x, y, z, a, b) (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30)) +// #define DECODE(value, x, y, z) \ +// x = ((value) >> 20) & 0x3ff; \ +// y = ((value) >> 10) & 0x3ff; \ +// z = (value) & 0x3ff + +// #define NOTSITE(value) (((value) >> 31) & 1) +// #define HASNEXT(value) (((value) >> 30) & 1) + +// #define GET_X(value) (((value) >> 20) & 0x3ff) +// #define GET_Y(value) (((value) >> 10) & 0x3ff) +// #define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff)) + +// #endif // ENCODE + +// END OF DEFINITIONS POTENTIALLY DEFINED IN THE PYTHON SCRIPT #define LL long long __device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3, LL y_3, LL z_3, LL x_0, LL z_0) @@ -20,29 +53,9 @@ __device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3, #define TOID(x, y, z, size) ((((z) * (size)) + (y)) * (size) + (x)) -#ifndef ENCODE - -// Sites : ENCODE(x, y, z, 0, 0) -// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER -#define ENCODE(x, y, z, a, b) (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30)) -#define DECODE(value, x, y, z) \ - x = ((value) >> 20) & 0x3ff; \ - y = ((value) >> 10) & 0x3ff; \ - z = (value) & 0x3ff - -#define NOTSITE(value) (((value) >> 31) & 1) -#define HASNEXT(value) (((value) >> 30) & 1) - -#define GET_X(value) (((value) >> 20) & 0x3ff) -#define GET_Y(value) (((value) >> 10) & 0x3ff) -#define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff)) - -#endif // ENCODE - - extern "C"{ -__global__ void kernelFloodZ(int *input, int *output, int size) +__global__ void kernelFloodZ(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *output, int size) { int tx = blockIdx.x * blockDim.x + threadIdx.x; @@ -51,9 +64,9 @@ __global__ void kernelFloodZ(int *input, int *output, int size) int plane = size * size; int id = TOID(tx, ty, tz, size); - int pixel1, pixel2; + ENCODED_INT_TYPE pixel1, pixel2; - pixel1 = ENCODE(0,0,0,1,0); + pixel1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO); // Sweep down for (int i = 0; i < size; i++, id += plane) { @@ -65,7 +78,7 @@ __global__ void kernelFloodZ(int *input, int *output, int size) output[id] = pixel1; } - int dist1, dist2, nz; + ENCODED_INT_TYPE dist1, dist2, nz; id -= plane + plane; @@ -86,7 +99,7 @@ __global__ void kernelFloodZ(int *input, int *output, int size) } -__global__ void kernelMaurerAxis(int *input, int *stack, int size) +__global__ void kernelMaurerAxis(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *stack, int size) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int tz = blockIdx.y * blockDim.y + threadIdx.y; @@ -94,10 +107,10 @@ __global__ void kernelMaurerAxis(int *input, int *stack, int size) int id = TOID(tx, ty, tz, size); - int lasty = 0; - int x1, y1, z1, x2, y2, z2, nx, ny, nz; - int p = ENCODE(0,0,0,1,0), s1 = ENCODE(0,0,0,1,0), s2 = ENCODE(0,0,0,1,0); - int flag = 0; + ENCODED_INT_TYPE lasty = 0; + ENCODED_INT_TYPE x1, y1, z1, x2, y2, z2, nx, ny, nz; + ENCODED_INT_TYPE p = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), s1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), s2 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO); + ENCODED_INT_TYPE flag = 0; for (ty = 0; ty < size; ++ty, id += size) { p = input[id]; @@ -120,31 +133,31 @@ __global__ void kernelMaurerAxis(int *input, int *stack, int size) DECODE(p, nx, ny, nz); s1 = s2; - s2 = ENCODE(nx, lasty, nz, 0, flag); + s2 = ENCODE(nx, lasty, nz, ZERO, flag); y2 = lasty; lasty = ty; stack[id] = s2; - flag = 1; + flag = ONE; } } if (NOTSITE(p)) - stack[TOID(tx, ty - 1, tz, size)] = ENCODE(0, lasty, 0, 1, flag); + stack[TOID(tx, ty - 1, tz, size)] = ENCODE(ZERO, lasty, ZERO, ONE, flag); } -__global__ void kernelColorAxis(int *input, int *output, int size) +__global__ void kernelColorAxis(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *output, int size) { - __shared__ int block[BLOCKSIZE][BLOCKSIZE]; + __shared__ ENCODED_INT_TYPE block[BLOCKSIZE][BLOCKSIZE]; int col = threadIdx.x; int tid = threadIdx.y; int tx = blockIdx.x * blockDim.x + col; int tz = blockIdx.y; - int x1, y1, z1, x2, y2, z2; - int last1 = ENCODE(0,0,0,1,0), last2 = ENCODE(0,0,0,1,0), lasty; + ENCODED_INT_TYPE x1, y1, z1, x2, y2, z2; + ENCODED_INT_TYPE last1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), last2 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), lasty; long long dx, dy, dz, best, dist; lasty = size - 1; @@ -189,7 +202,7 @@ __global__ void kernelColorAxis(int *input, int *output, int size) } } - block[threadIdx.x][ty - y_end] = ENCODE(lasty, x2, z2, NOTSITE(last2), 0); + block[threadIdx.x][ty - y_end] = ENCODE(lasty, x2, z2, NOTSITE(last2), ZERO); } __syncthreads(); From 8509bfff884ba1968879e7b13f01d39f1abbb94d Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 28 Jun 2022 10:00:30 -0400 Subject: [PATCH 05/12] expand tests and extend to 3D cases --- .../tests/test_distance_transform.py | 111 +++++++++++------- 1 file changed, 69 insertions(+), 42 deletions(-) diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py index 59e95469d..60a692f48 100644 --- a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py +++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py @@ -25,67 +25,94 @@ def assert_percentile_equal(arr1, arr2, pct=95): assert mismatch < pct_mismatch -@pytest.mark.parametrize('block_params', [(1, 1, 1), None]) -@pytest.mark.parametrize('sampling', [None, (1.5, 1.5)]) -@pytest.mark.parametrize('shape', [(256, 256), (537, 236)]) -@pytest.mark.parametrize('density', [20, 50, 80]) -def test_distance_transform_edt_2d(shape, sampling, block_params, density): - - kwargs_scipy = dict(sampling=sampling) - kwargs_cucim = dict(sampling=sampling, block_params=block_params) +@pytest.mark.parametrize('return_indices', [False, True]) +@pytest.mark.parametrize('return_distances', [False, True]) +@pytest.mark.parametrize('shape, sampling', + [ + ((256, 128), None), + ((384, 256), (1.5, 1.5)), + ((14, 32, 50), None), + ((50, 32, 24), (2, 2, 2)), + ] +) +@pytest.mark.parametrize('density', [5, 50, 95]) +@pytest.mark.parametrize('block_params', [None, (1, 1, 1)]) +def test_distance_transform_edt(shape, sampling, return_distances, + return_indices, density, block_params): + + if not (return_indices or return_distances): + return + + kwargs_scipy = dict( + sampling=sampling, + return_distances=return_distances, + return_indices=return_indices, + ) + kwargs_cucim = copy(kwargs_scipy) + kwargs_cucim['block_params'] = block_params img = binary_image(shape, pct_true=density) out = distance_transform_edt(img, **kwargs_cucim) expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs_scipy) + if return_indices and return_distances: + assert len(out) == 2 + cp.testing.assert_allclose(out[0], expected[0]) + # May differ at a small % of coordinates where multiple points were + # equidistant. + assert_percentile_equal(out[1], expected[1], pct=95) + elif return_distances: + cp.testing.assert_allclose(out, expected) + elif return_indices: + assert_percentile_equal(out, expected, pct=95) + + +@pytest.mark.parametrize('value', [0, 1, 3]) +@pytest.mark.parametrize('ndim', [2, 3]) +def test_distance_transform_edt_uniform_valued(value, ndim): + """ensure default block_params is robust to anisotropic shape.""" + img = cp.full((48, ) * ndim, value, dtype=cp.uint8) + # ensure there is at least 1 pixel at background intensity + img[(slice(24, 25),) * ndim] = 0 + out = distance_transform_edt(img) + expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img)) cp.testing.assert_allclose(out, expected) -@pytest.mark.parametrize('sx', list(range(32))) +@pytest.mark.parametrize('sx', list(range(16))) @pytest.mark.parametrize('sy', list(range(16))) -def test_distance_transform_edt_2d_aniso_block_params(sx, sy): +def test_distance_transform_edt_2d_aniso(sx, sy): """ensure default block_params is robust to anisotropic shape.""" - shape = (128 + sx, 128 + sy) + shape = (128 + sy, 128 + sx) img = binary_image(shape, pct_true=80) out = distance_transform_edt(img) expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img)) cp.testing.assert_allclose(out, expected) -@pytest.mark.parametrize('value', [0, 1, 3]) -def test_distance_transform_edt_2d_uniform_valued(value): +@pytest.mark.parametrize('sx', list(range(4))) +@pytest.mark.parametrize('sy', list(range(4))) +@pytest.mark.parametrize('sz', list(range(4))) +def test_distance_transform_edt_3d_aniso(sx, sy, sz): """ensure default block_params is robust to anisotropic shape.""" - img = cp.full((64, 64), value, dtype=cp.uint8) - # ensure there is at least 1 pixel at background intensity - img[13, 13] = 0 + shape = (16 + sz, 32 + sy, 48 + sx) + img = binary_image(shape, pct_true=80) out = distance_transform_edt(img) expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img)) cp.testing.assert_allclose(out, expected) -@pytest.mark.parametrize('return_indices', [False, True]) -@pytest.mark.parametrize('return_distances', [False, True]) -@pytest.mark.parametrize('sampling', [None, (1.5, 1.5)]) -@pytest.mark.parametrize('shape', [(256, 256), (65, 128)]) -def test_distance_transform_edt_2d_returns(shape, sampling, return_distances, - return_indices): +@pytest.mark.parametrize('ndim', [1, 4, 5]) +def test_distance_transform_edt_unsupported_ndim(ndim): + """ensure default block_params is robust to anisotropic shape.""" + with pytest.raises(NotImplementedError): + distance_transform_edt(cp.zeros((8,) * ndim)) - if not (return_indices or return_distances): - return - kwargs = dict( - sampling=sampling, - return_distances=return_distances, - return_indices=return_indices, - ) - img = binary_image(shape, pct_true=50) - out = distance_transform_edt(img, **kwargs) - expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs) - if return_indices and return_distances: - assert len(out) == 2 - cp.testing.assert_allclose(out[0], expected[0]) - # May differ at a small % of coordinates where multiple points were - # equidistant. - assert_percentile_equal(out[1], expected[1], pct=95) - elif return_distances: - cp.testing.assert_allclose(out, expected) - elif return_indices: - assert_percentile_equal(out, expected, pct=95) +@pytest.mark.skip(reason="excessive memory requirement") +def test_distance_transform_edt_3d_int64(): + """ensure default block_params is robust to anisotropic shape.""" + shape = (1280, 1280, 1280) + img = binary_image(shape, pct_true=80) + distance_transform_edt(img) + # Note: no validation vs. scipy.ndimage due to excessive run time + return + From 0933d739d0717a3c47e940efd7b25cf1a63a2f34 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 28 Jun 2022 10:52:54 -0400 Subject: [PATCH 06/12] test error on non-uniform sampling --- .../morphology/_distance_transform.py | 11 +++--- .../core/operations/morphology/_pba_2d.py | 5 +++ .../core/operations/morphology/_pba_3d.py | 5 +++ .../tests/test_distance_transform.py | 34 +++++++++++++++++-- 4 files changed, 49 insertions(+), 6 deletions(-) diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py index d3acf5ec7..7e51a22bc 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py +++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py @@ -148,13 +148,16 @@ def distance_transform_edt(image, sampling=None, return_distances=True, raise NotImplementedError( "preallocated indices image is not supported" ) + scalar_sampling = None if sampling is not None: sampling = np.unique(np.atleast_1d(sampling)) - if len(sampling) != 1: + if len(sampling) == 1: + scalar_sampling = float(sampling) + sampling = None + else: raise NotImplementedError( "non-uniform values in sampling is not currently supported" ) - sampling = float(sampling) if image.ndim == 3: pba_func = _pba_3d @@ -172,8 +175,8 @@ def distance_transform_edt(image, sampling=None, return_distances=True, block_params=block_params ) - if return_distances and sampling is not None: - vals= (vals[0] * sampling,) + vals[1:] + if return_distances and scalar_sampling is not None: + vals= (vals[0] * scalar_sampling,) + vals[1:] if len(vals) == 1: vals = vals[0] diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py index f26ff3760..57fac41c0 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py @@ -112,6 +112,11 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, # overhead of querying properties block_size = _get_block_size(check_warp_size) + if sampling is not None: + raise NotImplementedError("sampling not yet supported") + # if len(sampling) != 2: + # raise ValueError("sampling must be a sequence of two values.") + padded_size = math.ceil(max(arr.shape) / block_size) * block_size if block_params is None: # should be <= size / 64. sy must be a multiple of m1 diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py index 597df93f9..832817459 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py @@ -159,6 +159,11 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, if arr.ndim != 3: raise ValueError(f"expected a 3D array, got {arr.ndim}D") + if sampling is not None: + raise NotImplementedError("sampling not yet supported") + # if len(sampling) != 3: + # raise ValueError("sampling must be a sequence of three values.") + if block_params is None: m1 = 1 m2 = 1 diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py index 60a692f48..a607f492f 100644 --- a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py +++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py @@ -37,8 +37,9 @@ def assert_percentile_equal(arr1, arr2, pct=95): ) @pytest.mark.parametrize('density', [5, 50, 95]) @pytest.mark.parametrize('block_params', [None, (1, 1, 1)]) -def test_distance_transform_edt(shape, sampling, return_distances, - return_indices, density, block_params): +def test_distance_transform_edt( + shape, sampling, return_distances, return_indices, density, block_params +): if not (return_indices or return_distances): return @@ -65,6 +66,35 @@ def test_distance_transform_edt(shape, sampling, return_distances, assert_percentile_equal(out, expected, pct=95) +@pytest.mark.parametrize('return_indices', [False, True]) +@pytest.mark.parametrize('return_distances', [False, True]) +@pytest.mark.parametrize('shape, sampling', + [ + ((384, 256), (1, 3)), + ((50, 32, 24), (1, 2, 4)), + ] +) +@pytest.mark.parametrize('density', [5, 50, 95]) +def test_distance_transform_edt_nonuniform_sampling( + shape, sampling, return_distances, return_indices, density +): + + if not (return_indices or return_distances): + return + + kwargs_scipy = dict( + sampling=sampling, + return_distances=return_distances, + return_indices=return_indices, + ) + kwargs_cucim = copy(kwargs_scipy) + img = binary_image(shape, pct_true=density) + if sampling is not None and len(np.unique(sampling)) != 1: + with pytest.raises(NotImplementedError): + distance_transform_edt(img, **kwargs_cucim) + return + + @pytest.mark.parametrize('value', [0, 1, 3]) @pytest.mark.parametrize('ndim', [2, 3]) def test_distance_transform_edt_uniform_valued(value, ndim): From e98b611fe9422cf8229eafa80ecd8809cabfcfb1 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 28 Jun 2022 11:47:37 -0400 Subject: [PATCH 07/12] flake8/isort fixes --- .../morphology/_distance_transform.py | 3 +-- .../core/operations/morphology/_pba_2d.py | 23 +++++++++++-------- .../core/operations/morphology/_pba_3d.py | 18 ++++----------- .../tests/test_distance_transform.py | 13 +++++------ 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py index 7e51a22bc..24c34fff6 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py +++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py @@ -3,7 +3,6 @@ from ._pba_2d import _pba_2d from ._pba_3d import _pba_3d - # TODO: make a 64-bit packed coordinates variant in 3D for large shapes # support sampling distances # support chamfer, chessboard and l1/manhattan distances too? @@ -176,7 +175,7 @@ def distance_transform_edt(image, sampling=None, return_distances=True, ) if return_distances and scalar_sampling is not None: - vals= (vals[0] * scalar_sampling,) + vals[1:] + vals = (vals[0] * scalar_sampling,) + vals[1:] if len(vals) == 1: vals = vals[0] diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py index 57fac41c0..edb10a983 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py @@ -1,6 +1,5 @@ import math import os -from functools import reduce import cupy @@ -23,10 +22,13 @@ def _init_marker(int_dtype): # marker = cupy.iinfo(int_dtype).min marker = -32768 elif int_dtype == cupy.int32: - # divide by two so we don't have to promote other intermediate int variables to 64-bit int + # divide by two so we don't have to promote other intermediate int + # variables to 64-bit int marker = -2147483648 // 2 else: - raise ValueError("expected int_dtype to be either cupy.int16 or cupy.int32") + raise ValueError( + "expected int_dtype to be either cupy.int16 or cupy.int32" + ) return marker @@ -62,6 +64,7 @@ def _pack_int2(arr, marker=-32768, int_dtype=cupy.int16): raise ValueError("only 2d arr suppported") input_x = cupy.zeros(arr.shape, dtype=int_dtype) input_y = cupy.zeros(arr.shape, dtype=int_dtype) + # TODO: create custom kernel for setting values in input_x, input_y cond = arr == 0 y, x = cupy.where(cond) input_x[cond] = x @@ -69,7 +72,7 @@ def _pack_int2(arr, marker=-32768, int_dtype=cupy.int16): input_x[mask] = marker # 1 << 32 input_y[cond] = y input_y[mask] = marker # 1 << 32 - int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype]*2}) + int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype] * 2}) # in C++ code x is the contiguous axis and corresponds to width # y is the non-contiguous axis and corresponds to height # given that, store input_x as the last axis here @@ -90,7 +93,9 @@ def _determine_padding(shape, padded_size, block_size): # shape is not isotropic orig_sy, orig_sx = shape if orig_sx != padded_size or orig_sy != padded_size: - padding_width = ((0, padded_size - orig_sy), (0, padded_size - orig_sx)) + padding_width = ( + (0, padded_size - orig_sy), (0, padded_size - orig_sx) + ) else: padding_width = None return padding_width @@ -105,9 +110,6 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, # the pair (x, y). Pixels that are not sites should contain the pair # (MARKER, MARKER) - # See the following on float2, etc. use in cupy: - # https://github.com/cupy/cupy/issues/1535 - # Note: could query warp size here, but for now just assume 32 to avoid # overhead of querying properties block_size = _get_block_size(check_warp_size) @@ -171,7 +173,7 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, input_arr = _pack_int2(arr, marker=marker, int_dtype=int_dtype) output = cupy.zeros_like(input_arr) - int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype]*2}) + int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype] * 2}) margin = cupy.empty((2 * m1 * size,), dtype=int2_dtype) # phase 1 of PBA. m1 must divide texture size and be <= 64 @@ -249,7 +251,7 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, noBand //= 2 # Replace the forward link with the X coordinate of the seed to remove # the need of looking at the other texture. We need it for coloring. - grid = (math.ceil(size / block[0]), size); + grid = (math.ceil(size / block[0]), size) kernelDoubleToSingleList( grid, block, @@ -273,6 +275,7 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False, # raise NotImplementedError("TODO") vals = () if return_distances: + # TODO: custom kernel for more efficient distance computation y0, x0 = cupy.meshgrid( *( cupy.arange(s, dtype=cupy.int32) diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py index 832817459..80415ab3e 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py @@ -1,13 +1,11 @@ import math import os -from functools import reduce import cupy import numpy as np from ._pba_2d import _get_block_size - pba3d_defines_template = """ #define MARKER {marker} @@ -94,7 +92,8 @@ def encode3d(arr, marker=-2147483648, bit_depth=32, size_max=1024): image = cupy.zeros(arr.shape, dtype=dtype, order='C') cond = arr == 0 z, y, x = cupy.where(cond) - # z, y, x so that x is the contiguous axis (must match the TOID macro in the C++ code!) + # z, y, x so that x is the contiguous axis + # (must match TOID macro in the C++ code!) if size_max > 1024: image[cond] = (((x) << 40) | ((y) << 20) | (z)) else: @@ -105,7 +104,6 @@ def encode3d(arr, marker=-2147483648, bit_depth=32, size_max=1024): # TODO: custom kernel for decode3d def decode3d(output, size_max=1024): - # Note: z, y, x reversed vs. DECODE define in C above due to C-ordered memory layout if size_max > 1024: x = (output >> 40) & 0xfffff y = (output >> 20) & 0xfffff @@ -203,15 +201,8 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, block = (blockx, blocky, 1) grid = (size // block[0], size // block[1], 1) - - # MAKE SURE NO GRID/BLOCK SIZES WILL CAUSE OUT-OF-RANGE ACCESS - # block_size = min(32, min_pow2) - marker = -2147483648 - - pba3d = cupy.RawModule( - code=get_pba3d_src(block_size_3d=block_size, marker=marker, - size_max=size_max) + code=get_pba3d_src(block_size_3d=block_size, size_max=size_max) ) kernelFloodZ = pba3d.get_function('kernelFloodZ') @@ -264,7 +255,7 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, vals = () if return_distances: - # TODO: kernel for more efficient distance computation + # TODO: custom kernel for more efficient distance computation orig_shape = (orig_sz, orig_sy, orig_sx) z0, y0, x0 = cupy.meshgrid( *(cupy.arange(s, dtype=cupy.int32) for s in orig_shape), @@ -287,4 +278,3 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, indices = cupy.stack((z, y, x), axis=0) vals = vals + (indices,) return vals - diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py index a607f492f..f88d8f2e1 100644 --- a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py +++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py @@ -3,17 +3,15 @@ import cupy as cp import numpy as np import pytest -from cupy import testing import scipy.ndimage as ndi_cpu from cucim.core.operations.morphology import distance_transform_edt - def binary_image(shape, pct_true=50): rng = cp.random.default_rng(123) - x = rng.integers(0, 100, size=shape, dtype=cp.uint8) - return x >=pct_true + x = rng.integers(0, 100, size=shape, dtype=cp.uint8) + return x >= pct_true def assert_percentile_equal(arr1, arr2, pct=95): @@ -27,7 +25,8 @@ def assert_percentile_equal(arr1, arr2, pct=95): @pytest.mark.parametrize('return_indices', [False, True]) @pytest.mark.parametrize('return_distances', [False, True]) -@pytest.mark.parametrize('shape, sampling', +@pytest.mark.parametrize( + 'shape, sampling', [ ((256, 128), None), ((384, 256), (1.5, 1.5)), @@ -68,7 +67,8 @@ def test_distance_transform_edt( @pytest.mark.parametrize('return_indices', [False, True]) @pytest.mark.parametrize('return_distances', [False, True]) -@pytest.mark.parametrize('shape, sampling', +@pytest.mark.parametrize( + 'shape, sampling', [ ((384, 256), (1, 3)), ((50, 32, 24), (1, 2, 4)), @@ -145,4 +145,3 @@ def test_distance_transform_edt_3d_int64(): distance_transform_edt(img) # Note: no validation vs. scipy.ndimage due to excessive run time return - From e5a37a9593813b678edee97c4a6445f5f95bb3a2 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 28 Jun 2022 12:02:04 -0400 Subject: [PATCH 08/12] minor update to kernel headers --- .../morphology/cuda/pba_kernels_2d.h | 62 +++++++++++++------ .../morphology/cuda/pba_kernels_3d.h | 51 +++++++++------ 2 files changed, 77 insertions(+), 36 deletions(-) diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h index 3ea84e3b2..61677c682 100644 --- a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h +++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h @@ -1,15 +1,41 @@ +// Euclidean Distance Transform +// +// Kernels for the 2D version of the Parallel Banding Algorithm (PBA+). +// +// MIT license: see 3rdparty/LICENSE.pba+ +// Copyright: (c) 2019 School of Computing, National University of Singapore +// +// Modifications by Gregory Lee (2022) (NVIDIA) +// - add user-defined pixel_int2_t to enable +// - replace __mul24 operations with standard multiplication operator + + +// START OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT + +// The values included in this header file are those defined in the original +// PBA+ implementation + +// However, the Python code generation can potentially generate a different +// ENCODE/DECODE that use 20 bits per coordinates instead of 10 bits per +// coordinate with ENCODED_INT_TYPE as `long long`. #ifndef MARKER - #define MARKER -32768 +#endif + +#ifndef BLOCKSIZE #define BLOCKSIZE 32 +#endif + +#ifndef pixel_int2_t #define pixel_int2_t short2 #define make_pixel(x, y) make_short2(x, y) +#endif -#endif // MARKER +// END OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT -#define TOID(x, y, size) (__mul24((y), (size)) + (x)) +#define TOID(x, y, size) ((y) * (size) + (x)) #define LL long long __device__ bool dominate(LL x1, LL y1, LL x2, LL y2, LL x3, LL y3, LL x0) @@ -169,8 +195,8 @@ __global__ void kernelUpdateVertical(pixel_int2_t *color, pixel_int2_t *margin, __global__ void kernelProximatePoints(pixel_int2_t *input, pixel_int2_t *stack, int size, int bandSize) { - int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; - int ty = __mul24(blockIdx.y, bandSize); + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = blockIdx.y * bandSize; int id = TOID(tx, ty, size); int lasty = -1; pixel_int2_t last1, last2, current; @@ -205,8 +231,8 @@ __global__ void kernelProximatePoints(pixel_int2_t *input, pixel_int2_t *stack, __global__ void kernelCreateForwardPointers(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize) { - int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; - int ty = __mul24(blockIdx.y+1, bandSize) - 1; + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int ty = (blockIdx.y+1) * bandSize - 1; int id = TOID(tx, ty, size); int lasty = -1, nexty; pixel_int2_t current; @@ -235,7 +261,7 @@ __global__ void kernelCreateForwardPointers(pixel_int2_t *input, pixel_int2_t *o __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size, int bandSize) { - int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int tx = blockIdx.x * blockDim.x + threadIdx.x; int band1 = blockIdx.y * 2; int band2 = band1 + 1; int firsty, lasty; @@ -246,7 +272,7 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_ // x component store the forward pointer // Get the two last items of the first list - lasty = __mul24(band2, bandSize) - 1; + lasty = band2 * bandSize - 1; last2 = make_pixel(color[TOID(tx, lasty, size)].x, link[TOID(tx, lasty, size)].y); @@ -267,7 +293,7 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_ } // Get the first item of the second band - firsty = __mul24(band2, bandSize); + firsty = band2 * bandSize; current = make_pixel(link[TOID(tx, firsty, size)].x, color[TOID(tx, firsty, size)].x); @@ -321,8 +347,8 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_ } // Update the head and tail pointer. - firsty = __mul24(band1, bandSize); - lasty = __mul24(band2, bandSize); + firsty = band1 * bandSize; + lasty = band2 * bandSize; current = link[TOID(tx, firsty, size)]; if (current.y == MARKER && current.x < 0) { // No head? @@ -336,8 +362,8 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_ output[TOID(tx, firsty, size)] = current; } - firsty = __mul24(band1, bandSize) + bandSize - 1; - lasty = __mul24(band2, bandSize) + bandSize - 1; + firsty = band1 * bandSize + bandSize - 1; + lasty = band2 * bandSize + bandSize - 1; current = link[TOID(tx, lasty, size)]; if (current.x == MARKER && current.y < 0) { // No tail? @@ -354,7 +380,7 @@ __global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_ __global__ void kernelDoubleToSingleList(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size) { - int tx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; + int tx = blockIdx.x * blockDim.x + threadIdx.x; int ty = blockIdx.y; int id = TOID(tx, ty, size); @@ -367,7 +393,7 @@ __global__ void kernelColor(pixel_int2_t *input, pixel_int2_t *output, int size) int col = threadIdx.x; int tid = threadIdx.y; - int tx = __mul24(blockIdx.x, blockDim.x) + col; + int tx = blockIdx.x * blockDim.x + col; int dx, dy, lasty; unsigned int best, dist; pixel_int2_t last1, last2; @@ -391,11 +417,11 @@ __global__ void kernelColor(pixel_int2_t *input, pixel_int2_t *output, int size) for (int ty = y_start - tid; ty >= y_end; ty -= blockDim.y) { dx = last2.x - tx; dy = lasty - ty; - best = dist = __mul24(dx, dx) + __mul24(dy, dy); + best = dist = dx * dx + dy * dy; while (last2.y >= 0) { dx = last1.x - tx; dy = last2.y - ty; - dist = __mul24(dx, dx) + __mul24(dy, dy); + dist = dx * dx + dy * dy; if (dist > best) break; diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h index f6e90d2fb..c09f4b51f 100644 --- a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h +++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h @@ -3,9 +3,20 @@ // Kernels for the 3D version of the Parallel Banding Algorithm (PBA+). // // MIT license: see 3rdparty/LICENSE.pba+ +// +// Modifications by Gregory Lee (2022) (NVIDIA) +// - allow user-defined ENCODED_INT_TYPE, ENCODE, DECODE + + +// START OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT + +// The values included in this header file are those defined in the original +// PBA+ implementation +// However, the Python code generation can potentially generate a different +// ENCODE/DECODE that use 20 bits per coordinates instead of 10 bits per +// coordinate with ENCODED_INT_TYPE as `long long`. -// START OF DEFINITIONS LIKELY OVERRIDDEN BY THE PYTHON SCRIPT #ifndef MARKER #define MARKER -2147483648 @@ -19,26 +30,30 @@ #define BLOCKSIZE 32 #endif -// #ifndef ENCODE -// -// // Sites : ENCODE(x, y, z, 0, 0) -// // Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER -// #define ENCODE(x, y, z, a, b) (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30)) -// #define DECODE(value, x, y, z) \ -// x = ((value) >> 20) & 0x3ff; \ -// y = ((value) >> 10) & 0x3ff; \ -// z = (value) & 0x3ff +#ifndef ENCODE + +// Sites : ENCODE(x, y, z, 0, 0) +// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER +#define ENCODED_INT_TYPE int +#define ZERO 0 +#define ONE 1 +#define ENCODE(x, y, z, a, b) (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30)) +#define DECODE(value, x, y, z) \ + x = ((value) >> 20) & 0x3ff; \ + y = ((value) >> 10) & 0x3ff; \ + z = (value) & 0x3ff + +#define NOTSITE(value) (((value) >> 31) & 1) +#define HASNEXT(value) (((value) >> 30) & 1) -// #define NOTSITE(value) (((value) >> 31) & 1) -// #define HASNEXT(value) (((value) >> 30) & 1) +#define GET_X(value) (((value) >> 20) & 0x3ff) +#define GET_Y(value) (((value) >> 10) & 0x3ff) +#define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff)) -// #define GET_X(value) (((value) >> 20) & 0x3ff) -// #define GET_Y(value) (((value) >> 10) & 0x3ff) -// #define GET_Z(value) ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff)) +#endif // ENCODE -// #endif // ENCODE +// END OF DEFINITIONS DEFINED IN THE PYTHON SCRIPT -// END OF DEFINITIONS POTENTIALLY DEFINED IN THE PYTHON SCRIPT #define LL long long __device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3, LL y_3, LL z_3, LL x_0, LL z_0) @@ -219,4 +234,4 @@ __global__ void kernelColorAxis(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *outpu } -} // extern C \ No newline at end of file +} // extern C From e83d9a7cef28d20873f4617c785ca92846c6c1bb Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 28 Jun 2022 12:45:42 -0400 Subject: [PATCH 09/12] update outdated comment --- .../cucim/core/operations/morphology/_distance_transform.py | 5 ++--- python/cucim/src/cucim/core/operations/morphology/_pba_3d.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py index 24c34fff6..e781938b2 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py +++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py @@ -3,10 +3,9 @@ from ._pba_2d import _pba_2d from ._pba_3d import _pba_3d -# TODO: make a 64-bit packed coordinates variant in 3D for large shapes -# support sampling distances +# TODO: support sampling distances +# support the distances and indices output arguments # support chamfer, chessboard and l1/manhattan distances too? -# remove __mul24 from the pba2d code? (is there any benefit on recent GPUs?) # noqa def distance_transform_edt(image, sampling=None, return_distances=True, diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py index 80415ab3e..bc85d0a79 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py @@ -193,7 +193,7 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False, # pba algorithm was implemented to use 32-bit integer to store compressed # coordinates. input_arr will be C-contiguous, int32 - size_max = max(arr.shape) # TODO: update + size_max = max(arr.shape) input_arr = encode3d(arr, size_max=size_max) buffer_idx = 0 output = cupy.zeros_like(input_arr) From c67b7c4fc0d3e6aa15911e5f2fcaf599bde9f03d Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 29 Jun 2022 16:41:41 -0400 Subject: [PATCH 10/12] add fallback implementation of math.lcm for Python < 3.9 --- .../core/operations/morphology/_pba_3d.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py index bc85d0a79..2429a2f59 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py @@ -1,4 +1,6 @@ +import functools import math +import numbers import os import cupy @@ -6,6 +8,35 @@ from ._pba_2d import _get_block_size +try: + # math.lcm was introduced in Python 3.9 + from math import lcm +except ImportError: + + """Fallback implementation of least common multiple (lcm) + + TODO: remove once minimum Python requirement is >= 3.9 + """ + + def _lcm(a, b): + return abs(b * (a // math.gcd(a, b))) + + @functools.lru_cache() + def lcm(*args): + nargs = len(args) + if not all(isinstance(a, numbers.Integral) for a in args): + raise TypeError("all arguments must be integers") + if nargs == 0: + return 1 + res = int(args[0]) + if nargs == 1: + return abs(res) + for i in range(1, nargs): + x = int(args[i]) + res = _lcm(res, x) + return res + + pba3d_defines_template = """ #define MARKER {marker} From cb06d787aa3acc630283af07c4410af0720ff86b Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 29 Jun 2022 18:00:49 -0400 Subject: [PATCH 11/12] make sure fallback lcm function is actually used if needed --- python/cucim/src/cucim/core/operations/morphology/_pba_3d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py index 2429a2f59..dab484e90 100644 --- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py +++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py @@ -149,7 +149,7 @@ def decode3d(output, size_max=1024): def _determine_padding(shape, block_size, m1, m2, m3, blockx, blocky): # TODO: can possibly revise to consider only particular factors for LCM on # a given axis - LCM = math.lcm(block_size, m1, m2, m3, blockx, blocky) + LCM = lcm(block_size, m1, m2, m3, blockx, blocky) orig_sz, orig_sy, orig_sx = shape round_up = False if orig_sx % LCM != 0: From 2a190b34829ef7b912cefc5499b47eeda2cc6645 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 27 Jul 2022 12:33:11 -0700 Subject: [PATCH 12/12] Apply suggestions from code review remove irrelevant text (copy/paste error) --- .../core/operations/morphology/tests/test_distance_transform.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py index f88d8f2e1..ab46ae1fb 100644 --- a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py +++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py @@ -132,14 +132,12 @@ def test_distance_transform_edt_3d_aniso(sx, sy, sz): @pytest.mark.parametrize('ndim', [1, 4, 5]) def test_distance_transform_edt_unsupported_ndim(ndim): - """ensure default block_params is robust to anisotropic shape.""" with pytest.raises(NotImplementedError): distance_transform_edt(cp.zeros((8,) * ndim)) @pytest.mark.skip(reason="excessive memory requirement") def test_distance_transform_edt_3d_int64(): - """ensure default block_params is robust to anisotropic shape.""" shape = (1280, 1280, 1280) img = binary_image(shape, pct_true=80) distance_transform_edt(img)