From 95a275bf47c2ac453393628304504ffbabbe4c9b Mon Sep 17 00:00:00 2001 From: Bruno Saraiva Date: Thu, 12 Oct 2023 16:49:29 +0100 Subject: [PATCH] changed shift to be a single value and added new memory management --- src/nanopyx/core/transform/_le_esrrf.pyx | 4 +- .../transform/_le_interpolation_bicubic.pyx | 4 +- .../_le_interpolation_catmull_rom.pyx | 79 +++++++------ .../_le_interpolation_catmull_rom_.cl | 12 +- .../transform/_le_interpolation_lanczos.pyx | 77 +++++++------ .../transform/_le_interpolation_lanczos_.cl | 12 +- .../_le_radial_gradient_convergence.pyx | 16 +-- src/nanopyx/core/transform/_le_radiality.pyx | 104 ++++++++++++------ .../transform/_le_roberts_cross_gradients.pyx | 12 +- 9 files changed, 188 insertions(+), 132 deletions(-) diff --git a/src/nanopyx/core/transform/_le_esrrf.pyx b/src/nanopyx/core/transform/_le_esrrf.pyx index ff04e189..70cbe978 100644 --- a/src/nanopyx/core/transform/_le_esrrf.pyx +++ b/src/nanopyx/core/transform/_le_esrrf.pyx @@ -50,7 +50,7 @@ class eSRRF(LiquidEngine): max_slices = int((dc.global_mem_size // total_memory)/mem_div) - self._check_min_slices(max_slices) + max_slices = self._check_max_slices(image, max_slices) mf = cl.mem_flags @@ -75,7 +75,7 @@ class eSRRF(LiquidEngine): rgc_prg = cl.Program(cl_ctx, rgc_code).build(options=["-cl-mad-enable -cl-fast-relaxed-math"]) rgc_knl = rgc_prg.calculate_rgc - for i in range(0, image.shape[0]-1, max_slices): + for i in range(0, image.shape[0], max_slices): if image.shape[0] - i >= max_slices: n_slices = max_slices else: diff --git a/src/nanopyx/core/transform/_le_interpolation_bicubic.pyx b/src/nanopyx/core/transform/_le_interpolation_bicubic.pyx index 132f7d5d..382afc6c 100644 --- a/src/nanopyx/core/transform/_le_interpolation_bicubic.pyx +++ b/src/nanopyx/core/transform/_le_interpolation_bicubic.pyx @@ -87,7 +87,7 @@ class ShiftAndMagnify(LiquidEngine): image_out = np.zeros(output_shape, dtype=np.float32) max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div) - self._check_max_slices(max_slices) + max_slices = self._check_max_slices(image, max_slices) mf = cl.mem_flags input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes) @@ -329,7 +329,7 @@ class ShiftScaleRotate(LiquidEngine): image_out = np.zeros(output_shape, dtype=np.float32) max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div) - self._check_max_slices(max_slices) + max_slices = self._check_max_slices(image, max_slices) mf = cl.mem_flags input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes) diff --git a/src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx b/src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx index 030e5fbd..0eab7c04 100644 --- a/src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx +++ b/src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx @@ -80,7 +80,7 @@ class ShiftAndMagnify(LiquidEngine): # tag-end # tag-copy: _le_interpolation_nearest_neighbor.ShiftAndMagnify._run_opencl; replace("nearest_neighbor", "catmull_rom") - def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device) -> np.ndarray: + def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device, int mem_div=1) -> np.ndarray: # QUEUE AND CONTEXT cl_ctx = cl.Context([device['device']]) @@ -90,14 +90,13 @@ class ShiftAndMagnify(LiquidEngine): output_shape = (image.shape[0], int(image.shape[1]*magnification_row), int(image.shape[2]*magnification_col)) image_out = np.zeros(output_shape, dtype=np.float32) - # TODO 3 is a magic number - max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/3) - # TODO add exception if max_slices < 1 - + max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div) + max_slices = self._check_max_slices(image, max_slices) + mf = cl.mem_flags input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes) - cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait() output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes) + cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait() code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP']) prg = cl.Program(cl_ctx, code).build() @@ -331,44 +330,56 @@ class ShiftScaleRotate(LiquidEngine): # tag-end # tag-copy: _le_interpolation_nearest_neighbor.ShiftScaleRotate._run_opencl; replace("nearest_neighbor", "catmull_rom") - def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device) -> np.ndarray: + def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device, int mem_div=1) -> np.ndarray: # QUEUE AND CONTEXT cl_ctx = cl.Context([device['device']]) + dc = device["device"] cl_queue = cl.CommandQueue(cl_ctx) - code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP']) + output_shape = (image.shape[0], int(image.shape[1]), int(image.shape[2])) + image_out = np.zeros(output_shape, dtype=np.float32) - cdef int nFrames = image.shape[0] - cdef int rowsM = image.shape[1] - cdef int colsM = image.shape[2] + max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div) + max_slices = self._check_max_slices(image, max_slices) - image_in = cl_array.to_device(cl_queue, image) - shift_col_in = cl_array.to_device(cl_queue, shift_col) - shift_row_in = cl_array.to_device(cl_queue, shift_row) - image_out = cl_array.zeros(cl_queue, (nFrames, rowsM, colsM), dtype=np.float32) + mf = cl.mem_flags + input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes) + output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes) + cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait() - # Create the program + code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP']) prg = cl.Program(cl_ctx, code).build() + knl = prg.shiftScaleRotate - # Run the kernel - prg.shiftScaleRotate( - cl_queue, - image_out.shape, - None, - image_in.data, - image_out.data, - shift_row_in.data, - shift_col_in.data, - np.float32(scale_row), - np.float32(scale_col), - np.float32(angle) - ) - - # Wait for queue to finish - cl_queue.finish() - - return np.asarray(image_out.get(),dtype=np.float32) + for i in range(0, image.shape[0], max_slices): + if image.shape[0] - i >= max_slices: + n_slices = max_slices + else: + n_slices = image.shape[0] - i + knl( + cl_queue, + (n_slices, int(image.shape[1]), int(image.shape[2])), + self.get_work_group(dc, (n_slices, image.shape[1], image.shape[2])), + input_opencl, + output_opencl, + np.float32(shift_row), + np.float32(shift_col), + np.float32(scale_row), + np.float32(scale_col), + np.float32(angle) + ).wait() + + cl.enqueue_copy(cl_queue, image_out[i:i+n_slices,:,:], output_opencl).wait() + if i<=image.shape[0]-max_slices: + cl.enqueue_copy(cl_queue, input_opencl, image[i+n_slices:i+2*n_slices,:,:]).wait() + + cl_queue.finish() + + input_opencl.release() + output_opencl.release() + + return image_out # tag-end diff --git a/src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl b/src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl index 4b89461c..250891a8 100644 --- a/src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl +++ b/src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl @@ -75,8 +75,8 @@ shiftAndMagnify(__global float *image_in, __global float *image_out, __kernel void shiftScaleRotate(__global float *image_in, __global float *image_out, - __global float *shift_row, - __global float *shift_col, float scale_row, + float shift_row, + float shift_col, float scale_row, float scale_col, float angle) { // these are the indexes of the loop int f = get_global_id(0); @@ -98,11 +98,11 @@ __kernel void shiftScaleRotate(__global float *image_in, int nPixels = rows * cols; - float col = (a * (cM - center_col - shift_col[f]) + - b * (rM - center_row - shift_row[f])) + + float col = (a * (cM - center_col - shift_col) + + b * (rM - center_row - shift_row)) + center_col; - float row = (c * (cM - center_col - shift_col[f]) + - d * (rM - center_row - shift_row[f])) + + float row = (c * (cM - center_col - shift_col) + + d * (rM - center_row - shift_row)) + center_row; image_out[f * nPixels + rM * cols + cM] = diff --git a/src/nanopyx/core/transform/_le_interpolation_lanczos.pyx b/src/nanopyx/core/transform/_le_interpolation_lanczos.pyx index 91b8fd5f..2de150ef 100644 --- a/src/nanopyx/core/transform/_le_interpolation_lanczos.pyx +++ b/src/nanopyx/core/transform/_le_interpolation_lanczos.pyx @@ -75,7 +75,7 @@ class ShiftAndMagnify(LiquidEngine): # tag-end # tag-copy: _le_interpolation_nearest_neighbor.ShiftAndMagnify._run_opencl; replace("nearest_neighbor", "lanczos") - def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device) -> np.ndarray: + def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device, int mem_div=1) -> np.ndarray: # QUEUE AND CONTEXT cl_ctx = cl.Context([device['device']]) @@ -85,14 +85,13 @@ class ShiftAndMagnify(LiquidEngine): output_shape = (image.shape[0], int(image.shape[1]*magnification_row), int(image.shape[2]*magnification_col)) image_out = np.zeros(output_shape, dtype=np.float32) - # TODO 3 is a magic number - max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/3) - # TODO add exception if max_slices < 1 + max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div) + max_slices = self._check_max_slices(image, max_slices) mf = cl.mem_flags input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes) - cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait() output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes) + cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait() code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP']) prg = cl.Program(cl_ctx, code).build() @@ -318,44 +317,56 @@ class ShiftScaleRotate(LiquidEngine): # tag-end # tag-copy: _le_interpolation_nearest_neighbor.ShiftScaleRotate._run_opencl; replace("nearest_neighbor", "lanczos") - def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device) -> np.ndarray: + def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device, int mem_div=1) -> np.ndarray: # QUEUE AND CONTEXT cl_ctx = cl.Context([device['device']]) + dc = device["device"] cl_queue = cl.CommandQueue(cl_ctx) - code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP']) + output_shape = (image.shape[0], int(image.shape[1]), int(image.shape[2])) + image_out = np.zeros(output_shape, dtype=np.float32) - cdef int nFrames = image.shape[0] - cdef int rowsM = image.shape[1] - cdef int colsM = image.shape[2] + max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div) + max_slices = self._check_max_slices(image, max_slices) - image_in = cl_array.to_device(cl_queue, image) - shift_col_in = cl_array.to_device(cl_queue, shift_col) - shift_row_in = cl_array.to_device(cl_queue, shift_row) - image_out = cl_array.zeros(cl_queue, (nFrames, rowsM, colsM), dtype=np.float32) + mf = cl.mem_flags + input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes) + output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes) + cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait() - # Create the program + code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP']) prg = cl.Program(cl_ctx, code).build() + knl = prg.shiftScaleRotate - # Run the kernel - prg.shiftScaleRotate( - cl_queue, - image_out.shape, - None, - image_in.data, - image_out.data, - shift_row_in.data, - shift_col_in.data, - np.float32(scale_row), - np.float32(scale_col), - np.float32(angle) - ) - - # Wait for queue to finish - cl_queue.finish() - - return np.asarray(image_out.get(),dtype=np.float32) + for i in range(0, image.shape[0], max_slices): + if image.shape[0] - i >= max_slices: + n_slices = max_slices + else: + n_slices = image.shape[0] - i + knl( + cl_queue, + (n_slices, int(image.shape[1]), int(image.shape[2])), + self.get_work_group(dc, (n_slices, image.shape[1], image.shape[2])), + input_opencl, + output_opencl, + np.float32(shift_row), + np.float32(shift_col), + np.float32(scale_row), + np.float32(scale_col), + np.float32(angle) + ).wait() + + cl.enqueue_copy(cl_queue, image_out[i:i+n_slices,:,:], output_opencl).wait() + if i<=image.shape[0]-max_slices: + cl.enqueue_copy(cl_queue, input_opencl, image[i+n_slices:i+2*n_slices,:,:]).wait() + + cl_queue.finish() + + input_opencl.release() + output_opencl.release() + + return image_out # tag-end diff --git a/src/nanopyx/core/transform/_le_interpolation_lanczos_.cl b/src/nanopyx/core/transform/_le_interpolation_lanczos_.cl index fb2b3eac..bde6549b 100644 --- a/src/nanopyx/core/transform/_le_interpolation_lanczos_.cl +++ b/src/nanopyx/core/transform/_le_interpolation_lanczos_.cl @@ -81,8 +81,8 @@ shiftAndMagnify(__global float *image_in, __global float *image_out, __kernel void shiftScaleRotate(__global float *image_in, __global float *image_out, - __global float *shift_row, - __global float *shift_col, float scale_row, + float shift_row, + float shift_col, float scale_row, float scale_col, float angle) { // these are the indexes of the loop int f = get_global_id(0); @@ -104,11 +104,11 @@ __kernel void shiftScaleRotate(__global float *image_in, int nPixels = rows * cols; - float col = (a * (cM - center_col - shift_col[f]) + - b * (rM - center_row - shift_row[f])) + + float col = (a * (cM - center_col - shift_col) + + b * (rM - center_row - shift_row)) + center_col; - float row = (c * (cM - center_col - shift_col[f]) + - d * (rM - center_row - shift_row[f])) + + float row = (c * (cM - center_col - shift_col) + + d * (rM - center_row - shift_row)) + center_row; image_out[f * nPixels + rM * cols + cM] = diff --git a/src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx b/src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx index 276b6040..9c14d790 100644 --- a/src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx +++ b/src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx @@ -194,7 +194,7 @@ class RadialGradientConvergence(LiquidEngine): # tag-end - def _run_opencl(self, gradient_col_interp, gradient_row_interp, image_interp, magnification=5, radius=1.5, sensitivity=1, doIntensityWeighting=True, device=None): + def _run_opencl(self, gradient_col_interp, gradient_row_interp, image_interp, magnification=5, radius=1.5, sensitivity=1, doIntensityWeighting=True, device=None, int mem_div=1): # gradient gxgymag*mag*size # image_interp = mag*size @@ -230,21 +230,21 @@ class RadialGradientConvergence(LiquidEngine): # Calculating max slices size_per_slice = gradient_col_interp[0,:,:].nbytes + gradient_row_interp[0,:,:].nbytes + image_interp[0,:,:].nbytes + rgc_map[0,:,:].nbytes - max_slices = int((device['device'].global_mem_size // (size_per_slice))/4) # TODO 3 is a magic number + max_slices = int((device['device'].global_mem_size // (size_per_slice))/mem_div) + max_slices = self._check_max_slices(image_interp, max_slices) # Initial buffers mf = cl.mem_flags grad_col_int_in = cl.Buffer(cl_ctx, mf.READ_ONLY, gradient_col_interp[0:max_slices,:,:].nbytes) - cl.enqueue_copy(cl_queue, grad_col_int_in, gradient_col_interp[0:max_slices,:,:]).wait() - grad_row_int_in = cl.Buffer(cl_ctx, mf.READ_ONLY, gradient_row_interp[0:max_slices,:,:].nbytes) - cl.enqueue_copy(cl_queue, grad_row_int_in, gradient_row_interp[0:max_slices,:,:]).wait() - image_interp_in = cl.Buffer(cl_ctx, mf.READ_ONLY, image_interp[0:max_slices,:,:].nbytes) - cl.enqueue_copy(cl_queue, image_interp_in, image_interp[0:max_slices,:,:]).wait() - rgc_map_out = cl.Buffer(cl_ctx, mf.WRITE_ONLY, rgc_map[0:max_slices,:,:].nbytes) + + cl.enqueue_copy(cl_queue, grad_col_int_in, gradient_col_interp[0:max_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, grad_row_int_in, gradient_row_interp[0:max_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, image_interp_in, image_interp[0:max_slices,:,:]).wait() + # Code and building the kernel code = self._get_cl_code("_le_radial_gradient_convergence.cl", device['DP']) prg = cl.Program(cl_ctx, code).build() diff --git a/src/nanopyx/core/transform/_le_radiality.pyx b/src/nanopyx/core/transform/_le_radiality.pyx index 5e33be13..a1f65c47 100644 --- a/src/nanopyx/core/transform/_le_radiality.pyx +++ b/src/nanopyx/core/transform/_le_radiality.pyx @@ -238,13 +238,11 @@ class Radiality(LiquidEngine): return np.asarray(imRad) - def _run_opencl(self, image, image_interp, magnification=5, ringRadius=0.5, border=0, radialityPositivityConstraint=True, doIntensityWeighting=True, device=None): + def _run_opencl(self, image, image_interp, magnification=5, ringRadius=0.5, border=0, radialityPositivityConstraint=True, doIntensityWeighting=True, device=None, int mem_div=1): cl_ctx = cl.Context([device['device']]) cl_queue = cl.CommandQueue(cl_ctx) - code = self._get_cl_code("_le_radiality.cl", device['DP']) - cdef float _ringRadius = ringRadius * magnification cdef int nRingCoordinates = 12 @@ -261,21 +259,41 @@ class Radiality(LiquidEngine): cdef int h = image.shape[1] cdef int w = image.shape[2] - cdef float [:,:,:] imGx = np.zeros_like(image) - cdef float [:,:,:] imGy = np.zeros_like(image) + cdef float[:,:,:] imGx = np.zeros_like(image) + cdef float[:,:,:] imGy = np.zeros_like(image) cdef float[:,:,:] image_MV = image with nogil: for f in range(nFrames): _c_gradient_radiality(&image_MV[f,0,0], &imGx[f,0,0], &imGy[f,0,0], h, w) - image_in = cl_array.to_device(cl_queue, image) - imageinter_in = cl_array.to_device(cl_queue, image_interp) - imGx_in = cl_array.to_device(cl_queue, np.array(imGx, dtype=np.float32)) - imGy_in = cl_array.to_device(cl_queue, np.array(imGy, dtype=np.float32)) - imRad_out = cl_array.zeros(cl_queue, (nFrames, h*magnification, w*magnification), dtype=np.float32) + image_out = np.zeros(image.shape) + + x_ring_coords = np.asarray(xRingCoordinates) + y_ring_coords = np.asarray(yRingCoordinates) - xRingCoordinates_in = cl_array.to_device(cl_queue, np.array(xRingCoordinates, dtype=np.float32)) - yRingCoordinates_in = cl_array.to_device(cl_queue, np.array(yRingCoordinates, dtype=np.float32)) + print(x_ring_coords.nbytes, y_ring_coords.nbytes) + + # Calculate maximum number of slices that can fit in the GPU + size_per_slice = 2*image[0,:,:].nbytes + image_interp[0,:,:].nbytes + imGx[0,:,:].nbytes + imGy[0,:,:].nbytes + x_ring_coords.nbytes + y_ring_coords.nbytes + max_slices = int((device["device"].global_mem_size // (size_per_slice))/mem_div) + max_slices = self._check_max_slices(image, max_slices) + + # Initialize Buffers + mf = cl.mem_flags + image_in = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes) + imageinter_in = cl.Buffer(cl_ctx, mf.READ_ONLY, image_interp[0:max_slices,:,:].nbytes) + imGx_in = cl.Buffer(cl_ctx, mf.READ_ONLY, imGx[0:max_slices,:,:].nbytes) + imGy_in = cl.Buffer(cl_ctx, mf.READ_ONLY, imGy[0:max_slices,:,:].nbytes) + xRingCoordinates_in = cl.Buffer(cl_ctx, mf.READ_ONLY, x_ring_coords.nbytes) + yRingCoordinates_in = cl.Buffer(cl_ctx, mf.READ_ONLY, y_ring_coords.nbytes) + imRad_out = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes) + + cl.enqueue_copy(cl_queue, image_in, image[0:max_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, imageinter_in, image_interp[0:max_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, imGx_in, imGx[0:max_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, imGy_in, imGy[0:max_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, xRingCoordinates_in, x_ring_coords).wait() + cl.enqueue_copy(cl_queue, yRingCoordinates_in, y_ring_coords).wait() # Grid size lowest_row = (1 + border) * magnification @@ -284,29 +302,45 @@ class Radiality(LiquidEngine): lowest_col = (1 + border) * magnification highest_col = (w - 1 - border) * magnification - prg = cl.Program(cl_ctx, code).build() - - prg.radiality( - cl_queue, - (nFrames, highest_row - lowest_row, highest_col - lowest_col), - None, - image_in.data, - imageinter_in.data, - imGx_in.data, - imGy_in.data, - imRad_out.data, - xRingCoordinates_in.data, - yRingCoordinates_in.data, - np.int32(magnification), - np.float32(_ringRadius), - np.int32(nRingCoordinates), - np.int32(radialityPositivityConstraint), - np.int32(border), - np.int32(h), - np.int32(w) - ) + code = self._get_cl_code("_le_radiality.cl", device['DP']) + prg = cl.Program(cl_ctx, code).build() + knl = prg.radiality + + for i in range(0, nFrames-1, max_slices): + if nFrames - i >= max_slices: + n_slices = max_slices + else: + n_slices = nFrames - i + + knl( + cl_queue, + (n_slices, highest_row - lowest_row, highest_col - lowest_col), + self.get_work_group(device['device'],(n_slices, highest_row - lowest_row, highest_col - lowest_col)), + image_in, + imageinter_in, + imGx_in, + imGy_in, + imRad_out, + xRingCoordinates_in, + yRingCoordinates_in, + np.int32(magnification), + np.float32(_ringRadius), + np.int32(nRingCoordinates), + np.int32(radialityPositivityConstraint), + np.int32(border), + np.int32(h), + np.int32(w) + ) + + cl.enqueue_copy(cl_queue, image_out[i:i+n_slices,:,:], imRad_out).wait() + + if i<=nFrames-max_slices: + cl.enqueue_copy(cl_queue, image_in, image[i+n_slices:i+2*n_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, imageinter_in, image_interp[i+n_slices:i+2*n_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, imGx_in, imGx[i+n_slices:i+2*n_slices,:,:]).wait() + cl.enqueue_copy(cl_queue, imGy_in, imGy[i+n_slices:i+2*n_slices,:,:]).wait() - cl_queue.finish() + cl_queue.finish() - return np.asarray(imRad_out.get(),dtype=np.float32) + return image_out \ No newline at end of file diff --git a/src/nanopyx/core/transform/_le_roberts_cross_gradients.pyx b/src/nanopyx/core/transform/_le_roberts_cross_gradients.pyx index 53554530..8969ec9e 100644 --- a/src/nanopyx/core/transform/_le_roberts_cross_gradients.pyx +++ b/src/nanopyx/core/transform/_le_roberts_cross_gradients.pyx @@ -109,7 +109,7 @@ class GradientRobertsCross(LiquidEngine): return gradient_col, gradient_row # tag-end - def _run_opencl(self, float[:,:,:] image, dict device): + def _run_opencl(self, float[:,:,:] image, dict device, int mem_div=1): # QUEUE AND CONTEXT cl_ctx = cl.Context([device['device']]) @@ -127,16 +127,17 @@ class GradientRobertsCross(LiquidEngine): cdef float [:,:,:] gradient_col = np.zeros_like(image) cdef float [:,:,:] gradient_row = np.zeros_like(image) - max_slices = int((dc.global_mem_size // (image[0,:,:].nbytes + gradient_col[0,:,:].nbytes + gradient_row[0,:,:].nbytes))/3) - # TODO add exception if max_slices < 1 + max_slices = int((dc.global_mem_size // (image[0,:,:].nbytes + gradient_col[0,:,:].nbytes + gradient_row[0,:,:].nbytes))/mem_div) + max_slices = self._check_max_slices(image, max_slices) mf = cl.mem_flags input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes) - cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait() output_opencl_col = cl.Buffer(cl_ctx, mf.WRITE_ONLY, gradient_col[0:max_slices,:,:].nbytes) output_opencl_row = cl.Buffer(cl_ctx, mf.WRITE_ONLY, gradient_row[0:max_slices, :, :].nbytes) + cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait() + code = self._get_cl_code("_le_roberts_cross_gradients.cl", device['DP']) prg = cl.Program(cl_ctx, code).build() knl = prg.gradient_roberts_cross @@ -165,6 +166,5 @@ class GradientRobertsCross(LiquidEngine): input_opencl.release() output_opencl_col.release() output_opencl_row.release() - # Swap rows and columns back - #return np.ascontiguousarray(np.swapaxes(gradient_col, 1, 2), dtype=np.float32), np.ascontiguousarray(np.swapaxes(gradient_row, 1, 2), dtype=np.float32) + return gradient_col, gradient_row \ No newline at end of file