From 95a275bf47c2ac453393628304504ffbabbe4c9b Mon Sep 17 00:00:00 2001
From: Bruno Saraiva <bruno.msaraiva2@gmail.com>
Date: Thu, 12 Oct 2023 16:49:29 +0100
Subject: [PATCH] changed shift to be a single value and added new memory
 management

---
 src/nanopyx/core/transform/_le_esrrf.pyx      |   4 +-
 .../transform/_le_interpolation_bicubic.pyx   |   4 +-
 .../_le_interpolation_catmull_rom.pyx         |  79 +++++++------
 .../_le_interpolation_catmull_rom_.cl         |  12 +-
 .../transform/_le_interpolation_lanczos.pyx   |  77 +++++++------
 .../transform/_le_interpolation_lanczos_.cl   |  12 +-
 .../_le_radial_gradient_convergence.pyx       |  16 +--
 src/nanopyx/core/transform/_le_radiality.pyx  | 104 ++++++++++++------
 .../transform/_le_roberts_cross_gradients.pyx |  12 +-
 9 files changed, 188 insertions(+), 132 deletions(-)

diff --git a/src/nanopyx/core/transform/_le_esrrf.pyx b/src/nanopyx/core/transform/_le_esrrf.pyx
index ff04e189..70cbe978 100644
--- a/src/nanopyx/core/transform/_le_esrrf.pyx
+++ b/src/nanopyx/core/transform/_le_esrrf.pyx
@@ -50,7 +50,7 @@ class eSRRF(LiquidEngine):
 
         max_slices = int((dc.global_mem_size // total_memory)/mem_div)
 
-        self._check_min_slices(max_slices)
+        max_slices = self._check_max_slices(image, max_slices)
         
         mf = cl.mem_flags
 
@@ -75,7 +75,7 @@ class eSRRF(LiquidEngine):
         rgc_prg = cl.Program(cl_ctx, rgc_code).build(options=["-cl-mad-enable -cl-fast-relaxed-math"])
         rgc_knl = rgc_prg.calculate_rgc
 
-        for i in range(0, image.shape[0]-1, max_slices):
+        for i in range(0, image.shape[0], max_slices):
             if image.shape[0] - i >= max_slices:
                 n_slices = max_slices
             else:
diff --git a/src/nanopyx/core/transform/_le_interpolation_bicubic.pyx b/src/nanopyx/core/transform/_le_interpolation_bicubic.pyx
index 132f7d5d..382afc6c 100644
--- a/src/nanopyx/core/transform/_le_interpolation_bicubic.pyx
+++ b/src/nanopyx/core/transform/_le_interpolation_bicubic.pyx
@@ -87,7 +87,7 @@ class ShiftAndMagnify(LiquidEngine):
         image_out = np.zeros(output_shape, dtype=np.float32)
 
         max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
-        self._check_max_slices(max_slices)
+        max_slices = self._check_max_slices(image, max_slices)
 
         mf = cl.mem_flags
         input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
@@ -329,7 +329,7 @@ class ShiftScaleRotate(LiquidEngine):
         image_out = np.zeros(output_shape, dtype=np.float32)
 
         max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
-        self._check_max_slices(max_slices)
+        max_slices = self._check_max_slices(image, max_slices)
 
         mf = cl.mem_flags
         input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
diff --git a/src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx b/src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx
index 030e5fbd..0eab7c04 100644
--- a/src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx
+++ b/src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx
@@ -80,7 +80,7 @@ class ShiftAndMagnify(LiquidEngine):
     # tag-end
 
     # tag-copy: _le_interpolation_nearest_neighbor.ShiftAndMagnify._run_opencl; replace("nearest_neighbor", "catmull_rom")
-    def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device) -> np.ndarray:
+    def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device, int mem_div=1) -> np.ndarray:
 
         # QUEUE AND CONTEXT
         cl_ctx = cl.Context([device['device']])
@@ -90,14 +90,13 @@ class ShiftAndMagnify(LiquidEngine):
         output_shape = (image.shape[0], int(image.shape[1]*magnification_row), int(image.shape[2]*magnification_col))
         image_out = np.zeros(output_shape, dtype=np.float32)
 
-        # TODO 3 is a magic number 
-        max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/3)
-        # TODO add exception if max_slices < 1 
-        
+        max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
+        max_slices = self._check_max_slices(image, max_slices)
+
         mf = cl.mem_flags
         input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
-        cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
         output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
+        cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
 
         code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP'])
         prg = cl.Program(cl_ctx, code).build()
@@ -331,44 +330,56 @@ class ShiftScaleRotate(LiquidEngine):
     # tag-end
 
     # tag-copy: _le_interpolation_nearest_neighbor.ShiftScaleRotate._run_opencl; replace("nearest_neighbor", "catmull_rom")
-    def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device) -> np.ndarray:
+    def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device, int mem_div=1) -> np.ndarray:
 
         # QUEUE AND CONTEXT
         cl_ctx = cl.Context([device['device']])
+        dc = device["device"]
         cl_queue = cl.CommandQueue(cl_ctx)
 
-        code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP'])
+        output_shape = (image.shape[0], int(image.shape[1]), int(image.shape[2]))
+        image_out = np.zeros(output_shape, dtype=np.float32)
 
-        cdef int nFrames = image.shape[0]
-        cdef int rowsM = image.shape[1]
-        cdef int colsM = image.shape[2]
+        max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
+        max_slices = self._check_max_slices(image, max_slices)
 
-        image_in = cl_array.to_device(cl_queue, image)
-        shift_col_in = cl_array.to_device(cl_queue, shift_col)
-        shift_row_in = cl_array.to_device(cl_queue, shift_row)
-        image_out = cl_array.zeros(cl_queue, (nFrames, rowsM, colsM), dtype=np.float32)
+        mf = cl.mem_flags
+        input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
+        output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
+        cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
 
-        # Create the program
+        code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP'])
         prg = cl.Program(cl_ctx, code).build()
+        knl = prg.shiftScaleRotate
 
-        # Run the kernel
-        prg.shiftScaleRotate(
-            cl_queue,
-            image_out.shape,
-            None,
-            image_in.data,
-            image_out.data,
-            shift_row_in.data,
-            shift_col_in.data,
-            np.float32(scale_row),
-            np.float32(scale_col),
-            np.float32(angle)
-        )
-
-        # Wait for queue to finish
-        cl_queue.finish()
-
-        return np.asarray(image_out.get(),dtype=np.float32)
+        for i in range(0, image.shape[0], max_slices):
+            if image.shape[0] - i >= max_slices:
+                n_slices = max_slices
+            else:
+                n_slices = image.shape[0] - i
+            knl(
+                cl_queue,
+                (n_slices, int(image.shape[1]), int(image.shape[2])),
+                self.get_work_group(dc, (n_slices, image.shape[1], image.shape[2])),
+                input_opencl,
+                output_opencl,
+                np.float32(shift_row), 
+                np.float32(shift_col),
+                np.float32(scale_row),
+                np.float32(scale_col),
+                np.float32(angle)
+            ).wait()
+
+            cl.enqueue_copy(cl_queue, image_out[i:i+n_slices,:,:], output_opencl).wait()
+            if i<=image.shape[0]-max_slices:
+                cl.enqueue_copy(cl_queue, input_opencl, image[i+n_slices:i+2*n_slices,:,:]).wait()
+
+            cl_queue.finish()
+
+        input_opencl.release()
+        output_opencl.release()
+
+        return image_out
 
     # tag-end
 
diff --git a/src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl b/src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl
index 4b89461c..250891a8 100644
--- a/src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl
+++ b/src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl
@@ -75,8 +75,8 @@ shiftAndMagnify(__global float *image_in, __global float *image_out,
 
 __kernel void shiftScaleRotate(__global float *image_in,
                                __global float *image_out,
-                               __global float *shift_row,
-                               __global float *shift_col, float scale_row,
+                               float shift_row,
+                               float shift_col, float scale_row,
                                float scale_col, float angle) {
   // these are the indexes of the loop
   int f = get_global_id(0);
@@ -98,11 +98,11 @@ __kernel void shiftScaleRotate(__global float *image_in,
 
   int nPixels = rows * cols;
 
-  float col = (a * (cM - center_col - shift_col[f]) +
-               b * (rM - center_row - shift_row[f])) +
+  float col = (a * (cM - center_col - shift_col) +
+               b * (rM - center_row - shift_row)) +
               center_col;
-  float row = (c * (cM - center_col - shift_col[f]) +
-               d * (rM - center_row - shift_row[f])) +
+  float row = (c * (cM - center_col - shift_col) +
+               d * (rM - center_row - shift_row)) +
               center_row;
 
   image_out[f * nPixels + rM * cols + cM] =
diff --git a/src/nanopyx/core/transform/_le_interpolation_lanczos.pyx b/src/nanopyx/core/transform/_le_interpolation_lanczos.pyx
index 91b8fd5f..2de150ef 100644
--- a/src/nanopyx/core/transform/_le_interpolation_lanczos.pyx
+++ b/src/nanopyx/core/transform/_le_interpolation_lanczos.pyx
@@ -75,7 +75,7 @@ class ShiftAndMagnify(LiquidEngine):
     # tag-end
 
     # tag-copy: _le_interpolation_nearest_neighbor.ShiftAndMagnify._run_opencl; replace("nearest_neighbor", "lanczos")
-    def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device) -> np.ndarray:
+    def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device, int mem_div=1) -> np.ndarray:
 
         # QUEUE AND CONTEXT
         cl_ctx = cl.Context([device['device']])
@@ -85,14 +85,13 @@ class ShiftAndMagnify(LiquidEngine):
         output_shape = (image.shape[0], int(image.shape[1]*magnification_row), int(image.shape[2]*magnification_col))
         image_out = np.zeros(output_shape, dtype=np.float32)
 
-        # TODO 3 is a magic number 
-        max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/3)
-        # TODO add exception if max_slices < 1 
+        max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
+        max_slices = self._check_max_slices(image, max_slices)
 
         mf = cl.mem_flags
         input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
-        cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
         output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
+        cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
 
         code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP'])
         prg = cl.Program(cl_ctx, code).build()
@@ -318,44 +317,56 @@ class ShiftScaleRotate(LiquidEngine):
     # tag-end
 
     # tag-copy: _le_interpolation_nearest_neighbor.ShiftScaleRotate._run_opencl; replace("nearest_neighbor", "lanczos")
-    def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device) -> np.ndarray:
+    def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device, int mem_div=1) -> np.ndarray:
 
         # QUEUE AND CONTEXT
         cl_ctx = cl.Context([device['device']])
+        dc = device["device"]
         cl_queue = cl.CommandQueue(cl_ctx)
 
-        code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP'])
+        output_shape = (image.shape[0], int(image.shape[1]), int(image.shape[2]))
+        image_out = np.zeros(output_shape, dtype=np.float32)
 
-        cdef int nFrames = image.shape[0]
-        cdef int rowsM = image.shape[1]
-        cdef int colsM = image.shape[2]
+        max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
+        max_slices = self._check_max_slices(image, max_slices)
 
-        image_in = cl_array.to_device(cl_queue, image)
-        shift_col_in = cl_array.to_device(cl_queue, shift_col)
-        shift_row_in = cl_array.to_device(cl_queue, shift_row)
-        image_out = cl_array.zeros(cl_queue, (nFrames, rowsM, colsM), dtype=np.float32)
+        mf = cl.mem_flags
+        input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
+        output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
+        cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
 
-        # Create the program
+        code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP'])
         prg = cl.Program(cl_ctx, code).build()
+        knl = prg.shiftScaleRotate
 
-        # Run the kernel
-        prg.shiftScaleRotate(
-            cl_queue,
-            image_out.shape,
-            None,
-            image_in.data,
-            image_out.data,
-            shift_row_in.data,
-            shift_col_in.data,
-            np.float32(scale_row),
-            np.float32(scale_col),
-            np.float32(angle)
-        )
-
-        # Wait for queue to finish
-        cl_queue.finish()
-
-        return np.asarray(image_out.get(),dtype=np.float32)
+        for i in range(0, image.shape[0], max_slices):
+            if image.shape[0] - i >= max_slices:
+                n_slices = max_slices
+            else:
+                n_slices = image.shape[0] - i
+            knl(
+                cl_queue,
+                (n_slices, int(image.shape[1]), int(image.shape[2])),
+                self.get_work_group(dc, (n_slices, image.shape[1], image.shape[2])),
+                input_opencl,
+                output_opencl,
+                np.float32(shift_row), 
+                np.float32(shift_col),
+                np.float32(scale_row),
+                np.float32(scale_col),
+                np.float32(angle)
+            ).wait()
+
+            cl.enqueue_copy(cl_queue, image_out[i:i+n_slices,:,:], output_opencl).wait()
+            if i<=image.shape[0]-max_slices:
+                cl.enqueue_copy(cl_queue, input_opencl, image[i+n_slices:i+2*n_slices,:,:]).wait()
+
+            cl_queue.finish()
+
+        input_opencl.release()
+        output_opencl.release()
+
+        return image_out
 
     # tag-end
 
diff --git a/src/nanopyx/core/transform/_le_interpolation_lanczos_.cl b/src/nanopyx/core/transform/_le_interpolation_lanczos_.cl
index fb2b3eac..bde6549b 100644
--- a/src/nanopyx/core/transform/_le_interpolation_lanczos_.cl
+++ b/src/nanopyx/core/transform/_le_interpolation_lanczos_.cl
@@ -81,8 +81,8 @@ shiftAndMagnify(__global float *image_in, __global float *image_out,
 
 __kernel void shiftScaleRotate(__global float *image_in,
                                __global float *image_out,
-                               __global float *shift_row,
-                               __global float *shift_col, float scale_row,
+                               float shift_row,
+                               float shift_col, float scale_row,
                                float scale_col, float angle) {
   // these are the indexes of the loop
   int f = get_global_id(0);
@@ -104,11 +104,11 @@ __kernel void shiftScaleRotate(__global float *image_in,
 
   int nPixels = rows * cols;
 
-  float col = (a * (cM - center_col - shift_col[f]) +
-               b * (rM - center_row - shift_row[f])) +
+  float col = (a * (cM - center_col - shift_col) +
+               b * (rM - center_row - shift_row)) +
               center_col;
-  float row = (c * (cM - center_col - shift_col[f]) +
-               d * (rM - center_row - shift_row[f])) +
+  float row = (c * (cM - center_col - shift_col) +
+               d * (rM - center_row - shift_row)) +
               center_row;
 
   image_out[f * nPixels + rM * cols + cM] =
diff --git a/src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx b/src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx
index 276b6040..9c14d790 100644
--- a/src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx
+++ b/src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx
@@ -194,7 +194,7 @@ class RadialGradientConvergence(LiquidEngine):
         # tag-end
 
     
-    def _run_opencl(self, gradient_col_interp, gradient_row_interp, image_interp, magnification=5, radius=1.5, sensitivity=1, doIntensityWeighting=True, device=None):
+    def _run_opencl(self, gradient_col_interp, gradient_row_interp, image_interp, magnification=5, radius=1.5, sensitivity=1, doIntensityWeighting=True, device=None, int mem_div=1):
 
         # gradient gxgymag*mag*size
         # image_interp = mag*size
@@ -230,21 +230,21 @@ class RadialGradientConvergence(LiquidEngine):
 
         # Calculating max slices
         size_per_slice = gradient_col_interp[0,:,:].nbytes + gradient_row_interp[0,:,:].nbytes + image_interp[0,:,:].nbytes + rgc_map[0,:,:].nbytes
-        max_slices = int((device['device'].global_mem_size // (size_per_slice))/4)   # TODO 3 is a magic number 
+        max_slices = int((device['device'].global_mem_size // (size_per_slice))/mem_div)
+        max_slices = self._check_max_slices(image_interp, max_slices)
 
         # Initial buffers
         mf = cl.mem_flags
         grad_col_int_in = cl.Buffer(cl_ctx, mf.READ_ONLY, gradient_col_interp[0:max_slices,:,:].nbytes)
-        cl.enqueue_copy(cl_queue, grad_col_int_in, gradient_col_interp[0:max_slices,:,:]).wait()
-
         grad_row_int_in = cl.Buffer(cl_ctx, mf.READ_ONLY, gradient_row_interp[0:max_slices,:,:].nbytes)
-        cl.enqueue_copy(cl_queue, grad_row_int_in, gradient_row_interp[0:max_slices,:,:]).wait()
-
         image_interp_in = cl.Buffer(cl_ctx, mf.READ_ONLY, image_interp[0:max_slices,:,:].nbytes)
-        cl.enqueue_copy(cl_queue, image_interp_in, image_interp[0:max_slices,:,:]).wait()
-
         rgc_map_out = cl.Buffer(cl_ctx, mf.WRITE_ONLY, rgc_map[0:max_slices,:,:].nbytes)
 
+
+        cl.enqueue_copy(cl_queue, grad_col_int_in, gradient_col_interp[0:max_slices,:,:]).wait()
+        cl.enqueue_copy(cl_queue, grad_row_int_in, gradient_row_interp[0:max_slices,:,:]).wait()
+        cl.enqueue_copy(cl_queue, image_interp_in, image_interp[0:max_slices,:,:]).wait()
+
         # Code and building the kernel
         code = self._get_cl_code("_le_radial_gradient_convergence.cl", device['DP'])
         prg = cl.Program(cl_ctx, code).build() 
diff --git a/src/nanopyx/core/transform/_le_radiality.pyx b/src/nanopyx/core/transform/_le_radiality.pyx
index 5e33be13..a1f65c47 100644
--- a/src/nanopyx/core/transform/_le_radiality.pyx
+++ b/src/nanopyx/core/transform/_le_radiality.pyx
@@ -238,13 +238,11 @@ class Radiality(LiquidEngine):
         return np.asarray(imRad)
 
     
-    def _run_opencl(self, image, image_interp, magnification=5, ringRadius=0.5, border=0, radialityPositivityConstraint=True, doIntensityWeighting=True, device=None):
+    def _run_opencl(self, image, image_interp, magnification=5, ringRadius=0.5, border=0, radialityPositivityConstraint=True, doIntensityWeighting=True, device=None, int mem_div=1):
 
         cl_ctx = cl.Context([device['device']])
         cl_queue = cl.CommandQueue(cl_ctx)
 
-        code = self._get_cl_code("_le_radiality.cl", device['DP'])
-
         cdef float _ringRadius = ringRadius * magnification
         
         cdef int nRingCoordinates = 12
@@ -261,21 +259,41 @@ class Radiality(LiquidEngine):
         cdef int h = image.shape[1]
         cdef int w = image.shape[2]
 
-        cdef float [:,:,:] imGx = np.zeros_like(image) 
-        cdef float [:,:,:] imGy = np.zeros_like(image)
+        cdef float[:,:,:] imGx = np.zeros_like(image) 
+        cdef float[:,:,:] imGy = np.zeros_like(image)
         cdef float[:,:,:] image_MV = image
         with nogil:
             for f in range(nFrames):
                 _c_gradient_radiality(&image_MV[f,0,0], &imGx[f,0,0], &imGy[f,0,0], h, w)
 
-        image_in = cl_array.to_device(cl_queue, image)
-        imageinter_in = cl_array.to_device(cl_queue, image_interp)
-        imGx_in = cl_array.to_device(cl_queue, np.array(imGx, dtype=np.float32))
-        imGy_in = cl_array.to_device(cl_queue, np.array(imGy, dtype=np.float32))
-        imRad_out = cl_array.zeros(cl_queue, (nFrames, h*magnification, w*magnification), dtype=np.float32)
+        image_out = np.zeros(image.shape)
+
+        x_ring_coords = np.asarray(xRingCoordinates)
+        y_ring_coords = np.asarray(yRingCoordinates)
 
-        xRingCoordinates_in = cl_array.to_device(cl_queue, np.array(xRingCoordinates, dtype=np.float32))
-        yRingCoordinates_in = cl_array.to_device(cl_queue, np.array(yRingCoordinates, dtype=np.float32))
+        print(x_ring_coords.nbytes, y_ring_coords.nbytes)
+
+        # Calculate maximum number of slices that can fit in the GPU
+        size_per_slice = 2*image[0,:,:].nbytes + image_interp[0,:,:].nbytes + imGx[0,:,:].nbytes + imGy[0,:,:].nbytes + x_ring_coords.nbytes + y_ring_coords.nbytes
+        max_slices = int((device["device"].global_mem_size // (size_per_slice))/mem_div)
+        max_slices = self._check_max_slices(image, max_slices)
+
+        # Initialize Buffers
+        mf = cl.mem_flags
+        image_in = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
+        imageinter_in = cl.Buffer(cl_ctx, mf.READ_ONLY, image_interp[0:max_slices,:,:].nbytes)
+        imGx_in = cl.Buffer(cl_ctx, mf.READ_ONLY, imGx[0:max_slices,:,:].nbytes)
+        imGy_in = cl.Buffer(cl_ctx, mf.READ_ONLY, imGy[0:max_slices,:,:].nbytes)
+        xRingCoordinates_in = cl.Buffer(cl_ctx, mf.READ_ONLY, x_ring_coords.nbytes)
+        yRingCoordinates_in = cl.Buffer(cl_ctx, mf.READ_ONLY, y_ring_coords.nbytes)
+        imRad_out = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
+
+        cl.enqueue_copy(cl_queue, image_in, image[0:max_slices,:,:]).wait()
+        cl.enqueue_copy(cl_queue, imageinter_in, image_interp[0:max_slices,:,:]).wait()
+        cl.enqueue_copy(cl_queue, imGx_in, imGx[0:max_slices,:,:]).wait()
+        cl.enqueue_copy(cl_queue, imGy_in, imGy[0:max_slices,:,:]).wait()
+        cl.enqueue_copy(cl_queue, xRingCoordinates_in, x_ring_coords).wait()
+        cl.enqueue_copy(cl_queue, yRingCoordinates_in, y_ring_coords).wait()
 
         # Grid size
         lowest_row = (1 + border) * magnification 
@@ -284,29 +302,45 @@ class Radiality(LiquidEngine):
         lowest_col = (1 + border) * magnification
         highest_col = (w - 1 - border) * magnification
 
-        prg = cl.Program(cl_ctx, code).build()  
- 
-        prg.radiality(
-            cl_queue,
-            (nFrames, highest_row - lowest_row, highest_col - lowest_col),
-            None,
-            image_in.data,
-            imageinter_in.data,
-            imGx_in.data,
-            imGy_in.data,
-            imRad_out.data,
-            xRingCoordinates_in.data,
-            yRingCoordinates_in.data,
-            np.int32(magnification),
-            np.float32(_ringRadius),
-            np.int32(nRingCoordinates),
-            np.int32(radialityPositivityConstraint),
-            np.int32(border),
-            np.int32(h),
-            np.int32(w)
-        )
+        code = self._get_cl_code("_le_radiality.cl", device['DP'])
+        prg = cl.Program(cl_ctx, code).build()
+        knl = prg.radiality
+
+        for i in range(0, nFrames-1, max_slices):
+            if nFrames - i >= max_slices:
+                n_slices = max_slices
+            else:
+                n_slices = nFrames - i
+
+            knl(
+                cl_queue,
+                (n_slices, highest_row - lowest_row, highest_col - lowest_col),
+                self.get_work_group(device['device'],(n_slices, highest_row - lowest_row, highest_col - lowest_col)),
+                image_in,
+                imageinter_in,
+                imGx_in,
+                imGy_in,
+                imRad_out,
+                xRingCoordinates_in,
+                yRingCoordinates_in,
+                np.int32(magnification),
+                np.float32(_ringRadius),
+                np.int32(nRingCoordinates),
+                np.int32(radialityPositivityConstraint),
+                np.int32(border),
+                np.int32(h),
+                np.int32(w)
+            )
+
+            cl.enqueue_copy(cl_queue, image_out[i:i+n_slices,:,:], imRad_out).wait()
+
+            if i<=nFrames-max_slices:
+                cl.enqueue_copy(cl_queue, image_in, image[i+n_slices:i+2*n_slices,:,:]).wait()
+                cl.enqueue_copy(cl_queue, imageinter_in, image_interp[i+n_slices:i+2*n_slices,:,:]).wait()
+                cl.enqueue_copy(cl_queue, imGx_in, imGx[i+n_slices:i+2*n_slices,:,:]).wait()
+                cl.enqueue_copy(cl_queue, imGy_in, imGy[i+n_slices:i+2*n_slices,:,:]).wait()
     
-        cl_queue.finish()        
+            cl_queue.finish()        
         
-        return np.asarray(imRad_out.get(),dtype=np.float32)
+        return image_out
     
\ No newline at end of file
diff --git a/src/nanopyx/core/transform/_le_roberts_cross_gradients.pyx b/src/nanopyx/core/transform/_le_roberts_cross_gradients.pyx
index 53554530..8969ec9e 100644
--- a/src/nanopyx/core/transform/_le_roberts_cross_gradients.pyx
+++ b/src/nanopyx/core/transform/_le_roberts_cross_gradients.pyx
@@ -109,7 +109,7 @@ class GradientRobertsCross(LiquidEngine):
         return gradient_col, gradient_row
     # tag-end
 
-    def _run_opencl(self, float[:,:,:] image, dict device):
+    def _run_opencl(self, float[:,:,:] image, dict device, int mem_div=1):
 
         # QUEUE AND CONTEXT
         cl_ctx = cl.Context([device['device']])
@@ -127,16 +127,17 @@ class GradientRobertsCross(LiquidEngine):
         cdef float [:,:,:] gradient_col = np.zeros_like(image) 
         cdef float [:,:,:] gradient_row = np.zeros_like(image)
 
-        max_slices = int((dc.global_mem_size // (image[0,:,:].nbytes + gradient_col[0,:,:].nbytes + gradient_row[0,:,:].nbytes))/3)
-        # TODO add exception if max_slices < 1 
+        max_slices = int((dc.global_mem_size // (image[0,:,:].nbytes + gradient_col[0,:,:].nbytes + gradient_row[0,:,:].nbytes))/mem_div)
+        max_slices = self._check_max_slices(image, max_slices) 
 
         mf = cl.mem_flags
 
         input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
-        cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
         output_opencl_col = cl.Buffer(cl_ctx, mf.WRITE_ONLY, gradient_col[0:max_slices,:,:].nbytes)
         output_opencl_row = cl.Buffer(cl_ctx, mf.WRITE_ONLY, gradient_row[0:max_slices, :, :].nbytes)
 
+        cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
+
         code = self._get_cl_code("_le_roberts_cross_gradients.cl", device['DP'])
         prg = cl.Program(cl_ctx, code).build()
         knl = prg.gradient_roberts_cross
@@ -165,6 +166,5 @@ class GradientRobertsCross(LiquidEngine):
         input_opencl.release()
         output_opencl_col.release()
         output_opencl_row.release()
-        # Swap rows and columns back
-        #return np.ascontiguousarray(np.swapaxes(gradient_col, 1, 2), dtype=np.float32), np.ascontiguousarray(np.swapaxes(gradient_row, 1, 2), dtype=np.float32)
+
         return gradient_col, gradient_row
\ No newline at end of file