Skip to content

Commit

Permalink
changed shift to be a single value and added new memory management
Browse files Browse the repository at this point in the history
  • Loading branch information
brunomsaraiva committed Oct 12, 2023
1 parent 043b89b commit 95a275b
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 132 deletions.
4 changes: 2 additions & 2 deletions src/nanopyx/core/transform/_le_esrrf.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class eSRRF(LiquidEngine):

max_slices = int((dc.global_mem_size // total_memory)/mem_div)

self._check_min_slices(max_slices)
max_slices = self._check_max_slices(image, max_slices)

mf = cl.mem_flags

Expand All @@ -75,7 +75,7 @@ class eSRRF(LiquidEngine):
rgc_prg = cl.Program(cl_ctx, rgc_code).build(options=["-cl-mad-enable -cl-fast-relaxed-math"])
rgc_knl = rgc_prg.calculate_rgc

for i in range(0, image.shape[0]-1, max_slices):
for i in range(0, image.shape[0], max_slices):
if image.shape[0] - i >= max_slices:
n_slices = max_slices
else:
Expand Down
4 changes: 2 additions & 2 deletions src/nanopyx/core/transform/_le_interpolation_bicubic.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class ShiftAndMagnify(LiquidEngine):
image_out = np.zeros(output_shape, dtype=np.float32)

max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
self._check_max_slices(max_slices)
max_slices = self._check_max_slices(image, max_slices)

mf = cl.mem_flags
input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
Expand Down Expand Up @@ -329,7 +329,7 @@ class ShiftScaleRotate(LiquidEngine):
image_out = np.zeros(output_shape, dtype=np.float32)

max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
self._check_max_slices(max_slices)
max_slices = self._check_max_slices(image, max_slices)

mf = cl.mem_flags
input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
Expand Down
79 changes: 45 additions & 34 deletions src/nanopyx/core/transform/_le_interpolation_catmull_rom.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class ShiftAndMagnify(LiquidEngine):
# tag-end

# tag-copy: _le_interpolation_nearest_neighbor.ShiftAndMagnify._run_opencl; replace("nearest_neighbor", "catmull_rom")
def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device) -> np.ndarray:
def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device, int mem_div=1) -> np.ndarray:

# QUEUE AND CONTEXT
cl_ctx = cl.Context([device['device']])
Expand All @@ -90,14 +90,13 @@ class ShiftAndMagnify(LiquidEngine):
output_shape = (image.shape[0], int(image.shape[1]*magnification_row), int(image.shape[2]*magnification_col))
image_out = np.zeros(output_shape, dtype=np.float32)

# TODO 3 is a magic number
max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/3)
# TODO add exception if max_slices < 1

max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
max_slices = self._check_max_slices(image, max_slices)

mf = cl.mem_flags
input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()

code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP'])
prg = cl.Program(cl_ctx, code).build()
Expand Down Expand Up @@ -331,44 +330,56 @@ class ShiftScaleRotate(LiquidEngine):
# tag-end

# tag-copy: _le_interpolation_nearest_neighbor.ShiftScaleRotate._run_opencl; replace("nearest_neighbor", "catmull_rom")
def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device) -> np.ndarray:
def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device, int mem_div=1) -> np.ndarray:

# QUEUE AND CONTEXT
cl_ctx = cl.Context([device['device']])
dc = device["device"]
cl_queue = cl.CommandQueue(cl_ctx)

code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP'])
output_shape = (image.shape[0], int(image.shape[1]), int(image.shape[2]))
image_out = np.zeros(output_shape, dtype=np.float32)

cdef int nFrames = image.shape[0]
cdef int rowsM = image.shape[1]
cdef int colsM = image.shape[2]
max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
max_slices = self._check_max_slices(image, max_slices)

image_in = cl_array.to_device(cl_queue, image)
shift_col_in = cl_array.to_device(cl_queue, shift_col)
shift_row_in = cl_array.to_device(cl_queue, shift_row)
image_out = cl_array.zeros(cl_queue, (nFrames, rowsM, colsM), dtype=np.float32)
mf = cl.mem_flags
input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()

# Create the program
code = self._get_cl_code("_le_interpolation_catmull_rom_.cl", device['DP'])
prg = cl.Program(cl_ctx, code).build()
knl = prg.shiftScaleRotate

# Run the kernel
prg.shiftScaleRotate(
cl_queue,
image_out.shape,
None,
image_in.data,
image_out.data,
shift_row_in.data,
shift_col_in.data,
np.float32(scale_row),
np.float32(scale_col),
np.float32(angle)
)

# Wait for queue to finish
cl_queue.finish()

return np.asarray(image_out.get(),dtype=np.float32)
for i in range(0, image.shape[0], max_slices):
if image.shape[0] - i >= max_slices:
n_slices = max_slices
else:
n_slices = image.shape[0] - i
knl(
cl_queue,
(n_slices, int(image.shape[1]), int(image.shape[2])),
self.get_work_group(dc, (n_slices, image.shape[1], image.shape[2])),
input_opencl,
output_opencl,
np.float32(shift_row),
np.float32(shift_col),
np.float32(scale_row),
np.float32(scale_col),
np.float32(angle)
).wait()

cl.enqueue_copy(cl_queue, image_out[i:i+n_slices,:,:], output_opencl).wait()
if i<=image.shape[0]-max_slices:
cl.enqueue_copy(cl_queue, input_opencl, image[i+n_slices:i+2*n_slices,:,:]).wait()

cl_queue.finish()

input_opencl.release()
output_opencl.release()

return image_out

# tag-end

Expand Down
12 changes: 6 additions & 6 deletions src/nanopyx/core/transform/_le_interpolation_catmull_rom_.cl
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ shiftAndMagnify(__global float *image_in, __global float *image_out,

__kernel void shiftScaleRotate(__global float *image_in,
__global float *image_out,
__global float *shift_row,
__global float *shift_col, float scale_row,
float shift_row,
float shift_col, float scale_row,
float scale_col, float angle) {
// these are the indexes of the loop
int f = get_global_id(0);
Expand All @@ -98,11 +98,11 @@ __kernel void shiftScaleRotate(__global float *image_in,

int nPixels = rows * cols;

float col = (a * (cM - center_col - shift_col[f]) +
b * (rM - center_row - shift_row[f])) +
float col = (a * (cM - center_col - shift_col) +
b * (rM - center_row - shift_row)) +
center_col;
float row = (c * (cM - center_col - shift_col[f]) +
d * (rM - center_row - shift_row[f])) +
float row = (c * (cM - center_col - shift_col) +
d * (rM - center_row - shift_row)) +
center_row;

image_out[f * nPixels + rM * cols + cM] =
Expand Down
77 changes: 44 additions & 33 deletions src/nanopyx/core/transform/_le_interpolation_lanczos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class ShiftAndMagnify(LiquidEngine):
# tag-end

# tag-copy: _le_interpolation_nearest_neighbor.ShiftAndMagnify._run_opencl; replace("nearest_neighbor", "lanczos")
def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device) -> np.ndarray:
def _run_opencl(self, image, shift_row, shift_col, float magnification_row, float magnification_col, dict device, int mem_div=1) -> np.ndarray:

# QUEUE AND CONTEXT
cl_ctx = cl.Context([device['device']])
Expand All @@ -85,14 +85,13 @@ class ShiftAndMagnify(LiquidEngine):
output_shape = (image.shape[0], int(image.shape[1]*magnification_row), int(image.shape[2]*magnification_col))
image_out = np.zeros(output_shape, dtype=np.float32)

# TODO 3 is a magic number
max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/3)
# TODO add exception if max_slices < 1
max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
max_slices = self._check_max_slices(image, max_slices)

mf = cl.mem_flags
input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()
output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()

code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP'])
prg = cl.Program(cl_ctx, code).build()
Expand Down Expand Up @@ -318,44 +317,56 @@ class ShiftScaleRotate(LiquidEngine):
# tag-end

# tag-copy: _le_interpolation_nearest_neighbor.ShiftScaleRotate._run_opencl; replace("nearest_neighbor", "lanczos")
def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device) -> np.ndarray:
def _run_opencl(self, image, shift_row, shift_col, float scale_row, float scale_col, float angle, dict device, int mem_div=1) -> np.ndarray:

# QUEUE AND CONTEXT
cl_ctx = cl.Context([device['device']])
dc = device["device"]
cl_queue = cl.CommandQueue(cl_ctx)

code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP'])
output_shape = (image.shape[0], int(image.shape[1]), int(image.shape[2]))
image_out = np.zeros(output_shape, dtype=np.float32)

cdef int nFrames = image.shape[0]
cdef int rowsM = image.shape[1]
cdef int colsM = image.shape[2]
max_slices = int((dc.global_mem_size // (image_out[0,:,:].nbytes + image[0,:,:].nbytes))/mem_div)
max_slices = self._check_max_slices(image, max_slices)

image_in = cl_array.to_device(cl_queue, image)
shift_col_in = cl_array.to_device(cl_queue, shift_col)
shift_row_in = cl_array.to_device(cl_queue, shift_row)
image_out = cl_array.zeros(cl_queue, (nFrames, rowsM, colsM), dtype=np.float32)
mf = cl.mem_flags
input_opencl = cl.Buffer(cl_ctx, mf.READ_ONLY, image[0:max_slices,:,:].nbytes)
output_opencl = cl.Buffer(cl_ctx, mf.WRITE_ONLY, image_out[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, input_opencl, image[0:max_slices,:,:]).wait()

# Create the program
code = self._get_cl_code("_le_interpolation_lanczos_.cl", device['DP'])
prg = cl.Program(cl_ctx, code).build()
knl = prg.shiftScaleRotate

# Run the kernel
prg.shiftScaleRotate(
cl_queue,
image_out.shape,
None,
image_in.data,
image_out.data,
shift_row_in.data,
shift_col_in.data,
np.float32(scale_row),
np.float32(scale_col),
np.float32(angle)
)

# Wait for queue to finish
cl_queue.finish()

return np.asarray(image_out.get(),dtype=np.float32)
for i in range(0, image.shape[0], max_slices):
if image.shape[0] - i >= max_slices:
n_slices = max_slices
else:
n_slices = image.shape[0] - i
knl(
cl_queue,
(n_slices, int(image.shape[1]), int(image.shape[2])),
self.get_work_group(dc, (n_slices, image.shape[1], image.shape[2])),
input_opencl,
output_opencl,
np.float32(shift_row),
np.float32(shift_col),
np.float32(scale_row),
np.float32(scale_col),
np.float32(angle)
).wait()

cl.enqueue_copy(cl_queue, image_out[i:i+n_slices,:,:], output_opencl).wait()
if i<=image.shape[0]-max_slices:
cl.enqueue_copy(cl_queue, input_opencl, image[i+n_slices:i+2*n_slices,:,:]).wait()

cl_queue.finish()

input_opencl.release()
output_opencl.release()

return image_out

# tag-end

Expand Down
12 changes: 6 additions & 6 deletions src/nanopyx/core/transform/_le_interpolation_lanczos_.cl
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ shiftAndMagnify(__global float *image_in, __global float *image_out,

__kernel void shiftScaleRotate(__global float *image_in,
__global float *image_out,
__global float *shift_row,
__global float *shift_col, float scale_row,
float shift_row,
float shift_col, float scale_row,
float scale_col, float angle) {
// these are the indexes of the loop
int f = get_global_id(0);
Expand All @@ -104,11 +104,11 @@ __kernel void shiftScaleRotate(__global float *image_in,

int nPixels = rows * cols;

float col = (a * (cM - center_col - shift_col[f]) +
b * (rM - center_row - shift_row[f])) +
float col = (a * (cM - center_col - shift_col) +
b * (rM - center_row - shift_row)) +
center_col;
float row = (c * (cM - center_col - shift_col[f]) +
d * (rM - center_row - shift_row[f])) +
float row = (c * (cM - center_col - shift_col) +
d * (rM - center_row - shift_row)) +
center_row;

image_out[f * nPixels + rM * cols + cM] =
Expand Down
16 changes: 8 additions & 8 deletions src/nanopyx/core/transform/_le_radial_gradient_convergence.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class RadialGradientConvergence(LiquidEngine):
# tag-end


def _run_opencl(self, gradient_col_interp, gradient_row_interp, image_interp, magnification=5, radius=1.5, sensitivity=1, doIntensityWeighting=True, device=None):
def _run_opencl(self, gradient_col_interp, gradient_row_interp, image_interp, magnification=5, radius=1.5, sensitivity=1, doIntensityWeighting=True, device=None, int mem_div=1):

# gradient gxgymag*mag*size
# image_interp = mag*size
Expand Down Expand Up @@ -230,21 +230,21 @@ class RadialGradientConvergence(LiquidEngine):

# Calculating max slices
size_per_slice = gradient_col_interp[0,:,:].nbytes + gradient_row_interp[0,:,:].nbytes + image_interp[0,:,:].nbytes + rgc_map[0,:,:].nbytes
max_slices = int((device['device'].global_mem_size // (size_per_slice))/4) # TODO 3 is a magic number
max_slices = int((device['device'].global_mem_size // (size_per_slice))/mem_div)
max_slices = self._check_max_slices(image_interp, max_slices)

# Initial buffers
mf = cl.mem_flags
grad_col_int_in = cl.Buffer(cl_ctx, mf.READ_ONLY, gradient_col_interp[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, grad_col_int_in, gradient_col_interp[0:max_slices,:,:]).wait()

grad_row_int_in = cl.Buffer(cl_ctx, mf.READ_ONLY, gradient_row_interp[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, grad_row_int_in, gradient_row_interp[0:max_slices,:,:]).wait()

image_interp_in = cl.Buffer(cl_ctx, mf.READ_ONLY, image_interp[0:max_slices,:,:].nbytes)
cl.enqueue_copy(cl_queue, image_interp_in, image_interp[0:max_slices,:,:]).wait()

rgc_map_out = cl.Buffer(cl_ctx, mf.WRITE_ONLY, rgc_map[0:max_slices,:,:].nbytes)


cl.enqueue_copy(cl_queue, grad_col_int_in, gradient_col_interp[0:max_slices,:,:]).wait()
cl.enqueue_copy(cl_queue, grad_row_int_in, gradient_row_interp[0:max_slices,:,:]).wait()
cl.enqueue_copy(cl_queue, image_interp_in, image_interp[0:max_slices,:,:]).wait()

# Code and building the kernel
code = self._get_cl_code("_le_radial_gradient_convergence.cl", device['DP'])
prg = cl.Program(cl_ctx, code).build()
Expand Down
Loading

0 comments on commit 95a275b

Please sign in to comment.