From b3b1b0881c4b62249317c5c3c09b5fe4cbcb2c24 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 31 Oct 2022 15:22:26 -0700 Subject: [PATCH] Rewrite python_bindings/apps (#7133) * apps * wip * WIP 2 * Fix comments * _GPU_SCHEDULE_ENUM_MAP * Update blur_generator.py * Add hl.funcs, hl.vars, plus formatting tweaks --- python_bindings/src/halide/__init__.py | 21 +- .../src/halide/_generator_helpers.py | 9 + python_bindings/test/apps/CMakeLists.txt | 59 +++-- ...al_grid_shell.py => bilateral_grid_app.py} | 19 +- .../test/apps/bilateral_grid_generator.py | 190 ++++++++++++++ python_bindings/test/apps/blur.py | 85 ------ python_bindings/test/apps/blur_app.py | 51 ++++ python_bindings/test/apps/blur_generator.py | 128 +++++++++ python_bindings/test/apps/erode.py | 93 ------- python_bindings/test/apps/interpolate.py | 212 --------------- python_bindings/test/apps/interpolate_app.py | 54 ++++ .../test/apps/interpolate_generator.py | 235 +++++++++++++++++ python_bindings/test/apps/local_laplacian.py | 245 ------------------ .../test/apps/local_laplacian_app.py | 58 +++++ .../test/apps/local_laplacian_generator.py | 226 ++++++++++++++++ .../test/generators/CMakeLists.txt | 5 - .../generators/bilateral_grid_generator.py | 136 ---------- 17 files changed, 1021 insertions(+), 805 deletions(-) rename python_bindings/test/apps/{bilateral_grid_shell.py => bilateral_grid_app.py} (80%) create mode 100644 python_bindings/test/apps/bilateral_grid_generator.py delete mode 100644 python_bindings/test/apps/blur.py create mode 100644 python_bindings/test/apps/blur_app.py create mode 100644 python_bindings/test/apps/blur_generator.py delete mode 100644 python_bindings/test/apps/erode.py delete mode 100644 python_bindings/test/apps/interpolate.py create mode 100644 python_bindings/test/apps/interpolate_app.py create mode 100644 python_bindings/test/apps/interpolate_generator.py delete mode 100644 python_bindings/test/apps/local_laplacian.py create mode 100644 python_bindings/test/apps/local_laplacian_app.py create mode 100644 python_bindings/test/apps/local_laplacian_generator.py delete mode 100644 python_bindings/test/generators/bilateral_grid_generator.py diff --git a/python_bindings/src/halide/__init__.py b/python_bindings/src/halide/__init__.py index bc3e85f06db4..8a6167d3d5f0 100644 --- a/python_bindings/src/halide/__init__.py +++ b/python_bindings/src/halide/__init__.py @@ -1,6 +1,19 @@ from .halide_ import * from .halide_ import _, _1, _2, _3, _4, _5, _6, _7, _8, _9 -from ._generator_helpers import GeneratorParam, InputBuffer, InputScalar, OutputBuffer, \ - OutputScalar, Generator, alias, generator, active_generator_context, \ - _get_python_generator_names, _create_python_generator, \ - _generatorcontext_enter, _generatorcontext_exit +from ._generator_helpers import ( + _create_python_generator, + _generatorcontext_enter, + _generatorcontext_exit, + _get_python_generator_names, + active_generator_context, + alias, + funcs, + Generator, + generator, + GeneratorParam, + InputBuffer, + InputScalar, + OutputBuffer, + OutputScalar, + vars, +) diff --git a/python_bindings/src/halide/_generator_helpers.py b/python_bindings/src/halide/_generator_helpers.py index 49e04117a1b8..0358d741314f 100644 --- a/python_bindings/src/halide/_generator_helpers.py +++ b/python_bindings/src/halide/_generator_helpers.py @@ -809,3 +809,12 @@ def generator_impl(cls): return new_cls return generator_impl + +def funcs(names:str) -> tuple(Func): + """Given a space-delimited string, create a Func for each substring and return as a tuple.""" + return (Func(n) for n in names.split(' ')) + + +def vars(names:str) -> tuple(Var): + """Given a space-delimited string, create a Var for each substring and return as a tuple.""" + return (Var(n) for n in names.split(' ')) diff --git a/python_bindings/test/apps/CMakeLists.txt b/python_bindings/test/apps/CMakeLists.txt index 212d566b9279..9fbf0bf8eb11 100644 --- a/python_bindings/test/apps/CMakeLists.txt +++ b/python_bindings/test/apps/CMakeLists.txt @@ -1,25 +1,52 @@ -set(tests - bilateral_grid_shell.py - blur.py - erode.py - interpolate.py - local_laplacian.py) - set(TEST_TMPDIR "$") set(TEST_IMAGES_DIR "$") -set(DEPS_bilateral_grid_shell py_aot_bilateral_grid) -set(PYPATH_bilateral_grid_shell "$") -set(ARGS_bilateral_grid_shell ${TEST_IMAGES_DIR}/gray.png ${TEST_TMPDIR}/out.png 0.1 10) +set(APPS + bilateral_grid + blur + interpolate + local_laplacian) + +set(GENERATORS_bilateral_grid bilateral_grid bilateral_grid_Adams2019 bilateral_grid_Li2018 bilateral_grid_Mullapudi2016) +set(GENERATORS_interpolate interpolate interpolate_Mullapudi2016) +set(GENERATORS_local_laplacian local_laplacian local_laplacian_Mullapudi2016) +set(GENERATORS_blur blur) + +set(ARGS_bilateral_grid ${TEST_IMAGES_DIR}/gray.png 0.1 ${TEST_TMPDIR}/out.png) +set(ARGS_blur ${TEST_IMAGES_DIR}/gray.png ${TEST_TMPDIR}/out.png) +set(ARGS_interpolate ${TEST_IMAGES_DIR}/rgba.png ${TEST_TMPDIR}/out.png) +set(ARGS_local_laplacian ${TEST_IMAGES_DIR}/rgba.png 8 1 1 ${TEST_TMPDIR}/out.png) + +foreach (app IN LISTS APPS) + set(app_generator_src "${app}_generator.py") + add_halide_generator(app_gen_${app} + SOURCES ${app_generator_src}) + + set(DEPS "") + foreach (G IN ITEMS ${GENERATORS_${app}}) + add_halide_library(app_aot_${G} + FROM app_gen_${app} + GENERATOR ${G} + FUNCTION_NAME ${G} + USE_RUNTIME ${RUNTIME_${G}} + PYTHON_EXTENSION _ignored_result + # We don't really need all the plugins at once here -- + # it's just easier to specify them all + PLUGINS Halide::Adams2019 Halide::Li2018 Halide::Mullapudi2016) + + add_halide_python_extension_library(app_ext_${G} + MODULE_NAME ${G} + HALIDE_LIBRARIES app_aot_${G}) + list(APPEND DEPS app_ext_${G}) + endforeach() -foreach (test IN LISTS tests) - cmake_path(GET test STEM test_name) + set(app_src "${app}_app.py") add_python_test( - FILE "${test}" - TEST_ARGS ${ARGS_${test_name}} + FILE "${app_src}" + TEST_ARGS ${ARGS_${app}} LABEL python_apps - DEPENDS ${DEPS_${test_name}} - PYTHONPATH ${PYPATH_${test_name}} + DEPENDS ${DEPS} + PYTHONPATH "$" ENVIRONMENT "TEST_TMPDIR=${TEST_TMPDIR}" "TEST_IMAGES_DIR=${TEST_IMAGES_DIR}" diff --git a/python_bindings/test/apps/bilateral_grid_shell.py b/python_bindings/test/apps/bilateral_grid_app.py similarity index 80% rename from python_bindings/test/apps/bilateral_grid_shell.py rename to python_bindings/test/apps/bilateral_grid_app.py index 880ea670e31d..e04a24594bde 100644 --- a/python_bindings/test/apps/bilateral_grid_shell.py +++ b/python_bindings/test/apps/bilateral_grid_app.py @@ -8,22 +8,22 @@ from bilateral_grid_Mullapudi2016 import bilateral_grid_Mullapudi2016 import halide.imageio import numpy as np -import os import sys from timeit import Timer def main(): - if len(sys.argv) < 5: - print("Usage: %s input.png output.png range_sigma timing_iterations" % sys.argv[0]) + if len(sys.argv) < 4: + print("Usage: %s input.png output.png range_sigma" % sys.argv[0]) print("e.g. %s input.png output.png 0.1 10" % sys.argv[0]) - sys.exit(0) + sys.exit(1) input_path = sys.argv[1] - output_path = sys.argv[2] - r_sigma = float(sys.argv[3]) - timing_iterations = int(sys.argv[4]) + r_sigma = float(sys.argv[2]) + output_path = sys.argv[3] + timing_iterations = 10 + print("Reading from %s ..." % input_path) input_buf_u8 = halide.imageio.imread(input_path) assert input_buf_u8.dtype == np.uint8 # Convert to float32 @@ -45,18 +45,19 @@ def main(): } for name, fn in tests.items(): - print("Running %s... " % name, end = "") + print("Running %s... " % name, end="") t = Timer(lambda: fn(input_buf, r_sigma, output_buf)) avg_time_sec = t.timeit(number=timing_iterations) / timing_iterations print("time: %fms" % (avg_time_sec * 1e3)) output_buf *= 255.0 output_buf_u8 = output_buf.astype(np.uint8) + print("Saving to %s ..." % output_path) halide.imageio.imwrite(output_path, output_buf_u8) print("Success!") sys.exit(0) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python_bindings/test/apps/bilateral_grid_generator.py b/python_bindings/test/apps/bilateral_grid_generator.py new file mode 100644 index 000000000000..833ac0aa5fd2 --- /dev/null +++ b/python_bindings/test/apps/bilateral_grid_generator.py @@ -0,0 +1,190 @@ +""" +Bilateral histogram. +""" + +import halide as hl + + +@hl.alias( + bilateral_grid_Adams2019={"autoscheduler": "Adams2019"}, + bilateral_grid_Mullapudi2016={"autoscheduler": "Mullapudi2016"}, + bilateral_grid_Li2018={"autoscheduler": "Li2018"}, +) +@hl.generator() +class bilateral_grid: + s_sigma = hl.GeneratorParam(8) + + input_buf = hl.InputBuffer(hl.Float(32), 2) + r_sigma = hl.InputScalar(hl.Float(32)) + bilateral_grid = hl.OutputBuffer(hl.Float(32), 2) + + def generate(self): + g = self + + x, y, z, c = hl.vars("x y z c") + + # Add a boundary condition + clamped = hl.BoundaryConditions.repeat_edge(g.input_buf) + + # Construct the bilateral grid + r = hl.RDom([(0, g.s_sigma), (0, g.s_sigma)]) + val = clamped[ + x * g.s_sigma + r.x - g.s_sigma // 2, + y * g.s_sigma + r.y - g.s_sigma // 2, + ] + val = hl.clamp(val, 0.0, 1.0) + + zi = hl.i32(val / g.r_sigma + 0.5) + + histogram = hl.Func("histogram") + histogram[x, y, z, c] = 0.0 + histogram[x, y, zi, c] += hl.mux(c, [val, 1.0]) + + # Blur the histogram using a five-tap filter + blurx, blury, blurz = hl.funcs("blurx blury blurz") + blurz[x, y, z, c] = ( + histogram[x, y, z - 2, c] + + histogram[x, y, z - 1, c] * 4 + + histogram[x, y, z, c] * 6 + + histogram[x, y, z + 1, c] * 4 + + histogram[x, y, z + 2, c] + ) + blurx[x, y, z, c] = ( + blurz[x - 2, y, z, c] + + blurz[x - 1, y, z, c] * 4 + + blurz[x, y, z, c] * 6 + + blurz[x + 1, y, z, c] * 4 + + blurz[x + 2, y, z, c] + ) + blury[x, y, z, c] = ( + blurx[x, y - 2, z, c] + + blurx[x, y - 1, z, c] * 4 + + blurx[x, y, z, c] * 6 + + blurx[x, y + 1, z, c] * 4 + + blurx[x, y + 2, z, c] + ) + + # Take trilinear samples to compute the output + val = hl.clamp(clamped[x, y], 0.0, 1.0) + zv = val / g.r_sigma + zi = hl.i32(zv) + zf = zv - zi + xf = hl.f32(x % g.s_sigma) / g.s_sigma + yf = hl.f32(y % g.s_sigma) / g.s_sigma + xi = x / g.s_sigma + yi = y / g.s_sigma + + interpolated = hl.Func("interpolated") + interpolated[x, y, c] = hl.lerp( + hl.lerp( + hl.lerp(blury[xi, yi, zi, c], blury[xi + 1, yi, zi, c], xf), + hl.lerp(blury[xi, yi + 1, zi, c], blury[xi + 1, yi + 1, zi, c], xf), + yf, + ), + hl.lerp( + hl.lerp(blury[xi, yi, zi + 1, c], blury[xi + 1, yi, zi + 1, c], xf), + hl.lerp( + blury[xi, yi + 1, zi + 1, c], blury[xi + 1, yi + 1, zi + 1, c], xf + ), + yf, + ), + zf, + ) + + # Normalize + g.bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] + + # ESTIMATES + # (This can be useful in conjunction with RunGen and benchmarks as well + # as auto-schedule, so we do it in all cases.) + # Provide estimates on the input image + g.input_buf.set_estimates([(0, 1536), (0, 2560)]) + # Provide estimates on the parameters + g.r_sigma.set_estimate(0.1) + # TODO: Compute estimates from the parameter values + histogram.set_estimate(z, -2, 16) + blurz.set_estimate(z, 0, 12) + blurx.set_estimate(z, 0, 12) + blury.set_estimate(z, 0, 12) + g.bilateral_grid.set_estimates([(0, 1536), (0, 2560)]) + + if g.using_autoscheduler(): + # nothing + pass + elif g.target().has_gpu_feature(): + # 0.50ms on an RTX 2060 + + xi, yi, zi = hl.vars("xi yi zi") + + # Schedule blurz in 8x8 tiles. This is a tile in + # grid-space, which means it represents something like + # 64x64 pixels in the input (if s_sigma is 8). + blurz.compute_root().reorder(c, z, x, y).gpu_tile(x, y, xi, yi, 8, 8) + + # Schedule histogram to happen per-tile of blurz, with + # intermediate results in shared memory. This means histogram + # and blurz makes a three-stage kernel: + # 1) Zero out the 8x8 set of histograms + # 2) Compute those histogram by iterating over lots of the input image + # 3) Blur the set of histograms in z + histogram.reorder(c, z, x, y).compute_at(blurz, x).gpu_threads(x, y) + histogram.update().reorder(c, r.x, r.y, x, y).gpu_threads(x, y).unroll(c) + + # Schedule the remaining blurs and the sampling at the end + # similarly. + ( + blurx.compute_root() + .reorder(c, x, y, z) + .reorder_storage(c, x, y, z) + .vectorize(c) + .unroll(y, 2, hl.TailStrategy.RoundUp) + .gpu_tile(x, y, z, xi, yi, zi, 32, 8, 1, hl.TailStrategy.RoundUp) + ) + ( + blury.compute_root() + .reorder(c, x, y, z) + .reorder_storage(c, x, y, z) + .vectorize(c) + .unroll(y, 2, hl.TailStrategy.RoundUp) + .gpu_tile(x, y, z, xi, yi, zi, 32, 8, 1, hl.TailStrategy.RoundUp) + ) + g.bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, 32, 8) + interpolated.compute_at(g.bilateral_grid, xi).vectorize(c) + else: + # CPU schedule. + + # 3.98ms on an Intel i9-9960X using 32 threads at 3.7 GHz + # using target x86-64-avx2. This is a little less + # SIMD-friendly than some of the other apps, so we + # benefit from hyperthreading, and don't benefit from + # AVX-512, which on my machine reduces the clock to 3.0 + # GHz. + + ( + blurz.compute_root() + .reorder(c, z, x, y) + .parallel(y) + .vectorize(x, 8) + .unroll(c) + ) + histogram.compute_at(blurz, y) + histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) + ( + blurx.compute_root() # + .reorder(c, x, y, z) # + .parallel(z) # + .vectorize(x, 8) # + .unroll(c) + ) + ( + blury.compute_root() + .reorder(c, x, y, z) + .parallel(z) + .vectorize(x, 8) + .unroll(c) + ) + g.bilateral_grid.compute_root().parallel(y).vectorize(x, 8) + + +if __name__ == "__main__": + hl.main() diff --git a/python_bindings/test/apps/blur.py b/python_bindings/test/apps/blur.py deleted file mode 100644 index 58700dc5f167..000000000000 --- a/python_bindings/test/apps/blur.py +++ /dev/null @@ -1,85 +0,0 @@ -import halide as hl - -import numpy as np -import halide.imageio -import os.path - -# Return the directory to look in for test images: -# - If TEST_IMAGES_DIR is defined, use that -# - Otherwise, create a relative path to the C++ apps/images dir -def apps_images_dir(): - return os.environ.get("TEST_IMAGES_DIR", os.path.join(os.path.dirname(__file__), "../../apps/images")) - -# Return the directory to use when writing output files: -# - If TEST_TMPDIR is defined, use that -# - Otherwise, return an empty string (i.e., relative to whatever the current directory is) -def apps_output_dir(): - return os.environ.get("TEST_TMPDIR", "") - -def get_blur(input): - assert type(input) == hl.ImageParam - assert input.dimensions() == 2 - - x, y = hl.Var("x"), hl.Var("y") - - clamped_input = hl.BoundaryConditions.repeat_edge(input) - - input_uint16 = hl.Func("input_uint16") - input_uint16[x,y] = hl.u16(clamped_input[x,y]) - ci = input_uint16 - - blur_x = hl.Func("blur_x") - blur_y = hl.Func("blur_y") - - blur_x[x,y] = (ci[x,y]+ci[x+1,y]+ci[x+2,y])/3 - blur_y[x,y] = hl.cast(hl.UInt(8), (blur_x[x,y]+blur_x[x,y+1]+blur_x[x,y+2])/3) - - # schedule - xi, yi = hl.Var("xi"), hl.Var("yi") - blur_y.tile(x, y, xi, yi, 8, 4).parallel(y).vectorize(xi, 8) - blur_x.compute_at(blur_y, x).vectorize(x, 8) - - return blur_y - - -def get_input_data(): - image_path = os.path.join(apps_images_dir(), "rgb.png") - rgb_data = halide.imageio.imread(image_path) - - grey_data = np.mean(rgb_data, axis=0, dtype=np.float32).astype(rgb_data.dtype) - input_data = np.copy(grey_data) - - return input_data - -def main(): - # define and compile the function - input = hl.ImageParam(hl.UInt(8), 2, "input_param") - blur = get_blur(input) - blur.compile_jit() - - # preparing input and output memory buffers (numpy ndarrays) - input_data = get_input_data() - input_image = hl.Buffer(input_data) - input.set(input_image) - - output_data = np.empty(input_data.shape, dtype=input_data.dtype) - output_image = hl.Buffer(output_data) - - # do the actual computation - blur.realize(output_image) - - # save results - input_path = os.path.join(apps_output_dir(), "blur_input.png") - output_path = os.path.join(apps_output_dir(), "blur_result.png") - halide.imageio.imwrite(input_path, input_data) - halide.imageio.imwrite(output_path, output_data) - print("\nblur realized on output image.", - "Result saved at", output_path, - "( input data copy at", input_path, ")") - - print("\nEnd of game. Have a nice day!") - return - - -if __name__ == "__main__": - main() diff --git a/python_bindings/test/apps/blur_app.py b/python_bindings/test/apps/blur_app.py new file mode 100644 index 000000000000..f759e344886d --- /dev/null +++ b/python_bindings/test/apps/blur_app.py @@ -0,0 +1,51 @@ +""" +Simple blur. +""" + +from blur import blur +import halide.imageio +import numpy as np +import sys +from timeit import Timer + + +def main(): + if len(sys.argv) < 3: + print("Usage: %s input.png output.png" % sys.argv[0]) + print("e.g. %s input.png output.png 10" % sys.argv[0]) + sys.exit(1) + + input_path = sys.argv[1] + output_path = sys.argv[2] + timing_iterations = 10 + + print("Reading from %s ..." % input_path) + input_buf_u8 = halide.imageio.imread(input_path) + assert input_buf_u8.dtype == np.uint8 + # Convert to uint16... but remember that the blur() generator + # is documented as only working on <= 14 bits of image; if + # we use the upper two bits we'll get incorrect results. + # We'll just leave it with 8 bits of useful data. + input_buf = input_buf_u8.astype(np.uint16) + output_buf = np.empty(input_buf.shape, dtype=input_buf.dtype) + + tests = { + "Manual": blur, + } + + for name, fn in tests.items(): + print("Running %s... " % name, end="") + t = Timer(lambda: fn(input_buf, output_buf)) + avg_time_sec = t.timeit(number=timing_iterations) / timing_iterations + print("time: %fms" % (avg_time_sec * 1e3)) + + output_buf_u8 = output_buf.astype(np.uint8) + print("Saving to %s ..." % output_path) + halide.imageio.imwrite(output_path, output_buf_u8) + + print("Success!") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/python_bindings/test/apps/blur_generator.py b/python_bindings/test/apps/blur_generator.py new file mode 100644 index 000000000000..5a02a4a54947 --- /dev/null +++ b/python_bindings/test/apps/blur_generator.py @@ -0,0 +1,128 @@ +""" +Simple blur. +""" + +import halide as hl +from enum import Enum + + +class BlurGPUSchedule(Enum): + # Fully inlining schedule. + Inline = 0 + # Schedule caching intermedia result of blur_x. + Cache = 1 + # Schedule enabling sliding window opt within each work-item or cuda + # thread. + Slide = 2 + # The same as above plus vectorization per work-item. + SlideVectorize = 3 + + +_GPU_SCHEDULE_ENUM_MAP = { + "inline": BlurGPUSchedule.Inline, + "cache": BlurGPUSchedule.Cache, + "slide": BlurGPUSchedule.Slide, + "slide_vector": BlurGPUSchedule.SlideVectorize, +} + + +@hl.generator() +class blur: + gpu_schedule = hl.GeneratorParam("slide_vector") + gpu_tile_x = hl.GeneratorParam(32) + gpu_tile_y = hl.GeneratorParam(8) + + # Note: although this is declared as operating on uint16 images, + # it will produce incorrect results if more than 14-bit images are used. + input_buf = hl.InputBuffer(hl.UInt(16), 2) + blur_y = hl.OutputBuffer(hl.UInt(16), 2) + + def generate(self): + g = self + + x, y, xi, yi = hl.vars("x y xi yi") + + # The algorithm + clamped = hl.BoundaryConditions.repeat_edge(g.input_buf) + + blur_x = hl.Func("blur_x") + blur_x[x, y] = (clamped[x, y] + clamped[x + 1, y] + clamped[x + 2, y]) // 3 + g.blur_y[x, y] = (blur_x[x, y] + blur_x[x, y + 1] + blur_x[x, y + 2]) // 3 + + # How to schedule it + if g.target().has_gpu_feature(): + # GPU schedule. + + # This will raise an exception for unknown strings, which is what + # we want + schedule_enum = _GPU_SCHEDULE_ENUM_MAP[g.gpu_schedule] + + if schedule_enum == BlurGPUSchedule.Inline: + # - Fully inlining. + g.blur_y.gpu_tile(x, y, xi, yi, g.gpu_tile_x, g.gpu_tile_y) + + elif schedule_enum == BlurGPUSchedule.Cache: + # - Cache blur_x calculation. + g.blur_y.gpu_tile(x, y, xi, yi, g.gpu_tile_x, g.gpu_tile_y) + blur_x.compute_at(g.blur_y, x).gpu_threads(x, y) + + elif schedule_enum == BlurGPUSchedule.Slide: + # - Instead of caching blur_x calculation explicitly, the + # alternative is to allow each work-item in OpenCL or thread + # in CUDA to calculate more rows of blur_y so that temporary + # blur_x calculation is re-used implicitly. This achieves + # the similar schedule of sliding window. + y_inner = hl.Var("y_inner") + ( + g.blur_y.split(y, y, y_inner, g.gpu_tile_y) + .reorder(y_inner, x) + .unroll(y_inner) + .gpu_tile(x, y, xi, yi, g.gpu_tile_x, 1) + ) + + elif schedule_enum == BlurGPUSchedule.SlideVectorize: + # Vectorization factor. + factor = 2 + y_inner = hl.Var("y_inner") + ( + g.blur_y.vectorize(x, factor) + .split(y, y, y_inner, g.gpu_tile_y) + .reorder(y_inner, x) + .unroll(y_inner) + .gpu_tile(x, y, xi, yi, g.gpu_tile_x, 1) + ) + + elif g.target().has_feature(hl.TargetFeature.HVX): + # Hexagon schedule. + # TODO: Try using a schedule like the CPU one below. + vector_size = 128 + + ( + g.blur_y.compute_root() + .hexagon() + .prefetch(g.input_buf, y, y, 2) + .split(y, y, yi, 128) + .parallel(y) + .vectorize(x, vector_size * 2) + ) + ( + blur_x.store_at(g.blur_y, y) + .compute_at(g.blur_y, yi) + .vectorize(x, vector_size) + ) + else: + # CPU schedule. + # Compute blur_x as needed at each vector of the output. + # Halide will store blur_x in a circular buffer so its + # results can be re-used. + vector_size = g.natural_vector_size(g.input_buf.type()) + g.blur_y.split(y, y, yi, 32).parallel(y).vectorize(x, vector_size) + ( + blur_x.store_at(g.blur_y, y) + .compute_at(g.blur_y, x) + .vectorize(x, vector_size) + ) + + +if __name__ == "__main__": + hl.main() diff --git a/python_bindings/test/apps/erode.py b/python_bindings/test/apps/erode.py deleted file mode 100644 index 44f4815b5190..000000000000 --- a/python_bindings/test/apps/erode.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Erode application using Python Halide bindings -""" - -import halide as hl - -import numpy as np -import halide.imageio -import os.path - -# Return the directory to look in for test images: -# - If TEST_IMAGES_DIR is defined, use that -# - Otherwise, create a relative path to the C++ apps/images dir -def apps_images_dir(): - return os.environ.get("TEST_IMAGES_DIR", os.path.join(os.path.dirname(__file__), "../../apps/images")) - -# Return the directory to use when writing output files: -# - If TEST_TMPDIR is defined, use that -# - Otherwise, return an empty string (i.e., relative to whatever the current directory is) -def apps_output_dir(): - return os.environ.get("TEST_TMPDIR", "") - -def get_erode(input): - """ - Erode on 5x5 stencil, first erode x then erode y. - """ - - x = hl.Var("x") - y = hl.Var("y") - c = hl.Var("c") - input_clamped = hl.Func("input_clamped") - erode_x = hl.Func("erode_x") - erode_y = hl.Func("erode_y") - - input_clamped[x,y,c] = input[hl.clamp(x,hl.cast(hl.Int(32),0),hl.cast(hl.Int(32),input.width()-1)), - hl.clamp(y,hl.cast(hl.Int(32),0),hl.cast(hl.Int(32),input.height()-1)), c] - erode_x[x,y,c] = hl.min(hl.min(hl.min(hl.min(input_clamped[x-2,y,c],input_clamped[x-1,y,c]),input_clamped[x,y,c]),input_clamped[x+1,y,c]),input_clamped[x+2,y,c]) - erode_y[x,y,c] = hl.min(hl.min(hl.min(hl.min(erode_x[x,y-2,c],erode_x[x,y-1,c]),erode_x[x,y,c]),erode_x[x,y+1,c]),erode_x[x,y+2,c]) - - yi = hl.Var("yi") - - # CPU Schedule - erode_x.compute_root().split(y, y, yi, 8).parallel(y) - erode_y.compute_root().split(y, y, yi, 8).parallel(y) - - return erode_y - - -def get_input_data(): - image_path = os.path.join(apps_images_dir(), "rgb.png") - rgb_data = halide.imageio.imread(image_path) - print("rgb_data", type(rgb_data), rgb_data.shape, rgb_data.dtype) - - input_data = np.copy(rgb_data) - - return input_data - - -def main(): - - # define and compile the function - input = hl.ImageParam(hl.UInt(8), 3, "input") - erode = get_erode(input) - erode.compile_jit() - - # preparing input and output memory buffers (numpy ndarrays) - input_data = get_input_data() - input_image = hl.Buffer(input_data) - input.set(input_image) - - output_data = np.empty(input_data.shape, dtype=input_data.dtype) - output_image = hl.Buffer(output_data) - - print("input_image", input_image) - print("output_image", output_image) - - # do the actual computation - erode.realize(output_image) - - # save results - input_path = os.path.join(apps_output_dir(), "erode_input.png") - output_path = os.path.join(apps_output_dir(), "erode_result.png") - halide.imageio.imwrite(input_path, input_data) - halide.imageio.imwrite(output_path, output_data) - print("\nerode realized on output image.", - "Result saved at", output_path, - "( input data copy at", input_path, ")") - - print("\nEnd of game. Have a nice day!") - return - -if __name__ == "__main__": - main() diff --git a/python_bindings/test/apps/interpolate.py b/python_bindings/test/apps/interpolate.py deleted file mode 100644 index 5e23aa3020b4..000000000000 --- a/python_bindings/test/apps/interpolate.py +++ /dev/null @@ -1,212 +0,0 @@ -""" -Fast image interpolation using a pyramid. -""" - -import halide as hl - -from datetime import datetime -import halide.imageio -import numpy as np -import os.path - -# Return the directory to look in for test images: -# - If TEST_IMAGES_DIR is defined, use that -# - Otherwise, create a relative path to the C++ apps/images dir -def apps_images_dir(): - return os.environ.get("TEST_IMAGES_DIR", os.path.join(os.path.dirname(__file__), "../../apps/images")) - -# Return the directory to use when writing output files: -# - If TEST_TMPDIR is defined, use that -# - Otherwise, return an empty string (i.e., relative to whatever the current directory is) -def apps_output_dir(): - return os.environ.get("TEST_TMPDIR", "") - -int_t = hl.Int(32) -float_t = hl.Float(32) - - -def get_interpolate(input, levels): - """ - Build function, schedules it, and invokes jit compiler - :return: halide.hl.Func - """ - - # THE ALGORITHM - - downsampled = [hl.Func('downsampled%d' % i) for i in range(levels)] - downx = [hl.Func('downx%d' % l) for l in range(levels)] - interpolated = [hl.Func('interpolated%d' % i) for i in range(levels)] - - upsampled = [hl.Func('upsampled%d' % l) for l in range(levels)] - upsampledx = [hl.Func('upsampledx%d' % l) for l in range(levels)] - x = hl.Var('x') - y = hl.Var('y') - c = hl.Var('c') - - clamped = hl.Func('clamped') - clamped[x, y, c] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] - - # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we - # rewrite it in a way that doesn't trigger the bug. The rewritten - # form assumes the input alpha is zero or one. - # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3]) - downsampled[0][x, y, c] = clamped[x, y, c] * clamped[x, y, 3] - - for l in range(1, levels): - prev = downsampled[l - 1] - - if l == 4: - # Also add a boundary condition at a middle pyramid level - # to prevent the footprint of the downsamplings to extend - # too far off the base image. Otherwise we look 512 - # pixels off each edge. - w = input.width() / (1 << l) - h = input.height() / (1 << l) - prev = hl.lambda_func(x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c]) - - downx[l][x, y, c] = (prev[x * 2 - 1, y, c] + 2.0 * prev[x * 2, y, c] + prev[x * 2 + 1, y, c]) * 0.25 - downsampled[l][x, y, c] = (downx[l][x, y * 2 - 1, c] + 2.0 * downx[l][x, y * 2, c] + downx[l][ - x, y * 2 + 1, c]) * 0.25 - - interpolated[levels - 1][x, y, c] = downsampled[levels - 1][x, y, c] - for l in range(levels - 1)[::-1]: - upsampledx[l][x, y, c] = (interpolated[l + 1][x / 2, y, c] + interpolated[l + 1][(x + 1) / 2, y, c]) / 2.0 - upsampled[l][x, y, c] = (upsampledx[l][x, y / 2, c] + upsampledx[l][x, (y + 1) / 2, c]) / 2.0 - interpolated[l][x, y, c] = downsampled[l][x, y, c] + (1.0 - downsampled[l][x, y, 3]) * upsampled[l][x, y, c] - - normalize = hl.Func('normalize') - normalize[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3] - - final = hl.Func('final') - final[x, y, c] = normalize[x, y, c] - - print("Finished function setup.") - - # THE SCHEDULE - target = hl.get_target_from_environment() - if target.has_gpu_feature(): - sched = 4 - else: - sched = 2 - - if sched == 0: - print("Flat schedule.") - for l in range(levels): - downsampled[l].compute_root() - interpolated[l].compute_root() - - final.compute_root() - - elif sched == 1: - print("Flat schedule with vectorization.") - for l in range(levels): - downsampled[l].compute_root().vectorize(x, 4) - interpolated[l].compute_root().vectorize(x, 4) - - final.compute_root() - - elif sched == 2: - print("Flat schedule with parallelization + vectorization") - xi, yi = hl.Var('xi'), hl.Var('yi') - clamped.compute_root().parallel(y).bound(c, 0, 4).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) - for l in range(1, levels - 1): - if l > 0: - downsampled[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) - interpolated[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4) - interpolated[l].unroll(x, 2).unroll(y, 2) - - final.reorder(c, x, y).bound(c, 0, 3).parallel(y) - final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi) - final.bound(x, 0, input.width()) - final.bound(y, 0, input.height()) - - elif sched == 3: - print("Flat schedule with vectorization sometimes.") - for l in range(levels): - if l + 4 < levels: - downsampled[l].compute_root().vectorize(x, 4) - interpolated[l].compute_root().vectorize(x, 4) - else: - downsampled[l].compute_root() - interpolated[l].compute_root() - - final.compute_root() - - elif sched == 4: - print("GPU schedule.") - - # Some gpus don't have enough memory to process the entire - # image, so we process the image in tiles. - yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var("xi"), hl.Var("ci") - final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4) - final.tile(x, y, xo, yo, xi, yi, input.width() / 4, input.height() / 4) - normalize.compute_at(final, xo).reorder(c, x, y).gpu_tile(x, y, xi, yi, 16, 16).unroll(c) - - # Start from level 1 to save memory - level zero will be computed on demand - for l in range(1, levels): - tile_size = 32 >> l - if tile_size < 1: tile_size = 1 - if tile_size > 16: tile_size = 16 - downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4) - interpolated[l].compute_at(final, xo).gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4) - - else: - print("No schedule with this number.") - exit(1) - - # JIT compile the pipeline eagerly, so we don't interfere with timing - final.compile_jit(target) - - return final - - -def get_input_data(): - image_path = os.path.join(apps_images_dir(), "rgba.png") - rgba_data = halide.imageio.imread(image_path) - - # input data is in range [0, 1] - input_data = np.copy(rgba_data).astype(np.float32) / 255.0 - return input_data - - -def main(): - input = hl.ImageParam(float_t, 3, "input") - levels = 10 - - interpolate = get_interpolate(input, levels) - - # preparing input and output memory buffers (numpy ndarrays) - input_data = get_input_data() - input_image = hl.Buffer(input_data) - assert input_image.channels() == 4 - input.set(input_image) - - input_width, input_height = input_image.width(), input_image.height() - - t0 = datetime.now() - output_image = interpolate.realize([input_width, input_height, 3]) - t1 = datetime.now() - - elapsed = (t1 - t0).total_seconds() - print('Interpolated in {:.5f} secs'.format(elapsed)) - - output_data = np.asanyarray(output_image) - - # convert output - input_data = (input_data * 255).astype(np.uint8) - output_data = (output_data * 255).astype(np.uint8) - - # save results - input_path = os.path.join(apps_output_dir(), "interpolate_input.png") - output_path = os.path.join(apps_output_dir(), "interpolate_result.png") - halide.imageio.imwrite(input_path, input_data) - halide.imageio.imwrite(output_path, output_data) - - print() - print('blur realized on output image. Result saved at {} (input data copy at {})'.format(output_path, input_path)) - print() - print("End of game. Have a nice day!") - - -if __name__ == '__main__': - main() diff --git a/python_bindings/test/apps/interpolate_app.py b/python_bindings/test/apps/interpolate_app.py new file mode 100644 index 000000000000..a226a11a681b --- /dev/null +++ b/python_bindings/test/apps/interpolate_app.py @@ -0,0 +1,54 @@ +""" +Shell for running Fast image interpolation using a pyramid. +""" + +from interpolate import interpolate +from interpolate_Mullapudi2016 import interpolate_Mullapudi2016 +import halide.imageio +import numpy as np +import sys +from timeit import Timer + + +def main(): + if len(sys.argv) < 3: + print("Usage: %s input.png output.png" % sys.argv[0]) + print("e.g. %s input.png output.png 10" % sys.argv[0]) + sys.exit(1) + + input_path = sys.argv[1] + output_path = sys.argv[2] + timing_iterations = 10 + + print("Reading from %s ..." % input_path) + input_buf_u8 = halide.imageio.imread(input_path) + assert input_buf_u8.dtype == np.uint8 + # Convert to float32 in range [0..1] + input_buf = input_buf_u8.astype(np.float32) / 255.0 + h = input_buf.shape[1] + w = input_buf.shape[2] + output_buf = np.empty([3, h, w], dtype=input_buf.dtype) + + tests = { + "Manual": interpolate, + "Mullapudi2016": interpolate_Mullapudi2016, + } + + for name, fn in tests.items(): + print("Running %s... " % name, end="") + t = Timer(lambda: fn(input_buf, output_buf)) + avg_time_sec = t.timeit(number=timing_iterations) / timing_iterations + print("time: %fms" % (avg_time_sec * 1e3)) + + output_buf *= 255.0 + output_buf_u8 = output_buf.astype(np.uint8) + + print("Saving to %s ..." % output_path) + halide.imageio.imwrite(output_path, output_buf_u8) + + print("Success!") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/python_bindings/test/apps/interpolate_generator.py b/python_bindings/test/apps/interpolate_generator.py new file mode 100644 index 000000000000..a1ba365931f0 --- /dev/null +++ b/python_bindings/test/apps/interpolate_generator.py @@ -0,0 +1,235 @@ +""" +Fast image interpolation using a pyramid. +""" + +import halide as hl + + +def _func_list(name, size): + """Return a list containing `size` Funcs, named `name_n` for n in 0..size-1.""" + return [hl.Func("%s_%d" % (name, i)) for i in range(size)] + + +@hl.alias( + interpolate_Mullapudi2016={"autoscheduler": "Mullapudi2016"}, +) +@hl.generator() +class interpolate: + levels = hl.GeneratorParam(10) + + input_buf = hl.InputBuffer(hl.Float(32), 3) + output_buf = hl.OutputBuffer(hl.Float(32), 3) + + def generate(self): + g = self + + x, y, c = hl.vars("x y c") + + # Input must have four color channels - rgba + g.input_buf.dim(2).set_bounds(0, 4) + + downsampled = _func_list("downsampled", g.levels) + downx = _func_list("downx", g.levels) + interpolated = _func_list("interpolated", g.levels) + upsampled = _func_list("upsampled", g.levels) + upsampledx = _func_list("upsampledx", g.levels) + + clamped = hl.BoundaryConditions.repeat_edge(g.input_buf) + + downsampled[0][x, y, c] = hl.select( + c < 3, + clamped[x, y, c] * clamped[x, y, 3], + clamped[x, y, 3], + ) + + for l in range(1, g.levels): + prev = downsampled[l - 1] + + if l == 4: + # Also add a boundary condition at a middle pyramid level + # to prevent the footprint of the downsamplings to extend + # too far off the base image. Otherwise we look 512 + # pixels off each edge. + w = g.input_buf.width() / (1 << (l - 1)) + h = g.input_buf.height() / (1 << (l - 1)) + prev = hl.lambda_func( + x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c] + ) + + downx[l][x, y, c] = ( + prev[x * 2 - 1, y, c] + 2 * prev[x * 2, y, c] + prev[x * 2 + 1, y, c] + ) * 0.25 + + downsampled[l][x, y, c] = ( + downx[l][x, y * 2 - 1, c] + + 2 * downx[l][x, y * 2, c] + + downx[l][x, y * 2 + 1, c] + ) * 0.25 + + interpolated[g.levels - 1][x, y, c] = downsampled[g.levels - 1][x, y, c] + + for l in range(g.levels - 2, -1, -1): + upsampledx[l][x, y, c] = ( + interpolated[l + 1][x / 2, y, c] + + interpolated[l + 1][(x + 1) / 2, y, c] + ) / 2 + upsampled[l][x, y, c] = ( + upsampledx[l][x, y / 2, c] + upsampledx[l][x, (y + 1) / 2, c] + ) / 2 + alpha = 1.0 - downsampled[l][x, y, 3] + interpolated[l][x, y, c] = ( + downsampled[l][x, y, c] + alpha * upsampled[l][x, y, c] + ) + + g.output_buf[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3] + + # Schedule + if g.using_autoscheduler(): + # nothing + pass + elif g.target().has_gpu_feature(): + # 0.86ms on a 2060 RTX + yo, yi, xo, xi, ci, xii, yii = hl.vars("yo yi xo xi ci xii yii") + + ( + g.output_buf.bound(x, 0, g.input_buf.width()) + .bound(y, 0, g.input_buf.height()) + .bound(c, 0, 3) + .reorder(c, x, y) + .tile(x, y, xi, yi, 32, 32, hl.TailStrategy.RoundUp) + .tile(xi, yi, xii, yii, 2, 2) + .gpu_blocks(x, y) + .gpu_threads(xi, yi) + .unroll(xii) + .unroll(yii) + .unroll(c) + ) + + for l in range(1, g.levels): + ( + downsampled[l] + .compute_root() + .reorder(c, x, y) + .unroll(c) + .gpu_tile(x, y, xi, yi, 16, 16) + ) + + for l in range(3, g.levels, 2): + ( + interpolated[l] + .compute_root() + .reorder(c, x, y) + .tile(x, y, xi, yi, 32, 32, hl.TailStrategy.RoundUp) + .tile(xi, yi, xii, yii, 2, 2) + .gpu_blocks(x, y) + .gpu_threads(xi, yi) + .unroll(xii) + .unroll(yii) + .unroll(c) + ) + + ( + upsampledx[1] + .compute_at(g.output_buf, x) + .reorder(c, x, y) + .tile(x, y, xi, yi, 2, 1) + .unroll(xi) + .unroll(yi) + .unroll(c) + .gpu_threads(x, y) + ) + + ( + interpolated[1] + .compute_at(g.output_buf, x) + .reorder(c, x, y) + .tile(x, y, xi, yi, 2, 2) + .unroll(xi) + .unroll(yi) + .unroll(c) + .gpu_threads(x, y) + ) + + ( + interpolated[2] + .compute_at(g.output_buf, x) + .reorder(c, x, y) + .unroll(c) + .gpu_threads(x, y) + ) + + else: + # 4.54ms on an Intel i9-9960X using 16 threads + xo, xi, yo, yi = hl.vars("xo xi yo yi") + vec = g.natural_vector_size(hl.Float(32)) + for l in range(1, g.levels - 1): + # We must refer to the downsampled stages in the + # upsampling later, so they must all be + # compute_root or redundantly recomputed, as in + # the local_laplacian app. + ( + downsampled[l] + .compute_root() + .reorder(x, c, y) + .split(y, yo, yi, 8) + .parallel(yo) + .vectorize(x, vec) + ) + + # downsampled[0] takes too long to compute_root, so + # we'll redundantly recompute it instead. Make a + # separate clone of it in the first downsampled stage + # so that we can schedule the two versions + # separately. + ( + downsampled[0] + .clone_in(downx[1]) + .store_at(downsampled[1], yo) + .compute_at(downsampled[1], yi) + .reorder(c, x, y) + .unroll(c) + .vectorize(x, vec) + ) + + ( + g.output_buf.bound(x, 0, g.input_buf.width()) + .bound(y, 0, g.input_buf.height()) + .bound(c, 0, 3) + .split(x, xo, xi, vec) + .split(y, yo, yi, 32) + .reorder(xi, c, xo, yi, yo) + .unroll(c) + .vectorize(xi) + .parallel(yo) + ) + + for l in range(1, g.levels): + ( + interpolated[l] + .store_at(g.output_buf, yo) + .compute_at(g.output_buf, yi) + .vectorize(x, vec) + ) + + # Estimates (for autoscheduler; ignored otherwise) + ( + g.input_buf.dim(0) + .set_estimate(0, 1536) + .dim(1) + .set_estimate(0, 2560) + .dim(2) + .set_estimate(0, 4) + ) + ( + g.output_buf.output_buffer() + .dim(0) + .set_estimate(0, 1536) + .dim(1) + .set_estimate(0, 2560) + .dim(2) + .set_estimate(0, 3) + ) + + +if __name__ == "__main__": + hl.main() diff --git a/python_bindings/test/apps/local_laplacian.py b/python_bindings/test/apps/local_laplacian.py deleted file mode 100644 index 6a0a215f8059..000000000000 --- a/python_bindings/test/apps/local_laplacian.py +++ /dev/null @@ -1,245 +0,0 @@ -""" -Local Laplacian, see e.g. Aubry et al 2011, "Fast and Robust Pyramid-based Image Processing". -""" - -import halide as hl - -import numpy as np -import halide.imageio -import os.path - -# Return the directory to look in for test images: -# - If TEST_IMAGES_DIR is defined, use that -# - Otherwise, create a relative path to the C++ apps/images dir -def apps_images_dir(): - return os.environ.get("TEST_IMAGES_DIR", os.path.join(os.path.dirname(__file__), "../../apps/images")) - -# Return the directory to use when writing output files: -# - If TEST_TMPDIR is defined, use that -# - Otherwise, return an empty string (i.e., relative to whatever the current directory is) -def apps_output_dir(): - return os.environ.get("TEST_TMPDIR", "") - -int_t = hl.Int(32) -float_t = hl.Float(32) - - -def get_local_laplacian(input, levels, alpha, beta, J=8): - n_downsamples = 0 - n_upsamples = 0 - - x = hl.Var('x') - y = hl.Var('y') - - def downsample(f): - nonlocal n_downsamples - downx, downy = hl.Func('downx%i' % n_downsamples), hl.Func('downy%i' % n_downsamples) - n_downsamples += 1 - - downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 * (f[2 * x, y, c] + f[2 * x + 1, y, c]) + f[2 * x + 2, y, c]) / 8.0 - downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 * (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c]) - + downx[x, 2 * y + 2, c]) / 8.0 - - return downy - - def upsample(f): - nonlocal n_upsamples - upx, upy = hl.Func('upx%i' % n_upsamples), hl.Func('upy%i' % n_upsamples) - n_upsamples += 1 - - upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y, c] + 0.75 * f[x // 2, y, c] - upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2), c] + 0.75 * upx[x, y // 2, c] - - return upy - - def downsample2D(f): - nonlocal n_downsamples - downx, downy = hl.Func('downx%i' % n_downsamples), hl.Func('downy%i' % n_downsamples) - n_downsamples += 1 - - downx[x, y] = (f[2 * x - 1, y] + 3.0 * (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0 - downy[x, y] = (downx[x, 2 * y - 1] + 3.0 * (downx[x, 2 * y] + downx[x, 2 * y + 1]) + downx[x, 2 * y + 2]) / 8.0 - - return downy - - def upsample2D(f): - nonlocal n_upsamples - upx, upy = hl.Func('upx%i' % n_upsamples), hl.Func('upy%i' % n_upsamples) - n_upsamples += 1 - - upx[x, y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y] - upy[x, y] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2] - - return upy - - # THE ALGORITHM - - # loop variables - c = hl.Var('c') - k = hl.Var('k') - - # Make the remapping function as a lookup table. - remap = hl.Func('remap') - fx = hl.cast(float_t, x / 256.0) - # remap[x] = alpha*fx*exp(-fx*fx/2.0) - remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0) - - # Convert to floating point - floating = hl.Func('floating') - floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0 - - # Set a boundary condition - clamped = hl.Func('clamped') - clamped[x, y, c] = floating[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c] - - # Get the luminance channel - gray = hl.Func('gray') - kR = hl.f32(0.299) - kG = hl.f32(0.587) - kB = hl.f32(0.114) - gray[x, y] = kR * clamped[x, y, 0] + kG * clamped[x, y, 1] + kB * clamped[x, y, 2] - - # Make the processed Gaussian pyramid. - gPyramid = [hl.Func('gPyramid%i' % i) for i in range(J)] - # Do a lookup into a lut with 256 entires per intensity level - level = k / (levels - 1) - idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0 - idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256) - gPyramid[0][x, y, k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k] - for j in range(1, J): - gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k] - - # Get its laplacian pyramid - lPyramid = [hl.Func('lPyramid%i' % i) for i in range(J)] - lPyramid[J - 1] = gPyramid[J - 1] - for j in range(J - 1)[::-1]: - lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample(gPyramid[j + 1])[x, y, k] - - # Make the Gaussian pyramid of the input - inGPyramid = [hl.Func('inGPyramid%i' % i) for i in range(J)] - inGPyramid[0] = gray - for j in range(1, J): - inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y] - - # Make the laplacian pyramid of the output - outLPyramid = [hl.Func('outLPyramid%i' % i) for i in range(J)] - for j in range(J): - # Split input pyramid value into integer and floating parts - level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1) - li = hl.clamp(hl.cast(int_t, level), 0, levels - 2) - lf = level - hl.cast(float_t, li) - # Linearly interpolate between the nearest processed pyramid levels - outLPyramid[j][x, y] = (1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1] - - # Make the Gaussian pyramid of the output - outGPyramid = [hl.Func('outGPyramid%i' % i) for i in range(J)] - outGPyramid[J - 1] = outLPyramid[J - 1] - for j in range(J - 1)[::-1]: - outGPyramid[j][x, y] = upsample2D(outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y] - - # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) - color = hl.Func('color') - eps = hl.f32(0.01) - color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] + eps) / (gray[x, y] + eps) - - output = hl.Func('local_laplacian') - # Convert back to 16-bit - output[x, y, c] = hl.cast(hl.UInt(16), hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0) - - # THE SCHEDULE - target = hl.get_target_from_environment() - if target.has_gpu_feature(): - # GPU Schedule - print("Compiling for GPU") - xi, yi = hl.Var("xi"), hl.Var("yi") - - remap.compute_root() - output.compute_root().gpu_tile(x, y, xi, yi, 16, 8) - for j in range(J): - blockw = 16 - blockh = 8 - if j > 3: - blockw = 2 - blockh = 2 - if j > 0: - inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh) - gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh) - outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh) - else: - # CPU schedule - print("Compiling for CPU") - - remap.compute_root() - output.parallel(y, 4).vectorize(x, 4) - gray.compute_root().parallel(y, 4).vectorize(x, 4) - for j in range(4): - if j > 0: - inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) - if j > 0: - gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4) - outGPyramid[j].compute_root().parallel(y).vectorize(x, 4) - for j in range(4, J): - inGPyramid[j].compute_root().parallel(y) - gPyramid[j].compute_root().parallel(k) - outGPyramid[j].compute_root().parallel(y) - - return output - - -def get_input_data(): - image_path = os.path.join(apps_images_dir(), "rgb.png") - rgb_data = halide.imageio.imread(image_path) - - # input data is in range [0, 256*256] - input_data = rgb_data.astype(np.uint16) << 8 - return input_data - - -def filter_test_image(local_laplacian, input): - local_laplacian.compile_jit(hl.get_target_from_environment()) - - # preparing input and output memory buffers (numpy ndarrays) - input_data = get_input_data() - input_image = hl.Buffer(input_data) - input.set(input_image) - - output_data = np.empty_like(input_data) - - # do the actual computation - input_width, input_height = input_image.width(), input_image.height() - output_image = local_laplacian.realize([input_width, input_height, 3]) - output_data = np.asanyarray(output_image) - - # convert back to uint8 - input_data = (input_data >> 8).astype(np.uint8) - output_data = (output_data >> 8).astype(np.uint8) - - # save results - input_path = os.path.join(apps_output_dir(), "local_laplacian_input.png") - output_path = os.path.join(apps_output_dir(), "local_laplacian.png") - - halide.imageio.imwrite(input_path, input_data) - halide.imageio.imwrite(output_path, output_data) - - print() - print("local_laplacian realized on output_image.") - print('Result saved at {} (input data copy at {}).'.format(output_path, input_path)) - - -def main(): - input_img = hl.ImageParam(hl.UInt(16), 3, 'input') - - # number of intensity levels - levels = hl.Param(int_t, 'levels', 8) - - # Parameters controlling the filter - alpha = hl.Param(float_t, 'alpha', 1.0 / 7.0) - beta = hl.Param(float_t, 'beta', 1.0) - - local_laplacian = get_local_laplacian(input_img, levels, alpha, beta) - - filter_test_image(local_laplacian, input_img) - - -if __name__ == '__main__': - main() diff --git a/python_bindings/test/apps/local_laplacian_app.py b/python_bindings/test/apps/local_laplacian_app.py new file mode 100644 index 000000000000..666799be467f --- /dev/null +++ b/python_bindings/test/apps/local_laplacian_app.py @@ -0,0 +1,58 @@ +""" +Shell for running Local Laplacian. +""" + +from local_laplacian import local_laplacian +from local_laplacian_Mullapudi2016 import local_laplacian_Mullapudi2016 +import halide.imageio +import numpy as np +import sys +from timeit import Timer + + +def main(): + if len(sys.argv) < 6: + print( + "Usage: %s input.png input.png levels alpha beta output.png" % sys.argv[0] + ) + print("e.g. %s input.png 8 1 1 output.png 10" % sys.argv[0]) + sys.exit(1) + + input_path = sys.argv[1] + levels = int(sys.argv[2]) + alpha = float(sys.argv[3]) + beta = float(sys.argv[4]) + output_path = sys.argv[5] + timing_iterations = 10 + + print("Reading from %s ..." % input_path) + input_buf_u8 = halide.imageio.imread(input_path) + assert input_buf_u8.dtype == np.uint8 + # Convert to uint16 in range [0..1] + input_buf = input_buf_u8.astype(np.uint16) * 257 + h = input_buf.shape[1] + w = input_buf.shape[2] + output_buf = np.empty([3, h, w], dtype=input_buf.dtype) + + tests = { + "Manual": local_laplacian, + "Mullapudi2016": local_laplacian_Mullapudi2016, + } + + for name, fn in tests.items(): + print("Running %s... " % name, end="") + t = Timer(lambda: fn(input_buf, levels, alpha / (levels - 1), beta, output_buf)) + avg_time_sec = t.timeit(number=timing_iterations) / timing_iterations + print("time: %fms" % (avg_time_sec * 1e3)) + + output_buf_u8 = (output_buf // 257).astype(np.uint8) + + print("Saving to %s ..." % output_path) + halide.imageio.imwrite(output_path, output_buf_u8) + + print("Success!") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/python_bindings/test/apps/local_laplacian_generator.py b/python_bindings/test/apps/local_laplacian_generator.py new file mode 100644 index 000000000000..b53dde494465 --- /dev/null +++ b/python_bindings/test/apps/local_laplacian_generator.py @@ -0,0 +1,226 @@ +""" +Local Laplacian. +""" + +import halide as hl + +# Just declare these at global scope, for simplicity +x, y, c, k = hl.vars("x y c k") + + +def _func_list(name, size): + """Return a list containing `size` Funcs, named `name_n` for n in 0..size-1.""" + return [hl.Func("%s_%d" % (name, i)) for i in range(size)] + + +def _downsample(f): + """Downsample with a 1 3 3 1 filter""" + downx, downy = hl.funcs("downx downy") + downx[x, y, hl._] = ( + f[2 * x - 1, y, hl._] + + 3.0 * (f[2 * x, y, hl._] + f[2 * x + 1, y, hl._]) + + f[2 * x + 2, y, hl._] + ) / 8.0 + downy[x, y, hl._] = ( + downx[x, 2 * y - 1, hl._] + + 3.0 * (downx[x, 2 * y, hl._] + downx[x, 2 * y + 1, hl._]) + + downx[x, 2 * y + 2, hl._] + ) / 8.0 + return downy + + +def _upsample(f): + """Upsample using bilinear interpolation""" + upx, upy = hl.funcs("upx upy") + upx[x, y, hl._] = hl.lerp( + f[(x + 1) // 2, y, hl._], + f[(x - 1) // 2, y, hl._], + ((x % 2) * 2 + 1) / 4.0, + ) + upy[x, y, hl._] = hl.lerp( + upx[x, (y + 1) // 2, hl._], + upx[x, (y - 1) // 2, hl._], + ((y % 2) * 2 + 1) / 4.0, + ) + return upy + + +@hl.alias(local_laplacian_Mullapudi2016={"autoscheduler": "Mullapudi2016"}) +@hl.generator() +class local_laplacian: + pyramid_levels = hl.GeneratorParam(8) + + input_buf = hl.InputBuffer(hl.UInt(16), 3) + levels = hl.InputScalar(hl.Int(32)) + alpha = hl.InputScalar(hl.Float(32)) + beta = hl.InputScalar(hl.Float(32)) + output_buf = hl.OutputBuffer(hl.UInt(16), 3) + + def generate(self): + g = self + + # THE ALGORITHM + J = g.pyramid_levels + + # Make the remapping function as a lookup table. + fx = hl.f32(x) / 256.0 + remap = hl.Func("remap") + remap[x] = g.alpha * fx * hl.exp(-fx * fx / 2.0) + + # Set a boundary condition + clamped = hl.BoundaryConditions.repeat_edge(g.input_buf) + + # Convert to floating point + floating = hl.Func("floating") + floating[x, y, c] = clamped[x, y, c] / 65535.0 + + # Get the luminance channel + gray = hl.Func("gray") + gray[x, y] = ( + hl.f32(0.299) * floating[x, y, 0] + + hl.f32(0.587) * floating[x, y, 1] + + hl.f32(0.114) * floating[x, y, 2] + ) + + # Make the processed Gaussian pyramid. + gPyramid = _func_list("gPyramid", J) + # Do a lookup into a lut with 256 entires per intensity level + level = k * (1.0 / (g.levels - 1)) + idx = gray[x, y] * hl.f32(g.levels - 1) * 256.0 + idx = hl.clamp(hl.i32(idx), 0, (g.levels - 1) * 256) + gPyramid[0][x, y, k] = ( + g.beta * (gray[x, y] - level) + level + remap[idx - 256 * k] + ) + for j in range(1, J): + gPyramid[j][x, y, k] = _downsample(gPyramid[j - 1])[x, y, k] + + # Get its laplacian pyramid + lPyramid = _func_list("lPyramid", J) + lPyramid[J - 1][x, y, k] = gPyramid[J - 1][x, y, k] + for j in range(J - 2, -1, -1): + lPyramid[j][x, y, k] = ( + gPyramid[j][x, y, k] - _upsample(gPyramid[j + 1])[x, y, k] + ) + + # Make the Gaussian pyramid of the input + inGPyramid = _func_list("inGPyramid", J) + inGPyramid[0][x, y] = gray[x, y] + for j in range(1, J): + inGPyramid[j][x, y] = _downsample(inGPyramid[j - 1])[x, y] + + # Make the laplacian pyramid of the output + outLPyramid = _func_list("outLPyramid", J) + for j in range(0, J): + # Split input pyramid value into integer and floating parts + level = inGPyramid[j][x, y] * hl.f32(g.levels - 1) + li = hl.clamp(hl.i32(level), 0, g.levels - 2) + lf = level - hl.f32(li) + # Linearly interpolate between the nearest processed pyramid levels + outLPyramid[j][x, y] = (1.0 - lf) * lPyramid[j][x, y, li] + ( + lf * lPyramid[j][x, y, li + 1] + ) + + # Make the Gaussian pyramid of the output + outGPyramid = _func_list("outGPyramid", J) + outGPyramid[J - 1][x, y] = outLPyramid[J - 1][x, y] + for j in range(J - 2, -1, -1): + outGPyramid[j][x, y] = ( + _upsample(outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y] + ) + + # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ + # apollo3.png input) + color = hl.Func("color") + eps = hl.f32(0.01) + color[x, y, c] = ( + outGPyramid[0][x, y] * (floating[x, y, c] + eps) / (gray[x, y] + eps) + ) + + # Convert back to 16-bit + g.output_buf[x, y, c] = hl.u16(hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0) + + # ESTIMATES + # (This can be useful in conjunction with RunGen and benchmarks as well + # as autoschedulers, so we do it in all cases.) + g.input_buf.set_estimates([(0, 1536), (0, 2560), (0, 3)]) + # Provide estimates on the parameters + g.levels.set_estimate(8) + g.alpha.set_estimate(1) + g.beta.set_estimate(1) + g.output_buf.set_estimates([(0, 1536), (0, 2560), (0, 3)]) + + # THE SCHEDULE + if g.using_autoscheduler(): + # nothing + pass + elif g.target().has_gpu_feature(): + # GPU schedule. + # 3.19ms on an RTX 2060. + remap.compute_root() + xi, yi = hl.vars("xi yi") + g.output_buf.compute_root().gpu_tile(x, y, xi, yi, 16, 8) + for j in range(0, J): + blockw = 16 + blockh = 8 + if j > 3: + blockw = 2 + blockh = 2 + + if j > 0: + inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh) + ( + gPyramid[j] + .compute_root() + .reorder(k, x, y) + .gpu_tile(x, y, xi, yi, blockw, blockh) + ) + + outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh) + + else: + # CPU schedule. + + # 21.4ms on an Intel i9-9960X using 32 threads at 3.7 + # GHz, using the target x86-64-avx2. + + # This app is dominated by data-dependent loads from + # memory, so we're better off leaving the AVX-512 units + # off in exchange for a higher clock, and we benefit from + # hyperthreading. + + remap.compute_root() + yo = hl.Var("yo") + ( + g.output_buf.reorder(c, x, y) + .split(y, yo, y, 64) + .parallel(yo) + .vectorize(x, 8) + ) + gray.compute_root().parallel(y, 32).vectorize(x, 8) + for j in range(1, 5): + inGPyramid[j].compute_root().parallel(y, 32).vectorize(x, 8) + ( + gPyramid[j] + .compute_root() + .reorder_storage(x, k, y) + .reorder(k, y) + .parallel(y, 8) + .vectorize(x, 8) + ) + ( + outGPyramid[j] + .store_at(g.output_buf, yo) + .compute_at(g.output_buf, y) + .fold_storage(y, 4) + .vectorize(x, 8) + ) + + outGPyramid[0].compute_at(g.output_buf, y).vectorize(x, 8) + for j in range(5, J): + inGPyramid[j].compute_root() + gPyramid[j].compute_root().parallel(k) + outGPyramid[j].compute_root() + + +if __name__ == "__main__": + hl.main() diff --git a/python_bindings/test/generators/CMakeLists.txt b/python_bindings/test/generators/CMakeLists.txt index 27f0ac2e18e6..24253b9e333e 100644 --- a/python_bindings/test/generators/CMakeLists.txt +++ b/python_bindings/test/generators/CMakeLists.txt @@ -75,11 +75,6 @@ _add_python_aot_and_stub_extension(SOURCES addconstantpy_generator.py GENERATORS addconstantpy addconstantpy_with_offset_42 addconstantpy_with_negative_offset) -_add_python_aot_and_stub_extension(SOURCES bilateral_grid_generator.py - GENERATORS bilateral_grid - bilateral_grid_Adams2019 - bilateral_grid_Li2018 - bilateral_grid_Mullapudi2016) _add_python_aot_and_stub_extension(SOURCES bitpy_generator.py GENERATORS bitpy) _add_python_aot_and_stub_extension(SOURCES complexpy_generator.py GENERATORS complexpy) _add_python_aot_and_stub_extension(SOURCES simplepy_generator.py GENERATORS simplepy) diff --git a/python_bindings/test/generators/bilateral_grid_generator.py b/python_bindings/test/generators/bilateral_grid_generator.py deleted file mode 100644 index d6addeeb7dc5..000000000000 --- a/python_bindings/test/generators/bilateral_grid_generator.py +++ /dev/null @@ -1,136 +0,0 @@ -""" -Bilateral histogram. -""" - -import halide as hl - -@hl.alias( - bilateral_grid_Adams2019={'autoscheduler':'Adams2019'}, - bilateral_grid_Mullapudi2016={'autoscheduler':'Mullapudi2016'}, - bilateral_grid_Li2018={'autoscheduler':'Li2018'}, -) -@hl.generator() -class bilateral_grid: - s_sigma = hl.GeneratorParam(8) - - input_buf = hl.InputBuffer(hl.Float(32), 2) - r_sigma = hl.InputScalar(hl.Float(32)) - bilateral_grid = hl.OutputBuffer(hl.Float(32), 2) - - def generate(self): - g = self - - x = hl.Var('x') - y = hl.Var('y') - z = hl.Var('z') - c = hl.Var('c') - - # Add a boundary condition - clamped = hl.BoundaryConditions.repeat_edge(g.input_buf) - - # Construct the bilateral grid - r = hl.RDom([(0, g.s_sigma), (0, g.s_sigma)]) - val = clamped[x * g.s_sigma + r.x - g.s_sigma // 2, y * g.s_sigma + r.y - g.s_sigma // 2] - val = hl.clamp(val, 0.0, 1.0) - - zi = hl.i32(val / g.r_sigma + 0.5) - - histogram = hl.Func('histogram') - histogram[x, y, z, c] = 0.0 - histogram[x, y, zi, c] += hl.mux(c, [val, 1.0]) - - # Blur the histogram using a five-tap filter - blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz') - blurz[x, y, z, c] = (histogram[x, y, z - 2, c] + histogram[x, y, z - 1, c] * 4 + histogram[x, y, z, c] * 6 + - histogram[x, y, z + 1, c] * 4 + histogram[x, y, z + 2, c]) - blurx[x, y, z, c] = (blurz[x - 2, y, z, c] + blurz[x - 1, y, z, c] * 4 + blurz[x, y, z, c] * 6 + - blurz[x + 1, y, z, c] * 4 + blurz[x + 2, y, z, c]) - blury[x, y, z, c] = (blurx[x, y - 2, z, c] + blurx[x, y - 1, z, c] * 4 + blurx[x, y, z, c] * 6 + - blurx[x, y + 1, z, c] * 4 + blurx[x, y + 2, z, c]) - - # Take trilinear samples to compute the output - val = hl.clamp(clamped[x, y], 0.0, 1.0) - zv = val / g.r_sigma - zi = hl.i32(zv) - zf = zv - zi - xf = hl.f32(x % g.s_sigma) / g.s_sigma - yf = hl.f32(y % g.s_sigma) / g.s_sigma - xi = x / g.s_sigma - yi = y / g.s_sigma - - interpolated = hl.Func('interpolated') - interpolated[x, y, c] = hl.lerp( - hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi + 1, yi, zi, c], xf), - hl.lerp(blury[xi, yi + 1, zi, c], blury[xi + 1, yi + 1, zi, c], xf), yf), - hl.lerp(hl.lerp(blury[xi, yi, zi + 1, c], blury[xi + 1, yi, zi + 1, c], xf), - hl.lerp(blury[xi, yi + 1, zi + 1, c], blury[xi + 1, yi + 1, zi + 1, c], xf), yf), zf) - - # Normalize - g.bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1] - - # ESTIMATES - # (This can be useful in conjunction with RunGen and benchmarks as well - # as auto-schedule, so we do it in all cases.) - # Provide estimates on the input image - g.input_buf.set_estimates([(0, 1536), (0, 2560)]) - # Provide estimates on the parameters - g.r_sigma.set_estimate(0.1) - # TODO: Compute estimates from the parameter values - histogram.set_estimate(z, -2, 16) - blurz.set_estimate(z, 0, 12) - blurx.set_estimate(z, 0, 12) - blury.set_estimate(z, 0, 12) - g.bilateral_grid.set_estimates([(0, 1536), (0, 2560)]) - - if g.using_autoscheduler(): - # nothing - pass - else: - if g.target().has_gpu_feature(): - # 0.50ms on an RTX 2060 - - xi = hl.Var('xi') - yi = hl.Var('yi') - zi = hl.Var('zi') - - # Schedule blurz in 8x8 tiles. This is a tile in - # grid-space, which means it represents something like - # 64x64 pixels in the input (if s_sigma is 8). - blurz.compute_root().reorder(c, z, x, y).gpu_tile(x, y, xi, yi, 8, 8) - - # Schedule histogram to happen per-tile of blurz, with - # intermediate results in shared memory. This means histogram - # and blurz makes a three-stage kernel: - # 1) Zero out the 8x8 set of histograms - # 2) Compute those histogram by iterating over lots of the input image - # 3) Blur the set of histograms in z - histogram.reorder(c, z, x, y).compute_at(blurz, x).gpu_threads(x, y) - histogram.update().reorder(c, r.x, r.y, x, y).gpu_threads(x, y).unroll(c) - - # Schedule the remaining blurs and the sampling at the end similarly. - blurx.compute_root().reorder(c, x, y, z).reorder_storage(c, x, y, z).vectorize(c).unroll( - y, 2, hl.TailStrategy.RoundUp).gpu_tile(x, y, z, xi, yi, zi, 32, 8, 1, hl.TailStrategy.RoundUp) - blury.compute_root().reorder(c, x, y, z).reorder_storage(c, x, y, z).vectorize(c).unroll( - y, 2, hl.TailStrategy.RoundUp).gpu_tile(x, y, z, xi, yi, zi, 32, 8, 1, hl.TailStrategy.RoundUp) - g.bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, 32, 8) - interpolated.compute_at(g.bilateral_grid, xi).vectorize(c) - else: - # CPU schedule. - - # 3.98ms on an Intel i9-9960X using 32 threads at 3.7 GHz - # using target x86-64-avx2. This is a little less - # SIMD-friendly than some of the other apps, so we - # benefit from hyperthreading, and don't benefit from - # AVX-512, which on my machine reduces the clock to 3.0 - # GHz. - - blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 8).unroll(c) - histogram.compute_at(blurz, y) - histogram.update().reorder(c, r.x, r.y, x, y).unroll(c) - blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 8).unroll(c) - blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 8).unroll(c) - g.bilateral_grid.compute_root().parallel(y).vectorize(x, 8) - - -if __name__ == '__main__': - hl.main()