From b3b1b0881c4b62249317c5c3c09b5fe4cbcb2c24 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 31 Oct 2022 15:22:26 -0700
Subject: [PATCH] Rewrite python_bindings/apps (#7133)

* apps

* wip

* WIP 2

* Fix comments

* _GPU_SCHEDULE_ENUM_MAP

* Update blur_generator.py

* Add hl.funcs, hl.vars, plus formatting tweaks
---
 python_bindings/src/halide/__init__.py        |  21 +-
 .../src/halide/_generator_helpers.py          |   9 +
 python_bindings/test/apps/CMakeLists.txt      |  59 +++--
 ...al_grid_shell.py => bilateral_grid_app.py} |  19 +-
 .../test/apps/bilateral_grid_generator.py     | 190 ++++++++++++++
 python_bindings/test/apps/blur.py             |  85 ------
 python_bindings/test/apps/blur_app.py         |  51 ++++
 python_bindings/test/apps/blur_generator.py   | 128 +++++++++
 python_bindings/test/apps/erode.py            |  93 -------
 python_bindings/test/apps/interpolate.py      | 212 ---------------
 python_bindings/test/apps/interpolate_app.py  |  54 ++++
 .../test/apps/interpolate_generator.py        | 235 +++++++++++++++++
 python_bindings/test/apps/local_laplacian.py  | 245 ------------------
 .../test/apps/local_laplacian_app.py          |  58 +++++
 .../test/apps/local_laplacian_generator.py    | 226 ++++++++++++++++
 .../test/generators/CMakeLists.txt            |   5 -
 .../generators/bilateral_grid_generator.py    | 136 ----------
 17 files changed, 1021 insertions(+), 805 deletions(-)
 rename python_bindings/test/apps/{bilateral_grid_shell.py => bilateral_grid_app.py} (80%)
 create mode 100644 python_bindings/test/apps/bilateral_grid_generator.py
 delete mode 100644 python_bindings/test/apps/blur.py
 create mode 100644 python_bindings/test/apps/blur_app.py
 create mode 100644 python_bindings/test/apps/blur_generator.py
 delete mode 100644 python_bindings/test/apps/erode.py
 delete mode 100644 python_bindings/test/apps/interpolate.py
 create mode 100644 python_bindings/test/apps/interpolate_app.py
 create mode 100644 python_bindings/test/apps/interpolate_generator.py
 delete mode 100644 python_bindings/test/apps/local_laplacian.py
 create mode 100644 python_bindings/test/apps/local_laplacian_app.py
 create mode 100644 python_bindings/test/apps/local_laplacian_generator.py
 delete mode 100644 python_bindings/test/generators/bilateral_grid_generator.py

diff --git a/python_bindings/src/halide/__init__.py b/python_bindings/src/halide/__init__.py
index bc3e85f06db4..8a6167d3d5f0 100644
--- a/python_bindings/src/halide/__init__.py
+++ b/python_bindings/src/halide/__init__.py
@@ -1,6 +1,19 @@
 from .halide_ import *
 from .halide_ import _, _1, _2, _3, _4, _5, _6, _7, _8, _9
-from ._generator_helpers import GeneratorParam, InputBuffer, InputScalar, OutputBuffer, \
-     OutputScalar, Generator, alias, generator, active_generator_context, \
-     _get_python_generator_names, _create_python_generator, \
-     _generatorcontext_enter, _generatorcontext_exit
+from ._generator_helpers import (
+    _create_python_generator,
+    _generatorcontext_enter,
+    _generatorcontext_exit,
+    _get_python_generator_names,
+    active_generator_context,
+    alias,
+    funcs,
+    Generator,
+    generator,
+    GeneratorParam,
+    InputBuffer,
+    InputScalar,
+    OutputBuffer,
+    OutputScalar,
+    vars,
+)
diff --git a/python_bindings/src/halide/_generator_helpers.py b/python_bindings/src/halide/_generator_helpers.py
index 49e04117a1b8..0358d741314f 100644
--- a/python_bindings/src/halide/_generator_helpers.py
+++ b/python_bindings/src/halide/_generator_helpers.py
@@ -809,3 +809,12 @@ def generator_impl(cls):
         return new_cls
 
     return generator_impl
+
+def funcs(names:str) -> tuple(Func):
+    """Given a space-delimited string, create a Func for each substring and return as a tuple."""
+    return (Func(n) for n in names.split(' '))
+
+
+def vars(names:str) -> tuple(Var):
+    """Given a space-delimited string, create a Var for each substring and return as a tuple."""
+    return (Var(n) for n in names.split(' '))
diff --git a/python_bindings/test/apps/CMakeLists.txt b/python_bindings/test/apps/CMakeLists.txt
index 212d566b9279..9fbf0bf8eb11 100644
--- a/python_bindings/test/apps/CMakeLists.txt
+++ b/python_bindings/test/apps/CMakeLists.txt
@@ -1,25 +1,52 @@
-set(tests
-    bilateral_grid_shell.py
-    blur.py
-    erode.py
-    interpolate.py
-    local_laplacian.py)
-
 set(TEST_TMPDIR "$<SHELL_PATH:${CMAKE_CURRENT_BINARY_DIR}>")
 set(TEST_IMAGES_DIR "$<SHELL_PATH:${CMAKE_CURRENT_SOURCE_DIR}/../../../apps/images>")
 
-set(DEPS_bilateral_grid_shell   py_aot_bilateral_grid)
-set(PYPATH_bilateral_grid_shell "$<TARGET_FILE_DIR:py_aot_bilateral_grid>")
-set(ARGS_bilateral_grid_shell   ${TEST_IMAGES_DIR}/gray.png ${TEST_TMPDIR}/out.png 0.1 10)
+set(APPS
+    bilateral_grid
+    blur
+    interpolate
+    local_laplacian)
+
+set(GENERATORS_bilateral_grid   bilateral_grid bilateral_grid_Adams2019 bilateral_grid_Li2018 bilateral_grid_Mullapudi2016)
+set(GENERATORS_interpolate      interpolate interpolate_Mullapudi2016)
+set(GENERATORS_local_laplacian  local_laplacian local_laplacian_Mullapudi2016)
+set(GENERATORS_blur             blur)
+
+set(ARGS_bilateral_grid   ${TEST_IMAGES_DIR}/gray.png 0.1 ${TEST_TMPDIR}/out.png)
+set(ARGS_blur             ${TEST_IMAGES_DIR}/gray.png ${TEST_TMPDIR}/out.png)
+set(ARGS_interpolate      ${TEST_IMAGES_DIR}/rgba.png ${TEST_TMPDIR}/out.png)
+set(ARGS_local_laplacian  ${TEST_IMAGES_DIR}/rgba.png 8 1 1 ${TEST_TMPDIR}/out.png)
+
+foreach (app IN LISTS APPS)
+    set(app_generator_src "${app}_generator.py")
+    add_halide_generator(app_gen_${app}
+                         SOURCES ${app_generator_src})
+
+    set(DEPS "")
+    foreach (G IN ITEMS ${GENERATORS_${app}})
+        add_halide_library(app_aot_${G}
+                           FROM app_gen_${app}
+                           GENERATOR ${G}
+                           FUNCTION_NAME ${G}
+                           USE_RUNTIME ${RUNTIME_${G}}
+                           PYTHON_EXTENSION _ignored_result
+                           # We don't really need all the plugins at once here --
+                           # it's just easier to specify them all
+                           PLUGINS Halide::Adams2019 Halide::Li2018 Halide::Mullapudi2016)
+
+        add_halide_python_extension_library(app_ext_${G}
+                                            MODULE_NAME ${G}
+                                            HALIDE_LIBRARIES app_aot_${G})
+        list(APPEND DEPS app_ext_${G})
+    endforeach()
 
-foreach (test IN LISTS tests)
-    cmake_path(GET test STEM test_name)
+    set(app_src "${app}_app.py")
     add_python_test(
-        FILE "${test}"
-        TEST_ARGS ${ARGS_${test_name}}
+        FILE "${app_src}"
+        TEST_ARGS ${ARGS_${app}}
         LABEL python_apps
-        DEPENDS ${DEPS_${test_name}}
-        PYTHONPATH ${PYPATH_${test_name}}
+        DEPENDS ${DEPS}
+        PYTHONPATH "$<TARGET_FILE_DIR:app_ext_${app}>"
         ENVIRONMENT
         "TEST_TMPDIR=${TEST_TMPDIR}"
         "TEST_IMAGES_DIR=${TEST_IMAGES_DIR}"
diff --git a/python_bindings/test/apps/bilateral_grid_shell.py b/python_bindings/test/apps/bilateral_grid_app.py
similarity index 80%
rename from python_bindings/test/apps/bilateral_grid_shell.py
rename to python_bindings/test/apps/bilateral_grid_app.py
index 880ea670e31d..e04a24594bde 100644
--- a/python_bindings/test/apps/bilateral_grid_shell.py
+++ b/python_bindings/test/apps/bilateral_grid_app.py
@@ -8,22 +8,22 @@
 from bilateral_grid_Mullapudi2016 import bilateral_grid_Mullapudi2016
 import halide.imageio
 import numpy as np
-import os
 import sys
 from timeit import Timer
 
 
 def main():
-    if len(sys.argv) < 5:
-        print("Usage: %s input.png output.png range_sigma timing_iterations" % sys.argv[0])
+    if len(sys.argv) < 4:
+        print("Usage: %s input.png output.png range_sigma" % sys.argv[0])
         print("e.g. %s input.png output.png 0.1 10" % sys.argv[0])
-        sys.exit(0)
+        sys.exit(1)
 
     input_path = sys.argv[1]
-    output_path = sys.argv[2]
-    r_sigma = float(sys.argv[3])
-    timing_iterations = int(sys.argv[4])
+    r_sigma = float(sys.argv[2])
+    output_path = sys.argv[3]
+    timing_iterations = 10
 
+    print("Reading from %s ..." % input_path)
     input_buf_u8 = halide.imageio.imread(input_path)
     assert input_buf_u8.dtype == np.uint8
     # Convert to float32
@@ -45,18 +45,19 @@ def main():
     }
 
     for name, fn in tests.items():
-        print("Running %s... " % name, end = "")
+        print("Running %s... " % name, end="")
         t = Timer(lambda: fn(input_buf, r_sigma, output_buf))
         avg_time_sec = t.timeit(number=timing_iterations) / timing_iterations
         print("time: %fms" % (avg_time_sec * 1e3))
 
     output_buf *= 255.0
     output_buf_u8 = output_buf.astype(np.uint8)
+    print("Saving to %s ..." % output_path)
     halide.imageio.imwrite(output_path, output_buf_u8)
 
     print("Success!")
     sys.exit(0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/python_bindings/test/apps/bilateral_grid_generator.py b/python_bindings/test/apps/bilateral_grid_generator.py
new file mode 100644
index 000000000000..833ac0aa5fd2
--- /dev/null
+++ b/python_bindings/test/apps/bilateral_grid_generator.py
@@ -0,0 +1,190 @@
+"""
+Bilateral histogram.
+"""
+
+import halide as hl
+
+
+@hl.alias(
+    bilateral_grid_Adams2019={"autoscheduler": "Adams2019"},
+    bilateral_grid_Mullapudi2016={"autoscheduler": "Mullapudi2016"},
+    bilateral_grid_Li2018={"autoscheduler": "Li2018"},
+)
+@hl.generator()
+class bilateral_grid:
+    s_sigma = hl.GeneratorParam(8)
+
+    input_buf = hl.InputBuffer(hl.Float(32), 2)
+    r_sigma = hl.InputScalar(hl.Float(32))
+    bilateral_grid = hl.OutputBuffer(hl.Float(32), 2)
+
+    def generate(self):
+        g = self
+
+        x, y, z, c = hl.vars("x y z c")
+
+        # Add a boundary condition
+        clamped = hl.BoundaryConditions.repeat_edge(g.input_buf)
+
+        # Construct the bilateral grid
+        r = hl.RDom([(0, g.s_sigma), (0, g.s_sigma)])
+        val = clamped[
+            x * g.s_sigma + r.x - g.s_sigma // 2,
+            y * g.s_sigma + r.y - g.s_sigma // 2,
+        ]
+        val = hl.clamp(val, 0.0, 1.0)
+
+        zi = hl.i32(val / g.r_sigma + 0.5)
+
+        histogram = hl.Func("histogram")
+        histogram[x, y, z, c] = 0.0
+        histogram[x, y, zi, c] += hl.mux(c, [val, 1.0])
+
+        # Blur the histogram using a five-tap filter
+        blurx, blury, blurz = hl.funcs("blurx blury blurz")
+        blurz[x, y, z, c] = (
+            histogram[x, y, z - 2, c]
+            + histogram[x, y, z - 1, c] * 4
+            + histogram[x, y, z, c] * 6
+            + histogram[x, y, z + 1, c] * 4
+            + histogram[x, y, z + 2, c]
+        )
+        blurx[x, y, z, c] = (
+            blurz[x - 2, y, z, c]
+            + blurz[x - 1, y, z, c] * 4
+            + blurz[x, y, z, c] * 6
+            + blurz[x + 1, y, z, c] * 4
+            + blurz[x + 2, y, z, c]
+        )
+        blury[x, y, z, c] = (
+            blurx[x, y - 2, z, c]
+            + blurx[x, y - 1, z, c] * 4
+            + blurx[x, y, z, c] * 6
+            + blurx[x, y + 1, z, c] * 4
+            + blurx[x, y + 2, z, c]
+        )
+
+        # Take trilinear samples to compute the output
+        val = hl.clamp(clamped[x, y], 0.0, 1.0)
+        zv = val / g.r_sigma
+        zi = hl.i32(zv)
+        zf = zv - zi
+        xf = hl.f32(x % g.s_sigma) / g.s_sigma
+        yf = hl.f32(y % g.s_sigma) / g.s_sigma
+        xi = x / g.s_sigma
+        yi = y / g.s_sigma
+
+        interpolated = hl.Func("interpolated")
+        interpolated[x, y, c] = hl.lerp(
+            hl.lerp(
+                hl.lerp(blury[xi, yi, zi, c], blury[xi + 1, yi, zi, c], xf),
+                hl.lerp(blury[xi, yi + 1, zi, c], blury[xi + 1, yi + 1, zi, c], xf),
+                yf,
+            ),
+            hl.lerp(
+                hl.lerp(blury[xi, yi, zi + 1, c], blury[xi + 1, yi, zi + 1, c], xf),
+                hl.lerp(
+                    blury[xi, yi + 1, zi + 1, c], blury[xi + 1, yi + 1, zi + 1, c], xf
+                ),
+                yf,
+            ),
+            zf,
+        )
+
+        # Normalize
+        g.bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]
+
+        # ESTIMATES
+        # (This can be useful in conjunction with RunGen and benchmarks as well
+        # as auto-schedule, so we do it in all cases.)
+        # Provide estimates on the input image
+        g.input_buf.set_estimates([(0, 1536), (0, 2560)])
+        # Provide estimates on the parameters
+        g.r_sigma.set_estimate(0.1)
+        # TODO: Compute estimates from the parameter values
+        histogram.set_estimate(z, -2, 16)
+        blurz.set_estimate(z, 0, 12)
+        blurx.set_estimate(z, 0, 12)
+        blury.set_estimate(z, 0, 12)
+        g.bilateral_grid.set_estimates([(0, 1536), (0, 2560)])
+
+        if g.using_autoscheduler():
+            # nothing
+            pass
+        elif g.target().has_gpu_feature():
+            # 0.50ms on an RTX 2060
+
+            xi, yi, zi = hl.vars("xi yi zi")
+
+            # Schedule blurz in 8x8 tiles. This is a tile in
+            # grid-space, which means it represents something like
+            # 64x64 pixels in the input (if s_sigma is 8).
+            blurz.compute_root().reorder(c, z, x, y).gpu_tile(x, y, xi, yi, 8, 8)
+
+            # Schedule histogram to happen per-tile of blurz, with
+            # intermediate results in shared memory. This means histogram
+            # and blurz makes a three-stage kernel:
+            # 1) Zero out the 8x8 set of histograms
+            # 2) Compute those histogram by iterating over lots of the input image
+            # 3) Blur the set of histograms in z
+            histogram.reorder(c, z, x, y).compute_at(blurz, x).gpu_threads(x, y)
+            histogram.update().reorder(c, r.x, r.y, x, y).gpu_threads(x, y).unroll(c)
+
+            # Schedule the remaining blurs and the sampling at the end
+            # similarly.
+            (
+                blurx.compute_root()
+                .reorder(c, x, y, z)
+                .reorder_storage(c, x, y, z)
+                .vectorize(c)
+                .unroll(y, 2, hl.TailStrategy.RoundUp)
+                .gpu_tile(x, y, z, xi, yi, zi, 32, 8, 1, hl.TailStrategy.RoundUp)
+            )
+            (
+                blury.compute_root()
+                .reorder(c, x, y, z)
+                .reorder_storage(c, x, y, z)
+                .vectorize(c)
+                .unroll(y, 2, hl.TailStrategy.RoundUp)
+                .gpu_tile(x, y, z, xi, yi, zi, 32, 8, 1, hl.TailStrategy.RoundUp)
+            )
+            g.bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, 32, 8)
+            interpolated.compute_at(g.bilateral_grid, xi).vectorize(c)
+        else:
+            # CPU schedule.
+
+            # 3.98ms on an Intel i9-9960X using 32 threads at 3.7 GHz
+            # using target x86-64-avx2. This is a little less
+            # SIMD-friendly than some of the other apps, so we
+            # benefit from hyperthreading, and don't benefit from
+            # AVX-512, which on my machine reduces the clock to 3.0
+            # GHz.
+
+            (
+                blurz.compute_root()
+                .reorder(c, z, x, y)
+                .parallel(y)
+                .vectorize(x, 8)
+                .unroll(c)
+            )
+            histogram.compute_at(blurz, y)
+            histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
+            (
+                blurx.compute_root()  #
+                .reorder(c, x, y, z)  #
+                .parallel(z)  #
+                .vectorize(x, 8)  #
+                .unroll(c)
+            )
+            (
+                blury.compute_root()
+                .reorder(c, x, y, z)
+                .parallel(z)
+                .vectorize(x, 8)
+                .unroll(c)
+            )
+            g.bilateral_grid.compute_root().parallel(y).vectorize(x, 8)
+
+
+if __name__ == "__main__":
+    hl.main()
diff --git a/python_bindings/test/apps/blur.py b/python_bindings/test/apps/blur.py
deleted file mode 100644
index 58700dc5f167..000000000000
--- a/python_bindings/test/apps/blur.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import halide as hl
-
-import numpy as np
-import halide.imageio
-import os.path
-
-# Return the directory to look in for test images:
-# - If TEST_IMAGES_DIR is defined, use that
-# - Otherwise, create a relative path to the C++ apps/images dir
-def apps_images_dir():
-    return os.environ.get("TEST_IMAGES_DIR", os.path.join(os.path.dirname(__file__), "../../apps/images"))
-
-# Return the directory to use when writing output files:
-# - If TEST_TMPDIR is defined, use that
-# - Otherwise, return an empty string (i.e., relative to whatever the current directory is)
-def apps_output_dir():
-    return os.environ.get("TEST_TMPDIR", "")
-
-def get_blur(input):
-    assert type(input) == hl.ImageParam
-    assert input.dimensions() == 2
-
-    x, y = hl.Var("x"), hl.Var("y")
-
-    clamped_input = hl.BoundaryConditions.repeat_edge(input)
-
-    input_uint16 = hl.Func("input_uint16")
-    input_uint16[x,y] = hl.u16(clamped_input[x,y])
-    ci = input_uint16
-
-    blur_x = hl.Func("blur_x")
-    blur_y = hl.Func("blur_y")
-
-    blur_x[x,y] = (ci[x,y]+ci[x+1,y]+ci[x+2,y])/3
-    blur_y[x,y] = hl.cast(hl.UInt(8), (blur_x[x,y]+blur_x[x,y+1]+blur_x[x,y+2])/3)
-
-    # schedule
-    xi, yi = hl.Var("xi"), hl.Var("yi")
-    blur_y.tile(x, y, xi, yi, 8, 4).parallel(y).vectorize(xi, 8)
-    blur_x.compute_at(blur_y, x).vectorize(x, 8)
-
-    return blur_y
-
-
-def get_input_data():
-    image_path = os.path.join(apps_images_dir(), "rgb.png")
-    rgb_data = halide.imageio.imread(image_path)
-
-    grey_data = np.mean(rgb_data, axis=0, dtype=np.float32).astype(rgb_data.dtype)
-    input_data = np.copy(grey_data)
-
-    return input_data
-
-def main():
-    # define and compile the function
-    input = hl.ImageParam(hl.UInt(8), 2, "input_param")
-    blur = get_blur(input)
-    blur.compile_jit()
-
-    # preparing input and output memory buffers (numpy ndarrays)
-    input_data = get_input_data()
-    input_image = hl.Buffer(input_data)
-    input.set(input_image)
-
-    output_data = np.empty(input_data.shape, dtype=input_data.dtype)
-    output_image = hl.Buffer(output_data)
-
-    # do the actual computation
-    blur.realize(output_image)
-
-    # save results
-    input_path = os.path.join(apps_output_dir(), "blur_input.png")
-    output_path = os.path.join(apps_output_dir(), "blur_result.png")
-    halide.imageio.imwrite(input_path, input_data)
-    halide.imageio.imwrite(output_path, output_data)
-    print("\nblur realized on output image.",
-          "Result saved at", output_path,
-          "( input data copy at", input_path, ")")
-
-    print("\nEnd of game. Have a nice day!")
-    return
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python_bindings/test/apps/blur_app.py b/python_bindings/test/apps/blur_app.py
new file mode 100644
index 000000000000..f759e344886d
--- /dev/null
+++ b/python_bindings/test/apps/blur_app.py
@@ -0,0 +1,51 @@
+"""
+Simple blur.
+"""
+
+from blur import blur
+import halide.imageio
+import numpy as np
+import sys
+from timeit import Timer
+
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: %s input.png output.png" % sys.argv[0])
+        print("e.g. %s input.png output.png 10" % sys.argv[0])
+        sys.exit(1)
+
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    timing_iterations = 10
+
+    print("Reading from %s ..." % input_path)
+    input_buf_u8 = halide.imageio.imread(input_path)
+    assert input_buf_u8.dtype == np.uint8
+    # Convert to uint16... but remember that the blur() generator
+    # is documented as only working on <= 14 bits of image; if
+    # we use the upper two bits we'll get incorrect results.
+    # We'll just leave it with 8 bits of useful data.
+    input_buf = input_buf_u8.astype(np.uint16)
+    output_buf = np.empty(input_buf.shape, dtype=input_buf.dtype)
+
+    tests = {
+        "Manual": blur,
+    }
+
+    for name, fn in tests.items():
+        print("Running %s... " % name, end="")
+        t = Timer(lambda: fn(input_buf, output_buf))
+        avg_time_sec = t.timeit(number=timing_iterations) / timing_iterations
+        print("time: %fms" % (avg_time_sec * 1e3))
+
+    output_buf_u8 = output_buf.astype(np.uint8)
+    print("Saving to %s ..." % output_path)
+    halide.imageio.imwrite(output_path, output_buf_u8)
+
+    print("Success!")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python_bindings/test/apps/blur_generator.py b/python_bindings/test/apps/blur_generator.py
new file mode 100644
index 000000000000..5a02a4a54947
--- /dev/null
+++ b/python_bindings/test/apps/blur_generator.py
@@ -0,0 +1,128 @@
+"""
+Simple blur.
+"""
+
+import halide as hl
+from enum import Enum
+
+
+class BlurGPUSchedule(Enum):
+    # Fully inlining schedule.
+    Inline = 0
+    # Schedule caching intermedia result of blur_x.
+    Cache = 1
+    # Schedule enabling sliding window opt within each work-item or cuda
+    # thread.
+    Slide = 2
+    # The same as above plus vectorization per work-item.
+    SlideVectorize = 3
+
+
+_GPU_SCHEDULE_ENUM_MAP = {
+    "inline": BlurGPUSchedule.Inline,
+    "cache": BlurGPUSchedule.Cache,
+    "slide": BlurGPUSchedule.Slide,
+    "slide_vector": BlurGPUSchedule.SlideVectorize,
+}
+
+
+@hl.generator()
+class blur:
+    gpu_schedule = hl.GeneratorParam("slide_vector")
+    gpu_tile_x = hl.GeneratorParam(32)
+    gpu_tile_y = hl.GeneratorParam(8)
+
+    # Note: although this is declared as operating on uint16 images,
+    # it will produce incorrect results if more than 14-bit images are used.
+    input_buf = hl.InputBuffer(hl.UInt(16), 2)
+    blur_y = hl.OutputBuffer(hl.UInt(16), 2)
+
+    def generate(self):
+        g = self
+
+        x, y, xi, yi = hl.vars("x y xi yi")
+
+        # The algorithm
+        clamped = hl.BoundaryConditions.repeat_edge(g.input_buf)
+
+        blur_x = hl.Func("blur_x")
+        blur_x[x, y] = (clamped[x, y] + clamped[x + 1, y] + clamped[x + 2, y]) // 3
+        g.blur_y[x, y] = (blur_x[x, y] + blur_x[x, y + 1] + blur_x[x, y + 2]) // 3
+
+        # How to schedule it
+        if g.target().has_gpu_feature():
+            # GPU schedule.
+
+            # This will raise an exception for unknown strings, which is what
+            # we want
+            schedule_enum = _GPU_SCHEDULE_ENUM_MAP[g.gpu_schedule]
+
+            if schedule_enum == BlurGPUSchedule.Inline:
+                # - Fully inlining.
+                g.blur_y.gpu_tile(x, y, xi, yi, g.gpu_tile_x, g.gpu_tile_y)
+
+            elif schedule_enum == BlurGPUSchedule.Cache:
+                # - Cache blur_x calculation.
+                g.blur_y.gpu_tile(x, y, xi, yi, g.gpu_tile_x, g.gpu_tile_y)
+                blur_x.compute_at(g.blur_y, x).gpu_threads(x, y)
+
+            elif schedule_enum == BlurGPUSchedule.Slide:
+                # - Instead of caching blur_x calculation explicitly, the
+                #   alternative is to allow each work-item in OpenCL or thread
+                #   in CUDA to calculate more rows of blur_y so that temporary
+                #   blur_x calculation is re-used implicitly. This achieves
+                #   the similar schedule of sliding window.
+                y_inner = hl.Var("y_inner")
+                (
+                    g.blur_y.split(y, y, y_inner, g.gpu_tile_y)
+                    .reorder(y_inner, x)
+                    .unroll(y_inner)
+                    .gpu_tile(x, y, xi, yi, g.gpu_tile_x, 1)
+                )
+
+            elif schedule_enum == BlurGPUSchedule.SlideVectorize:
+                # Vectorization factor.
+                factor = 2
+                y_inner = hl.Var("y_inner")
+                (
+                    g.blur_y.vectorize(x, factor)
+                    .split(y, y, y_inner, g.gpu_tile_y)
+                    .reorder(y_inner, x)
+                    .unroll(y_inner)
+                    .gpu_tile(x, y, xi, yi, g.gpu_tile_x, 1)
+                )
+
+        elif g.target().has_feature(hl.TargetFeature.HVX):
+            # Hexagon schedule.
+            # TODO: Try using a schedule like the CPU one below.
+            vector_size = 128
+
+            (
+                g.blur_y.compute_root()
+                .hexagon()
+                .prefetch(g.input_buf, y, y, 2)
+                .split(y, y, yi, 128)
+                .parallel(y)
+                .vectorize(x, vector_size * 2)
+            )
+            (
+                blur_x.store_at(g.blur_y, y)
+                .compute_at(g.blur_y, yi)
+                .vectorize(x, vector_size)
+            )
+        else:
+            # CPU schedule.
+            # Compute blur_x as needed at each vector of the output.
+            # Halide will store blur_x in a circular buffer so its
+            # results can be re-used.
+            vector_size = g.natural_vector_size(g.input_buf.type())
+            g.blur_y.split(y, y, yi, 32).parallel(y).vectorize(x, vector_size)
+            (
+                blur_x.store_at(g.blur_y, y)
+                .compute_at(g.blur_y, x)
+                .vectorize(x, vector_size)
+            )
+
+
+if __name__ == "__main__":
+    hl.main()
diff --git a/python_bindings/test/apps/erode.py b/python_bindings/test/apps/erode.py
deleted file mode 100644
index 44f4815b5190..000000000000
--- a/python_bindings/test/apps/erode.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""
-Erode application using Python Halide bindings
-"""
-
-import halide as hl
-
-import numpy as np
-import halide.imageio
-import os.path
-
-# Return the directory to look in for test images:
-# - If TEST_IMAGES_DIR is defined, use that
-# - Otherwise, create a relative path to the C++ apps/images dir
-def apps_images_dir():
-    return os.environ.get("TEST_IMAGES_DIR", os.path.join(os.path.dirname(__file__), "../../apps/images"))
-
-# Return the directory to use when writing output files:
-# - If TEST_TMPDIR is defined, use that
-# - Otherwise, return an empty string (i.e., relative to whatever the current directory is)
-def apps_output_dir():
-    return os.environ.get("TEST_TMPDIR", "")
-
-def get_erode(input):
-    """
-    Erode on 5x5 stencil, first erode x then erode y.
-    """
-
-    x = hl.Var("x")
-    y = hl.Var("y")
-    c = hl.Var("c")
-    input_clamped = hl.Func("input_clamped")
-    erode_x = hl.Func("erode_x")
-    erode_y = hl.Func("erode_y")
-
-    input_clamped[x,y,c] = input[hl.clamp(x,hl.cast(hl.Int(32),0),hl.cast(hl.Int(32),input.width()-1)),
-                                 hl.clamp(y,hl.cast(hl.Int(32),0),hl.cast(hl.Int(32),input.height()-1)), c]
-    erode_x[x,y,c] = hl.min(hl.min(hl.min(hl.min(input_clamped[x-2,y,c],input_clamped[x-1,y,c]),input_clamped[x,y,c]),input_clamped[x+1,y,c]),input_clamped[x+2,y,c])
-    erode_y[x,y,c] = hl.min(hl.min(hl.min(hl.min(erode_x[x,y-2,c],erode_x[x,y-1,c]),erode_x[x,y,c]),erode_x[x,y+1,c]),erode_x[x,y+2,c])
-
-    yi = hl.Var("yi")
-
-    # CPU Schedule
-    erode_x.compute_root().split(y, y, yi, 8).parallel(y)
-    erode_y.compute_root().split(y, y, yi, 8).parallel(y)
-
-    return erode_y
-
-
-def get_input_data():
-    image_path = os.path.join(apps_images_dir(), "rgb.png")
-    rgb_data = halide.imageio.imread(image_path)
-    print("rgb_data", type(rgb_data), rgb_data.shape, rgb_data.dtype)
-
-    input_data = np.copy(rgb_data)
-
-    return input_data
-
-
-def main():
-
-    # define and compile the function
-    input = hl.ImageParam(hl.UInt(8), 3, "input")
-    erode = get_erode(input)
-    erode.compile_jit()
-
-    # preparing input and output memory buffers (numpy ndarrays)
-    input_data = get_input_data()
-    input_image = hl.Buffer(input_data)
-    input.set(input_image)
-
-    output_data = np.empty(input_data.shape, dtype=input_data.dtype)
-    output_image = hl.Buffer(output_data)
-
-    print("input_image", input_image)
-    print("output_image", output_image)
-
-    # do the actual computation
-    erode.realize(output_image)
-
-    # save results
-    input_path = os.path.join(apps_output_dir(), "erode_input.png")
-    output_path = os.path.join(apps_output_dir(), "erode_result.png")
-    halide.imageio.imwrite(input_path, input_data)
-    halide.imageio.imwrite(output_path, output_data)
-    print("\nerode realized on output image.",
-          "Result saved at", output_path,
-          "( input data copy at", input_path, ")")
-
-    print("\nEnd of game. Have a nice day!")
-    return
-
-if __name__ == "__main__":
-    main()
diff --git a/python_bindings/test/apps/interpolate.py b/python_bindings/test/apps/interpolate.py
deleted file mode 100644
index 5e23aa3020b4..000000000000
--- a/python_bindings/test/apps/interpolate.py
+++ /dev/null
@@ -1,212 +0,0 @@
-"""
-Fast image interpolation using a pyramid.
-"""
-
-import halide as hl
-
-from datetime import datetime
-import halide.imageio
-import numpy as np
-import os.path
-
-# Return the directory to look in for test images:
-# - If TEST_IMAGES_DIR is defined, use that
-# - Otherwise, create a relative path to the C++ apps/images dir
-def apps_images_dir():
-    return os.environ.get("TEST_IMAGES_DIR", os.path.join(os.path.dirname(__file__), "../../apps/images"))
-
-# Return the directory to use when writing output files:
-# - If TEST_TMPDIR is defined, use that
-# - Otherwise, return an empty string (i.e., relative to whatever the current directory is)
-def apps_output_dir():
-    return os.environ.get("TEST_TMPDIR", "")
-
-int_t = hl.Int(32)
-float_t = hl.Float(32)
-
-
-def get_interpolate(input, levels):
-    """
-    Build function, schedules it, and invokes jit compiler
-    :return: halide.hl.Func
-    """
-
-    # THE ALGORITHM
-
-    downsampled = [hl.Func('downsampled%d' % i) for i in range(levels)]
-    downx = [hl.Func('downx%d' % l) for l in range(levels)]
-    interpolated = [hl.Func('interpolated%d' % i) for i in range(levels)]
-
-    upsampled = [hl.Func('upsampled%d' % l) for l in range(levels)]
-    upsampledx = [hl.Func('upsampledx%d' % l) for l in range(levels)]
-    x = hl.Var('x')
-    y = hl.Var('y')
-    c = hl.Var('c')
-
-    clamped = hl.Func('clamped')
-    clamped[x, y, c] = input[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c]
-
-    # This triggers a bug in llvm 3.3 (3.2 and trunk are fine), so we
-    # rewrite it in a way that doesn't trigger the bug. The rewritten
-    # form assumes the input alpha is zero or one.
-    # downsampled[0][x, y, c] = hl.select(c < 3, clamped[x, y, c] * clamped[x, y, 3], clamped[x, y, 3])
-    downsampled[0][x, y, c] = clamped[x, y, c] * clamped[x, y, 3]
-
-    for l in range(1, levels):
-        prev = downsampled[l - 1]
-
-        if l == 4:
-            # Also add a boundary condition at a middle pyramid level
-            # to prevent the footprint of the downsamplings to extend
-            # too far off the base image. Otherwise we look 512
-            # pixels off each edge.
-            w = input.width() / (1 << l)
-            h = input.height() / (1 << l)
-            prev = hl.lambda_func(x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c])
-
-        downx[l][x, y, c] = (prev[x * 2 - 1, y, c] + 2.0 * prev[x * 2, y, c] + prev[x * 2 + 1, y, c]) * 0.25
-        downsampled[l][x, y, c] = (downx[l][x, y * 2 - 1, c] + 2.0 * downx[l][x, y * 2, c] + downx[l][
-            x, y * 2 + 1, c]) * 0.25
-
-    interpolated[levels - 1][x, y, c] = downsampled[levels - 1][x, y, c]
-    for l in range(levels - 1)[::-1]:
-        upsampledx[l][x, y, c] = (interpolated[l + 1][x / 2, y, c] + interpolated[l + 1][(x + 1) / 2, y, c]) / 2.0
-        upsampled[l][x, y, c] = (upsampledx[l][x, y / 2, c] + upsampledx[l][x, (y + 1) / 2, c]) / 2.0
-        interpolated[l][x, y, c] = downsampled[l][x, y, c] + (1.0 - downsampled[l][x, y, 3]) * upsampled[l][x, y, c]
-
-    normalize = hl.Func('normalize')
-    normalize[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3]
-
-    final = hl.Func('final')
-    final[x, y, c] = normalize[x, y, c]
-
-    print("Finished function setup.")
-
-    # THE SCHEDULE
-    target = hl.get_target_from_environment()
-    if target.has_gpu_feature():
-        sched = 4
-    else:
-        sched = 2
-
-    if sched == 0:
-        print("Flat schedule.")
-        for l in range(levels):
-            downsampled[l].compute_root()
-            interpolated[l].compute_root()
-
-        final.compute_root()
-
-    elif sched == 1:
-        print("Flat schedule with vectorization.")
-        for l in range(levels):
-            downsampled[l].compute_root().vectorize(x, 4)
-            interpolated[l].compute_root().vectorize(x, 4)
-
-        final.compute_root()
-
-    elif sched == 2:
-        print("Flat schedule with parallelization + vectorization")
-        xi, yi = hl.Var('xi'), hl.Var('yi')
-        clamped.compute_root().parallel(y).bound(c, 0, 4).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
-        for l in range(1, levels - 1):
-            if l > 0:
-                downsampled[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
-            interpolated[l].compute_root().parallel(y).reorder(c, x, y).reorder_storage(c, x, y).vectorize(c, 4)
-            interpolated[l].unroll(x, 2).unroll(y, 2)
-
-        final.reorder(c, x, y).bound(c, 0, 3).parallel(y)
-        final.tile(x, y, xi, yi, 2, 2).unroll(xi).unroll(yi)
-        final.bound(x, 0, input.width())
-        final.bound(y, 0, input.height())
-
-    elif sched == 3:
-        print("Flat schedule with vectorization sometimes.")
-        for l in range(levels):
-            if l + 4 < levels:
-                downsampled[l].compute_root().vectorize(x, 4)
-                interpolated[l].compute_root().vectorize(x, 4)
-            else:
-                downsampled[l].compute_root()
-                interpolated[l].compute_root()
-
-        final.compute_root()
-
-    elif sched == 4:
-        print("GPU schedule.")
-
-        # Some gpus don't have enough memory to process the entire
-        # image, so we process the image in tiles.
-        yo, yi, xo, xi, ci = hl.Var('yo'), hl.Var('yi'), hl.Var('xo'), hl.Var("xi"), hl.Var("ci")
-        final.reorder(c, x, y).bound(c, 0, 3).vectorize(x, 4)
-        final.tile(x, y, xo, yo, xi, yi, input.width() / 4, input.height() / 4)
-        normalize.compute_at(final, xo).reorder(c, x, y).gpu_tile(x, y, xi, yi, 16, 16).unroll(c)
-
-        # Start from level 1 to save memory - level zero will be computed on demand
-        for l in range(1, levels):
-            tile_size = 32 >> l
-            if tile_size < 1: tile_size = 1
-            if tile_size > 16: tile_size = 16
-            downsampled[l].compute_root().gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4)
-            interpolated[l].compute_at(final, xo).gpu_tile(x, y, c, xi, yi, ci, tile_size, tile_size, 4)
-
-    else:
-        print("No schedule with this number.")
-        exit(1)
-
-    # JIT compile the pipeline eagerly, so we don't interfere with timing
-    final.compile_jit(target)
-
-    return final
-
-
-def get_input_data():
-    image_path = os.path.join(apps_images_dir(), "rgba.png")
-    rgba_data = halide.imageio.imread(image_path)
-
-    # input data is in range [0, 1]
-    input_data = np.copy(rgba_data).astype(np.float32) / 255.0
-    return input_data
-
-
-def main():
-    input = hl.ImageParam(float_t, 3, "input")
-    levels = 10
-
-    interpolate = get_interpolate(input, levels)
-
-    # preparing input and output memory buffers (numpy ndarrays)
-    input_data = get_input_data()
-    input_image = hl.Buffer(input_data)
-    assert input_image.channels() == 4
-    input.set(input_image)
-
-    input_width, input_height = input_image.width(), input_image.height()
-
-    t0 = datetime.now()
-    output_image = interpolate.realize([input_width, input_height, 3])
-    t1 = datetime.now()
-
-    elapsed = (t1 - t0).total_seconds()
-    print('Interpolated in {:.5f} secs'.format(elapsed))
-
-    output_data = np.asanyarray(output_image)
-
-    # convert output
-    input_data = (input_data * 255).astype(np.uint8)
-    output_data = (output_data * 255).astype(np.uint8)
-
-    # save results
-    input_path = os.path.join(apps_output_dir(), "interpolate_input.png")
-    output_path = os.path.join(apps_output_dir(), "interpolate_result.png")
-    halide.imageio.imwrite(input_path, input_data)
-    halide.imageio.imwrite(output_path, output_data)
-
-    print()
-    print('blur realized on output image. Result saved at {} (input data copy at {})'.format(output_path, input_path))
-    print()
-    print("End of game. Have a nice day!")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python_bindings/test/apps/interpolate_app.py b/python_bindings/test/apps/interpolate_app.py
new file mode 100644
index 000000000000..a226a11a681b
--- /dev/null
+++ b/python_bindings/test/apps/interpolate_app.py
@@ -0,0 +1,54 @@
+"""
+Shell for running Fast image interpolation using a pyramid.
+"""
+
+from interpolate import interpolate
+from interpolate_Mullapudi2016 import interpolate_Mullapudi2016
+import halide.imageio
+import numpy as np
+import sys
+from timeit import Timer
+
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: %s input.png output.png" % sys.argv[0])
+        print("e.g. %s input.png output.png 10" % sys.argv[0])
+        sys.exit(1)
+
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    timing_iterations = 10
+
+    print("Reading from %s ..." % input_path)
+    input_buf_u8 = halide.imageio.imread(input_path)
+    assert input_buf_u8.dtype == np.uint8
+    # Convert to float32 in range [0..1]
+    input_buf = input_buf_u8.astype(np.float32) / 255.0
+    h = input_buf.shape[1]
+    w = input_buf.shape[2]
+    output_buf = np.empty([3, h, w], dtype=input_buf.dtype)
+
+    tests = {
+        "Manual": interpolate,
+        "Mullapudi2016": interpolate_Mullapudi2016,
+    }
+
+    for name, fn in tests.items():
+        print("Running %s... " % name, end="")
+        t = Timer(lambda: fn(input_buf, output_buf))
+        avg_time_sec = t.timeit(number=timing_iterations) / timing_iterations
+        print("time: %fms" % (avg_time_sec * 1e3))
+
+    output_buf *= 255.0
+    output_buf_u8 = output_buf.astype(np.uint8)
+
+    print("Saving to %s ..." % output_path)
+    halide.imageio.imwrite(output_path, output_buf_u8)
+
+    print("Success!")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python_bindings/test/apps/interpolate_generator.py b/python_bindings/test/apps/interpolate_generator.py
new file mode 100644
index 000000000000..a1ba365931f0
--- /dev/null
+++ b/python_bindings/test/apps/interpolate_generator.py
@@ -0,0 +1,235 @@
+"""
+Fast image interpolation using a pyramid.
+"""
+
+import halide as hl
+
+
+def _func_list(name, size):
+    """Return a list containing `size` Funcs, named `name_n` for n in 0..size-1."""
+    return [hl.Func("%s_%d" % (name, i)) for i in range(size)]
+
+
+@hl.alias(
+    interpolate_Mullapudi2016={"autoscheduler": "Mullapudi2016"},
+)
+@hl.generator()
+class interpolate:
+    levels = hl.GeneratorParam(10)
+
+    input_buf = hl.InputBuffer(hl.Float(32), 3)
+    output_buf = hl.OutputBuffer(hl.Float(32), 3)
+
+    def generate(self):
+        g = self
+
+        x, y, c = hl.vars("x y c")
+
+        # Input must have four color channels - rgba
+        g.input_buf.dim(2).set_bounds(0, 4)
+
+        downsampled = _func_list("downsampled", g.levels)
+        downx = _func_list("downx", g.levels)
+        interpolated = _func_list("interpolated", g.levels)
+        upsampled = _func_list("upsampled", g.levels)
+        upsampledx = _func_list("upsampledx", g.levels)
+
+        clamped = hl.BoundaryConditions.repeat_edge(g.input_buf)
+
+        downsampled[0][x, y, c] = hl.select(
+            c < 3,
+            clamped[x, y, c] * clamped[x, y, 3],
+            clamped[x, y, 3],
+        )
+
+        for l in range(1, g.levels):
+            prev = downsampled[l - 1]
+
+            if l == 4:
+                # Also add a boundary condition at a middle pyramid level
+                # to prevent the footprint of the downsamplings to extend
+                # too far off the base image. Otherwise we look 512
+                # pixels off each edge.
+                w = g.input_buf.width() / (1 << (l - 1))
+                h = g.input_buf.height() / (1 << (l - 1))
+                prev = hl.lambda_func(
+                    x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c]
+                )
+
+            downx[l][x, y, c] = (
+                prev[x * 2 - 1, y, c] + 2 * prev[x * 2, y, c] + prev[x * 2 + 1, y, c]
+            ) * 0.25
+
+            downsampled[l][x, y, c] = (
+                downx[l][x, y * 2 - 1, c]
+                + 2 * downx[l][x, y * 2, c]
+                + downx[l][x, y * 2 + 1, c]
+            ) * 0.25
+
+        interpolated[g.levels - 1][x, y, c] = downsampled[g.levels - 1][x, y, c]
+
+        for l in range(g.levels - 2, -1, -1):
+            upsampledx[l][x, y, c] = (
+                interpolated[l + 1][x / 2, y, c]
+                + interpolated[l + 1][(x + 1) / 2, y, c]
+            ) / 2
+            upsampled[l][x, y, c] = (
+                upsampledx[l][x, y / 2, c] + upsampledx[l][x, (y + 1) / 2, c]
+            ) / 2
+            alpha = 1.0 - downsampled[l][x, y, 3]
+            interpolated[l][x, y, c] = (
+                downsampled[l][x, y, c] + alpha * upsampled[l][x, y, c]
+            )
+
+        g.output_buf[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3]
+
+        # Schedule
+        if g.using_autoscheduler():
+            # nothing
+            pass
+        elif g.target().has_gpu_feature():
+            # 0.86ms on a 2060 RTX
+            yo, yi, xo, xi, ci, xii, yii = hl.vars("yo yi xo xi ci xii yii")
+
+            (
+                g.output_buf.bound(x, 0, g.input_buf.width())
+                .bound(y, 0, g.input_buf.height())
+                .bound(c, 0, 3)
+                .reorder(c, x, y)
+                .tile(x, y, xi, yi, 32, 32, hl.TailStrategy.RoundUp)
+                .tile(xi, yi, xii, yii, 2, 2)
+                .gpu_blocks(x, y)
+                .gpu_threads(xi, yi)
+                .unroll(xii)
+                .unroll(yii)
+                .unroll(c)
+            )
+
+            for l in range(1, g.levels):
+                (
+                    downsampled[l]
+                    .compute_root()
+                    .reorder(c, x, y)
+                    .unroll(c)
+                    .gpu_tile(x, y, xi, yi, 16, 16)
+                )
+
+            for l in range(3, g.levels, 2):
+                (
+                    interpolated[l]
+                    .compute_root()
+                    .reorder(c, x, y)
+                    .tile(x, y, xi, yi, 32, 32, hl.TailStrategy.RoundUp)
+                    .tile(xi, yi, xii, yii, 2, 2)
+                    .gpu_blocks(x, y)
+                    .gpu_threads(xi, yi)
+                    .unroll(xii)
+                    .unroll(yii)
+                    .unroll(c)
+                )
+
+            (
+                upsampledx[1]
+                .compute_at(g.output_buf, x)
+                .reorder(c, x, y)
+                .tile(x, y, xi, yi, 2, 1)
+                .unroll(xi)
+                .unroll(yi)
+                .unroll(c)
+                .gpu_threads(x, y)
+            )
+
+            (
+                interpolated[1]
+                .compute_at(g.output_buf, x)
+                .reorder(c, x, y)
+                .tile(x, y, xi, yi, 2, 2)
+                .unroll(xi)
+                .unroll(yi)
+                .unroll(c)
+                .gpu_threads(x, y)
+            )
+
+            (
+                interpolated[2]
+                .compute_at(g.output_buf, x)
+                .reorder(c, x, y)
+                .unroll(c)
+                .gpu_threads(x, y)
+            )
+
+        else:
+            # 4.54ms on an Intel i9-9960X using 16 threads
+            xo, xi, yo, yi = hl.vars("xo xi yo yi")
+            vec = g.natural_vector_size(hl.Float(32))
+            for l in range(1, g.levels - 1):
+                # We must refer to the downsampled stages in the
+                # upsampling later, so they must all be
+                # compute_root or redundantly recomputed, as in
+                # the local_laplacian app.
+                (
+                    downsampled[l]
+                    .compute_root()
+                    .reorder(x, c, y)
+                    .split(y, yo, yi, 8)
+                    .parallel(yo)
+                    .vectorize(x, vec)
+                )
+
+            # downsampled[0] takes too long to compute_root, so
+            # we'll redundantly recompute it instead.  Make a
+            # separate clone of it in the first downsampled stage
+            # so that we can schedule the two versions
+            # separately.
+            (
+                downsampled[0]
+                .clone_in(downx[1])
+                .store_at(downsampled[1], yo)
+                .compute_at(downsampled[1], yi)
+                .reorder(c, x, y)
+                .unroll(c)
+                .vectorize(x, vec)
+            )
+
+            (
+                g.output_buf.bound(x, 0, g.input_buf.width())
+                .bound(y, 0, g.input_buf.height())
+                .bound(c, 0, 3)
+                .split(x, xo, xi, vec)
+                .split(y, yo, yi, 32)
+                .reorder(xi, c, xo, yi, yo)
+                .unroll(c)
+                .vectorize(xi)
+                .parallel(yo)
+            )
+
+            for l in range(1, g.levels):
+                (
+                    interpolated[l]
+                    .store_at(g.output_buf, yo)
+                    .compute_at(g.output_buf, yi)
+                    .vectorize(x, vec)
+                )
+
+        # Estimates (for autoscheduler; ignored otherwise)
+        (
+            g.input_buf.dim(0)
+            .set_estimate(0, 1536)
+            .dim(1)
+            .set_estimate(0, 2560)
+            .dim(2)
+            .set_estimate(0, 4)
+        )
+        (
+            g.output_buf.output_buffer()
+            .dim(0)
+            .set_estimate(0, 1536)
+            .dim(1)
+            .set_estimate(0, 2560)
+            .dim(2)
+            .set_estimate(0, 3)
+        )
+
+
+if __name__ == "__main__":
+    hl.main()
diff --git a/python_bindings/test/apps/local_laplacian.py b/python_bindings/test/apps/local_laplacian.py
deleted file mode 100644
index 6a0a215f8059..000000000000
--- a/python_bindings/test/apps/local_laplacian.py
+++ /dev/null
@@ -1,245 +0,0 @@
-"""
-Local Laplacian, see e.g. Aubry et al 2011, "Fast and Robust Pyramid-based Image Processing".
-"""
-
-import halide as hl
-
-import numpy as np
-import halide.imageio
-import os.path
-
-# Return the directory to look in for test images:
-# - If TEST_IMAGES_DIR is defined, use that
-# - Otherwise, create a relative path to the C++ apps/images dir
-def apps_images_dir():
-    return os.environ.get("TEST_IMAGES_DIR", os.path.join(os.path.dirname(__file__), "../../apps/images"))
-
-# Return the directory to use when writing output files:
-# - If TEST_TMPDIR is defined, use that
-# - Otherwise, return an empty string (i.e., relative to whatever the current directory is)
-def apps_output_dir():
-    return os.environ.get("TEST_TMPDIR", "")
-
-int_t = hl.Int(32)
-float_t = hl.Float(32)
-
-
-def get_local_laplacian(input, levels, alpha, beta, J=8):
-    n_downsamples = 0
-    n_upsamples = 0
-
-    x = hl.Var('x')
-    y = hl.Var('y')
-
-    def downsample(f):
-        nonlocal n_downsamples
-        downx, downy = hl.Func('downx%i' % n_downsamples), hl.Func('downy%i' % n_downsamples)
-        n_downsamples += 1
-
-        downx[x, y, c] = (f[2 * x - 1, y, c] + 3.0 * (f[2 * x, y, c] + f[2 * x + 1, y, c]) + f[2 * x + 2, y, c]) / 8.0
-        downy[x, y, c] = (downx[x, 2 * y - 1, c] + 3.0 * (downx[x, 2 * y, c] + downx[x, 2 * y + 1, c])
-                          + downx[x, 2 * y + 2, c]) / 8.0
-
-        return downy
-
-    def upsample(f):
-        nonlocal n_upsamples
-        upx, upy = hl.Func('upx%i' % n_upsamples), hl.Func('upy%i' % n_upsamples)
-        n_upsamples += 1
-
-        upx[x, y, c] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y, c] + 0.75 * f[x // 2, y, c]
-        upy[x, y, c] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2), c] + 0.75 * upx[x, y // 2, c]
-
-        return upy
-
-    def downsample2D(f):
-        nonlocal n_downsamples
-        downx, downy = hl.Func('downx%i' % n_downsamples), hl.Func('downy%i' % n_downsamples)
-        n_downsamples += 1
-
-        downx[x, y] = (f[2 * x - 1, y] + 3.0 * (f[2 * x, y] + f[2 * x + 1, y]) + f[2 * x + 2, y]) / 8.0
-        downy[x, y] = (downx[x, 2 * y - 1] + 3.0 * (downx[x, 2 * y] + downx[x, 2 * y + 1]) + downx[x, 2 * y + 2]) / 8.0
-
-        return downy
-
-    def upsample2D(f):
-        nonlocal n_upsamples
-        upx, upy = hl.Func('upx%i' % n_upsamples), hl.Func('upy%i' % n_upsamples)
-        n_upsamples += 1
-
-        upx[x, y] = 0.25 * f[(x // 2) - 1 + 2 * (x % 2), y] + 0.75 * f[x // 2, y]
-        upy[x, y] = 0.25 * upx[x, (y // 2) - 1 + 2 * (y % 2)] + 0.75 * upx[x, y // 2]
-
-        return upy
-
-    # THE ALGORITHM
-
-    # loop variables
-    c = hl.Var('c')
-    k = hl.Var('k')
-
-    # Make the remapping function as a lookup table.
-    remap = hl.Func('remap')
-    fx = hl.cast(float_t, x / 256.0)
-    # remap[x] = alpha*fx*exp(-fx*fx/2.0)
-    remap[x] = alpha * fx * hl.exp(-fx * fx / 2.0)
-
-    # Convert to floating point
-    floating = hl.Func('floating')
-    floating[x, y, c] = hl.cast(float_t, input[x, y, c]) / 65535.0
-
-    # Set a boundary condition
-    clamped = hl.Func('clamped')
-    clamped[x, y, c] = floating[hl.clamp(x, 0, input.width() - 1), hl.clamp(y, 0, input.height() - 1), c]
-
-    # Get the luminance channel
-    gray = hl.Func('gray')
-    kR = hl.f32(0.299)
-    kG = hl.f32(0.587)
-    kB = hl.f32(0.114)
-    gray[x, y] = kR * clamped[x, y, 0] + kG * clamped[x, y, 1] + kB * clamped[x, y, 2]
-
-    # Make the processed Gaussian pyramid.
-    gPyramid = [hl.Func('gPyramid%i' % i) for i in range(J)]
-    # Do a lookup into a lut with 256 entires per intensity level
-    level = k / (levels - 1)
-    idx = gray[x, y] * hl.cast(float_t, levels - 1) * 256.0
-    idx = hl.clamp(hl.cast(int_t, idx), 0, (levels - 1) * 256)
-    gPyramid[0][x, y, k] = beta * (gray[x, y] - level) + level + remap[idx - 256 * k]
-    for j in range(1, J):
-        gPyramid[j][x, y, k] = downsample(gPyramid[j - 1])[x, y, k]
-
-    # Get its laplacian pyramid
-    lPyramid = [hl.Func('lPyramid%i' % i) for i in range(J)]
-    lPyramid[J - 1] = gPyramid[J - 1]
-    for j in range(J - 1)[::-1]:
-        lPyramid[j][x, y, k] = gPyramid[j][x, y, k] - upsample(gPyramid[j + 1])[x, y, k]
-
-    # Make the Gaussian pyramid of the input
-    inGPyramid = [hl.Func('inGPyramid%i' % i) for i in range(J)]
-    inGPyramid[0] = gray
-    for j in range(1, J):
-        inGPyramid[j][x, y] = downsample2D(inGPyramid[j - 1])[x, y]
-
-    # Make the laplacian pyramid of the output
-    outLPyramid = [hl.Func('outLPyramid%i' % i) for i in range(J)]
-    for j in range(J):
-        # Split input pyramid value into integer and floating parts
-        level = inGPyramid[j][x, y] * hl.cast(float_t, levels - 1)
-        li = hl.clamp(hl.cast(int_t, level), 0, levels - 2)
-        lf = level - hl.cast(float_t, li)
-        # Linearly interpolate between the nearest processed pyramid levels
-        outLPyramid[j][x, y] = (1.0 - lf) * lPyramid[j][x, y, li] + lf * lPyramid[j][x, y, li + 1]
-
-    # Make the Gaussian pyramid of the output
-    outGPyramid = [hl.Func('outGPyramid%i' % i) for i in range(J)]
-    outGPyramid[J - 1] = outLPyramid[J - 1]
-    for j in range(J - 1)[::-1]:
-        outGPyramid[j][x, y] = upsample2D(outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y]
-
-    # Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
-    color = hl.Func('color')
-    eps = hl.f32(0.01)
-    color[x, y, c] = outGPyramid[0][x, y] * (clamped[x, y, c] + eps) / (gray[x, y] + eps)
-
-    output = hl.Func('local_laplacian')
-    # Convert back to 16-bit
-    output[x, y, c] = hl.cast(hl.UInt(16), hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0)
-
-    # THE SCHEDULE
-    target = hl.get_target_from_environment()
-    if target.has_gpu_feature():
-        # GPU Schedule
-        print("Compiling for GPU")
-        xi, yi = hl.Var("xi"), hl.Var("yi")
-
-        remap.compute_root()
-        output.compute_root().gpu_tile(x, y, xi, yi, 16, 8)
-        for j in range(J):
-            blockw = 16
-            blockh = 8
-            if j > 3:
-                blockw = 2
-                blockh = 2
-            if j > 0:
-                inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh)
-                gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh)
-            outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh)
-    else:
-        # CPU schedule
-        print("Compiling for CPU")
-
-        remap.compute_root()
-        output.parallel(y, 4).vectorize(x, 4)
-        gray.compute_root().parallel(y, 4).vectorize(x, 4)
-        for j in range(4):
-            if j > 0:
-                inGPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
-            if j > 0:
-                gPyramid[j].compute_root().parallel(y, 4).vectorize(x, 4)
-            outGPyramid[j].compute_root().parallel(y).vectorize(x, 4)
-        for j in range(4, J):
-            inGPyramid[j].compute_root().parallel(y)
-            gPyramid[j].compute_root().parallel(k)
-            outGPyramid[j].compute_root().parallel(y)
-
-    return output
-
-
-def get_input_data():
-    image_path = os.path.join(apps_images_dir(), "rgb.png")
-    rgb_data = halide.imageio.imread(image_path)
-
-    # input data is in range [0, 256*256]
-    input_data = rgb_data.astype(np.uint16) << 8
-    return input_data
-
-
-def filter_test_image(local_laplacian, input):
-    local_laplacian.compile_jit(hl.get_target_from_environment())
-
-    # preparing input and output memory buffers (numpy ndarrays)
-    input_data = get_input_data()
-    input_image = hl.Buffer(input_data)
-    input.set(input_image)
-
-    output_data = np.empty_like(input_data)
-
-    # do the actual computation
-    input_width, input_height = input_image.width(), input_image.height()
-    output_image = local_laplacian.realize([input_width, input_height, 3])
-    output_data = np.asanyarray(output_image)
-
-    # convert back to uint8
-    input_data = (input_data >> 8).astype(np.uint8)
-    output_data = (output_data >> 8).astype(np.uint8)
-
-    # save results
-    input_path = os.path.join(apps_output_dir(), "local_laplacian_input.png")
-    output_path = os.path.join(apps_output_dir(), "local_laplacian.png")
-
-    halide.imageio.imwrite(input_path, input_data)
-    halide.imageio.imwrite(output_path, output_data)
-
-    print()
-    print("local_laplacian realized on output_image.")
-    print('Result saved at {} (input data copy at {}).'.format(output_path, input_path))
-
-
-def main():
-    input_img = hl.ImageParam(hl.UInt(16), 3, 'input')
-
-    # number of intensity levels
-    levels = hl.Param(int_t, 'levels', 8)
-
-    # Parameters controlling the filter
-    alpha = hl.Param(float_t, 'alpha', 1.0 / 7.0)
-    beta = hl.Param(float_t, 'beta', 1.0)
-
-    local_laplacian = get_local_laplacian(input_img, levels, alpha, beta)
-
-    filter_test_image(local_laplacian, input_img)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python_bindings/test/apps/local_laplacian_app.py b/python_bindings/test/apps/local_laplacian_app.py
new file mode 100644
index 000000000000..666799be467f
--- /dev/null
+++ b/python_bindings/test/apps/local_laplacian_app.py
@@ -0,0 +1,58 @@
+"""
+Shell for running Local Laplacian.
+"""
+
+from local_laplacian import local_laplacian
+from local_laplacian_Mullapudi2016 import local_laplacian_Mullapudi2016
+import halide.imageio
+import numpy as np
+import sys
+from timeit import Timer
+
+
+def main():
+    if len(sys.argv) < 6:
+        print(
+            "Usage: %s input.png input.png levels alpha beta output.png" % sys.argv[0]
+        )
+        print("e.g. %s input.png 8 1 1 output.png 10" % sys.argv[0])
+        sys.exit(1)
+
+    input_path = sys.argv[1]
+    levels = int(sys.argv[2])
+    alpha = float(sys.argv[3])
+    beta = float(sys.argv[4])
+    output_path = sys.argv[5]
+    timing_iterations = 10
+
+    print("Reading from %s ..." % input_path)
+    input_buf_u8 = halide.imageio.imread(input_path)
+    assert input_buf_u8.dtype == np.uint8
+    # Convert to uint16 in range [0..1]
+    input_buf = input_buf_u8.astype(np.uint16) * 257
+    h = input_buf.shape[1]
+    w = input_buf.shape[2]
+    output_buf = np.empty([3, h, w], dtype=input_buf.dtype)
+
+    tests = {
+        "Manual": local_laplacian,
+        "Mullapudi2016": local_laplacian_Mullapudi2016,
+    }
+
+    for name, fn in tests.items():
+        print("Running %s... " % name, end="")
+        t = Timer(lambda: fn(input_buf, levels, alpha / (levels - 1), beta, output_buf))
+        avg_time_sec = t.timeit(number=timing_iterations) / timing_iterations
+        print("time: %fms" % (avg_time_sec * 1e3))
+
+    output_buf_u8 = (output_buf // 257).astype(np.uint8)
+
+    print("Saving to %s ..." % output_path)
+    halide.imageio.imwrite(output_path, output_buf_u8)
+
+    print("Success!")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python_bindings/test/apps/local_laplacian_generator.py b/python_bindings/test/apps/local_laplacian_generator.py
new file mode 100644
index 000000000000..b53dde494465
--- /dev/null
+++ b/python_bindings/test/apps/local_laplacian_generator.py
@@ -0,0 +1,226 @@
+"""
+Local Laplacian.
+"""
+
+import halide as hl
+
+# Just declare these at global scope, for simplicity
+x, y, c, k = hl.vars("x y c k")
+
+
+def _func_list(name, size):
+    """Return a list containing `size` Funcs, named `name_n` for n in 0..size-1."""
+    return [hl.Func("%s_%d" % (name, i)) for i in range(size)]
+
+
+def _downsample(f):
+    """Downsample with a 1 3 3 1 filter"""
+    downx, downy = hl.funcs("downx downy")
+    downx[x, y, hl._] = (
+        f[2 * x - 1, y, hl._]
+        + 3.0 * (f[2 * x, y, hl._] + f[2 * x + 1, y, hl._])
+        + f[2 * x + 2, y, hl._]
+    ) / 8.0
+    downy[x, y, hl._] = (
+        downx[x, 2 * y - 1, hl._]
+        + 3.0 * (downx[x, 2 * y, hl._] + downx[x, 2 * y + 1, hl._])
+        + downx[x, 2 * y + 2, hl._]
+    ) / 8.0
+    return downy
+
+
+def _upsample(f):
+    """Upsample using bilinear interpolation"""
+    upx, upy = hl.funcs("upx upy")
+    upx[x, y, hl._] = hl.lerp(
+        f[(x + 1) // 2, y, hl._],
+        f[(x - 1) // 2, y, hl._],
+        ((x % 2) * 2 + 1) / 4.0,
+    )
+    upy[x, y, hl._] = hl.lerp(
+        upx[x, (y + 1) // 2, hl._],
+        upx[x, (y - 1) // 2, hl._],
+        ((y % 2) * 2 + 1) / 4.0,
+    )
+    return upy
+
+
+@hl.alias(local_laplacian_Mullapudi2016={"autoscheduler": "Mullapudi2016"})
+@hl.generator()
+class local_laplacian:
+    pyramid_levels = hl.GeneratorParam(8)
+
+    input_buf = hl.InputBuffer(hl.UInt(16), 3)
+    levels = hl.InputScalar(hl.Int(32))
+    alpha = hl.InputScalar(hl.Float(32))
+    beta = hl.InputScalar(hl.Float(32))
+    output_buf = hl.OutputBuffer(hl.UInt(16), 3)
+
+    def generate(self):
+        g = self
+
+        # THE ALGORITHM
+        J = g.pyramid_levels
+
+        # Make the remapping function as a lookup table.
+        fx = hl.f32(x) / 256.0
+        remap = hl.Func("remap")
+        remap[x] = g.alpha * fx * hl.exp(-fx * fx / 2.0)
+
+        # Set a boundary condition
+        clamped = hl.BoundaryConditions.repeat_edge(g.input_buf)
+
+        # Convert to floating point
+        floating = hl.Func("floating")
+        floating[x, y, c] = clamped[x, y, c] / 65535.0
+
+        # Get the luminance channel
+        gray = hl.Func("gray")
+        gray[x, y] = (
+            hl.f32(0.299) * floating[x, y, 0]
+            + hl.f32(0.587) * floating[x, y, 1]
+            + hl.f32(0.114) * floating[x, y, 2]
+        )
+
+        # Make the processed Gaussian pyramid.
+        gPyramid = _func_list("gPyramid", J)
+        # Do a lookup into a lut with 256 entires per intensity level
+        level = k * (1.0 / (g.levels - 1))
+        idx = gray[x, y] * hl.f32(g.levels - 1) * 256.0
+        idx = hl.clamp(hl.i32(idx), 0, (g.levels - 1) * 256)
+        gPyramid[0][x, y, k] = (
+            g.beta * (gray[x, y] - level) + level + remap[idx - 256 * k]
+        )
+        for j in range(1, J):
+            gPyramid[j][x, y, k] = _downsample(gPyramid[j - 1])[x, y, k]
+
+        # Get its laplacian pyramid
+        lPyramid = _func_list("lPyramid", J)
+        lPyramid[J - 1][x, y, k] = gPyramid[J - 1][x, y, k]
+        for j in range(J - 2, -1, -1):
+            lPyramid[j][x, y, k] = (
+                gPyramid[j][x, y, k] - _upsample(gPyramid[j + 1])[x, y, k]
+            )
+
+        # Make the Gaussian pyramid of the input
+        inGPyramid = _func_list("inGPyramid", J)
+        inGPyramid[0][x, y] = gray[x, y]
+        for j in range(1, J):
+            inGPyramid[j][x, y] = _downsample(inGPyramid[j - 1])[x, y]
+
+        # Make the laplacian pyramid of the output
+        outLPyramid = _func_list("outLPyramid", J)
+        for j in range(0, J):
+            # Split input pyramid value into integer and floating parts
+            level = inGPyramid[j][x, y] * hl.f32(g.levels - 1)
+            li = hl.clamp(hl.i32(level), 0, g.levels - 2)
+            lf = level - hl.f32(li)
+            # Linearly interpolate between the nearest processed pyramid levels
+            outLPyramid[j][x, y] = (1.0 - lf) * lPyramid[j][x, y, li] + (
+                lf * lPyramid[j][x, y, li + 1]
+            )
+
+        # Make the Gaussian pyramid of the output
+        outGPyramid = _func_list("outGPyramid", J)
+        outGPyramid[J - 1][x, y] = outLPyramid[J - 1][x, y]
+        for j in range(J - 2, -1, -1):
+            outGPyramid[j][x, y] = (
+                _upsample(outGPyramid[j + 1])[x, y] + outLPyramid[j][x, y]
+            )
+
+        # Reintroduce color (Connelly: use eps to avoid scaling up noise w/
+        # apollo3.png input)
+        color = hl.Func("color")
+        eps = hl.f32(0.01)
+        color[x, y, c] = (
+            outGPyramid[0][x, y] * (floating[x, y, c] + eps) / (gray[x, y] + eps)
+        )
+
+        # Convert back to 16-bit
+        g.output_buf[x, y, c] = hl.u16(hl.clamp(color[x, y, c], 0.0, 1.0) * 65535.0)
+
+        # ESTIMATES
+        # (This can be useful in conjunction with RunGen and benchmarks as well
+        # as autoschedulers, so we do it in all cases.)
+        g.input_buf.set_estimates([(0, 1536), (0, 2560), (0, 3)])
+        # Provide estimates on the parameters
+        g.levels.set_estimate(8)
+        g.alpha.set_estimate(1)
+        g.beta.set_estimate(1)
+        g.output_buf.set_estimates([(0, 1536), (0, 2560), (0, 3)])
+
+        # THE SCHEDULE
+        if g.using_autoscheduler():
+            # nothing
+            pass
+        elif g.target().has_gpu_feature():
+            # GPU schedule.
+            # 3.19ms on an RTX 2060.
+            remap.compute_root()
+            xi, yi = hl.vars("xi yi")
+            g.output_buf.compute_root().gpu_tile(x, y, xi, yi, 16, 8)
+            for j in range(0, J):
+                blockw = 16
+                blockh = 8
+                if j > 3:
+                    blockw = 2
+                    blockh = 2
+
+                if j > 0:
+                    inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh)
+                    (
+                        gPyramid[j]
+                        .compute_root()
+                        .reorder(k, x, y)
+                        .gpu_tile(x, y, xi, yi, blockw, blockh)
+                    )
+
+                outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh)
+
+        else:
+            # CPU schedule.
+
+            # 21.4ms on an Intel i9-9960X using 32 threads at 3.7
+            # GHz, using the target x86-64-avx2.
+
+            # This app is dominated by data-dependent loads from
+            # memory, so we're better off leaving the AVX-512 units
+            # off in exchange for a higher clock, and we benefit from
+            # hyperthreading.
+
+            remap.compute_root()
+            yo = hl.Var("yo")
+            (
+                g.output_buf.reorder(c, x, y)
+                .split(y, yo, y, 64)
+                .parallel(yo)
+                .vectorize(x, 8)
+            )
+            gray.compute_root().parallel(y, 32).vectorize(x, 8)
+            for j in range(1, 5):
+                inGPyramid[j].compute_root().parallel(y, 32).vectorize(x, 8)
+                (
+                    gPyramid[j]
+                    .compute_root()
+                    .reorder_storage(x, k, y)
+                    .reorder(k, y)
+                    .parallel(y, 8)
+                    .vectorize(x, 8)
+                )
+                (
+                    outGPyramid[j]
+                    .store_at(g.output_buf, yo)
+                    .compute_at(g.output_buf, y)
+                    .fold_storage(y, 4)
+                    .vectorize(x, 8)
+                )
+
+            outGPyramid[0].compute_at(g.output_buf, y).vectorize(x, 8)
+            for j in range(5, J):
+                inGPyramid[j].compute_root()
+                gPyramid[j].compute_root().parallel(k)
+                outGPyramid[j].compute_root()
+
+
+if __name__ == "__main__":
+    hl.main()
diff --git a/python_bindings/test/generators/CMakeLists.txt b/python_bindings/test/generators/CMakeLists.txt
index 27f0ac2e18e6..24253b9e333e 100644
--- a/python_bindings/test/generators/CMakeLists.txt
+++ b/python_bindings/test/generators/CMakeLists.txt
@@ -75,11 +75,6 @@ _add_python_aot_and_stub_extension(SOURCES addconstantpy_generator.py
                                    GENERATORS addconstantpy
                                               addconstantpy_with_offset_42
                                               addconstantpy_with_negative_offset)
-_add_python_aot_and_stub_extension(SOURCES bilateral_grid_generator.py
-                                   GENERATORS bilateral_grid
-                                              bilateral_grid_Adams2019
-                                              bilateral_grid_Li2018
-                                              bilateral_grid_Mullapudi2016)
 _add_python_aot_and_stub_extension(SOURCES bitpy_generator.py GENERATORS bitpy)
 _add_python_aot_and_stub_extension(SOURCES complexpy_generator.py GENERATORS complexpy)
 _add_python_aot_and_stub_extension(SOURCES simplepy_generator.py GENERATORS simplepy)
diff --git a/python_bindings/test/generators/bilateral_grid_generator.py b/python_bindings/test/generators/bilateral_grid_generator.py
deleted file mode 100644
index d6addeeb7dc5..000000000000
--- a/python_bindings/test/generators/bilateral_grid_generator.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""
-Bilateral histogram.
-"""
-
-import halide as hl
-
-@hl.alias(
-    bilateral_grid_Adams2019={'autoscheduler':'Adams2019'},
-    bilateral_grid_Mullapudi2016={'autoscheduler':'Mullapudi2016'},
-    bilateral_grid_Li2018={'autoscheduler':'Li2018'},
-)
-@hl.generator()
-class bilateral_grid:
-    s_sigma = hl.GeneratorParam(8)
-
-    input_buf = hl.InputBuffer(hl.Float(32), 2)
-    r_sigma = hl.InputScalar(hl.Float(32))
-    bilateral_grid = hl.OutputBuffer(hl.Float(32), 2)
-
-    def generate(self):
-        g = self
-
-        x = hl.Var('x')
-        y = hl.Var('y')
-        z = hl.Var('z')
-        c = hl.Var('c')
-
-        # Add a boundary condition
-        clamped = hl.BoundaryConditions.repeat_edge(g.input_buf)
-
-        # Construct the bilateral grid
-        r = hl.RDom([(0, g.s_sigma), (0, g.s_sigma)])
-        val = clamped[x * g.s_sigma + r.x - g.s_sigma // 2, y * g.s_sigma + r.y - g.s_sigma // 2]
-        val = hl.clamp(val, 0.0, 1.0)
-
-        zi = hl.i32(val / g.r_sigma + 0.5)
-
-        histogram = hl.Func('histogram')
-        histogram[x, y, z, c] = 0.0
-        histogram[x, y, zi, c] += hl.mux(c, [val, 1.0])
-
-        # Blur the histogram using a five-tap filter
-        blurx, blury, blurz = hl.Func('blurx'), hl.Func('blury'), hl.Func('blurz')
-        blurz[x, y, z, c] = (histogram[x, y, z - 2, c] + histogram[x, y, z - 1, c] * 4 + histogram[x, y, z, c] * 6 +
-                             histogram[x, y, z + 1, c] * 4 + histogram[x, y, z + 2, c])
-        blurx[x, y, z, c] = (blurz[x - 2, y, z, c] + blurz[x - 1, y, z, c] * 4 + blurz[x, y, z, c] * 6 +
-                             blurz[x + 1, y, z, c] * 4 + blurz[x + 2, y, z, c])
-        blury[x, y, z, c] = (blurx[x, y - 2, z, c] + blurx[x, y - 1, z, c] * 4 + blurx[x, y, z, c] * 6 +
-                             blurx[x, y + 1, z, c] * 4 + blurx[x, y + 2, z, c])
-
-        # Take trilinear samples to compute the output
-        val = hl.clamp(clamped[x, y], 0.0, 1.0)
-        zv = val / g.r_sigma
-        zi = hl.i32(zv)
-        zf = zv - zi
-        xf = hl.f32(x % g.s_sigma) / g.s_sigma
-        yf = hl.f32(y % g.s_sigma) / g.s_sigma
-        xi = x / g.s_sigma
-        yi = y / g.s_sigma
-
-        interpolated = hl.Func('interpolated')
-        interpolated[x, y, c] = hl.lerp(
-            hl.lerp(hl.lerp(blury[xi, yi, zi, c], blury[xi + 1, yi, zi, c], xf),
-                    hl.lerp(blury[xi, yi + 1, zi, c], blury[xi + 1, yi + 1, zi, c], xf), yf),
-            hl.lerp(hl.lerp(blury[xi, yi, zi + 1, c], blury[xi + 1, yi, zi + 1, c], xf),
-                    hl.lerp(blury[xi, yi + 1, zi + 1, c], blury[xi + 1, yi + 1, zi + 1, c], xf), yf), zf)
-
-        # Normalize
-        g.bilateral_grid[x, y] = interpolated[x, y, 0] / interpolated[x, y, 1]
-
-        # ESTIMATES
-        # (This can be useful in conjunction with RunGen and benchmarks as well
-        # as auto-schedule, so we do it in all cases.)
-        # Provide estimates on the input image
-        g.input_buf.set_estimates([(0, 1536), (0, 2560)])
-        # Provide estimates on the parameters
-        g.r_sigma.set_estimate(0.1)
-        # TODO: Compute estimates from the parameter values
-        histogram.set_estimate(z, -2, 16)
-        blurz.set_estimate(z, 0, 12)
-        blurx.set_estimate(z, 0, 12)
-        blury.set_estimate(z, 0, 12)
-        g.bilateral_grid.set_estimates([(0, 1536), (0, 2560)])
-
-        if g.using_autoscheduler():
-            # nothing
-            pass
-        else:
-            if g.target().has_gpu_feature():
-                # 0.50ms on an RTX 2060
-
-                xi = hl.Var('xi')
-                yi = hl.Var('yi')
-                zi = hl.Var('zi')
-
-                # Schedule blurz in 8x8 tiles. This is a tile in
-                # grid-space, which means it represents something like
-                # 64x64 pixels in the input (if s_sigma is 8).
-                blurz.compute_root().reorder(c, z, x, y).gpu_tile(x, y, xi, yi, 8, 8)
-
-                # Schedule histogram to happen per-tile of blurz, with
-                # intermediate results in shared memory. This means histogram
-                # and blurz makes a three-stage kernel:
-                # 1) Zero out the 8x8 set of histograms
-                # 2) Compute those histogram by iterating over lots of the input image
-                # 3) Blur the set of histograms in z
-                histogram.reorder(c, z, x, y).compute_at(blurz, x).gpu_threads(x, y)
-                histogram.update().reorder(c, r.x, r.y, x, y).gpu_threads(x, y).unroll(c)
-
-                # Schedule the remaining blurs and the sampling at the end similarly.
-                blurx.compute_root().reorder(c, x, y, z).reorder_storage(c, x, y, z).vectorize(c).unroll(
-                    y, 2, hl.TailStrategy.RoundUp).gpu_tile(x, y, z, xi, yi, zi, 32, 8, 1, hl.TailStrategy.RoundUp)
-                blury.compute_root().reorder(c, x, y, z).reorder_storage(c, x, y, z).vectorize(c).unroll(
-                    y, 2, hl.TailStrategy.RoundUp).gpu_tile(x, y, z, xi, yi, zi, 32, 8, 1, hl.TailStrategy.RoundUp)
-                g.bilateral_grid.compute_root().gpu_tile(x, y, xi, yi, 32, 8)
-                interpolated.compute_at(g.bilateral_grid, xi).vectorize(c)
-            else:
-                # CPU schedule.
-
-                # 3.98ms on an Intel i9-9960X using 32 threads at 3.7 GHz
-                # using target x86-64-avx2. This is a little less
-                # SIMD-friendly than some of the other apps, so we
-                # benefit from hyperthreading, and don't benefit from
-                # AVX-512, which on my machine reduces the clock to 3.0
-                # GHz.
-
-                blurz.compute_root().reorder(c, z, x, y).parallel(y).vectorize(x, 8).unroll(c)
-                histogram.compute_at(blurz, y)
-                histogram.update().reorder(c, r.x, r.y, x, y).unroll(c)
-                blurx.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 8).unroll(c)
-                blury.compute_root().reorder(c, x, y, z).parallel(z).vectorize(x, 8).unroll(c)
-                g.bilateral_grid.compute_root().parallel(y).vectorize(x, 8)
-
-
-if __name__ == '__main__':
-    hl.main()