From b1ca3345b81b3a5ebee16278d1a603b4fe7eae31 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 15 Jul 2022 15:13:50 -0700
Subject: [PATCH 1/2] Rework autoscheduler API (#6788) (#6838)

* Rework autoschduler API (#6788)

* Oops

* Update test_function_dag.cpp

* clang-tidy

* trigger buildbots

* Update Generator.h

* Minor cleanups

* Update README_cmake.md

* Check for malformed autoscheduler_params dicts

* Add alias-with-autoscheduler code, plus tweaks

* Update stubtest_jittest.cpp

* Update Makefile

* trigger buildbots

* fixes

* Update AbstractGenerator.cpp

* Update stubtest_generator.cpp

* Update Makefile

* Add deprecation warning for HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API

* Make AutoschedulerParams a real struct

* clang-tidy
---
 Makefile                                      |  29 ++-
 README_cmake.md                               |   7 +-
 apps/HelloPyTorch/Makefile                    |  36 ++--
 apps/HelloPyTorch/src/add_generator.cpp       |   4 +-
 apps/bgu/Makefile                             |   4 +-
 apps/bgu/bgu_generator.cpp                    |   4 +-
 apps/bilateral_grid/Makefile                  |   4 +-
 .../bilateral_grid_generator.cpp              |   2 +-
 apps/camera_pipe/Makefile                     |   4 +-
 apps/camera_pipe/camera_pipe_generator.cpp    |  12 +-
 apps/conv_layer/Makefile                      |   4 +-
 apps/conv_layer/conv_layer_generator.cpp      |   2 +-
 apps/depthwise_separable_conv/Makefile        |   4 +-
 .../depthwise_separable_conv_generator.cpp    |   4 +-
 apps/harris/Makefile                          |   4 +-
 apps/harris/harris_generator.cpp              |   2 +-
 apps/hist/Makefile                            |   4 +-
 apps/hist/hist_generator.cpp                  |   2 +-
 apps/iir_blur/Makefile                        |   4 +-
 apps/iir_blur/iir_blur_generator.cpp          |   4 +-
 apps/interpolate/Makefile                     |   4 +-
 apps/interpolate/interpolate_generator.cpp    |   2 +-
 apps/lens_blur/Makefile                       |   4 +-
 apps/lens_blur/lens_blur_generator.cpp        |   2 +-
 apps/linear_blur/linear_blur_generator.cpp    |   2 +-
 apps/linear_blur/linear_to_srgb_generator.cpp |   2 +-
 apps/linear_blur/simple_blur_generator.cpp    |   2 +-
 apps/linear_blur/srgb_to_linear_generator.cpp |   2 +-
 apps/local_laplacian/Makefile                 |   4 +-
 .../local_laplacian_generator.cpp             |   2 +-
 apps/max_filter/Makefile                      |   4 +-
 apps/max_filter/max_filter_generator.cpp      |   2 +-
 apps/nl_means/Makefile                        |   4 +-
 apps/nl_means/nl_means_generator.cpp          |   2 +-
 apps/resnet_50/Makefile                       |   2 +-
 apps/stencil_chain/Makefile                   |   4 +-
 .../stencil_chain/stencil_chain_generator.cpp |   2 +-
 apps/support/autoscheduler.inc                |  99 -----------
 apps/unsharp/Makefile                         |   4 +-
 apps/unsharp/unsharp_generator.cpp            |   2 +-
 cmake/HalideGeneratorHelpers.cmake            |   5 +-
 python_bindings/src/PyHalide.cpp              |   4 +
 python_bindings/src/PyMachineParams.cpp       |   2 +
 python_bindings/src/PyMachineParams.h         |   2 +
 python_bindings/src/PyModule.cpp              |   6 +-
 python_bindings/src/PyPipeline.cpp            |  32 +++-
 python_bindings/todo.txt                      |   1 -
 src/AbstractGenerator.cpp                     |  25 ++-
 src/AbstractGenerator.h                       |   2 +-
 src/Generator.cpp                             | 166 ++++++++++++++++--
 src/Generator.h                               |  92 ++++++++--
 src/Module.cpp                                |  30 +++-
 src/Pipeline.cpp                              |  35 ++++
 src/Pipeline.h                                |  60 ++++++-
 src/autoschedulers/adams2019/AutoSchedule.cpp |  43 +++--
 src/autoschedulers/adams2019/AutoSchedule.h   |   2 +-
 src/autoschedulers/adams2019/Cache.cpp        |   2 +-
 src/autoschedulers/adams2019/Cache.h          |   2 +-
 src/autoschedulers/adams2019/CostModel.h      |  10 +-
 .../adams2019/DefaultCostModel.cpp            |   2 +-
 .../adams2019/DefaultCostModel.h              |   8 +-
 src/autoschedulers/adams2019/FunctionDAG.cpp  |   2 +-
 src/autoschedulers/adams2019/FunctionDAG.h    |   4 +-
 src/autoschedulers/adams2019/LoopNest.cpp     |   6 +-
 src/autoschedulers/adams2019/LoopNest.h       |   6 +-
 src/autoschedulers/adams2019/Makefile         |  11 +-
 src/autoschedulers/adams2019/State.cpp        |  10 +-
 src/autoschedulers/adams2019/State.h          |  10 +-
 src/autoschedulers/adams2019/autotune_loop.sh |   7 +-
 .../adams2019/cost_model_generator.cpp        |   6 +-
 .../included_schedule_file_generator.cpp      |   2 +-
 src/autoschedulers/adams2019/test.cpp         |  70 +++++---
 .../adams2019/test_function_dag.cpp           |  18 +-
 .../li2018/GradientAutoscheduler.cpp          |  43 ++++-
 src/autoschedulers/li2018/Makefile            |   2 +-
 src/autoschedulers/li2018/test.cpp            |  64 +++++--
 src/autoschedulers/li2018/test.py             |   5 +-
 .../mullapudi2016/AutoSchedule.cpp            |  61 ++++++-
 test/auto_schedule/cost_function.cpp          |   4 +
 test/auto_schedule/data_dependent.cpp         |   4 +
 test/auto_schedule/extern.cpp                 |  12 ++
 test/auto_schedule/fibonacci.cpp              |   4 +
 test/auto_schedule/histogram.cpp              |   4 +
 test/auto_schedule/large_window.cpp           |   4 +
 test/auto_schedule/mat_mul.cpp                |   4 +
 test/auto_schedule/max_filter.cpp             |   4 +
 test/auto_schedule/multi_output.cpp           |  10 +-
 test/auto_schedule/overlap.cpp                |   4 +
 test/auto_schedule/param.cpp                  |  16 ++
 test/auto_schedule/reorder.cpp                |  12 ++
 test/auto_schedule/small_pure_update.cpp      |   7 +-
 test/auto_schedule/tile_vs_inline.cpp         |   4 +
 test/auto_schedule/unused_func.cpp            |   4 +
 .../auto_schedule/vectorize_var_in_update.cpp |   4 +
 test/correctness/custom_auto_scheduler.cpp    |  21 ++-
 test/error/auto_schedule_no_parallel.cpp      |   4 +
 test/error/auto_schedule_no_reorder.cpp       |   4 +
 test/generator/CMakeLists.txt                 |  15 +-
 test/generator/alias_aottest.cpp              |  36 ++++
 test/generator/alias_generator.cpp            |  18 ++
 test/generator/example_generator.cpp          |   2 +-
 test/generator/stubtest_generator.cpp         |   2 +-
 test/generator/stubtest_jittest.cpp           |   4 +-
 tutorial/CMakeLists.txt                       |   7 +-
 .../lesson_21_auto_scheduler_generate.cpp     |  52 +++---
 105 files changed, 1016 insertions(+), 399 deletions(-)
 delete mode 100644 apps/support/autoscheduler.inc

diff --git a/Makefile b/Makefile
index 45440516c540..97d481012909 100644
--- a/Makefile
+++ b/Makefile
@@ -1443,6 +1443,18 @@ $(FILTERS_DIR)/alias_with_offset_42.a: $(BIN_DIR)/alias.generator
 	@mkdir -p $(@D)
 	$(CURDIR)/$< -g alias_with_offset_42 -f alias_with_offset_42 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime
 
+$(FILTERS_DIR)/alias_Adams2019.a: $(BIN_DIR)/alias.generator autoschedulers
+	@mkdir -p $(@D)
+	$(CURDIR)/$< -g alias_Adams2019 -f alias_Adams2019 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_DIR)/libautoschedule_adams2019.$(SHARED_EXT)
+
+$(FILTERS_DIR)/alias_Li2018.a: $(BIN_DIR)/alias.generator autoschedulers
+	@mkdir -p $(@D)
+	$(CURDIR)/$< -g alias_Li2018 -f alias_Li2018 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_DIR)/libautoschedule_li2018.$(SHARED_EXT)
+
+$(FILTERS_DIR)/alias_Mullapudi2016.a: $(BIN_DIR)/alias.generator autoschedulers
+	@mkdir -p $(@D)
+	$(CURDIR)/$< -g alias_Mullapudi2016 -f alias_Mullapudi2016 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_DIR)/libautoschedule_mullapudi2016.$(SHARED_EXT)
+
 METADATA_TESTER_GENERATOR_ARGS=\
 	input.type=uint8 input.dim=3 \
 	dim_only_input_buffer.type=uint8 \
@@ -1552,7 +1564,7 @@ $(FILTERS_DIR)/stubtest.a: $(BIN_DIR)/stubtest.generator
 
 $(FILTERS_DIR)/stubuser_auto.a: $(BIN_DIR)/stubuser.generator $(BIN_MULLAPUDI2016)
 	@mkdir -p $(@D)
-	$(CURDIR)/$< -g stubuser $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f stubuser_auto target=$(TARGET)-no_runtime auto_schedule=true -s Mullapudi2016 -p $(BIN_MULLAPUDI2016)
+	$(CURDIR)/$< -g stubuser $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f stubuser_auto target=$(TARGET)-no_runtime autoscheduler=Mullapudi2016 -p $(BIN_MULLAPUDI2016)
 
 $(FILTERS_DIR)/external_code.a: $(BIN_DIR)/external_code.generator
 	@mkdir -p $(@D)
@@ -1564,7 +1576,7 @@ $(FILTERS_DIR)/external_code.halide_generated.cpp: $(BIN_DIR)/external_code.gene
 
 $(FILTERS_DIR)/autograd_grad.a: $(BIN_DIR)/autograd.generator $(BIN_MULLAPUDI2016)
 	@mkdir -p $(@D)
-	$(CURDIR)/$< -g autograd $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f autograd_grad target=$(TARGET)-no_runtime auto_schedule=true -s Mullapudi2016 -d 1 -p $(BIN_MULLAPUDI2016)
+	$(CURDIR)/$< -g autograd $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f autograd_grad target=$(TARGET)-no_runtime autoscheduler=Mullapudi2016 -d 1 -p $(BIN_MULLAPUDI2016)
 
 # Usually, it's considered best practice to have one Generator per
 # .cpp file, with the generator-name and filename matching;
@@ -1611,12 +1623,13 @@ $(BIN_DIR)/$(TARGET)/generator_aot_sanitizercoverage: $(ROOT_DIR)/test/generator
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter-out %.h,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@
 
+
 # alias has additional deps to link in
-$(BIN_DIR)/$(TARGET)/generator_aot_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.a $(FILTERS_DIR)/alias_with_offset_42.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+$(BIN_DIR)/$(TARGET)/generator_aot_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.a $(FILTERS_DIR)/alias_with_offset_42.a $(FILTERS_DIR)/alias_Adams2019.a $(FILTERS_DIR)/alias_Li2018.a $(FILTERS_DIR)/alias_Mullapudi2016.a  $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@
 
-$(BIN_DIR)/$(TARGET)/generator_aotcpp_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.halide_generated.cpp $(FILTERS_DIR)/alias_with_offset_42.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+$(BIN_DIR)/$(TARGET)/generator_aotcpp_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.halide_generated.cpp $(FILTERS_DIR)/alias_with_offset_42.halide_generated.cpp $(FILTERS_DIR)/alias_Adams2019.halide_generated.cpp $(FILTERS_DIR)/alias_Li2018.halide_generated.cpp $(FILTERS_DIR)/alias_Mullapudi2016.halide_generated.cpp  $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@
 
@@ -1841,13 +1854,17 @@ $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate: $(ROOT_DIR)/tutorial/less
 	$(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \
 	-I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@
 
-# The values in MachineParams are:
+# The values are:
 # - the maximum level of parallelism available,
 # - the size of the last-level cache (in bytes),
 # - the ratio between the cost of a miss at the last level cache and the cost
 #   of arithmetic on the target architecture
 # ...in that order.
-LESSON_21_MACHINE_PARAMS = 32,16777216,40
+LESSON_21_AUTOSCHEDULER_PARAMS=\
+    autoscheduler=Mullapudi2016 \
+    autoscheduler.parallelism=32 \
+    autoscheduler.last_level_cache_size=16777216 \
+    autoscheduler.balance=40
 
 $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_run: $(ROOT_DIR)/tutorial/lesson_21_auto_scheduler_run.cpp $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate $(BIN_MULLAPUDI2016)
 	@-mkdir -p $(TMP_DIR)
diff --git a/README_cmake.md b/README_cmake.md
index c6d971e03c71..0ceb4b931dc9 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -677,8 +677,7 @@ autoscheduler:
 
 ```cmake
 add_halide_library(my_second_generator FROM my_generators
-                   AUTOSCHEDULER Halide::Adams2019
-                   PARAMS auto_schedule=true)
+                   AUTOSCHEDULER Halide::Adams2019)
 ```
 
 ### RunGenMain
@@ -858,9 +857,9 @@ being created. When `TARGETS` is empty and the `host` target would not
 cross-compile, then `host` will be used. Otherwise, `cmake` will be used and an
 author warning will be issued.
 
-To set the default autoscheduler, set the `AUTOSCHEDULER` argument to a target
+To use an autoscheduler, set the `AUTOSCHEDULER` argument to a target
 named like `Namespace::Scheduler`, for example `Halide::Adams19`. This will set
-the `-s` flag on the generator command line to `Scheduler` and add the target to
+the `autoscheduler` GeneratorParam on the generator command line to `Scheduler` and add the target to
 the list of plugins. Additional plugins can be loaded by setting the `PLUGINS`
 argument. If the argument to `AUTOSCHEDULER` does not contain `::` or it does
 not name a target, it will be passed to the `-s` flag verbatim.
diff --git a/apps/HelloPyTorch/Makefile b/apps/HelloPyTorch/Makefile
index 15dd231b99de..c05d2826a475 100644
--- a/apps/HelloPyTorch/Makefile
+++ b/apps/HelloPyTorch/Makefile
@@ -84,8 +84,7 @@ $(BIN)/%/add_float32.a: $(GENERATOR_BIN)/add.generator
 		-f add_float32 \
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
-		target=$* \
-		auto_schedule=false
+		target=$*
 
 $(BIN)/%/add_halidegrad_float32.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -95,11 +94,10 @@ $(BIN)/%/add_halidegrad_float32.a: $(GENERATOR_BIN)/add.generator
 		-f add_halidegrad_float32 \
 		-e static_library,c_header,pytorch_wrapper \
 		-p $(HALIDE_DISTRIB_PATH)/lib/libautoschedule_li2018.so \
-		-s Li2018 \
 		-o $(@D) \
 		-d 1 \
 		target=$* \
-		auto_schedule=true
+		autoscheduler=Li2018
 
 $(BIN)/%/add_grad_float32.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -109,8 +107,7 @@ $(BIN)/%/add_grad_float32.a: $(GENERATOR_BIN)/add.generator
 		-f add_grad_float32 \
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
-		target=$* \
-		auto_schedule=false
+		target=$*
 
 $(BIN)/%/add_float64.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -120,8 +117,7 @@ $(BIN)/%/add_float64.a: $(GENERATOR_BIN)/add.generator
 		-f add_float64 \
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
-		target=$* \
-		auto_schedule=false
+		target=$*
 
 $(BIN)/%/add_halidegrad_float64.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -132,11 +128,10 @@ $(BIN)/%/add_halidegrad_float64.a: $(GENERATOR_BIN)/add.generator
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
 		-p $(HALIDE_DISTRIB_PATH)/lib/libautoschedule_li2018.so \
-		-s Li2018 \
 		target=$* \
 		-d 1 \
 		target=$* \
-		auto_schedule=true
+		autoscheduler=Li2018
 
 $(BIN)/%/add_grad_float64.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -146,8 +141,7 @@ $(BIN)/%/add_grad_float64.a: $(GENERATOR_BIN)/add.generator
 		-f add_grad_float64 \
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
-		target=$* \
-		auto_schedule=false
+		target=$*
 
 # -----------------------------------------------------------------------------
 
@@ -160,8 +154,7 @@ $(BIN)/%/add_cuda_float32.a: $(GENERATOR_BIN)/add.generator
 		-f add_cuda_float32 \
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
-		target=$(CUDA_TARGET) \
-		auto_schedule=false
+		target=$(CUDA_TARGET)
 
 $(BIN)/%/add_halidegrad_cuda_float32.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -172,10 +165,9 @@ $(BIN)/%/add_halidegrad_cuda_float32.a: $(GENERATOR_BIN)/add.generator
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
 		-p $(HALIDE_DISTRIB_PATH)/lib/libautoschedule_li2018.so \
-		-s Li2018 \
 		-d 1 \
 		target=$(CUDA_TARGET) \
-		auto_schedule=true
+		autoscheduler=Li2018
 
 $(BIN)/%/add_grad_cuda_float32.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -185,8 +177,7 @@ $(BIN)/%/add_grad_cuda_float32.a: $(GENERATOR_BIN)/add.generator
 		-f add_grad_cuda_float32 \
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
-		target=$(CUDA_TARGET) \
-		auto_schedule=false
+		target=$(CUDA_TARGET)
 
 $(BIN)/%/add_cuda_float64.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -196,8 +187,7 @@ $(BIN)/%/add_cuda_float64.a: $(GENERATOR_BIN)/add.generator
 		-f add_cuda_float64 \
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
-		target=$(CUDA_TARGET) \
-		auto_schedule=false
+		target=$(CUDA_TARGET)
 
 $(BIN)/%/add_halidegrad_cuda_float64.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -208,10 +198,9 @@ $(BIN)/%/add_halidegrad_cuda_float64.a: $(GENERATOR_BIN)/add.generator
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
 		-p $(HALIDE_DISTRIB_PATH)/lib/libautoschedule_li2018.so \
-		-s Li2018 \
 		-d 1 \
 		target=$(CUDA_TARGET) \
-		auto_schedule=true
+		autoscheduler=Li2018
 
 $(BIN)/%/add_grad_cuda_float64.a: $(GENERATOR_BIN)/add.generator
 	@mkdir -p $(@D)
@@ -221,8 +210,7 @@ $(BIN)/%/add_grad_cuda_float64.a: $(GENERATOR_BIN)/add.generator
 		-f add_grad_cuda_float64 \
 		-e static_library,c_header,pytorch_wrapper \
 		-o $(@D) \
-		target=$(CUDA_TARGET) \
-		auto_schedule=false
+		target=$(CUDA_TARGET)
 
 # -----------------------------------------------------------------------------
 
diff --git a/apps/HelloPyTorch/src/add_generator.cpp b/apps/HelloPyTorch/src/add_generator.cpp
index 8f2d8f4d6a81..ccfaa937d5e9 100644
--- a/apps/HelloPyTorch/src/add_generator.cpp
+++ b/apps/HelloPyTorch/src/add_generator.cpp
@@ -30,7 +30,7 @@ class AddGenerator : public Generator<AddGenerator> {
         output.set_estimates({{0, kEdge}, {0, kEdge}, {0, kEdge}, {0, kEdge}});
 
         // Schedule
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             Var tx("tx"), xy("xy"), cn("cn"), allvars("allvars");
             if (get_target().has_gpu_feature()) {
                 output
@@ -84,7 +84,7 @@ class AddGradGenerator : public Generator<AddGradGenerator> {
         d_input_b.set_estimates({{0, kEdge}, {0, kEdge}, {0, kEdge}, {0, kEdge}});
 
         // Schedule
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             Var tx("tx"), xy("xy"), cn("cn"), allvars("allvars");
 
             if (get_target().has_gpu_feature()) {
diff --git a/apps/bgu/Makefile b/apps/bgu/Makefile
index 297ceaee90b0..8eb687ec064a 100644
--- a/apps/bgu/Makefile
+++ b/apps/bgu/Makefile
@@ -16,11 +16,11 @@ $(GENERATOR_BIN)/bgu.generator: bgu_generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/bgu.a: $(GENERATOR_BIN)/bgu.generator
 	@mkdir -p $(@D)
-	$< -g bgu -f bgu -o $(BIN)/$* target=$*-no_runtime auto_schedule=false
+	$< -g bgu -f bgu -o $(BIN)/$* target=$*-no_runtime
 
 $(BIN)/%/bgu_auto_schedule.a: $(GENERATOR_BIN)/bgu.generator
 	@mkdir -p $(@D)
-	$< -g bgu -f bgu_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true
+	$< -g bgu -f bgu_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/bgu.generator
 	@mkdir -p $(@D)
diff --git a/apps/bgu/bgu_generator.cpp b/apps/bgu/bgu_generator.cpp
index 054df3e52ba6..1b2cff5b1dc7 100644
--- a/apps/bgu/bgu_generator.cpp
+++ b/apps/bgu/bgu_generator.cpp
@@ -430,7 +430,7 @@ class BGU : public Generator<BGU> {
             b(2, 2) += weighted_lambda * gain;
 
             // Now solve Ax = b
-            Matrix<3, 4> result = transpose(solve_symmetric(A, b, line, x, auto_schedule, get_target()));
+            Matrix<3, 4> result = transpose(solve_symmetric(A, b, line, x, using_autoscheduler(), get_target()));
 
             // Pack the resulting matrix into the output Func.
             line(x, y, z, c) = pack_channels(c, {result(0, 0),
@@ -509,7 +509,7 @@ class BGU : public Generator<BGU> {
         output = slice;
 
         // Schedule
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             if (!get_target().has_gpu_feature()) {
                 // 7.09 ms on an Intel i9-9960X using 16 threads
                 //
diff --git a/apps/bilateral_grid/Makefile b/apps/bilateral_grid/Makefile
index 405d3e3c6782..11d79fbcd946 100644
--- a/apps/bilateral_grid/Makefile
+++ b/apps/bilateral_grid/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/bilateral_grid.generator: bilateral_grid_generator.cpp $(GENERA
 
 $(BIN)/%/bilateral_grid.a: $(GENERATOR_BIN)/bilateral_grid.generator
 	@mkdir -p $(@D)
-	$^ -g bilateral_grid -e $(GENERATOR_OUTPUTS) -o $(@D) -f bilateral_grid target=$* auto_schedule=false
+	$^ -g bilateral_grid -e $(GENERATOR_OUTPUTS) -o $(@D) -f bilateral_grid target=$*
 
 $(BIN)/%/bilateral_grid_auto_schedule.a: $(GENERATOR_BIN)/bilateral_grid.generator
 	@mkdir -p $(@D)
-	$^ -g bilateral_grid -e $(GENERATOR_OUTPUTS) -o $(@D) -f bilateral_grid_auto_schedule target=$*-no_runtime auto_schedule=true
+	$^ -g bilateral_grid -e $(GENERATOR_OUTPUTS) -o $(@D) -f bilateral_grid_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/filter: filter.cpp $(BIN)/%/bilateral_grid.a $(BIN)/%/bilateral_grid_auto_schedule.a
 	@mkdir -p $(@D)
diff --git a/apps/bilateral_grid/bilateral_grid_generator.cpp b/apps/bilateral_grid/bilateral_grid_generator.cpp
index ede57459d5ab..b1e07fb15cdf 100644
--- a/apps/bilateral_grid/bilateral_grid_generator.cpp
+++ b/apps/bilateral_grid/bilateral_grid_generator.cpp
@@ -80,7 +80,7 @@ class BilateralGrid : public Halide::Generator<BilateralGrid> {
         blury.set_estimate(z, 0, 12);
         bilateral_grid.set_estimates({{0, 1536}, {0, 2560}});
 
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             // nothing
         } else if (get_target().has_gpu_feature()) {
             // 0.50ms on an RTX 2060
diff --git a/apps/camera_pipe/Makefile b/apps/camera_pipe/Makefile
index 38f984d2af3e..b86698cd36ed 100644
--- a/apps/camera_pipe/Makefile
+++ b/apps/camera_pipe/Makefile
@@ -12,11 +12,11 @@ $(GENERATOR_BIN)/camera_pipe.generator: camera_pipe_generator.cpp $(GENERATOR_DE
 
 $(BIN)/%/camera_pipe.a: $(GENERATOR_BIN)/camera_pipe.generator
 	@mkdir -p $(@D)
-	$^ -g camera_pipe -e $(GENERATOR_OUTPUTS) -o $(@D) -f camera_pipe target=$* auto_schedule=false
+	$^ -g camera_pipe -e $(GENERATOR_OUTPUTS) -o $(@D) -f camera_pipe target=$*
 
 $(BIN)/%/camera_pipe_auto_schedule.a: $(GENERATOR_BIN)/camera_pipe.generator
 	@mkdir -p $(@D)
-	$^ -g camera_pipe -e $(GENERATOR_OUTPUTS) -o $(@D) -f camera_pipe_auto_schedule target=$*-no_runtime auto_schedule=true
+	$^ -g camera_pipe -e $(GENERATOR_OUTPUTS) -o $(@D) -f camera_pipe_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/process: process.cpp $(BIN)/%/camera_pipe.a $(BIN)/%/camera_pipe_auto_schedule.a
 	@mkdir -p $(@D)
diff --git a/apps/camera_pipe/camera_pipe_generator.cpp b/apps/camera_pipe/camera_pipe_generator.cpp
index ec0323676cd4..06251f5691bb 100644
--- a/apps/camera_pipe/camera_pipe_generator.cpp
+++ b/apps/camera_pipe/camera_pipe_generator.cpp
@@ -154,7 +154,7 @@ class Demosaic : public Halide::Generator<Demosaic> {
     void schedule() {
         Pipeline p(output);
 
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             // blank
         } else if (get_target().has_gpu_feature()) {
             Var xi, yi;
@@ -270,7 +270,7 @@ Func CameraPipe::color_correct(Func input) {
     Expr val = (matrix_3200(x, y) * alpha + matrix_7000(x, y) * (1 - alpha));
     matrix(x, y) = cast<int16_t>(val * 256.0f);  // Q8.8 fixed point
 
-    if (!auto_schedule) {
+    if (!using_autoscheduler()) {
         matrix.compute_root();
         if (get_target().has_gpu_feature()) {
             matrix.gpu_single_thread();
@@ -331,7 +331,7 @@ Func CameraPipe::apply_curve(Func input) {
     // makeLUT add guard band outside of (minRaw, maxRaw]:
     curve(x) = select(x <= minRaw, 0, select(x > maxRaw, 255, val));
 
-    if (!auto_schedule) {
+    if (!using_autoscheduler()) {
         // It's a LUT, compute it once ahead of time.
         curve.compute_root();
         if (get_target().has_gpu_feature()) {
@@ -370,7 +370,7 @@ Func CameraPipe::sharpen(Func input) {
     // Convert the sharpening strength to 2.5 fixed point. This allows sharpening in the range [0, 4].
     Func sharpen_strength_x32("sharpen_strength_x32");
     sharpen_strength_x32() = u8_sat(sharpen_strength * 32);
-    if (!auto_schedule) {
+    if (!using_autoscheduler()) {
         sharpen_strength_x32.compute_root();
         if (get_target().has_gpu_feature()) {
             sharpen_strength_x32.gpu_single_thread();
@@ -439,12 +439,12 @@ void CameraPipe::generate() {
     processed.set_estimates({{0, 2592}, {0, 1968}, {0, 3}});
 
     // Schedule
-    if (auto_schedule) {
+    if (using_autoscheduler()) {
         // nothing
     } else if (get_target().has_gpu_feature()) {
 
         // We can generate slightly better code if we know the output is even-sized
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             // TODO: The autoscheduler really ought to be able to
             // accommodate bounds on the output Func.
             Expr out_width = processed.width();
diff --git a/apps/conv_layer/Makefile b/apps/conv_layer/Makefile
index 2ac64101691f..43db9f9ee70a 100644
--- a/apps/conv_layer/Makefile
+++ b/apps/conv_layer/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/conv_layer.generator: conv_layer_generator.cpp $(GENERATOR_DEPS
 
 $(BIN)/%/conv_layer.a: $(GENERATOR_BIN)/conv_layer.generator
 	@mkdir -p $(@D)
-	$^ -g conv_layer -e $(GENERATOR_OUTPUTS) -o $(@D) -f conv_layer target=$* auto_schedule=false
+	$^ -g conv_layer -e $(GENERATOR_OUTPUTS) -o $(@D) -f conv_layer target=$*
 
 $(BIN)/%/conv_layer_auto_schedule.a: $(GENERATOR_BIN)/conv_layer.generator
 	@mkdir -p $(@D)
-	$^ -g conv_layer -e $(GENERATOR_OUTPUTS) -o $(@D) -f conv_layer_auto_schedule target=$*-no_runtime auto_schedule=true
+	$^ -g conv_layer -e $(GENERATOR_OUTPUTS) -o $(@D) -f conv_layer_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/process: process.cpp $(BIN)/%/conv_layer.a $(BIN)/%/conv_layer_auto_schedule.a
 	@mkdir -p $(@D)
diff --git a/apps/conv_layer/conv_layer_generator.cpp b/apps/conv_layer/conv_layer_generator.cpp
index 5b6ff1ee5e10..a27d367a076d 100644
--- a/apps/conv_layer/conv_layer_generator.cpp
+++ b/apps/conv_layer/conv_layer_generator.cpp
@@ -49,7 +49,7 @@ class ConvolutionLayer : public Halide::Generator<ConvolutionLayer> {
 
         bias.dim(0).set_bounds(0, CO).set_stride(1);
 
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             input.dim(0).set_estimate(0, CI);
             input.dim(1).set_estimate(0, W + 2);
             input.dim(2).set_estimate(0, H + 2);
diff --git a/apps/depthwise_separable_conv/Makefile b/apps/depthwise_separable_conv/Makefile
index def2146eb3f6..001e12444809 100644
--- a/apps/depthwise_separable_conv/Makefile
+++ b/apps/depthwise_separable_conv/Makefile
@@ -8,11 +8,11 @@ $(GENERATOR_BIN)/depthwise_separable_conv.generator: depthwise_separable_conv_ge
 
 $(BIN)/%/depthwise_separable_conv.a: $(GENERATOR_BIN)/depthwise_separable_conv.generator
 	@mkdir -p $(@D)
-	$^ -g depthwise_separable_conv -e $(GENERATOR_OUTPUTS) -o $(@D) -f depthwise_separable_conv target=$* auto_schedule=false
+	$^ -g depthwise_separable_conv -e $(GENERATOR_OUTPUTS) -o $(@D) -f depthwise_separable_conv target=$*
 
 $(BIN)/%/depthwise_separable_conv_auto_schedule.a: $(GENERATOR_BIN)/depthwise_separable_conv.generator
 	@mkdir -p $(@D)
-	$^ -g depthwise_separable_conv -e $(GENERATOR_OUTPUTS) -o $(@D) -f depthwise_separable_conv_auto_schedule target=$*-no_runtime auto_schedule=true
+	$^ -g depthwise_separable_conv -e $(GENERATOR_OUTPUTS) -o $(@D) -f depthwise_separable_conv_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/process: process.cpp $(BIN)/%/depthwise_separable_conv.a $(BIN)/%/depthwise_separable_conv_auto_schedule.a
 	@-mkdir -p $(BIN)
diff --git a/apps/depthwise_separable_conv/depthwise_separable_conv_generator.cpp b/apps/depthwise_separable_conv/depthwise_separable_conv_generator.cpp
index d560a8bea376..ba230ee03653 100644
--- a/apps/depthwise_separable_conv/depthwise_separable_conv_generator.cpp
+++ b/apps/depthwise_separable_conv/depthwise_separable_conv_generator.cpp
@@ -74,7 +74,7 @@ class DepthwiseSeparableConvolution : public Generator<DepthwiseSeparableConvolu
         output(d, x, y, b) = max(pointwise_convolved(d, x, y, b), 0.f);
 
         // The schedule.
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             // Second layer of MobileNet v2
             const int N = 4, CI = 32, CO = 16, CM = 1, W = 112, H = 112;
 
@@ -276,7 +276,7 @@ class DepthwiseSeparableConvolution : public Generator<DepthwiseSeparableConvolu
                 .unroll(xi);
         }
 
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             // We're going to specialize both schedules for channel_multiplier = 1,
             // in which case it's nice to know that depthwise_filter
             // is dense across the second dimension.
diff --git a/apps/harris/Makefile b/apps/harris/Makefile
index 713c11d0c2c7..d99a72591d38 100644
--- a/apps/harris/Makefile
+++ b/apps/harris/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/harris.generator: harris_generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/harris.a: $(GENERATOR_BIN)/harris.generator
 	@mkdir -p $(@D)
-	$< -g harris -f harris -o $(BIN)/$* target=$*-no_runtime auto_schedule=false
+	$< -g harris -f harris -o $(BIN)/$* target=$*-no_runtime
 
 $(BIN)/%/harris_auto_schedule.a: $(GENERATOR_BIN)/harris.generator
 	@mkdir -p $(@D)
-	$< -g harris -f harris_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true
+	$< -g harris -f harris_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/harris.generator
 	@mkdir -p $(@D)
diff --git a/apps/harris/harris_generator.cpp b/apps/harris/harris_generator.cpp
index feb16d1d7170..69cf8c05c68c 100644
--- a/apps/harris/harris_generator.cpp
+++ b/apps/harris/harris_generator.cpp
@@ -72,7 +72,7 @@ class Harris : public Halide::Generator<Harris> {
         }
 
         // Schedule
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             Var xi("xi"), yi("yi");
             if (get_target().has_gpu_feature()) {
                 // 0.253ms on a 2060 RTX
diff --git a/apps/hist/Makefile b/apps/hist/Makefile
index 5f4faa1b835a..b0843bda1fb0 100644
--- a/apps/hist/Makefile
+++ b/apps/hist/Makefile
@@ -12,11 +12,11 @@ $(GENERATOR_BIN)/hist.generator: hist_generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/hist.a: $(GENERATOR_BIN)/hist.generator
 	@mkdir -p $(@D)
-	$< -g hist -f hist -o $(BIN)/$* target=$*-no_runtime auto_schedule=false
+	$< -g hist -f hist -o $(BIN)/$* target=$*-no_runtime
 
 $(BIN)/%/hist_auto_schedule.a: $(GENERATOR_BIN)/hist.generator
 	@mkdir -p $(@D)
-	$< -g hist -f hist_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true
+	$< -g hist -f hist_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/hist.generator
 	@mkdir -p $(@D)
diff --git a/apps/hist/hist_generator.cpp b/apps/hist/hist_generator.cpp
index e3d5de7f5737..32d86d3d0186 100644
--- a/apps/hist/hist_generator.cpp
+++ b/apps/hist/hist_generator.cpp
@@ -64,7 +64,7 @@ class Hist : public Halide::Generator<Hist> {
         }
 
         // Schedule
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             cdf.bound(x, 0, 256);
 
             Var xi("xi"), yi("yi");
diff --git a/apps/iir_blur/Makefile b/apps/iir_blur/Makefile
index 8c9983c8fa14..49104b3e5fa3 100644
--- a/apps/iir_blur/Makefile
+++ b/apps/iir_blur/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/iir_blur.generator: iir_blur_generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/iir_blur.a: $(GENERATOR_BIN)/iir_blur.generator
 	@mkdir -p $(@D)
-	$< -g iir_blur -f iir_blur -o $(BIN)/$* target=$*-no_runtime auto_schedule=false
+	$< -g iir_blur -f iir_blur -o $(BIN)/$* target=$*-no_runtime
 
 $(BIN)/%/iir_blur_auto_schedule.a: $(GENERATOR_BIN)/iir_blur.generator
 	@mkdir -p $(@D)
-	$< -g iir_blur -f iir_blur_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true
+	$< -g iir_blur -f iir_blur_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/iir_blur.generator
 	@mkdir -p $(@D)
diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index 59ef065e79e6..1aeb3e0d1a5f 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -145,10 +145,10 @@ class IirBlur : public Generator<IirBlur> {
         Expr height = input.height();
 
         // First, blur the columns of the input.
-        Func blury_T = blur_cols_transpose(input, height, alpha, auto_schedule, get_target());
+        Func blury_T = blur_cols_transpose(input, height, alpha, using_autoscheduler(), get_target());
 
         // Blur the columns again (the rows of the original).
-        Func blur = blur_cols_transpose(blury_T, width, alpha, auto_schedule, get_target());
+        Func blur = blur_cols_transpose(blury_T, width, alpha, using_autoscheduler(), get_target());
 
         // Scheduling is done inside blur_cols_transpose.
         output = blur;
diff --git a/apps/interpolate/Makefile b/apps/interpolate/Makefile
index 8e55e16a1283..95c165b533ee 100644
--- a/apps/interpolate/Makefile
+++ b/apps/interpolate/Makefile
@@ -12,11 +12,11 @@ $(GENERATOR_BIN)/interpolate.generator: interpolate_generator.cpp $(GENERATOR_DE
 
 $(BIN)/%/interpolate.a: $(GENERATOR_BIN)/interpolate.generator
 	@mkdir -p $(@D)
-	$< -g interpolate -e $(GENERATOR_OUTPUTS) -f interpolate -o $(BIN)/$* target=$*-no_runtime auto_schedule=false
+	$< -g interpolate -e $(GENERATOR_OUTPUTS) -f interpolate -o $(BIN)/$* target=$*-no_runtime
 
 $(BIN)/%/interpolate_auto_schedule.a: $(GENERATOR_BIN)/interpolate.generator
 	@mkdir -p $(@D)
-	$< -g interpolate -e $(GENERATOR_OUTPUTS) -f interpolate_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true
+	$< -g interpolate -e $(GENERATOR_OUTPUTS) -f interpolate_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/interpolate.generator
 	@mkdir -p $(@D)
diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp
index 58d6d65374eb..1e4026b9ef87 100644
--- a/apps/interpolate/interpolate_generator.cpp
+++ b/apps/interpolate/interpolate_generator.cpp
@@ -72,7 +72,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
         normalize(x, y, c) = interpolated[0](x, y, c) / interpolated[0](x, y, 3);
 
         // Schedule
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             output = normalize;
         } else {
             // 0.86ms on a 2060 RTX
diff --git a/apps/lens_blur/Makefile b/apps/lens_blur/Makefile
index 8ede6b797ffe..c5c424c82edf 100644
--- a/apps/lens_blur/Makefile
+++ b/apps/lens_blur/Makefile
@@ -11,11 +11,11 @@ $(GENERATOR_BIN)/lens_blur.generator: lens_blur_generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/lens_blur.a: $(GENERATOR_BIN)/lens_blur.generator
 	@mkdir -p $(@D)
-	$^ -g lens_blur -e $(GENERATOR_OUTPUTS) -o $(@D) -f lens_blur target=$* auto_schedule=false
+	$^ -g lens_blur -e $(GENERATOR_OUTPUTS) -o $(@D) -f lens_blur target=$*
 
 $(BIN)/%/lens_blur_auto_schedule.a: $(GENERATOR_BIN)/lens_blur.generator
 	@mkdir -p $(@D)
-	$^ -g lens_blur -e $(GENERATOR_OUTPUTS) -o $(@D) -f lens_blur_auto_schedule target=$*-no_runtime auto_schedule=true
+	$^ -g lens_blur -e $(GENERATOR_OUTPUTS) -o $(@D) -f lens_blur_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/process: process.cpp $(BIN)/%/lens_blur.a $(BIN)/%/lens_blur_auto_schedule.a
 	@mkdir -p $(@D)
diff --git a/apps/lens_blur/lens_blur_generator.cpp b/apps/lens_blur/lens_blur_generator.cpp
index 52fad46cb82b..14aa92c876f2 100644
--- a/apps/lens_blur/lens_blur_generator.cpp
+++ b/apps/lens_blur/lens_blur_generator.cpp
@@ -166,7 +166,7 @@ class LensBlur : public Halide::Generator<LensBlur> {
         final.set_estimates({{0, 192}, {0, 320}, {0, 3}});
 
         /* THE SCHEDULE */
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             // nothing
         } else if (get_target().has_gpu_feature()) {
             // Manual GPU schedule
diff --git a/apps/linear_blur/linear_blur_generator.cpp b/apps/linear_blur/linear_blur_generator.cpp
index 9b18e4b4bd3d..ec9db2e8097b 100644
--- a/apps/linear_blur/linear_blur_generator.cpp
+++ b/apps/linear_blur/linear_blur_generator.cpp
@@ -17,7 +17,7 @@ struct LinearBlur : public Halide::Generator<LinearBlur> {
         Func srgb = linear_to_srgb::generate(this, {blurred});
         output(x, y, c) = srgb(x, y, c);
 
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             input.set_estimates({{0, 1536}, {0, 2560}, {0, 4}});
             output.set_estimates({{0, 1536}, {0, 2560}, {0, 4}});
         } else {
diff --git a/apps/linear_blur/linear_to_srgb_generator.cpp b/apps/linear_blur/linear_to_srgb_generator.cpp
index adf7b9426712..a45285e3b5a8 100644
--- a/apps/linear_blur/linear_to_srgb_generator.cpp
+++ b/apps/linear_blur/linear_to_srgb_generator.cpp
@@ -17,7 +17,7 @@ struct LinearTosRGB : public Halide::Generator<LinearTosRGB> {
     }
 
     void schedule() {
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             const int W = 1536, H = 2560, C = 4;
             // Wart: Input<Func> are defined with Vars we don't know.
             // Might be x,y but might be _0,_1. Use the args() to work around.
diff --git a/apps/linear_blur/simple_blur_generator.cpp b/apps/linear_blur/simple_blur_generator.cpp
index a53a3e26c426..78d23ae253cd 100644
--- a/apps/linear_blur/simple_blur_generator.cpp
+++ b/apps/linear_blur/simple_blur_generator.cpp
@@ -22,7 +22,7 @@ struct SimpleBlur : public Halide::Generator<SimpleBlur> {
     }
 
     void schedule() {
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             const int W = 1536, H = 2560, C = 4;
             // Wart: Input<Func> are defined with Vars we don't know.
             // Might be x,y but might be _0,_1. Use the args() to work around.
diff --git a/apps/linear_blur/srgb_to_linear_generator.cpp b/apps/linear_blur/srgb_to_linear_generator.cpp
index b03907463c83..95cf203ada85 100644
--- a/apps/linear_blur/srgb_to_linear_generator.cpp
+++ b/apps/linear_blur/srgb_to_linear_generator.cpp
@@ -17,7 +17,7 @@ struct sRGBToLinear : public Halide::Generator<sRGBToLinear> {
     }
 
     void schedule() {
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             const int W = 1536, H = 2560, C = 4;
             // Wart: Input<Func> are defined with Vars we don't know.
             // Might be x,y but might be _0,_1. Use the args() to work around.
diff --git a/apps/local_laplacian/Makefile b/apps/local_laplacian/Makefile
index 21fa7bf74f6b..a9f57b4de81a 100644
--- a/apps/local_laplacian/Makefile
+++ b/apps/local_laplacian/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/local_laplacian.generator: local_laplacian_generator.cpp $(GENE
 
 $(BIN)/%/local_laplacian.a: $(GENERATOR_BIN)/local_laplacian.generator
 	@mkdir -p $(@D)
-	$^ -g local_laplacian -e $(GENERATOR_OUTPUTS) -o $(@D) -f local_laplacian target=$* auto_schedule=false
+	$^ -g local_laplacian -e $(GENERATOR_OUTPUTS) -o $(@D) -f local_laplacian target=$*
 
 $(BIN)/%/local_laplacian_auto_schedule.a: $(GENERATOR_BIN)/local_laplacian.generator
 	@mkdir -p $(@D)
-	$^ -g local_laplacian -e $(GENERATOR_OUTPUTS) -o $(@D) -f local_laplacian_auto_schedule target=$*-no_runtime auto_schedule=true
+	$^ -g local_laplacian -e $(GENERATOR_OUTPUTS) -o $(@D) -f local_laplacian_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/process: process.cpp $(BIN)/%/local_laplacian.a $(BIN)/%/local_laplacian_auto_schedule.a
 	@mkdir -p $(@D)
diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
index b1c697a2a3b7..ee6e7dc09c57 100644
--- a/apps/local_laplacian/local_laplacian_generator.cpp
+++ b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -98,7 +98,7 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
         output.set_estimates({{0, 1536}, {0, 2560}, {0, 3}});
 
         /* THE SCHEDULE */
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             // Nothing.
         } else if (get_target().has_gpu_feature()) {
             // GPU schedule.
diff --git a/apps/max_filter/Makefile b/apps/max_filter/Makefile
index bd755774b2f5..ec7fdc7e0739 100644
--- a/apps/max_filter/Makefile
+++ b/apps/max_filter/Makefile
@@ -12,11 +12,11 @@ $(GENERATOR_BIN)/max_filter.generator: max_filter_generator.cpp $(GENERATOR_DEPS
 
 $(BIN)/%/max_filter.a: $(GENERATOR_BIN)/max_filter.generator
 	@mkdir -p $(@D)
-	$< -g max_filter -f max_filter -o $(BIN)/$* target=$*-no_runtime auto_schedule=false
+	$< -g max_filter -f max_filter -o $(BIN)/$* target=$*-no_runtime
 
 $(BIN)/%/max_filter_auto_schedule.a: $(GENERATOR_BIN)/max_filter.generator
 	@mkdir -p $(@D)
-	$< -g max_filter -f max_filter_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true
+	$< -g max_filter -f max_filter_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/max_filter.generator
 	@mkdir -p $(@D)
diff --git a/apps/max_filter/max_filter_generator.cpp b/apps/max_filter/max_filter_generator.cpp
index 02856a5e4604..bfe0c9457e23 100644
--- a/apps/max_filter/max_filter_generator.cpp
+++ b/apps/max_filter/max_filter_generator.cpp
@@ -64,7 +64,7 @@ class Max : public Halide::Generator<Max> {
         }
 
         // Schedule
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             if (get_target().has_gpu_feature()) {
                 // 11.8ms on a 2060 RTX
 
diff --git a/apps/nl_means/Makefile b/apps/nl_means/Makefile
index 2c7fecdccc47..109cb5af13f7 100644
--- a/apps/nl_means/Makefile
+++ b/apps/nl_means/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/nl_means.generator: nl_means_generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/nl_means.a: $(GENERATOR_BIN)/nl_means.generator
 	@mkdir -p $(@D)
-	$^ -g nl_means -e $(GENERATOR_OUTPUTS) -o $(@D) -f nl_means target=$* auto_schedule=false
+	$^ -g nl_means -e $(GENERATOR_OUTPUTS) -o $(@D) -f nl_means target=$*
 
 $(BIN)/%/nl_means_auto_schedule.a: $(GENERATOR_BIN)/nl_means.generator
 	@mkdir -p $(@D)
-	$^ -g nl_means -e $(GENERATOR_OUTPUTS) -o $(@D) -f nl_means_auto_schedule target=$*-no_runtime auto_schedule=true
+	$^ -g nl_means -e $(GENERATOR_OUTPUTS) -o $(@D) -f nl_means_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/process: process.cpp $(BIN)/%/nl_means.a $(BIN)/%/nl_means_auto_schedule.a
 	@mkdir -p $(@D)
diff --git a/apps/nl_means/nl_means_generator.cpp b/apps/nl_means/nl_means_generator.cpp
index ec51844119ed..5b3e136111ff 100644
--- a/apps/nl_means/nl_means_generator.cpp
+++ b/apps/nl_means/nl_means_generator.cpp
@@ -81,7 +81,7 @@ class NonLocalMeans : public Halide::Generator<NonLocalMeans> {
         // Provide estimates on the output pipeline
         non_local_means.set_estimates({{0, 1536}, {0, 2560}, {0, 3}});
 
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             // nothing
         } else if (get_target().has_gpu_feature()) {
             // 22 ms on a 2060 RTX
diff --git a/apps/resnet_50/Makefile b/apps/resnet_50/Makefile
index 3d1dd30c9ce8..5303bd06e449 100644
--- a/apps/resnet_50/Makefile
+++ b/apps/resnet_50/Makefile
@@ -17,7 +17,7 @@ $(GENERATOR_BIN)/resnet50.generator: Resnet50Generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/resnet50.a: $(GENERATOR_BIN)/resnet50.generator
 	@mkdir -p $(@D)
-	$^ -g resnet50 -o $(@D) -f resnet50 target=$* auto_schedule=false
+	$^ -g resnet50 -o $(@D) -f resnet50 target=$*
 
 $(BIN)/%/process: process.cpp $(BIN)/%/resnet50.a
 	@mkdir -p $(@D)
diff --git a/apps/stencil_chain/Makefile b/apps/stencil_chain/Makefile
index 116922d03095..4c2706e66cd5 100644
--- a/apps/stencil_chain/Makefile
+++ b/apps/stencil_chain/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/stencil_chain.generator: stencil_chain_generator.cpp $(GENERATO
 
 $(BIN)/%/stencil_chain.a: $(GENERATOR_BIN)/stencil_chain.generator
 	@mkdir -p $(@D)
-	$^ -g stencil_chain -e $(GENERATOR_OUTPUTS) -o $(@D) -f stencil_chain target=$* auto_schedule=false
+	$^ -g stencil_chain -e $(GENERATOR_OUTPUTS) -o $(@D) -f stencil_chain target=$*
 
 $(BIN)/%/stencil_chain_auto_schedule.a: $(GENERATOR_BIN)/stencil_chain.generator
 	@mkdir -p $(@D)
-	$^ -g stencil_chain -e $(GENERATOR_OUTPUTS) -o $(@D) -f stencil_chain_auto_schedule target=$*-no_runtime auto_schedule=true
+	$^ -g stencil_chain -e $(GENERATOR_OUTPUTS) -o $(@D) -f stencil_chain_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/process: process.cpp $(BIN)/%/stencil_chain.a $(BIN)/%/stencil_chain_auto_schedule.a
 	@mkdir -p $(@D)
diff --git a/apps/stencil_chain/stencil_chain_generator.cpp b/apps/stencil_chain/stencil_chain_generator.cpp
index ebe07d51bdba..f62f269d6146 100644
--- a/apps/stencil_chain/stencil_chain_generator.cpp
+++ b/apps/stencil_chain/stencil_chain_generator.cpp
@@ -45,7 +45,7 @@ class StencilChain : public Halide::Generator<StencilChain> {
             output.set_estimates({{0, width}, {0, height}});
         }
 
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             // nothing
         } else if (get_target().has_gpu_feature()) {
             // GPU schedule
diff --git a/apps/support/autoscheduler.inc b/apps/support/autoscheduler.inc
deleted file mode 100644
index fc3aeb8f1876..000000000000
--- a/apps/support/autoscheduler.inc
+++ /dev/null
@@ -1,99 +0,0 @@
-ifndef BIN
-$(error BIN must be set prior to including autoscheduler.inc)
-endif
-
-AUTOSCHED_SRC ?= $(realpath ../autoscheduler)
-
-# Default to $(BIN) so that the toplevel Makefile can put all build products
-# into the build products directory (rather than into the source tree)
-AUTOSCHED_BIN ?= $(BIN)
-AUTOSCHED_SAMPLES_OUT ?= $(AUTOSCHED_SRC)/samples
-
-AUTOSCHED_WEIGHT_OBJECTS=$(AUTOSCHED_BIN)/baseline_weights.o
-
-# TODO(srj): depending on something not in the distrib folder isn't strictly
-# kosher, but this is still experimental
-$(AUTOSCHED_BIN)/binary2cpp: ../../tools/binary2cpp.cpp
-	@mkdir -p $(@D)
-	$(CXX) $< -o $@
-
-$(AUTOSCHED_BIN)/baseline_weights.cpp: $(AUTOSCHED_BIN)/binary2cpp $(AUTOSCHED_SRC)/baseline.weights
-	@mkdir -p $(@D)
-	$(AUTOSCHED_BIN)/binary2cpp baseline_weights < $(AUTOSCHED_SRC)/baseline.weights > $@
-
-$(AUTOSCHED_BIN)/baseline_weights.o: $(AUTOSCHED_BIN)/baseline_weights.cpp
-	$(CXX) -c $< -o $@
-
-AUTOSCHED_COST_MODEL_LIBS=\
-$(AUTOSCHED_BIN)/cost_model/cost_model.a \
-$(AUTOSCHED_BIN)/cost_model/train_cost_model.a \
-
-$(AUTOSCHED_BIN)/cost_model.generator: $(AUTOSCHED_SRC)/cost_model_generator.cpp \
-							$(AUTOSCHED_SRC)/cost_model_schedule.h \
-							$(AUTOSCHED_SRC)/NetworkSize.h \
-							$(GENERATOR_DEPS)
-	@mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_EXPORT_DYNAMIC)
-
-$(AUTOSCHED_BIN)/auto_schedule_runtime.a: $(AUTOSCHED_BIN)/cost_model.generator
-	@mkdir -p $(@D)
-	$^ -r auto_schedule_runtime -o $(AUTOSCHED_BIN) target=$(HL_TARGET)
-
-$(AUTOSCHED_BIN)/cost_model/%.a: $(AUTOSCHED_BIN)/cost_model.generator
-	@mkdir -p $(@D)
-	$^ -g $* -o $(AUTOSCHED_BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime auto_schedule=false -e stmt,static_library,h,assembly
-
-# It's important to use dynamic lookups for undefined symbols here: all of libHalide
-# is expected to be present (in the loading binary), so we explicitly make the symbols
-# undefined rather than dependent on libHalide.so.
-$(AUTOSCHED_BIN)/libauto_schedule.so: $(AUTOSCHED_SRC)/AutoSchedule.cpp \
-			  							$(AUTOSCHED_SRC)/ASLog.cpp \
-										$(AUTOSCHED_SRC)/DefaultCostModel.h \
-										$(AUTOSCHED_SRC)/DefaultCostModel.cpp \
-										$(AUTOSCHED_SRC)/Weights.h \
-										$(AUTOSCHED_SRC)/Weights.cpp \
-										$(AUTOSCHED_SRC)/FunctionDAG.h \
-										$(AUTOSCHED_SRC)/FunctionDAG.cpp \
-										$(AUTOSCHED_SRC)/LoopNest.h \
-										$(AUTOSCHED_SRC)/LoopNest.cpp \
-										$(AUTOSCHED_SRC)/Featurization.h \
-										$(AUTOSCHED_SRC)/CostModel.h \
-										$(AUTOSCHED_SRC)/PerfectHashMap.h \
-										$(AUTOSCHED_WEIGHT_OBJECTS) \
-										$(AUTOSCHED_COST_MODEL_LIBS) \
-										$(GENERATOR_DEPS) \
-										$(AUTOSCHED_BIN)/auto_schedule_runtime.a
-	@mkdir -p $(@D)
-	$(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(AUTOSCHED_BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS)
-
-$(AUTOSCHED_BIN)/retrain_cost_model: $(AUTOSCHED_SRC)/retrain_cost_model.cpp \
-									$(AUTOSCHED_SRC)/ASLog.cpp \
-									$(AUTOSCHED_SRC)/DefaultCostModel.h \
-									$(AUTOSCHED_SRC)/DefaultCostModel.cpp \
-									$(AUTOSCHED_SRC)/Weights.h \
-									$(AUTOSCHED_SRC)/Weights.cpp \
-									$(AUTOSCHED_SRC)/CostModel.h \
-									$(AUTOSCHED_SRC)/NetworkSize.h \
-									$(AUTOSCHED_COST_MODEL_LIBS) \
-									$(AUTOSCHED_WEIGHT_OBJECTS) \
-									$(AUTOSCHED_BIN)/auto_schedule_runtime.a
-	@mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(AUTOSCHED_BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP)
-
-$(AUTOSCHED_BIN)/featurization_to_sample: $(AUTOSCHED_SRC)/featurization_to_sample.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) $< $(OPTIMIZE) -o $@
-
-$(AUTOSCHED_BIN)/get_host_target: $(AUTOSCHED_SRC)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h
-	@mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@
-
-$(AUTOSCHED_BIN)/weightsdir_to_weightsfile: $(AUTOSCHED_SRC)/weightsdir_to_weightsfile.cpp $(AUTOSCHED_SRC)/Weights.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@
-
-# This is the value that machine_params defaults to if no custom value is specified;
-# see MachineParams::generic()
-HL_MACHINE_PARAMS ?= 32,25165824,160
-
-
diff --git a/apps/unsharp/Makefile b/apps/unsharp/Makefile
index fa912ad172e1..047fc2854fb3 100644
--- a/apps/unsharp/Makefile
+++ b/apps/unsharp/Makefile
@@ -10,11 +10,11 @@ $(GENERATOR_BIN)/unsharp.generator: unsharp_generator.cpp $(GENERATOR_DEPS)
 
 $(BIN)/%/unsharp.a: $(GENERATOR_BIN)/unsharp.generator
 	@mkdir -p $(@D)
-	$< -g unsharp -f unsharp -o $(BIN)/$* target=$*-no_runtime auto_schedule=false
+	$< -g unsharp -f unsharp -o $(BIN)/$* target=$*-no_runtime
 
 $(BIN)/%/unsharp_auto_schedule.a: $(GENERATOR_BIN)/unsharp.generator
 	@mkdir -p $(@D)
-	$< -g unsharp -f unsharp_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true
+	$< -g unsharp -f unsharp_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016
 
 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/unsharp.generator
 	@mkdir -p $(@D)
diff --git a/apps/unsharp/unsharp_generator.cpp b/apps/unsharp/unsharp_generator.cpp
index d68702bf1e20..c1070b2753fe 100644
--- a/apps/unsharp/unsharp_generator.cpp
+++ b/apps/unsharp/unsharp_generator.cpp
@@ -61,7 +61,7 @@ class Unsharp : public Halide::Generator<Unsharp> {
         }
 
         // Schedule
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             // Some Intel Mac Minis have GPUs that require tile sizes smaller than 32x32
             // for this pipeline because they have too few registers. Drop to 16x16 to
             // avoid unexpected crashes in CI.
diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake
index fa724e73bc33..b4465bf74d9b 100644
--- a/cmake/HalideGeneratorHelpers.cmake
+++ b/cmake/HalideGeneratorHelpers.cmake
@@ -281,7 +281,6 @@ function(add_halide_library TARGET)
     # Attach an autoscheduler if the user requested it
     ##
 
-    set(autoscheduler "")
     if (ARG_AUTOSCHEDULER)
         if ("${ARG_AUTOSCHEDULER}" MATCHES "::")
             if (NOT TARGET "${ARG_AUTOSCHEDULER}")
@@ -295,8 +294,7 @@ function(add_halide_library TARGET)
         elseif (NOT ARG_PLUGINS)
             message(AUTHOR_WARNING "AUTOSCHEDULER set to a scheduler name but no plugins were loaded")
         endif ()
-        set(autoscheduler -s "${ARG_AUTOSCHEDULER}")
-        list(PREPEND ARG_PARAMS auto_schedule=true)
+        list(PREPEND ARG_PARAMS "autoscheduler=${ARG_AUTOSCHEDULER}")
     endif ()
 
     ##
@@ -334,7 +332,6 @@ function(add_halide_library TARGET)
                        -f "${ARG_FUNCTION_NAME}"
                        -e "$<JOIN:${generator_outputs},$<COMMA>>"
                        ${generator_plugins}
-                       ${autoscheduler}
                        -o .
                        "target=$<JOIN:${ARG_TARGETS},$<COMMA>>"
                        ${ARG_PARAMS}
diff --git a/python_bindings/src/PyHalide.cpp b/python_bindings/src/PyHalide.cpp
index da598b3b0bc8..a7348dcffc67 100644
--- a/python_bindings/src/PyHalide.cpp
+++ b/python_bindings/src/PyHalide.cpp
@@ -15,7 +15,9 @@
 #include "PyImageParam.h"
 #include "PyInlineReductions.h"
 #include "PyLambda.h"
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 #include "PyMachineParams.h"
+#endif
 #include "PyModule.h"
 #include "PyParam.h"
 #include "PyPipeline.h"
@@ -53,7 +55,9 @@ PYBIND11_MODULE(HALIDE_PYBIND_MODULE_NAME, m) {
     define_extern_func_argument(m);
     define_var(m);
     define_rdom(m);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     define_machine_params(m);
+#endif
     define_module(m);
     define_callable(m);
     define_func(m);
diff --git a/python_bindings/src/PyMachineParams.cpp b/python_bindings/src/PyMachineParams.cpp
index e99dd594b11d..93c49d97fae6 100644
--- a/python_bindings/src/PyMachineParams.cpp
+++ b/python_bindings/src/PyMachineParams.cpp
@@ -1,3 +1,4 @@
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 #include "PyMachineParams.h"
 
 namespace Halide {
@@ -23,3 +24,4 @@ void define_machine_params(py::module &m) {
 
 }  // namespace PythonBindings
 }  // namespace Halide
+#endif
diff --git a/python_bindings/src/PyMachineParams.h b/python_bindings/src/PyMachineParams.h
index aa15ee73c069..82b4ff3ac441 100644
--- a/python_bindings/src/PyMachineParams.h
+++ b/python_bindings/src/PyMachineParams.h
@@ -1,6 +1,7 @@
 #ifndef HALIDE_PYTHON_BINDINGS_PYMACHINEPARAMS_H
 #define HALIDE_PYTHON_BINDINGS_PYMACHINEPARAMS_H
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 #include "PyHalide.h"
 
 namespace Halide {
@@ -11,4 +12,5 @@ void define_machine_params(py::module &m);
 }  // namespace PythonBindings
 }  // namespace Halide
 
+#endif
 #endif  // HALIDE_PYTHON_BINDINGS_PYMACHINEPARAMS_H
diff --git a/python_bindings/src/PyModule.cpp b/python_bindings/src/PyModule.cpp
index ac98de2d58e3..2527ae035121 100644
--- a/python_bindings/src/PyModule.cpp
+++ b/python_bindings/src/PyModule.cpp
@@ -12,9 +12,13 @@ void define_module(py::module &m) {
     auto auto_scheduler_results_class =
         py::class_<AutoSchedulerResults>(m, "AutoSchedulerResults")
             .def(py::init<>())
-            .def_readwrite("scheduler_name", &AutoSchedulerResults::scheduler_name)
             .def_readwrite("target", &AutoSchedulerResults::target)
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+            .def_readwrite("scheduler_name", &AutoSchedulerResults::scheduler_name)
             .def_readwrite("machine_params_string", &AutoSchedulerResults::machine_params_string)
+#else
+            .def_readwrite("autoscheduler_params", &AutoSchedulerResults::autoscheduler_params)
+#endif
             .def_readwrite("schedule_source", &AutoSchedulerResults::schedule_source)
             .def_readwrite("featurization", &AutoSchedulerResults::featurization)
             .def("__repr__", [](const AutoSchedulerResults &o) -> std::string {
diff --git a/python_bindings/src/PyPipeline.cpp b/python_bindings/src/PyPipeline.cpp
index 68caf873e2f2..18d932f01649 100644
--- a/python_bindings/src/PyPipeline.cpp
+++ b/python_bindings/src/PyPipeline.cpp
@@ -41,6 +41,32 @@ void define_pipeline(py::module &m) {
     // - set_custom_trace()
     // - set_custom_print()
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+// nothing
+#else
+    py::class_<AutoschedulerParams>(m, "AutoschedulerParams")
+        .def(py::init<>())
+        .def(py::init<std::string>(), py::arg("name"))
+        .def(py::init([](const std::string &name, const py::dict &extra) -> AutoschedulerParams {
+                 // Manually convert the dict:
+                 // we want to allow Python to pass in dicts that have non-string values for some keys;
+                 // PyBind will reject these as a type failure. We'll stringify them here explicitly.
+                 AutoschedulerParams asp(name);
+                 for (auto item : extra) {
+                     const std::string name = py::str(item.first).cast<std::string>();
+                     const std::string value = py::str(item.second).cast<std::string>();
+                     asp.extra[name] = value;
+                 }
+                 return asp;
+             }),
+             py::arg("target"), py::arg("autoscheduler_params"))
+        .def_readwrite("name", &AutoschedulerParams::name)
+        .def_readwrite("extra", &AutoschedulerParams::extra)
+        .def("__repr__", [](const AutoSchedulerResults &o) -> std::string {
+            return "<halide.AutoschedulerParams>";
+        });
+#endif
+
     auto pipeline_class =
         py::class_<Pipeline>(m, "Pipeline")
             .def(py::init<>())
@@ -49,6 +75,7 @@ void define_pipeline(py::module &m) {
 
             .def("outputs", &Pipeline::outputs)
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
             .def("auto_schedule", (AutoSchedulerResults(Pipeline::*)(const std::string &, const Target &, const MachineParams &) const) & Pipeline::auto_schedule,
                  py::arg("autoscheduler_name"), py::arg("target"), py::arg("machine_params") = MachineParams::generic())
             .def("auto_schedule", (AutoSchedulerResults(Pipeline::*)(const Target &, const MachineParams &) const) & Pipeline::auto_schedule,
@@ -56,7 +83,10 @@ void define_pipeline(py::module &m) {
 
             .def_static("set_default_autoscheduler_name", &Pipeline::set_default_autoscheduler_name,
                         py::arg("autoscheduler_name"))
-
+#else
+            .def("apply_autoscheduler", (AutoSchedulerResults(Pipeline::*)(const Target &, const AutoschedulerParams &) const) & Pipeline::apply_autoscheduler,
+                 py::arg("target"), py::arg("autoscheduler_params"))
+#endif
             .def("get_func", &Pipeline::get_func,
                  py::arg("index"))
             .def("print_loop_nest", &Pipeline::print_loop_nest)
diff --git a/python_bindings/todo.txt b/python_bindings/todo.txt
index dfb2bdb780bb..c73685c38443 100644
--- a/python_bindings/todo.txt
+++ b/python_bindings/todo.txt
@@ -25,7 +25,6 @@
     - InlineReductions
     - IROperator
     - LoopLevel
-    - MachineParams
     - Module
     - OutputImageParam
     - Pipeline
diff --git a/src/AbstractGenerator.cpp b/src/AbstractGenerator.cpp
index cdcc80c4800a..52bd89553e38 100644
--- a/src/AbstractGenerator.cpp
+++ b/src/AbstractGenerator.cpp
@@ -25,9 +25,19 @@ Module AbstractGenerator::build_module(const std::string &function_name) {
 
     AutoSchedulerResults auto_schedule_results;
     const auto context = this->context();
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     if (context.auto_schedule()) {
         auto_schedule_results = pipeline.auto_schedule(context.target(), context.machine_params());
     }
+#else
+    const auto &asp = context.autoscheduler_params();
+    if (!asp.name.empty()) {
+        debug(1) << "Applying autoscheduler " << asp.name << " to Generator " << name() << " ...\n";
+        auto_schedule_results = pipeline.apply_autoscheduler(context.target(), asp);
+    } else {
+        debug(1) << "Applying autoscheduler (NONE) to Generator " << name() << " ...\n";
+    }
+#endif
 
     std::vector<Argument> filter_arguments;
     const auto arg_infos = arginfos();
@@ -215,9 +225,17 @@ Module AbstractGenerator::build_gradient_module(const std::string &function_name
 
     AutoSchedulerResults auto_schedule_results;
     const auto context = this->context();
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     if (context.auto_schedule()) {
         auto_schedule_results = grad_pipeline.auto_schedule(context.target(), context.machine_params());
-    } else {
+    }
+#else
+    const auto &asp = context.autoscheduler_params();
+    if (!asp.name.empty()) {
+        auto_schedule_results = grad_pipeline.apply_autoscheduler(context.target(), asp);
+    }
+#endif
+    else {
         user_warning << "Autoscheduling is not enabled in build_gradient_module(), so the resulting "
                         "gradient module will be unscheduled; this is very unlikely to be what you want.\n";
     }
@@ -257,8 +275,13 @@ Callable AbstractGenerator::compile_to_callable(const JITHandlers *jit_handlers,
 
 void AbstractGenerator::set_generatorparam_values(const GeneratorParamsMap &m) {
     for (const auto &c : m) {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         user_assert(c.first != "target" && c.first != "auto_schedule" && c.first != "machine_params")
             << "The GeneratorParam '" << c.first << "' cannot be specified via string here; use GeneratorContext instead.";
+#else
+        user_assert(c.first != "target" && c.first != "auto_scheduler")
+            << "The GeneratorParam '" << c.first << "' cannot be specified via string here; use GeneratorContext instead.";
+#endif
         set_generatorparam_value(c.first, c.second);
     }
 }
diff --git a/src/AbstractGenerator.h b/src/AbstractGenerator.h
index 28dc9335b0f3..95e904dfd9aa 100644
--- a/src/AbstractGenerator.h
+++ b/src/AbstractGenerator.h
@@ -81,7 +81,7 @@ class AbstractGenerator {
      * used to register it.) */
     virtual std::string name() = 0;
 
-    /** Return the Target, autoscheduler flag, and MachineParams that this Generator
+    /** Return the Target and autoscheduler info that this Generator
      * was created with. Always legal to call on any AbstractGenerator instance,
      * regardless of what other methods have been called. (All AbstractGenerator instances
      * are expected to be created with immutable values for these, which can't be
diff --git a/src/Generator.cpp b/src/Generator.cpp
index d0254c3206bf..4fe37a305417 100644
--- a/src/Generator.cpp
+++ b/src/Generator.cpp
@@ -21,35 +21,59 @@ namespace Halide {
 
 #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
 GeneratorContext::GeneratorContext(const Target &target,
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
                                    bool auto_schedule,
                                    const MachineParams &machine_params,
+#else
+                                   const AutoschedulerParams &autoscheduler_params,
+#endif
                                    std::shared_ptr<ExternsMap> externs_map)
     : target_(target),
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
       auto_schedule_(auto_schedule),
       machine_params_(machine_params),
+#else
+      autoscheduler_params_(autoscheduler_params),
+#endif
       externs_map_(std::move(externs_map)) {
 }
-#endif
+#endif  // HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 GeneratorContext::GeneratorContext(const Target &target,
                                    bool auto_schedule,
                                    const MachineParams &machine_params)
     : target_(target),
       auto_schedule_(auto_schedule),
-      machine_params_(machine_params)
-#ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
-      ,
-      externs_map_(std::make_shared<ExternsMap>())
-#endif
-{
+      machine_params_(machine_params) {
+}
+#else
+GeneratorContext::GeneratorContext(const Target &target)
+    : target_(target),
+      autoscheduler_params_() {
 }
 
+GeneratorContext::GeneratorContext(const Target &target,
+                                   const AutoschedulerParams &autoscheduler_params)
+    : target_(target),
+      autoscheduler_params_(autoscheduler_params) {
+}
+#endif
+
 GeneratorContext GeneratorContext::with_target(const Target &t) const {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
     return GeneratorContext(t, auto_schedule_, machine_params_, externs_map_);
 #else
     return GeneratorContext(t, auto_schedule_, machine_params_);
 #endif
+#else
+#ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
+    return GeneratorContext(t, autoscheduler_params_, externs_map_);
+#else
+    return GeneratorContext(t, autoscheduler_params_);
+#endif
+#endif
 }
 
 namespace Internal {
@@ -183,11 +207,18 @@ class StubEmitter {
         std::vector<Internal::GeneratorParamBase *> out;
         for (auto *p : in) {
             // These are always propagated specially.
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
             if (p->name() == "target" ||
                 p->name() == "auto_schedule" ||
                 p->name() == "machine_params") {
                 continue;
             }
+#else
+            if (p->name() == "target" ||
+                p->name() == "autoscheduler") {
+                continue;
+            }
+#endif
             if (p->is_synthetic_param()) {
                 continue;
             }
@@ -225,7 +256,11 @@ void StubEmitter::emit_generator_params_struct() {
         indent_level++;
         std::string comma = "";
         for (auto *p : v) {
-            stream << get_indent() << comma << p->get_c_type() << " " << p->name() << "\n";
+            std::string c_type = p->get_c_type();
+            if (c_type == "AutoschedulerParams") {
+                c_type = "const AutoschedulerParams&";
+            }
+            stream << get_indent() << comma << c_type << " " << p->name() << "\n";
             comma = ", ";
         }
         indent_level--;
@@ -683,8 +718,6 @@ gengen
      find one. Flags across all of the targets that do not affect runtime code
      generation, such as `no_asserts` and `no_runtime`, are ignored.
 
- -s  The name of an autoscheduler to set as the default.
-
  -t  Timeout for the Generator to run, in seconds; mainly useful to ensure that
      bugs and/or degenerate cases don't stall build systems. Defaults to 900
      (=15 minutes). Specify 0 to allow ~infinite time.
@@ -700,7 +733,9 @@ gengen
         {"-o", ""},
         {"-p", ""},
         {"-r", ""},
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         {"-s", ""},
+#endif
         {"-t", "900"},  // 15 minutes
     };
 
@@ -717,6 +752,15 @@ gengen
             ++i;
             continue;
         } else {
+            if (!strcmp(argv[i], "-s")) {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+                user_warning << "HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API is deprecated in Halide 15 "
+                                "(and will be removed in Halide 16).\n";
+#else
+                user_error << "-s is no longer supported for setting autoscheduler; specify autoschduler.name=NAME instead.\n"
+                           << kUsage;
+#endif
+            }
             user_error << "Unknown flag: " << argv[i] << "\n"
                        << kUsage;
         }
@@ -730,10 +774,21 @@ gengen
         }
     }
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     const auto autoscheduler_name = flags_info["-s"];
     if (!autoscheduler_name.empty()) {
         Pipeline::set_default_autoscheduler_name(autoscheduler_name);
     }
+#else
+    if (args.generator_params.count("auto_schedule")) {
+        user_error << "auto_schedule=true is no longer supported for enabling autoscheduling; specify autoscheduler=NAME instead.\n"
+                   << kUsage;
+    }
+    if (args.generator_params.count("machine_params")) {
+        user_error << "machine_params is no longer supported as a GeneratorParam; specify autoscheduler.FIELD=VALUE instead.\n"
+                   << kUsage;
+    }
+#endif
 
     const auto &d_val = flags_info["-d"];
     user_assert(d_val == "1" || d_val == "0") << "-d must be 0 or 1\n"
@@ -855,14 +910,17 @@ gengen
     if (do_compiler_logging) {
         const bool obfuscate_compiler_logging = get_env_variable("HL_OBFUSCATE_COMPILER_LOGGER") == "1";
         args.compiler_logger_factory =
-            [obfuscate_compiler_logging, &args, &autoscheduler_name](const std::string &function_name, const Target &target) -> std::unique_ptr<CompilerLogger> {
+            [obfuscate_compiler_logging, &args](const std::string &function_name, const Target &target) -> std::unique_ptr<CompilerLogger> {
             // rebuild generator_args from the map so that they are always canonical
-            std::string generator_args_string;
+            std::string generator_args_string, autoscheduler_name;
             std::string sep;
             for (const auto &it : args.generator_params) {
                 std::string quote = it.second.find(' ') != std::string::npos ? "\\\"" : "";
                 generator_args_string += sep + it.first + "=" + quote + it.second + quote;
                 sep = " ";
+                if (it.first == "autoscheduler") {
+                    autoscheduler_name = it.second;
+                }
             }
             std::unique_ptr<JSONCompilerLogger> t(new JSONCompilerLogger(
                 obfuscate_compiler_logging ? "" : args.generator_name,
@@ -1091,6 +1149,7 @@ void execute_generator(const ExecuteGeneratorArgs &args_in) {
         // Don't bother with this if we're just emitting a cpp_stub.
         if (!cpp_stub_only) {
             auto output_files = compute_output_files(args.targets[0], base_path, args.output_types);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
             const auto get_gp = [&](const std::string &key) {
                 auto it = args.generator_params.find(key);
                 return it != args.generator_params.end() ? it->second : "";
@@ -1099,8 +1158,10 @@ void execute_generator(const ExecuteGeneratorArgs &args_in) {
             const auto machine_params_string = get_gp("machine_params");
             const bool auto_schedule = auto_schedule_string == "true" || auto_schedule_string == "True";
             const MachineParams machine_params = !machine_params_string.empty() ? MachineParams(machine_params_string) : MachineParams::generic();
+#endif
             auto module_factory = [&](const std::string &function_name, const Target &target) -> Module {
-                // Must re-create each time since each instance will have a different Target.
+            // Must re-create each time since each instance will have a different Target.
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
                 auto gen = args.create_generator(args.generator_name, GeneratorContext(target, auto_schedule, machine_params));
                 for (const auto &kv : args.generator_params) {
                     if (kv.first == "target" ||
@@ -1110,6 +1171,15 @@ void execute_generator(const ExecuteGeneratorArgs &args_in) {
                     }
                     gen->set_generatorparam_value(kv.first, kv.second);
                 }
+#else
+                auto gen = args.create_generator(args.generator_name, GeneratorContext(target));
+                for (const auto &kv : args.generator_params) {
+                    if (kv.first == "target") {
+                        continue;
+                    }
+                    gen->set_generatorparam_value(kv.first, kv.second);
+                }
+#endif
                 return args.build_mode == ExecuteGeneratorArgs::Gradient ?
                            gen->build_gradient_module(function_name) :
                            gen->build_module(function_name);
@@ -1131,11 +1201,18 @@ GeneratorParamBase::~GeneratorParamBase() {
 
 void GeneratorParamBase::check_value_readable() const {
     // These are always readable.
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     if (name() == "target" ||
         name() == "auto_schedule" ||
         name() == "machine_params") {
         return;
     }
+#else
+    if (name() == "target" ||
+        name() == "autoscheduler") {
+        return;
+    }
+#endif
     user_assert(generator && generator->phase >= GeneratorBase::ConfigureCalled)
         << "The GeneratorParam \"" << name() << "\" cannot be read before configure()/generate() is called.\n";
 }
@@ -1153,6 +1230,50 @@ void GeneratorParamBase::fail_wrong_type(const char *type) {
     user_error << "The GeneratorParam \"" << name() << "\" cannot be set with a value of type " << type << ".\n";
 }
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+// nothing
+#else
+GeneratorParam_AutoSchedulerParams::GeneratorParam_AutoSchedulerParams()
+    : GeneratorParamImpl<AutoschedulerParams>("autoscheduler", {}) {
+}
+
+void GeneratorParam_AutoSchedulerParams::set_from_string(const std::string &new_value_string) {
+    internal_error << "This method should never be called.";
+}
+
+std::string GeneratorParam_AutoSchedulerParams::get_default_value() const {
+    internal_error << "This method should never be called.";
+    return "";
+}
+
+std::string GeneratorParam_AutoSchedulerParams::call_to_string(const std::string &v) const {
+    internal_error << "This method should never be called.";
+    return "";
+}
+
+std::string GeneratorParam_AutoSchedulerParams::get_c_type() const {
+    internal_error << "This method should never be called.";
+    return "";
+}
+
+bool GeneratorParam_AutoSchedulerParams::try_set(const std::string &key, const std::string &value) {
+    const auto &n = this->name();
+    if (key == n) {
+        user_assert(this->value_.name.empty()) << "The GeneratorParam " << key << " cannot be set more than once.\n";
+        this->value_.name = value;
+        return true;
+    } else if (starts_with(key, n + ".")) {
+        const auto sub_key = key.substr(n.size() + 1);
+        user_assert(this->value_.extra.count(sub_key) == 0) << "The GeneratorParam " << key << " cannot be set more than once.\n";
+        this->value_.extra[sub_key] = value;
+        return true;
+    } else {
+        return false;
+    }
+}
+
+#endif
+
 /* static */
 GeneratorRegistry &GeneratorRegistry::get_registry() {
     static GeneratorRegistry *registry = new GeneratorRegistry;
@@ -1302,17 +1423,29 @@ GeneratorOutputBase *GeneratorBase::find_output_by_name(const std::string &name)
 }
 
 GeneratorContext GeneratorBase::context() const {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
     return GeneratorContext(target, auto_schedule, machine_params, externs_map);
 #else
     return GeneratorContext(target, auto_schedule, machine_params);
 #endif
+#else
+#ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
+    return GeneratorContext(target, autoscheduler_.value(), externs_map);
+#else
+    return GeneratorContext(target, autoscheduler_.value());
+#endif
+#endif
 }
 
 void GeneratorBase::init_from_context(const Halide::GeneratorContext &context) {
     target.set(context.target_);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     auto_schedule.set(context.auto_schedule_);
     machine_params.set(context.machine_params_);
+#else
+    autoscheduler_.set(context.autoscheduler_params_);
+#endif
 
 #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
     externs_map = context.externs_map_;
@@ -1465,12 +1598,19 @@ void GeneratorBase::check_input_kind(Internal::GeneratorInputBase *in, Internal:
 }
 
 void GeneratorBase::set_generatorparam_value(const std::string &name, const std::string &value) {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     if (name == "target" ||
         name == "auto_schedule" ||
         name == "machine_params") {
         user_error
             << "The GeneratorParam named " << name << " cannot be set by set_generatorparam_value().\n";
     }
+#else
+    user_assert(name != "target") << "The GeneratorParam named " << name << " cannot be set by set_generatorparam_value().\n";
+    if (autoscheduler_.try_set(name, value)) {
+        return;
+    }
+#endif
 
     GeneratorParamInfo &pi = param_info();
 
diff --git a/src/Generator.h b/src/Generator.h
index e5f254e82b36..d40fddc79141 100644
--- a/src/Generator.h
+++ b/src/Generator.h
@@ -208,27 +208,27 @@
  *     };
  * \endcode
  *
- *  All Generators have three GeneratorParams that are implicitly provided
+ *  All Generators have two GeneratorParams that are implicitly provided
  *  by the base class:
  *
  *      GeneratorParam<Target> target{"target", Target()};
- *      GeneratorParam<bool> auto_schedule{"auto_schedule", false};
- *      GeneratorParam<MachineParams> machine_params{"machine_params", MachineParams::generic()};
+ *      GeneratorParam<AutoschedulerParams> autoscheduler{"autoscheduler", {}}
  *
  *  - 'target' is the Halide::Target for which the Generator is producing code.
  *    It is read-only during the Generator's lifetime, and must not be modified;
  *    its value should always be filled in by the calling code: either the Halide
  *    build system (for ahead-of-time compilation), or ordinary C++ code
  *    (for JIT compilation).
- *  - 'auto_schedule' indicates whether the auto-scheduler should be run for this
- *    Generator:
- *      - if 'false', the Generator should schedule its Funcs as it sees fit.
- *      - if 'true', the Generator should only provide estimate()s for its Funcs,
- *        and not call any other scheduling methods.
- *  - 'machine_params' is only used if auto_schedule is true; it is ignored
- *    if auto_schedule is false. It provides details about the machine architecture
- *    being targeted which may be used to enhance the automatically-generated
- *    schedule.
+ *  - 'autoscheduler' is a string-to-string map that is used to indicates whether
+ *    and how an auto-scheduler should be run for this Generator:
+ *      - if empty, the Generator should schedule its Funcs as it sees fit; no autoscheduler will be run.
+ *      - if the 'name' key is set, it should be one of the known autoschedulers
+ *        provided with this release of Halide, which will be used to schedule
+ *        the Funcs in the Generator. In this case, the Generator should only
+ *        provide estimate()s for its Funcs, and not call any other scheduling methods.
+ *      - Other keys may be specified in the params, on a per-autoscheduler
+ *        basis, to optimize or enhance the automatically-generated schedule.
+ *        See documentation for each autoscheduler for options.
  *
  * Generators are added to a global registry to simplify AOT build mechanics; this
  * is done by simply using the HALIDE_REGISTER_GENERATOR macro at global scope:
@@ -426,7 +426,11 @@ class GeneratorParamBase {
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(float)
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(double)
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(Target)
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(MachineParams)
+#else
+    HALIDE_GENERATOR_PARAM_TYPED_SETTER(AutoschedulerParams)
+#endif
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(Type)
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(LoopLevel)
 
@@ -540,7 +544,11 @@ class GeneratorParamImpl : public GeneratorParamBase {
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(float)
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(double)
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(Target)
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(MachineParams)
+#else
+    HALIDE_GENERATOR_PARAM_TYPED_SETTER(AutoschedulerParams)
+#endif
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(Type)
     HALIDE_GENERATOR_PARAM_TYPED_SETTER(LoopLevel)
 
@@ -634,6 +642,7 @@ class GeneratorParam_Target : public GeneratorParamImpl<T> {
     }
 };
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 template<typename T>
 class GeneratorParam_MachineParams : public GeneratorParamImpl<T> {
 public:
@@ -659,6 +668,22 @@ class GeneratorParam_MachineParams : public GeneratorParamImpl<T> {
         return "MachineParams";
     }
 };
+#else
+class GeneratorParam_AutoSchedulerParams : public GeneratorParamImpl<AutoschedulerParams> {
+public:
+    GeneratorParam_AutoSchedulerParams();
+
+    void set_from_string(const std::string &new_value_string) override;
+    std::string get_default_value() const override;
+    std::string call_to_string(const std::string &v) const override;
+    std::string get_c_type() const override;
+
+private:
+    friend class GeneratorBase;
+
+    bool try_set(const std::string &key, const std::string &value);
+};
+#endif
 
 class GeneratorParam_LoopLevel : public GeneratorParamImpl<LoopLevel> {
 public:
@@ -954,7 +979,9 @@ template<typename T>
 using GeneratorParamImplBase =
     typename select_type<
         cond<std::is_same<T, Target>::value, GeneratorParam_Target<T>>,
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         cond<std::is_same<T, MachineParams>::value, GeneratorParam_MachineParams<T>>,
+#endif
         cond<std::is_same<T, LoopLevel>::value, GeneratorParam_LoopLevel>,
         cond<std::is_same<T, std::string>::value, GeneratorParam_String<T>>,
         cond<std::is_same<T, Type>::value, GeneratorParam_Type<T>>,
@@ -3009,7 +3036,7 @@ class GeneratorParam_Synthetic : public GeneratorParamImpl<T> {
  * \endcode
  *
  * Note that all Generators embed a GeneratorContext, so if you are using a Stub
- * from within a Generator, you can just pass 'contex()' for the GeneratorContext:
+ * from within a Generator, you can just pass 'context()' for the GeneratorContext:
  * \code
  *  struct SomeGen : Generator<SomeGen> {
  *   void generate() {
@@ -3034,9 +3061,15 @@ class GeneratorContext {
     using ExternsMap = std::map<std::string, ExternalCode>;
 #endif
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     explicit GeneratorContext(const Target &t,
                               bool auto_schedule = false,
                               const MachineParams &machine_params = MachineParams::generic());
+#else
+    explicit GeneratorContext(const Target &t);
+    explicit GeneratorContext(const Target &t,
+                              const AutoschedulerParams &autoscheduler_params);
+#endif
 
     GeneratorContext() = default;
     GeneratorContext(const GeneratorContext &) = default;
@@ -3047,17 +3080,24 @@ class GeneratorContext {
     const Target &target() const {
         return target_;
     }
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     bool auto_schedule() const {
         return auto_schedule_;
     }
     const MachineParams &machine_params() const {
         return machine_params_;
     }
+#else
+    const AutoschedulerParams &autoscheduler_params() const {
+        return autoscheduler_params_;
+    }
+#endif
 
     HALIDE_ATTRIBUTE_DEPRECATED("Call GeneratorContext::target() instead of GeneratorContext::get_target().")
     const Target &get_target() const {
         return target_;
     }
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     HALIDE_ATTRIBUTE_DEPRECATED("Call GeneratorContext::auto_schedule() instead of GeneratorContext::get_auto_schedule().")
     bool get_auto_schedule() const {
         return auto_schedule_;
@@ -3066,6 +3106,7 @@ class GeneratorContext {
     const MachineParams &get_machine_params() const {
         return machine_params_;
     }
+#endif
 
     // Return a copy of this GeneratorContext that uses the given Target.
     // This method is rarely needed; it's really provided as a convenience
@@ -3085,18 +3126,26 @@ class GeneratorContext {
 
 private:
     Target target_;
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     bool auto_schedule_ = false;
     MachineParams machine_params_ = MachineParams::generic();
+#else
+    AutoschedulerParams autoscheduler_params_;
+#endif
 #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
-    std::shared_ptr<ExternsMap> externs_map_;
+    std::shared_ptr<ExternsMap> externs_map_ = std::make_shared<ExternsMap>();
 #endif
 
 #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
     GeneratorContext(const Target &target,
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
                      bool auto_schedule,
                      const MachineParams &machine_params,
-                     std::shared_ptr<ExternsMap> externs_map);
+#else
+                     const AutoschedulerParams &autoscheduler_params,
 #endif
+                     std::shared_ptr<ExternsMap> externs_map);
+#endif  // HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
 };
 
 class NamesInterface {
@@ -3516,12 +3565,21 @@ class GeneratorBase : public NamesInterface, public AbstractGenerator {
     Target get_target() const {
         return target;
     }
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     bool get_auto_schedule() const {
         return auto_schedule;
     }
     MachineParams get_machine_params() const {
         return machine_params;
     }
+    bool using_autoscheduler() const {
+        return get_auto_schedule();
+    }
+#else
+    bool using_autoscheduler() const {
+        return !autoscheduler_.value().name.empty();
+    }
+#endif
 
 #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE
     /** Generators can register ExternalCode objects onto
@@ -3550,8 +3608,12 @@ class GeneratorBase : public NamesInterface, public AbstractGenerator {
 
     // These must remain here for legacy code that access the fields directly.
     GeneratorParam<Target> target{"target", Target()};
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     GeneratorParam<bool> auto_schedule{"auto_schedule", false};
     GeneratorParam<MachineParams> machine_params{"machine_params", MachineParams::generic()};
+#else
+    GeneratorParam_AutoSchedulerParams autoscheduler_;
+#endif
 
 private:
     friend void ::Halide::Internal::generator_test();
diff --git a/src/Module.cpp b/src/Module.cpp
index aae1064bd65c..7eb9d02b703f 100644
--- a/src/Module.cpp
+++ b/src/Module.cpp
@@ -253,7 +253,11 @@ std::string indent_string(const std::string &src, const std::string &indent) {
 void emit_schedule_file(const std::string &name,
                         const std::vector<Target> &targets,
                         const std::string &scheduler_name,
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
                         const std::string &machine_params_string,
+#else
+                        const std::string &autoscheduler_params_string,
+#endif
                         const std::string &body,
                         std::ostream &stream) {
     std::string s = R"INLINE_CODE(#ifndef $CLEANNAME$_SCHEDULE_H
@@ -262,7 +266,7 @@ void emit_schedule_file(const std::string &name,
 // MACHINE GENERATED -- DO NOT EDIT
 // This schedule was automatically generated by $SCHEDULER$
 // for target=$TARGET$  // NOLINT
-// with machine_params=$MACHINEPARAMS$
+// with $MPNAME$=$MACHINEPARAMS$
 
 #include "Halide.h"
 
@@ -316,7 +320,13 @@ inline void apply_schedule_$SHORTNAME$(
     s = replace_all(s, "$NAMESPACECLOSE$", nsclose);
     s = replace_all(s, "$TARGET$", target_string);
     s = replace_all(s, "$BODY$", body_text);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    s = replace_all(s, "$MPNAME$", "machine_params");
     s = replace_all(s, "$MACHINEPARAMS$", machine_params_string);
+#else
+    s = replace_all(s, "$MPNAME$", "autoscheduler_params");
+    s = replace_all(s, "$MACHINEPARAMS$", autoscheduler_params_string);
+#endif
     stream << s;
 }
 
@@ -671,10 +681,16 @@ void Module::compile(const std::map<OutputFileType, std::string> &output_files)
         debug(1) << "Module.compile(): schedule " << output_files.at(OutputFileType::schedule) << "\n";
         std::ofstream file(output_files.at(OutputFileType::schedule));
         auto *r = contents->auto_scheduler_results.get();
+        std::string body = r && !r->schedule_source.empty() ? r->schedule_source : "// No autoscheduler has been run for this Generator.\n";
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         std::string scheduler = r ? r->scheduler_name : "(None)";
         std::string machine_params = r ? r->machine_params_string : "(None)";
-        std::string body = r && !r->schedule_source.empty() ? r->schedule_source : "// No autoscheduler has been run for this Generator.\n";
         emit_schedule_file(name(), {target()}, scheduler, machine_params, body, file);
+#else
+        std::string scheduler = r ? r->autoscheduler_params.name : "(None)";
+        std::string autoscheduler_params_string = r ? r->autoscheduler_params.to_string() : "(None)";
+        emit_schedule_file(name(), {target()}, scheduler, autoscheduler_params_string, body, file);
+#endif
     }
     if (contains(output_files, OutputFileType::featurization)) {
         debug(1) << "Module.compile(): featurization " << output_files.at(OutputFileType::featurization) << "\n";
@@ -1004,6 +1020,7 @@ void compile_multitarget(const std::string &fn_name,
 
     if (contains(output_files, OutputFileType::schedule)) {
         debug(1) << "compile_multitarget: schedule " << output_files.at(OutputFileType::schedule) << "\n";
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         std::string scheduler = auto_scheduler_results.front().scheduler_name;
         if (scheduler.empty()) {
             scheduler = "(None)";
@@ -1012,6 +1029,11 @@ void compile_multitarget(const std::string &fn_name,
         if (machine_params.empty()) {
             machine_params = "(None)";
         }
+#else
+        const auto &autoscheduler_params = auto_scheduler_results.front().autoscheduler_params;
+        std::string scheduler = autoscheduler_params.name.empty() ? "(None)" : autoscheduler_params.name;
+        std::string autoscheduler_params_string = autoscheduler_params.name.empty() ? "(None)" : autoscheduler_params.to_string();
+#endif
 
         // Find the features that are unique to each stage (vs the baseline case).
         const auto &baseline_target = auto_scheduler_results.back().target;
@@ -1053,7 +1075,11 @@ void compile_multitarget(const std::string &fn_name,
         }
 
         std::ofstream file(output_files.at(OutputFileType::schedule));
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         emit_schedule_file(fn_name, targets, scheduler, machine_params, body.str(), file);
+#else
+        emit_schedule_file(fn_name, targets, scheduler, autoscheduler_params_string, body.str(), file);
+#endif
     }
 
     if (contains(output_files, OutputFileType::static_library)) {
diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
index 19e088d517ea..d2b669614470 100644
--- a/src/Pipeline.cpp
+++ b/src/Pipeline.cpp
@@ -220,6 +220,7 @@ std::map<std::string, AutoSchedulerFn> &Pipeline::get_autoscheduler_map() {
     return autoschedulers;
 }
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 /* static */
 std::string &Pipeline::get_default_autoscheduler_name() {
     static std::string autoscheduler_name = "";
@@ -228,6 +229,7 @@ std::string &Pipeline::get_default_autoscheduler_name() {
     }
     return autoscheduler_name;
 }
+#endif
 
 /* static */
 AutoSchedulerFn Pipeline::find_autoscheduler(const std::string &autoscheduler_name) {
@@ -244,6 +246,7 @@ AutoSchedulerFn Pipeline::find_autoscheduler(const std::string &autoscheduler_na
     return it->second;
 }
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 AutoSchedulerResults Pipeline::auto_schedule(const std::string &autoscheduler_name, const Target &target, const MachineParams &arch_params) const {
     auto autoscheduler_fn = find_autoscheduler(autoscheduler_name);
     user_assert(autoscheduler_fn)
@@ -261,6 +264,23 @@ AutoSchedulerResults Pipeline::auto_schedule(const std::string &autoscheduler_na
 AutoSchedulerResults Pipeline::auto_schedule(const Target &target, const MachineParams &arch_params) const {
     return auto_schedule(get_default_autoscheduler_name(), target, arch_params);
 }
+#else
+AutoSchedulerResults Pipeline::apply_autoscheduler(const Target &target, const AutoschedulerParams &autoscheduler_params) const {
+    user_assert(!autoscheduler_params.name.empty()) << "apply_autoscheduler was called with no Autoscheduler specified.";
+
+    auto autoscheduler_fn = find_autoscheduler(autoscheduler_params.name);
+    user_assert(autoscheduler_fn)
+        << "Could not find autoscheduler named '" << autoscheduler_params.name << "'.\n"
+        << "Did you remember to load the plugin?";
+
+    AutoSchedulerResults results;
+    results.target = target;
+    results.autoscheduler_params = autoscheduler_params;
+
+    autoscheduler_fn(*this, target, autoscheduler_params, &results);
+    return results;
+}
+#endif
 
 /* static */
 void Pipeline::add_autoscheduler(const std::string &autoscheduler_name, const AutoSchedulerFn &autoscheduler) {
@@ -269,11 +289,13 @@ void Pipeline::add_autoscheduler(const std::string &autoscheduler_name, const Au
     m[autoscheduler_name] = autoscheduler;
 }
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 /* static */
 void Pipeline::set_default_autoscheduler_name(const std::string &autoscheduler_name) {
     (void)find_autoscheduler(autoscheduler_name);  // ensure it's valid
     get_default_autoscheduler_name() = autoscheduler_name;
 }
+#endif
 
 Func Pipeline::get_func(size_t index) {
     // Compute an environment
@@ -1186,6 +1208,7 @@ JITExtern::JITExtern(const ExternCFunction &extern_c_function)
     : extern_c_function_(extern_c_function) {
 }
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 MachineParams MachineParams::generic() {
     std::string params = Internal::get_env_variable("HL_MACHINE_PARAMS");
     if (params.empty()) {
@@ -1208,5 +1231,17 @@ MachineParams::MachineParams(const std::string &s) {
     last_level_cache_size = std::atoll(v[1].c_str());
     balance = std::atof(v[2].c_str());
 }
+#endif
+
+std::string AutoschedulerParams::to_string() const {
+    std::ostringstream os;
+    if (!name.empty()) {
+        os << "autoscheduler=" << name;
+    }
+    for (const auto &kv : extra) {
+        os << " autoscheduler." << kv.first << "=" << kv.second;
+    }
+    return os.str();
+}
 
 }  // namespace Halide
diff --git a/src/Pipeline.h b/src/Pipeline.h
index 15e19652c107..bb67391f4a44 100644
--- a/src/Pipeline.h
+++ b/src/Pipeline.h
@@ -31,6 +31,7 @@ class Callable;
 class Func;
 struct PipelineContents;
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 /** A struct representing the machine parameters to generate the auto-scheduled
  * code for. */
 struct MachineParams {
@@ -55,6 +56,40 @@ struct MachineParams {
     /** Reconstruct a MachineParams from canonical string form. */
     explicit MachineParams(const std::string &s);
 };
+#else
+/** Special the Autoscheduler to be used (if any), along with arbitrary
+ * additional arguments specific to the given Autoscheduler.
+ *
+ * The 'name' field specifies the type of Autoscheduler
+ * to be used (e.g. Adams2019, Mullapudi2016). If this is an empty string,
+ * no autoscheduling will be done; if not, it mustbe the name of a known Autoscheduler.
+ *
+ * At this time, well-known autoschedulers include:
+ *  "Mullapudi2016" -- heuristics-based; the first working autoscheduler; currently built in to libHalide
+ *                     see http://graphics.cs.cmu.edu/projects/halidesched/
+ *  "Adams2019"     -- aka "the ML autoscheduler"; currently located in apps/autoscheduler
+ *                     see https://halide-lang.org/papers/autoscheduler2019.html
+ *  "Li2018"        -- aka "the gradient autoscheduler"; currently located in apps/gradient_autoscheduler.
+ *                     see https://people.csail.mit.edu/tzumao/gradient_halide
+ *
+ * The key/value pairs in 'extra' are defined on a per-autoscheduler basis.
+ * An autoscheduler can have any number of required or optional keys.
+ */
+struct AutoschedulerParams {
+    std::string name;
+    std::map<std::string, std::string> extra;
+
+    AutoschedulerParams() = default;
+    /*not-explicit*/ AutoschedulerParams(const std::string &name)
+        : name(name) {
+    }
+    AutoschedulerParams(const std::string &name, const std::map<std::string, std::string> &extra)
+        : name(name), extra(extra) {
+    }
+
+    std::string to_string() const;
+};
+#endif
 
 namespace Internal {
 class IRMutator;
@@ -88,16 +123,25 @@ struct CustomLoweringPass {
 struct JITExtern;
 
 struct AutoSchedulerResults {
-    std::string scheduler_name;          // name of the autoscheduler used
-    Target target;                       // Target specified to the autoscheduler
-    std::string machine_params_string;   // MachineParams specified to the autoscheduler (in string form)
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    std::string scheduler_name;         // name of the autoscheduler used
+    Target target;                      // Target specified to the autoscheduler
+    std::string machine_params_string;  // MachineParams specified to the autoscheduler (in string form)
+#else
+    Target target;                             // Target specified to the autoscheduler
+    AutoschedulerParams autoscheduler_params;  // The autoscheduler used, along with its params
+#endif
     std::string schedule_source;         // The C++ source code of the generated schedule
     std::vector<uint8_t> featurization;  // The featurization of the pipeline (if any)
 };
 
 class Pipeline;
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
 using AutoSchedulerFn = std::function<void(const Pipeline &, const Target &, const MachineParams &, AutoSchedulerResults *outputs)>;
+#else
+using AutoSchedulerFn = std::function<void(const Pipeline &, const Target &, const AutoschedulerParams &, AutoSchedulerResults *outputs)>;
+#endif
 
 /** A class representing a Halide pipeline. Constructed from the Func
  * or Funcs that it outputs. */
@@ -155,7 +199,9 @@ class Pipeline {
 
     static std::map<std::string, AutoSchedulerFn> &get_autoscheduler_map();
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     static std::string &get_default_autoscheduler_name();
+#endif
 
     static AutoSchedulerFn find_autoscheduler(const std::string &autoscheduler_name);
 
@@ -188,6 +234,7 @@ class Pipeline {
     /** Get the Funcs this pipeline outputs. */
     std::vector<Func> outputs() const;
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     /** Generate a schedule for the pipeline using the currently-default autoscheduler. */
     AutoSchedulerResults auto_schedule(const Target &target,
                                        const MachineParams &arch_params = MachineParams::generic()) const;
@@ -196,11 +243,17 @@ class Pipeline {
     AutoSchedulerResults auto_schedule(const std::string &autoscheduler_name,
                                        const Target &target,
                                        const MachineParams &arch_params = MachineParams::generic()) const;
+#else
+    /** Generate a schedule for the pipeline using the specified autoscheduler. */
+    AutoSchedulerResults apply_autoscheduler(const Target &target,
+                                             const AutoschedulerParams &autoscheduler_params) const;
+#endif
 
     /** Add a new the autoscheduler method with the given name. Does not affect the current default autoscheduler.
      * It is an error to call this with the same name multiple times. */
     static void add_autoscheduler(const std::string &autoscheduler_name, const AutoSchedulerFn &autoscheduler);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     /** Globally set the default autoscheduler method to use whenever
      * autoscheduling any Pipeline when no name is specified. If the autoscheduler_name isn't in the
      * current table of known autoschedulers, assert-fail.
@@ -214,6 +267,7 @@ class Pipeline {
      *                     see https://people.csail.mit.edu/tzumao/gradient_halide
      */
     static void set_default_autoscheduler_name(const std::string &autoscheduler_name);
+#endif
 
     /** Return handle to the index-th Func within the pipeline based on the
      * topological order. */
diff --git a/src/autoschedulers/adams2019/AutoSchedule.cpp b/src/autoschedulers/adams2019/AutoSchedule.cpp
index 56f8ed3dbda5..baa4c14160f5 100644
--- a/src/autoschedulers/adams2019/AutoSchedule.cpp
+++ b/src/autoschedulers/adams2019/AutoSchedule.cpp
@@ -31,9 +31,6 @@
   Write out a training featurization for the selected schedule into this file.
   Needs to be converted to a sample file with the runtime using featurization_to_sample before it can be used to train.
 
-  HL_MACHINE_PARAMS
-  An architecture description string. Used by Halide master to configure the cost model. We only use the first term. Set it to the number of cores to target.
-
   HL_PERMIT_FAILED_UNROLL
   Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding.
 
@@ -256,7 +253,7 @@ class StateQueue {
 
 // Configure a cost model to process a specific pipeline.
 void configure_pipeline_features(const FunctionDAG &dag,
-                                 const MachineParams &params,
+                                 const Adams2019Params &params,
                                  CostModel *cost_model) {
     cost_model->reset();
     cost_model->set_pipeline_features(dag, params);
@@ -265,7 +262,7 @@ void configure_pipeline_features(const FunctionDAG &dag,
 // A single pass of coarse-to-fine beam search.
 IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
                                           const vector<Function> &outputs,
-                                          const MachineParams &params,
+                                          const Adams2019Params &params,
                                           CostModel *cost_model,
                                           std::mt19937 &rng,
                                           int beam_size,
@@ -464,7 +461,7 @@ IntrusivePtr<State> optimal_schedule_pass(FunctionDAG &dag,
 // Performance coarse-to-fine beam search and return the best state found.
 IntrusivePtr<State> optimal_schedule(FunctionDAG &dag,
                                      const vector<Function> &outputs,
-                                     const MachineParams &params,
+                                     const Adams2019Params &params,
                                      CostModel *cost_model,
                                      std::mt19937 &rng,
                                      int beam_size,
@@ -543,7 +540,7 @@ int State::cost_calculations = 0;
 // The main entrypoint to generate a schedule for a pipeline.
 void generate_schedule(const std::vector<Function> &outputs,
                        const Target &target,
-                       const MachineParams &params,
+                       const Adams2019Params &params,
                        AutoSchedulerResults *auto_scheduler_results) {
     aslog(1) << "generate_schedule for target=" << target.to_string() << "\n";
 
@@ -580,7 +577,7 @@ void generate_schedule(const std::vector<Function> &outputs,
     int64_t memory_limit = memory_limit_str.empty() ? (uint64_t)(-1) : std::atoll(memory_limit_str.c_str());
 
     // Analyse the Halide algorithm and construct our abstract representation of it
-    FunctionDAG dag(outputs, params, target);
+    FunctionDAG dag(outputs, target);
     if (aslog::aslog_level() >= 2) {
         dag.dump(aslog(2).get_ostream());
     }
@@ -641,7 +638,9 @@ void generate_schedule(const std::vector<Function> &outputs,
     }
 
     if (auto_scheduler_results) {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         auto_scheduler_results->scheduler_name = "Adams2019";
+#endif
         auto_scheduler_results->schedule_source = optimal->schedule_source;
         {
             std::ostringstream out;
@@ -653,13 +652,37 @@ void generate_schedule(const std::vector<Function> &outputs,
 }
 
 struct Adams2019 {
-    void operator()(const Pipeline &p, const Target &target, const MachineParams &params, AutoSchedulerResults *results) {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    void operator()(const Pipeline &p, const Target &target, const MachineParams &params_in, AutoSchedulerResults *results) {
+        std::vector<Function> outputs;
+        for (const Func &f : p.outputs()) {
+            outputs.push_back(f.function());
+        }
+        Adams2019Params params;
+        params.parallelism = params_in.parallelism;
+        Autoscheduler::generate_schedule(outputs, target, params, results);
+    }
+#else
+    void operator()(const Pipeline &p, const Target &target, const AutoschedulerParams &params_in, AutoSchedulerResults *results) {
+        internal_assert(params_in.name == "Adams2019");
+        // Verify that no unknown keys are set in params_in
+        const std::set<std::string> legal_keys = {"parallelism"};
+        for (const auto &it : params_in.extra) {
+            user_assert(legal_keys.count(it.first) == 1) << "The key " << it.first << " is not legal to use for the Adams2019 Autoscheduler.";
+        }
+
         std::vector<Function> outputs;
         for (const Func &f : p.outputs()) {
             outputs.push_back(f.function());
         }
+        Adams2019Params params;
+        if (params_in.extra.count("parallelism")) {
+            params.parallelism = std::stoi(params_in.extra.at("parallelism"));
+        }
         Autoscheduler::generate_schedule(outputs, target, params, results);
+        results->autoscheduler_params = params_in;
     }
+#endif
 };
 
 REGISTER_AUTOSCHEDULER(Adams2019)
@@ -667,7 +690,7 @@ REGISTER_AUTOSCHEDULER(Adams2019)
 // An alternative entrypoint for other uses
 void find_and_apply_schedule(FunctionDAG &dag,
                              const std::vector<Function> &outputs,
-                             const MachineParams &params,
+                             const Adams2019Params &params,
                              CostModel *cost_model,
                              int beam_size,
                              int64_t memory_limit,
diff --git a/src/autoschedulers/adams2019/AutoSchedule.h b/src/autoschedulers/adams2019/AutoSchedule.h
index b7a76dc67e50..270ca7a24641 100644
--- a/src/autoschedulers/adams2019/AutoSchedule.h
+++ b/src/autoschedulers/adams2019/AutoSchedule.h
@@ -11,7 +11,7 @@ namespace Autoscheduler {
 
 typedef PerfectHashMap<FunctionDAG::Node::Stage, ScheduleFeatures> StageMapOfScheduleFeatures;
 
-void find_and_apply_schedule(FunctionDAG &dag, const std::vector<Function> &outputs, const MachineParams &params,
+void find_and_apply_schedule(FunctionDAG &dag, const std::vector<Function> &outputs, const Adams2019Params &params,
                              CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features);
 
 }  // namespace Autoscheduler
diff --git a/src/autoschedulers/adams2019/Cache.cpp b/src/autoschedulers/adams2019/Cache.cpp
index ef14e9313563..b149accc36c1 100644
--- a/src/autoschedulers/adams2019/Cache.cpp
+++ b/src/autoschedulers/adams2019/Cache.cpp
@@ -18,7 +18,7 @@ bool Cache::add_memoized_blocks(const State *state,
                                 std::function<void(IntrusivePtr<State> &&)> &accept_child,
                                 const FunctionDAG::Node *node, int &num_children,
                                 const FunctionDAG &dag,
-                                const MachineParams &params,
+                                const Adams2019Params &params,
                                 CostModel *cost_model,
                                 int64_t memory_limit) const {
     if (!options.cache_blocks || !memoized_compute_root_blocks.contains(node)) {
diff --git a/src/autoschedulers/adams2019/Cache.h b/src/autoschedulers/adams2019/Cache.h
index 3272691ab13f..c1cedc23f856 100644
--- a/src/autoschedulers/adams2019/Cache.h
+++ b/src/autoschedulers/adams2019/Cache.h
@@ -122,7 +122,7 @@ struct Cache {
                              const FunctionDAG::Node *node,
                              int &num_children,
                              const FunctionDAG &dag,
-                             const MachineParams &params,
+                             const Adams2019Params &params,
                              CostModel *cost_model,
                              int64_t memory_limit) const;
 
diff --git a/src/autoschedulers/adams2019/CostModel.h b/src/autoschedulers/adams2019/CostModel.h
index 8459932c8dca..82ba413a17b0 100644
--- a/src/autoschedulers/adams2019/CostModel.h
+++ b/src/autoschedulers/adams2019/CostModel.h
@@ -3,6 +3,7 @@
 
 #include <string>
 
+#include "Featurization.h"
 #include "FunctionDAG.h"
 #include "HalideBuffer.h"
 #include "PerfectHashMap.h"
@@ -12,7 +13,14 @@ namespace Halide {
 
 namespace Internal {
 namespace Autoscheduler {
+
 typedef PerfectHashMap<FunctionDAG::Node::Stage, ScheduleFeatures> StageMapOfScheduleFeatures;
+
+struct Adams2019Params {
+    /** Maximum level of parallelism available. */
+    int parallelism = 16;
+};
+
 }  // namespace Autoscheduler
 }  // namespace Internal
 
@@ -22,7 +30,7 @@ class CostModel {
 
     // Configure the cost model for the algorithm to be scheduled.
     virtual void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag,
-                                       const MachineParams &params) = 0;
+                                       const Internal::Autoscheduler::Adams2019Params &params) = 0;
 
     // Enqueue a schedule to be evaluated. Will annotate the value located at cost_ptr when the evaluation takes place.
     // Note that the dag argument should correspond to the dag specified previously when calling set_pipeline_features.
diff --git a/src/autoschedulers/adams2019/DefaultCostModel.cpp b/src/autoschedulers/adams2019/DefaultCostModel.cpp
index 630628c4354e..01307d765131 100644
--- a/src/autoschedulers/adams2019/DefaultCostModel.cpp
+++ b/src/autoschedulers/adams2019/DefaultCostModel.cpp
@@ -47,7 +47,7 @@ bool ends_with(const std::string &str, const std::string &suffix) {
 }  // namespace
 
 void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag,
-                                             const MachineParams &params) {
+                                             const Internal::Autoscheduler::Adams2019Params &params) {
 
     const int pipeline_feat_size = head1_w * head1_h;
     // We ignore the first seven pipeline features in the cost
diff --git a/src/autoschedulers/adams2019/DefaultCostModel.h b/src/autoschedulers/adams2019/DefaultCostModel.h
index 11dff14ef0dc..9f7d6ac6c39b 100644
--- a/src/autoschedulers/adams2019/DefaultCostModel.h
+++ b/src/autoschedulers/adams2019/DefaultCostModel.h
@@ -7,6 +7,12 @@
 
 namespace Halide {
 
+namespace Internal {
+namespace Autoscheduler {
+struct Adams2019Params;
+}  // namespace Autoscheduler
+}  // namespace Internal
+
 class DefaultCostModel : public CostModel {
 private:
     Internal::Weights weights;
@@ -37,7 +43,7 @@ class DefaultCostModel : public CostModel {
 
     // Configure the cost model for the algorithm to be scheduled.
     void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag,
-                               const MachineParams &params) override;
+                               const Internal::Autoscheduler::Adams2019Params &params) override;
     void set_pipeline_features(const Runtime::Buffer<float> &, int n);
 
     // Enqueue a schedule to be evaluated. The second version of this method returns a buffer of
diff --git a/src/autoschedulers/adams2019/FunctionDAG.cpp b/src/autoschedulers/adams2019/FunctionDAG.cpp
index 72bc9dc7e0e1..52a481eed18c 100644
--- a/src/autoschedulers/adams2019/FunctionDAG.cpp
+++ b/src/autoschedulers/adams2019/FunctionDAG.cpp
@@ -572,7 +572,7 @@ bool depends_on_estimate(const Expr &expr) {
     return dependency_checker.found_estimate;
 }
 
-FunctionDAG::FunctionDAG(const vector<Function> &outputs, const MachineParams &params, const Target &target) {
+FunctionDAG::FunctionDAG(const vector<Function> &outputs, const Target &target) {
     map<string, Function> env = build_environment(outputs);
 
     // A mutator to apply parameter estimates to the expressions
diff --git a/src/autoschedulers/adams2019/FunctionDAG.h b/src/autoschedulers/adams2019/FunctionDAG.h
index 44f0cf315db8..75c75c3c8b07 100644
--- a/src/autoschedulers/adams2019/FunctionDAG.h
+++ b/src/autoschedulers/adams2019/FunctionDAG.h
@@ -27,6 +27,8 @@ using std::string;
 using std::unique_ptr;
 using std::vector;
 
+struct Adams2019Params;
+
 // First we have various utility classes.
 
 // An optional rational type used when analyzing memory dependencies.
@@ -563,7 +565,7 @@ struct FunctionDAG {
 
     // Create the function DAG, and do all the dependency and cost
     // analysis. This is done once up-front before the tree search.
-    FunctionDAG(const vector<Function> &outputs, const MachineParams &params, const Target &target);
+    FunctionDAG(const vector<Function> &outputs, const Target &target);
 
     void dump(std::ostream &os) const;
 
diff --git a/src/autoschedulers/adams2019/LoopNest.cpp b/src/autoschedulers/adams2019/LoopNest.cpp
index a5cf19a61274..8568e92df8a9 100644
--- a/src/autoschedulers/adams2019/LoopNest.cpp
+++ b/src/autoschedulers/adams2019/LoopNest.cpp
@@ -227,7 +227,7 @@ void LoopNest::get_sites(StageMap<Sites> &sites,
 
 // Do a recursive walk over the loop nest computing features to feed the cost model.
 void LoopNest::compute_features(const FunctionDAG &dag,
-                                const MachineParams &params,
+                                const Adams2019Params &params,
                                 const StageMap<Sites> &sites,
                                 int64_t instances,
                                 int64_t parallelism,
@@ -1355,7 +1355,7 @@ void LoopNest::compute_here(const FunctionDAG::Node *f, bool tileable, int v) {
 }
 
 // Parallelize this loop according to the given tiling.
-IntrusivePtr<const LoopNest> LoopNest::parallelize_in_tiles(const MachineParams &params,
+IntrusivePtr<const LoopNest> LoopNest::parallelize_in_tiles(const Adams2019Params &params,
                                                             const vector<int64_t> &tiling,
                                                             const LoopNest *parent) const {
 
@@ -1423,7 +1423,7 @@ IntrusivePtr<const LoopNest> LoopNest::parallelize_in_tiles(const MachineParams
 // this loop nest.
 vector<IntrusivePtr<const LoopNest>> LoopNest::compute_in_tiles(const FunctionDAG::Node *f,
                                                                 const LoopNest *parent,
-                                                                const MachineParams &params,
+                                                                const Adams2019Params &params,
                                                                 int v,
                                                                 bool in_realization) const {
     internal_assert(f);
diff --git a/src/autoschedulers/adams2019/LoopNest.h b/src/autoschedulers/adams2019/LoopNest.h
index b937d1133da7..e9cb9e872441 100644
--- a/src/autoschedulers/adams2019/LoopNest.h
+++ b/src/autoschedulers/adams2019/LoopNest.h
@@ -129,7 +129,7 @@ struct LoopNest {
 
     // Do a recursive walk over the loop nest computing features to feed the cost model.
     void compute_features(const FunctionDAG &dag,
-                          const MachineParams &params,
+                          const Adams2019Params &params,
                           const StageMap<Sites> &sites,
                           int64_t instances,
                           int64_t parallelism,
@@ -189,7 +189,7 @@ struct LoopNest {
     void compute_here(const FunctionDAG::Node *f, bool tileable, int v);
 
     // Parallelize this loop according to the given tiling.
-    IntrusivePtr<const LoopNest> parallelize_in_tiles(const MachineParams &params,
+    IntrusivePtr<const LoopNest> parallelize_in_tiles(const Adams2019Params &params,
                                                       const vector<int64_t> &tiling,
                                                       const LoopNest *parent) const;
 
@@ -197,7 +197,7 @@ struct LoopNest {
     // this loop nest.
     std::vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
                                                                const LoopNest *parent,
-                                                               const MachineParams &params,
+                                                               const Adams2019Params &params,
                                                                int v,
                                                                bool in_realization) const;
 
diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile
index 050bce258ebd..389565c7109d 100644
--- a/src/autoschedulers/adams2019/Makefile
+++ b/src/autoschedulers/adams2019/Makefile
@@ -52,7 +52,7 @@ $(BIN)/auto_schedule_runtime.a: $(BIN)/cost_model.generator
 
 $(BIN)/cost_model/%.a: $(BIN)/cost_model.generator
 	@mkdir -p $(@D)
-	$^ -g $* -o $(BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime auto_schedule=false -e stmt,static_library,h,assembly
+	$^ -g $* -o $(BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime -e stmt,static_library,h,assembly
 
 # It's important to use dynamic lookups for undefined symbols here: all of libHalide
 # is expected to be present (in the loading binary), so we explicitly make the symbols
@@ -107,11 +107,6 @@ $(BIN)/weightsdir_to_weightsfile: $(SRC)/weightsdir_to_weightsfile.cpp $(SRC)/We
 	@mkdir -p $(@D)
 	$(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@
 
-# This is the value that machine_params defaults to if no custom value is specified;
-# see MachineParams::generic()
-HL_MACHINE_PARAMS ?= 32,25165824,160
-
-
 # A sample generator to autoschedule. Note that if it statically links
 # to libHalide, then it must be build with $(USE_EXPORT_DYNAMIC), or the
 # autoscheduler can't find the libHalide symbols that it needs.
@@ -123,7 +118,7 @@ $(GENERATOR_BIN)/demo.generator: $(SRC)/demo_generator.cpp $(GENERATOR_DEPS)
 $(BIN)/%/demo.a: $(GENERATOR_BIN)/demo.generator $(BIN)/libautoschedule_adams2019.$(SHARED_EXT)
 	@mkdir -p $(@D)
 	HL_WEIGHTS_DIR=$(SRC)/baseline.weights \
-	$(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* auto_schedule=true -p $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) -s Adams2019
+	$(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* autoscheduler=Adams2019 autoscheduler.parallelism=32 -p $(BIN)/libautoschedule_adams2019.$(SHARED_EXT)
 
 $(BIN)/%/demo.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/demo.registration.cpp $(BIN)/%/demo.a
 	@mkdir -p $(@D)
@@ -207,7 +202,7 @@ $(GENERATOR_BIN)/included_schedule_file_none.generator: $(SRC)/included_schedule
 $(BIN)/%/included_schedule_file.schedule.h: $(GENERATOR_BIN)/included_schedule_file_none.generator $(BIN)/libautoschedule_adams2019.$(SHARED_EXT)
 	@mkdir -p $(@D)
 	HL_WEIGHTS_DIR=$(SRC)/baseline.weights \
-	$< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* auto_schedule=true -p $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) -s Adams2019 -e schedule
+	$< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* autoscheduler=Adams2019 autoscheduler.parallelism=32 -p $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) -e schedule
 
 # Note that this depends on included_schedule_file.schedule.h rather than $(BIN)/%/included_schedule_file.schedule.h --
 # the former should be generated by something like
diff --git a/src/autoschedulers/adams2019/State.cpp b/src/autoschedulers/adams2019/State.cpp
index d85bf91ce6f6..e7cb410d7921 100644
--- a/src/autoschedulers/adams2019/State.cpp
+++ b/src/autoschedulers/adams2019/State.cpp
@@ -14,7 +14,7 @@ uint64_t State::structural_hash(int depth) const {
     return h;
 }
 
-void State::compute_featurization(const FunctionDAG &dag, const MachineParams &params,
+void State::compute_featurization(const FunctionDAG &dag, const Adams2019Params &params,
                                   StageMap<ScheduleFeatures> *features, const CachingOptions &cache_options) {
     StageMap<LoopNest::Sites> sites;
     sites.make_large(dag.nodes[0].stages[0].max_id);
@@ -93,7 +93,7 @@ void State::compute_featurization(const FunctionDAG &dag, const MachineParams &p
     }
 }
 
-void State::save_featurization(const FunctionDAG &dag, const MachineParams &params,
+void State::save_featurization(const FunctionDAG &dag, const Adams2019Params &params,
                                const CachingOptions &cache_options, std::ostream &out) {
     StageMap<ScheduleFeatures> features;
     compute_featurization(dag, params, &features, cache_options);
@@ -123,7 +123,7 @@ void State::save_featurization(const FunctionDAG &dag, const MachineParams &para
     }
 }
 
-bool State::calculate_cost(const FunctionDAG &dag, const MachineParams &params,
+bool State::calculate_cost(const FunctionDAG &dag, const Adams2019Params &params,
                            CostModel *cost_model, const CachingOptions &cache_options,
                            int64_t memory_limit, int verbosity) {
     StageMap<ScheduleFeatures> features;
@@ -200,7 +200,7 @@ IntrusivePtr<State> State::make_child() const {
 
 // Generate the successor states to this state
 void State::generate_children(const FunctionDAG &dag,
-                              const MachineParams &params,
+                              const Adams2019Params &params,
                               CostModel *cost_model,
                               int64_t memory_limit,
                               std::function<void(IntrusivePtr<State> &&)> &accept_child,
@@ -539,7 +539,7 @@ void State::dump(std::ostream &os) const {
 // Apply the schedule represented by this state to a Halide
 // Pipeline. Also generate source code for the schedule for the
 // user to copy-paste to freeze this schedule as permanent artifact.
-void State::apply_schedule(const FunctionDAG &dag, const MachineParams &params) {
+void State::apply_schedule(const FunctionDAG &dag, const Adams2019Params &params) {
     StageMap<std::unique_ptr<LoopNest::StageScheduleState>> state_map;
     root->apply(LoopLevel::root(), state_map, params.parallelism, 0, nullptr, nullptr);
 
diff --git a/src/autoschedulers/adams2019/State.h b/src/autoschedulers/adams2019/State.h
index 592b6db8930e..0cb0419fb96f 100644
--- a/src/autoschedulers/adams2019/State.h
+++ b/src/autoschedulers/adams2019/State.h
@@ -52,20 +52,20 @@ struct State {
     // Compute the featurization of this state (based on `root`),
     // and store features in `features`. Defers to `root->compute_features()`.
     void compute_featurization(const FunctionDAG &dag,
-                               const MachineParams &params,
+                               const Adams2019Params &params,
                                StageMap<ScheduleFeatures> *features,
                                const CachingOptions &cache_options);
 
     // Calls `compute_featurization` and prints those features to `out`.
     void save_featurization(const FunctionDAG &dag,
-                            const MachineParams &params,
+                            const Adams2019Params &params,
                             const CachingOptions &cache_options,
                             std::ostream &out);
 
     // Performs some pruning to decide if this state is worth queuing in
     // the cost_model. If it is, calls `cost_model->enqueue` and returns true,
     // otherwise sets `cost` equal to a large value and returns false.
-    bool calculate_cost(const FunctionDAG &dag, const MachineParams &params,
+    bool calculate_cost(const FunctionDAG &dag, const Adams2019Params &params,
                         CostModel *cost_model, const CachingOptions &cache_options,
                         int64_t memory_limit, int verbosity = 99);
 
@@ -79,7 +79,7 @@ struct State {
     // If they are not pruned by `calculate_cost()`,
     // then calls `accept_child()` on them.
     void generate_children(const FunctionDAG &dag,
-                           const MachineParams &params,
+                           const Adams2019Params &params,
                            CostModel *cost_model,
                            int64_t memory_limit,
                            std::function<void(IntrusivePtr<State> &&)> &accept_child,
@@ -92,7 +92,7 @@ struct State {
     // Pipeline. Also generate source code for the schedule for the
     // user to copy-paste to freeze this schedule as permanent artifact.
     // Also fills `schedule_source`.
-    void apply_schedule(const FunctionDAG &dag, const MachineParams &params);
+    void apply_schedule(const FunctionDAG &dag, const Adams2019Params &params);
 };
 
 }  // namespace Autoscheduler
diff --git a/src/autoschedulers/adams2019/autotune_loop.sh b/src/autoschedulers/adams2019/autotune_loop.sh
index f4fd01afa967..14805d830c1c 100755
--- a/src/autoschedulers/adams2019/autotune_loop.sh
+++ b/src/autoschedulers/adams2019/autotune_loop.sh
@@ -107,7 +107,6 @@ make_featurization() {
         HL_WEIGHTS_DIR=${WEIGHTS} \
         HL_RANDOM_DROPOUT=${dropout} \
         HL_BEAM_SIZE=${beam} \
-        HL_MACHINE_PARAMS=32,24000000,40 \
         ${TIMEOUT_CMD} -k ${COMPILATION_TIMEOUT} ${COMPILATION_TIMEOUT} \
         ${GENERATOR} \
         -g ${PIPELINE} \
@@ -115,11 +114,11 @@ make_featurization() {
         -o ${D} \
         -e stmt,assembly,static_library,c_header,registration,schedule,featurization \
         target=${HL_TARGET} \
-        auto_schedule=true \
         ${EXTRA_GENERATOR_ARGS} \
         -p ${AUTOSCHED_BIN}/libautoschedule_adams2019.${SHARED_EXT} \
-        -s Adams2019 \
-          2> ${D}/compile_log.txt || echo "Compilation failed or timed out for ${D}"
+        autoscheduler=Adams2019 \
+        autoscheduler.parallelism=32 \
+            2> ${D}/compile_log.txt || echo "Compilation failed or timed out for ${D}"
 
 
     # We don't need image I/O for this purpose,
diff --git a/src/autoschedulers/adams2019/cost_model_generator.cpp b/src/autoschedulers/adams2019/cost_model_generator.cpp
index dfca665505b1..4ab6b59c1b57 100644
--- a/src/autoschedulers/adams2019/cost_model_generator.cpp
+++ b/src/autoschedulers/adams2019/cost_model_generator.cpp
@@ -123,7 +123,7 @@ class CostModel : public Generator<CostModel<training>> {
     using Input = GeneratorInput<T>;
     template<typename T>
     using Output = GeneratorOutput<T>;
-    using Generator<CostModel<training>>::auto_schedule;
+    using Generator<CostModel<training>>::using_autoscheduler;
     using Generator<CostModel<training>>::get_pipeline;
 
     // Number of pipeline stages
@@ -482,9 +482,9 @@ class CostModel : public Generator<CostModel<training>> {
         true_runtime.set_estimates({{0, 80}});
 
         // SCHEDULE
-        if (training && !auto_schedule) {
+        if (training && !using_autoscheduler()) {
             do_cost_model_schedule(get_pipeline());
-        } else if (auto_schedule) {
+        } else if (using_autoscheduler()) {
             // Do nothing.
         } else {
             // We just write down a good schedule for
diff --git a/src/autoschedulers/adams2019/included_schedule_file_generator.cpp b/src/autoschedulers/adams2019/included_schedule_file_generator.cpp
index 21ee6ec0918c..cdd2bc7f6bf3 100644
--- a/src/autoschedulers/adams2019/included_schedule_file_generator.cpp
+++ b/src/autoschedulers/adams2019/included_schedule_file_generator.cpp
@@ -37,7 +37,7 @@ struct IncludedScheduleFile : public Halide::Generator<IncludedScheduleFile> {
         relu.set_estimates({{0, CO}, {0, W}, {0, H}, {0, N}});
 
         // Schedule
-        if (auto_schedule) {
+        if (using_autoscheduler()) {
             // nothing
         } else {
 #if defined(GENERATING_SCHEDULE)
diff --git a/src/autoschedulers/adams2019/test.cpp b/src/autoschedulers/adams2019/test.cpp
index 21e0f0ec20bb..a135c11fe63f 100644
--- a/src/autoschedulers/adams2019/test.cpp
+++ b/src/autoschedulers/adams2019/test.cpp
@@ -14,7 +14,7 @@ void set_env_variable(const std::string &name, const std::string &value, int ove
 #endif
 }
 
-bool test_caching(Pipeline &p1, Pipeline &p2, const Target &target, const MachineParams &params) {
+bool test_caching(Pipeline &p1, Pipeline &p2, const Target &target) {
     static const std::string seed_value = Internal::get_env_variable("HL_SEED");
     if (seed_value.empty()) {
         // If HL_SEED is not set, then set seed for both autoscheduling executions.
@@ -22,15 +22,30 @@ bool test_caching(Pipeline &p1, Pipeline &p2, const Target &target, const Machin
         set_env_variable("HL_SEED", std::to_string(seed), /* overwrite */ 0);
     }
 
+    constexpr int parallelism = 32;
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    MachineParams params(parallelism, 16000000, 40);
+#else
+    AutoschedulerParams params = {"Adams2019", {{"parallelism", std::to_string(parallelism)}}};
+#endif
+
     // Turn off caching.
     set_env_variable("HL_DISABLE_MEMOIZED_FEATURES", "1", /* overwrite */ 1);
     set_env_variable("HL_DISABLE_MEMOIZED_BLOCKS", "1", /* overwrite */ 1);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     auto results_without_caching = p1.auto_schedule(target, params);
+#else
+    auto results_without_caching = p1.apply_autoscheduler(target, params);
+#endif
 
     // Turn on caching.
     set_env_variable("HL_DISABLE_MEMOIZED_FEATURES", "0", /* overwrite */ 1);
     set_env_variable("HL_DISABLE_MEMOIZED_BLOCKS", "0", /* overwrite */ 1);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     auto results_with_caching = p2.auto_schedule(target, params);
+#else
+    auto results_with_caching = p2.apply_autoscheduler(target, params);
+#endif
 
     // Reset environment variables to what they were before (memoization variables are reset in main).
     if (seed_value.empty()) {
@@ -65,7 +80,6 @@ int main(int argc, char **argv) {
     const std::string cache_features = Internal::get_env_variable("HL_DISABLE_MEMOIZED_FEATURES");
     const std::string cache_blocks = Internal::get_env_variable("HL_DISABLE_MEMOIZED_BLOCKS");
 
-    MachineParams params(32, 16000000, 40);
     // Use a fixed target for the analysis to get consistent results from this test.
     Target target("x86-64-linux-sse41-avx-avx2");
 
@@ -90,7 +104,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on point-wise pipeline" << std::endl;
             return 1;
         }
@@ -123,7 +137,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on huge expensive stencils and low memory costs" << std::endl;
             return 1;
         }
@@ -149,7 +163,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on moderate isotropic stencils" << std::endl;
             return 1;
         }
@@ -175,7 +189,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on smaller footprint stencil" << std::endl;
             return 1;
         }
@@ -207,7 +221,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on stencil chain" << std::endl;
             return 1;
         }
@@ -231,7 +245,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on an outer product" << std::endl;
             return 1;
         }
@@ -263,7 +277,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on a separable downsample" << std::endl;
             return 1;
         }
@@ -295,7 +309,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on Func with multiple stages + loops" << std::endl;
             return 1;
         }
@@ -332,7 +346,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on scan with pointwise stages before and after" << std::endl;
             return 1;
         }
@@ -365,7 +379,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on bad vectorization" << std::endl;
             return 1;
         }
@@ -397,7 +411,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on matrix multiply + wrapper" << std::endl;
             return 1;
         }
@@ -440,7 +454,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(pipeline1, pipeline2, target, params)) {
+        if (!test_caching(pipeline1, pipeline2, target)) {
             std::cerr << "Caching check failed on scan + downsample" << std::endl;
             return 1;
         }
@@ -473,7 +487,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on gather with LUT" << std::endl;
             return 1;
         }
@@ -501,7 +515,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on 'compute inside an rvar'" << std::endl;
             return 1;
         }
@@ -529,7 +543,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on alternating vectorized dimensions" << std::endl;
             return 1;
         }
@@ -560,7 +574,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on no-win scenario" << std::endl;
             return 1;
         }
@@ -585,7 +599,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on boring memcpy" << std::endl;
             return 1;
         }
@@ -609,7 +623,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on load from a tiny input image" << std::endl;
             return 1;
         }
@@ -640,7 +654,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on many-dimension func" << std::endl;
             return 1;
         }
@@ -673,7 +687,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on long transpose chain" << std::endl;
             return 1;
         }
@@ -711,7 +725,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on inlines + stencil chain" << std::endl;
             return 1;
         }
@@ -738,7 +752,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on alternating vectorized dimensions" << std::endl;
             return 1;
         }
@@ -766,7 +780,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on vectorizable with pure var using RoundUp" << std::endl;
             return 1;
         }
@@ -812,7 +826,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on convolution pyramid" << std::endl;
             return 1;
         }
@@ -844,7 +858,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on casted scan" << std::endl;
             return 1;
         }
@@ -874,7 +888,7 @@ int main(int argc, char **argv) {
             }
         }
 
-        if (!test_caching(p1, p2, target, params)) {
+        if (!test_caching(p1, p2, target)) {
             std::cerr << "Caching check failed on histogram" << std::endl;
             return 1;
         }
diff --git a/src/autoschedulers/adams2019/test_function_dag.cpp b/src/autoschedulers/adams2019/test_function_dag.cpp
index 253307321ecc..0b4604b9500d 100644
--- a/src/autoschedulers/adams2019/test_function_dag.cpp
+++ b/src/autoschedulers/adams2019/test_function_dag.cpp
@@ -1,3 +1,4 @@
+#include "Featurization.h"
 #include "FunctionDAG.h"
 #include "Halide.h"
 #include <sstream>
@@ -31,7 +32,7 @@ extern "C" int mul_by_two(
     return 0;
 }
 
-void test_coeff_wise(const MachineParams &params, const Target &target) {
+void test_coeff_wise(const Target &target) {
     Var x("x"), y("y");
 
     std::ostringstream with_extern;
@@ -55,7 +56,7 @@ void test_coeff_wise(const MachineParams &params, const Target &target) {
         h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000);
         std::vector<Halide::Internal::Function> v;
         v.push_back(h.function());
-        Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target);
+        Halide::Internal::Autoscheduler::FunctionDAG d(v, target);
 
         d.dump(with_extern);
     }
@@ -70,7 +71,7 @@ void test_coeff_wise(const MachineParams &params, const Target &target) {
         h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000);
         std::vector<Halide::Internal::Function> v;
         v.push_back(h.function());
-        Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target);
+        Halide::Internal::Autoscheduler::FunctionDAG d(v, target);
 
         d.dump(without_extern);
     }
@@ -113,7 +114,7 @@ extern "C" int matmul(
     return 0;
 }
 
-void test_matmul(const MachineParams &params, const Target &target) {
+void test_matmul(const Target &target) {
     Var x("x"), y("y"), k("k");
     RDom r(0, 200);
     Halide::Buffer<float> input1(200, 200);
@@ -140,7 +141,7 @@ void test_matmul(const MachineParams &params, const Target &target) {
         h.set_estimate(x, 0, 200).set_estimate(y, 0, 200);
         std::vector<Halide::Internal::Function> v;
         v.push_back(h.function());
-        Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target);
+        Halide::Internal::Autoscheduler::FunctionDAG d(v, target);
 
         d.dump(with_extern);
     }
@@ -153,7 +154,7 @@ void test_matmul(const MachineParams &params, const Target &target) {
         h.set_estimate(x, 0, 200).set_estimate(y, 0, 200);
         std::vector<Halide::Internal::Function> v;
         v.push_back(h.function());
-        Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target);
+        Halide::Internal::Autoscheduler::FunctionDAG d(v, target);
 
         d.dump(without_extern);
     }
@@ -164,11 +165,10 @@ void test_matmul(const MachineParams &params, const Target &target) {
 
 int main(int argc, char **argv) {
     // Use a fixed target for the analysis to get consistent results from this test.
-    MachineParams params(32, 16000000, 40);
     Target target("x86-64-linux-sse41-avx-avx2");
 
-    test_coeff_wise(params, target);
-    test_matmul(params, target);
+    test_coeff_wise(target);
+    test_matmul(target);
 
     return 0;
 }
diff --git a/src/autoschedulers/li2018/GradientAutoscheduler.cpp b/src/autoschedulers/li2018/GradientAutoscheduler.cpp
index d2068c3c6a57..aed686ed51c8 100644
--- a/src/autoschedulers/li2018/GradientAutoscheduler.cpp
+++ b/src/autoschedulers/li2018/GradientAutoscheduler.cpp
@@ -8,6 +8,11 @@ namespace Autoscheduler {
 
 namespace {
 
+struct GradientAutoschedulerParams {
+    /** Maximum level of parallelism available. */
+    int parallelism = 16;
+};
+
 std::map<std::string, Box> inference_bounds(const std::vector<Function> &functions,
                                             const std::vector<Box> &output_bounds) {
     std::vector<Func> funcs;
@@ -86,7 +91,7 @@ int natural_vector_size(const Target &target, const Type &t) {
 
 template<typename FuncOrStage>
 void parallelize_vars_and_rvars_gpu(
-    const MachineParams &params,
+    const GradientAutoschedulerParams &params,
     FuncOrStage func_or_stage,
     bool is_pure_def,
     const std::vector<Var> &vars,
@@ -324,7 +329,7 @@ void parallelize_vars_and_rvars_gpu(
 
 template<typename FuncOrStage>
 void parallelize_vars_and_rvars_cpu(
-    const MachineParams &params,
+    const GradientAutoschedulerParams &params,
     FuncOrStage func_or_stage,
     int natural_vector_size,
     bool is_pure_def,
@@ -528,7 +533,7 @@ void parallelize_vars_and_rvars_cpu(
 
 template<typename FuncOrStage>
 void parallelize_vars_and_rvars(
-    const MachineParams &params,
+    const GradientAutoschedulerParams &params,
     FuncOrStage func_or_stage,
     int natural_vector_size,
     bool is_pure_def,
@@ -565,7 +570,7 @@ void parallelize_vars_and_rvars(
     }
 }
 
-void apply_schedule(const MachineParams &params,
+void apply_schedule(const GradientAutoschedulerParams &params,
                     const Target &target,
                     Func func,
                     int update_id,
@@ -817,7 +822,7 @@ void apply_schedule(const MachineParams &params,
 
 void generate_schedule(const std::vector<Function> &outputs,
                        const Target &target,
-                       const MachineParams &params,
+                       const GradientAutoschedulerParams &params,
                        AutoSchedulerResults *auto_scheduler_results) {
     // The first few steps are the same as src/AutoSchedule.cpp
     // Make an environment map which is used throughout the auto scheduling process.
@@ -919,19 +924,45 @@ void generate_schedule(const std::vector<Function> &outputs,
         }
     }
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     auto_scheduler_results->scheduler_name = "Li2018";
+#endif
     auto_scheduler_results->schedule_source = schedule_source.str();
     debug(1) << schedule_source.str() << "\n";
 }
 
 struct Li2018 {
-    void operator()(const Pipeline &p, const Target &target, const MachineParams &params, AutoSchedulerResults *results) {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    void operator()(const Pipeline &p, const Target &target, const MachineParams &params_in, AutoSchedulerResults *results) {
         std::vector<Function> outputs;
         for (const Func &f : p.outputs()) {
             outputs.push_back(f.function());
         }
+        GradientAutoschedulerParams params;
+        params.parallelism = params_in.parallelism;
+        generate_schedule(outputs, target, params, results);
+    }
+#else
+    void operator()(const Pipeline &p, const Target &target, const AutoschedulerParams &params_in, AutoSchedulerResults *results) {
+        internal_assert(params_in.name == "Li2018");
+        // Verify that no unknown keys are set in params_in
+        const std::set<std::string> legal_keys = {"parallelism"};
+        for (const auto &it : params_in.extra) {
+            user_assert(legal_keys.count(it.first) == 1) << "The key " << it.first << " is not legal to use for the Li2018 Autoscheduler.";
+        }
+
+        std::vector<Function> outputs;
+        for (const Func &f : p.outputs()) {
+            outputs.push_back(f.function());
+        }
+        GradientAutoschedulerParams params;
+        if (params_in.extra.count("parallelism")) {
+            params.parallelism = std::stoi(params_in.extra.at("parallelism"));
+        }
         generate_schedule(outputs, target, params, results);
+        results->autoscheduler_params = params_in;
     }
+#endif
 };
 
 REGISTER_AUTOSCHEDULER(Li2018)
diff --git a/src/autoschedulers/li2018/Makefile b/src/autoschedulers/li2018/Makefile
index 2dc6a1aed289..8bf442918ae2 100644
--- a/src/autoschedulers/li2018/Makefile
+++ b/src/autoschedulers/li2018/Makefile
@@ -35,7 +35,7 @@ $(GENERATOR_BIN)/demo.generator: $(SRC)/demo_generator.cpp $(GENERATOR_DEPS)
 # Use the -p flag to the generator to load the autoscheduler as a plugin
 $(BIN)/%/demo.a: $(GENERATOR_BIN)/demo.generator $(BIN)/libautoschedule_li2018.$(SHARED_EXT)
 	@mkdir -p $(@D)
-	$(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* auto_schedule=true -p $(BIN)/libautoschedule_li2018.$(SHARED_EXT) -s Li2018
+	$(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* autoscheduler=Li2018 -p $(BIN)/libautoschedule_li2018.$(SHARED_EXT)
 
 $(BIN)/%/demo.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/demo.registration.cpp $(BIN)/%/demo.a
 	@mkdir -p $(@D)
diff --git a/src/autoschedulers/li2018/test.cpp b/src/autoschedulers/li2018/test.cpp
index 6518cda38960..f3fb11f7cca7 100644
--- a/src/autoschedulers/li2018/test.cpp
+++ b/src/autoschedulers/li2018/test.cpp
@@ -10,7 +10,13 @@ int main(int argc, char **argv) {
 
     load_plugin(argv[1]);
 
-    MachineParams params(32, 16000000, 40);
+    constexpr int parallelism = 32;
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    MachineParams params(parallelism, 16000000, 40);
+#else
+    AutoschedulerParams params = {"Li2018", {{"parallelism", std::to_string(parallelism)}}};
+#endif
+
     Target target;
 
     Var x("x"), y("y");
@@ -27,8 +33,11 @@ int main(int argc, char **argv) {
 
         f2.set_estimate(x, 0, 10000);
 
-        AutoSchedulerResults result =
-            Pipeline(f2).auto_schedule(target, params);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+        AutoSchedulerResults result = Pipeline(f2).auto_schedule(target, params);
+#else
+        AutoSchedulerResults result = Pipeline(f2).apply_autoscheduler(target, params);
+#endif
         std::cout << "Schedule for 1D pointwise operations:\n"
                   << result.schedule_source << "\n\n";
     }
@@ -46,8 +55,11 @@ int main(int argc, char **argv) {
         f2.set_estimate(x, 0, 1000)
             .set_estimate(y, 0, 1000);
 
-        AutoSchedulerResults result =
-            Pipeline(f2).auto_schedule(target, params);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+        AutoSchedulerResults result = Pipeline(f2).auto_schedule(target, params);
+#else
+        AutoSchedulerResults result = Pipeline(f2).apply_autoscheduler(target, params);
+#endif
         std::cout << "Schedule for 2D pointwise operations:\n"
                   << result.schedule_source << "\n\n";
     }
@@ -61,8 +73,11 @@ int main(int argc, char **argv) {
 
         f0.set_estimate(x, 0, 1000);
 
-        AutoSchedulerResults result =
-            Pipeline(f0).auto_schedule(target, params);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+        AutoSchedulerResults result = Pipeline(f0).auto_schedule(target, params);
+#else
+        AutoSchedulerResults result = Pipeline(f0).apply_autoscheduler(target, params);
+#endif
         std::cout << "Schedule for 1D convolution:\n"
                   << result.schedule_source << "\n\n";
     }
@@ -77,8 +92,11 @@ int main(int argc, char **argv) {
         f0.set_estimate(x, 0, 1000)
             .set_estimate(y, 0, 1000);
 
-        AutoSchedulerResults result =
-            Pipeline(f0).auto_schedule(target, params);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+        AutoSchedulerResults result = Pipeline(f0).auto_schedule(target, params);
+#else
+        AutoSchedulerResults result = Pipeline(f0).apply_autoscheduler(target, params);
+#endif
         std::cout << "Schedule for 2D convolution:\n"
                   << result.schedule_source << "\n\n";
     }
@@ -93,8 +111,11 @@ int main(int argc, char **argv) {
 
         hist.set_estimate(x, 0, 10);
 
-        AutoSchedulerResults result =
-            Pipeline(hist).auto_schedule(target, params);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+        AutoSchedulerResults result = Pipeline(hist).auto_schedule(target, params);
+#else
+        AutoSchedulerResults result = Pipeline(hist).apply_autoscheduler(target, params);
+#endif
         std::cout << "Schedule for 1D histogram:\n"
                   << result.schedule_source << "\n\n";
     }
@@ -109,8 +130,11 @@ int main(int argc, char **argv) {
 
         hist.set_estimate(x, 0, 10);
 
-        AutoSchedulerResults result =
-            Pipeline(hist).auto_schedule(target, params);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+        AutoSchedulerResults result = Pipeline(hist).auto_schedule(target, params);
+#else
+        AutoSchedulerResults result = Pipeline(hist).apply_autoscheduler(target, params);
+#endif
         std::cout << "Schedule for 2D histogram:\n"
                   << result.schedule_source << "\n\n";
     }
@@ -125,8 +149,11 @@ int main(int argc, char **argv) {
 
         hist.set_estimate(x, 0, 10000);
 
-        AutoSchedulerResults result =
-            Pipeline(hist).auto_schedule(target, params);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+        AutoSchedulerResults result = Pipeline(hist).auto_schedule(target, params);
+#else
+        AutoSchedulerResults result = Pipeline(hist).apply_autoscheduler(target, params);
+#endif
         std::cout << "Schedule for 2D histogram with larger domain:\n"
                   << result.schedule_source << "\n\n";
     }
@@ -146,8 +173,11 @@ int main(int argc, char **argv) {
         f2.set_estimate(y, 0, 1024)
             .set_estimate(x, 0, 4);
 
-        AutoSchedulerResults result =
-            Pipeline(f2).auto_schedule(target, params);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+        AutoSchedulerResults result = Pipeline(f2).auto_schedule(target, params);
+#else
+        AutoSchedulerResults result = Pipeline(f2).apply_autoscheduler(target, params);
+#endif
         std::cout << "Schedule for 2D pointwise operations with small x dimension:\n"
                   << result.schedule_source << "\n\n";
     }
diff --git a/src/autoschedulers/li2018/test.py b/src/autoschedulers/li2018/test.py
index 31971ba556d8..72afc9334540 100644
--- a/src/autoschedulers/li2018/test.py
+++ b/src/autoschedulers/li2018/test.py
@@ -17,9 +17,8 @@ def main():
     f_2.set_estimate(x, 0, 1000)
     p = hl.Pipeline(f_2)
     target = hl.Target()
-    # Only first parameter is used (number of cores on CPU)
-    params = hl.MachineParams(32, 0, 0);
-    result = p.auto_schedule('Li2018', target, params)
+    asp = hl.AutoschedulerParams('Li2018', {'parallelism': 32})
+    result = p.apply_autoscheduler(target, asp)
     print('Schedule:')
     print(result.schedule_source)
 
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index 6253b8229c46..3fc82e293508 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -20,6 +20,18 @@ using std::vector;
 
 namespace {
 
+struct ArchParams {
+    /** Maximum level of parallelism avalaible. */
+    int parallelism = 16;
+
+    /** Size of the last-level cache (in bytes). */
+    uint64_t last_level_cache_size = 16 * 1024 * 1024;
+
+    /** Indicates how much more expensive is the cost of a load compared to
+     * the cost of an arithmetic operation at last level cache. */
+    float balance = 40;
+};
+
 // Substitute parameter estimates into the exprs describing the box bounds.
 void substitute_estimates_box(Box &box) {
     box.used = substitute_var_estimates(box.used);
@@ -1054,7 +1066,7 @@ struct Partitioner {
     const map<string, Box> &pipeline_bounds;
     // Parameters of the machine model that is used for estimating the cost of each
     // group in the pipeline.
-    const MachineParams &arch_params;
+    const ArchParams &arch_params;
     // Dependency analysis of the pipeline. This support queries on regions
     // accessed and computed for producing some regions of some functions.
     DependenceAnalysis &dep_analysis;
@@ -1065,7 +1077,7 @@ struct Partitioner {
     const vector<Function> &outputs;
 
     Partitioner(const map<string, Box> &_pipeline_bounds,
-                const MachineParams &_arch_params,
+                const ArchParams &_arch_params,
                 const vector<Function> &_outputs,
                 DependenceAnalysis &_dep_analysis,
                 RegionCosts &_costs);
@@ -1305,7 +1317,7 @@ void Partitioner::disp_pipeline_costs() {
 // Construct a partitioner and build the pipeline graph on which the grouping
 // algorithm operates.
 Partitioner::Partitioner(const map<string, Box> &_pipeline_bounds,
-                         const MachineParams &_arch_params,
+                         const ArchParams &_arch_params,
                          const vector<Function> &_outputs,
                          DependenceAnalysis &_dep_analysis,
                          RegionCosts &_costs)
@@ -3166,7 +3178,7 @@ bool inline_unbounded(const vector<Function> &outputs,
 // outputs. This applies the schedules and returns a string representation of
 // the schedules. The target architecture is specified by 'target'.
 string generate_schedules(const vector<Function> &outputs, const Target &target,
-                          const MachineParams &arch_params) {
+                          const ArchParams &arch_params) {
     // Make an environment map which is used throughout the auto scheduling process.
     map<string, Function> env;
     for (const Function &f : outputs) {
@@ -3372,21 +3384,56 @@ string generate_schedules(const vector<Function> &outputs, const Target &target,
 }
 
 struct Mullapudi2016 {
-    void operator()(const Pipeline &pipeline, const Target &target, const MachineParams &arch_params, AutoSchedulerResults *outputs) {
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    void operator()(const Pipeline &pipeline, const Target &target, const MachineParams &params_in, AutoSchedulerResults *outputs) {
         AutoSchedulerResults results;
         results.target = target;
-        results.machine_params_string = arch_params.to_string();
+        results.machine_params_string = params_in.to_string();
 
         results.scheduler_name = "Mullapudi2016";
         std::vector<Function> pipeline_outputs;
         for (const Func &f : pipeline.outputs()) {
             pipeline_outputs.push_back(f.function());
         }
+        ArchParams arch_params{params_in.parallelism, params_in.last_level_cache_size, params_in.balance};
         results.schedule_source = generate_schedules(pipeline_outputs, target, arch_params);
         // this autoscheduler has no featurization
+        *outputs = std::move(results);
+    }
+#else
+    void operator()(const Pipeline &pipeline, const Target &target, const AutoschedulerParams &params_in, AutoSchedulerResults *outputs) {
+        internal_assert(params_in.name == "Mullapudi2016");
+        // Verify that no unknown keys are set in params_in
+        const std::set<std::string> legal_keys = {"parallelism", "last_level_cache_size", "balance"};
+        for (const auto &it : params_in.extra) {
+            user_assert(legal_keys.count(it.first) == 1) << "The key " << it.first << " is not legal to use for the Mullapudi2016 Autoscheduler.";
+        }
 
-        *outputs = results;
+        AutoSchedulerResults results;
+        results.target = target;
+        results.autoscheduler_params = params_in;
+
+        std::vector<Function> pipeline_outputs;
+        for (const Func &f : pipeline.outputs()) {
+            pipeline_outputs.push_back(f.function());
+        }
+
+        ArchParams arch_params;
+        if (params_in.extra.count("parallelism")) {
+            arch_params.parallelism = std::stoi(params_in.extra.at("parallelism"));
+        }
+        if (params_in.extra.count("last_level_cache_size")) {
+            arch_params.last_level_cache_size = (uint64_t)std::stol(params_in.extra.at("last_level_cache_size"));
+        }
+        if (params_in.extra.count("balance")) {
+            arch_params.balance = std::stoi(params_in.extra.at("balance"));
+        }
+        results.schedule_source = generate_schedules(pipeline_outputs, target, arch_params);
+        results.autoscheduler_params = params_in;
+        // this autoscheduler has no featurization
+        *outputs = std::move(results);
     }
+#endif
 };
 
 REGISTER_AUTOSCHEDULER(Mullapudi2016)
diff --git a/test/auto_schedule/cost_function.cpp b/test/auto_schedule/cost_function.cpp
index 683785e9914e..7200c9348f5d 100644
--- a/test/auto_schedule/cost_function.cpp
+++ b/test/auto_schedule/cost_function.cpp
@@ -48,7 +48,11 @@ int main(int argc, char **argv) {
     // Auto-schedule the pipeline
     Target target = get_jit_target_from_environment();
     Pipeline p(stencils[num_stencils - 1]);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     AutoSchedulerResults results = p.auto_schedule(target);
+#else
+    AutoSchedulerResults results = p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     std::cout << "\n\n******************************************\nSCHEDULE:\n"
               << "******************************************\n"
diff --git a/test/auto_schedule/data_dependent.cpp b/test/auto_schedule/data_dependent.cpp
index 5a54626c4763..828a1061cd3e 100644
--- a/test/auto_schedule/data_dependent.cpp
+++ b/test/auto_schedule/data_dependent.cpp
@@ -40,7 +40,11 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     g.print_loop_nest();
diff --git a/test/auto_schedule/extern.cpp b/test/auto_schedule/extern.cpp
index 02fe11582c4e..8cd4b5181c2c 100644
--- a/test/auto_schedule/extern.cpp
+++ b/test/auto_schedule/extern.cpp
@@ -52,7 +52,11 @@ void test_case_1() {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     g.print_loop_nest();
@@ -82,7 +86,11 @@ void test_case_2() {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     g.print_loop_nest();
@@ -114,7 +122,11 @@ void test_case_3() {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     g.print_loop_nest();
diff --git a/test/auto_schedule/fibonacci.cpp b/test/auto_schedule/fibonacci.cpp
index a394af50a921..0d2a05a3001b 100644
--- a/test/auto_schedule/fibonacci.cpp
+++ b/test/auto_schedule/fibonacci.cpp
@@ -22,7 +22,11 @@ double run_test(bool auto_schedule) {
 
     if (auto_schedule) {
         // Auto-schedule the pipeline
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         p.auto_schedule(target);
+#else
+        p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
     }
 
     // Inspect the schedule
diff --git a/test/auto_schedule/histogram.cpp b/test/auto_schedule/histogram.cpp
index c51cac7436b4..0cc4f151030b 100644
--- a/test/auto_schedule/histogram.cpp
+++ b/test/auto_schedule/histogram.cpp
@@ -64,7 +64,11 @@ double run_test(bool auto_schedule) {
         // Provide estimates on the pipeline output
         color.set_estimates({{0, 1920}, {0, 1024}, {0, 3}});
         // Auto-schedule the pipeline
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         p.auto_schedule(target);
+#else
+        p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
     } else if (target.has_gpu_feature()) {
         Var xi("xi"), yi("yi");
         Y.compute_root().gpu_tile(x, y, xi, yi, 16, 16);
diff --git a/test/auto_schedule/large_window.cpp b/test/auto_schedule/large_window.cpp
index 2626b9a2508b..c449d7136873 100644
--- a/test/auto_schedule/large_window.cpp
+++ b/test/auto_schedule/large_window.cpp
@@ -46,7 +46,11 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     g.print_loop_nest();
diff --git a/test/auto_schedule/mat_mul.cpp b/test/auto_schedule/mat_mul.cpp
index 07e5fefce2ca..73bac853d393 100644
--- a/test/auto_schedule/mat_mul.cpp
+++ b/test/auto_schedule/mat_mul.cpp
@@ -40,7 +40,11 @@ double run_test(bool auto_schedule) {
         // Provide estimates on the pipeline output
         out.set_estimate(x, 0, size).set_estimate(y, 0, size);
         // Auto-schedule the pipeline
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         p.auto_schedule(target);
+#else
+        p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
     } else if (target.has_gpu_feature()) {
         Var xi("xi"), yi("yi"), xii("xii"), yii("yii"), xt("xt"), yt("yt");
         out.tile(x, y, xi, yi, 8, 8).unroll(xi).unroll(yi).gpu_tile(x, y, xt, yt, 8, 8);
diff --git a/test/auto_schedule/max_filter.cpp b/test/auto_schedule/max_filter.cpp
index fa9b72706d5d..f9d7e0854012 100644
--- a/test/auto_schedule/max_filter.cpp
+++ b/test/auto_schedule/max_filter.cpp
@@ -72,7 +72,11 @@ double run_test(bool auto_schedule) {
             .set_estimate(y, 0, in.height())
             .set_estimate(c, 0, in.channels());
         // Auto-schedule the pipeline
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         p.auto_schedule(target);
+#else
+        p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
     } else if (target.has_gpu_feature()) {
         slice_for_radius.compute_root();
         filter_height.compute_root();
diff --git a/test/auto_schedule/multi_output.cpp b/test/auto_schedule/multi_output.cpp
index f00f4ee09fa3..3ad372568e13 100644
--- a/test/auto_schedule/multi_output.cpp
+++ b/test/auto_schedule/multi_output.cpp
@@ -44,10 +44,14 @@ int main(int argc, char **argv) {
     std::vector<Func> outs;
     outs.push_back(h);
     outs.push_back(g);
-    Pipeline test(outs);
+    Pipeline p(outs);
 
     Target target = get_jit_target_from_environment();
-    test.auto_schedule(target);
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     h.print_loop_nest();
@@ -56,7 +60,7 @@ int main(int argc, char **argv) {
     Buffer<uint16_t> out_1(999, 999), out_2(999, 999);
 
     // Run the schedule
-    test.realize({out_1, out_2});
+    p.realize({out_1, out_2});
 
     printf("Success!\n");
     return 0;
diff --git a/test/auto_schedule/overlap.cpp b/test/auto_schedule/overlap.cpp
index 8fe4a0b5aa1f..2f747879244f 100644
--- a/test/auto_schedule/overlap.cpp
+++ b/test/auto_schedule/overlap.cpp
@@ -50,7 +50,11 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(up[num_levels - 1]);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     up[num_levels - 1].print_loop_nest();
diff --git a/test/auto_schedule/param.cpp b/test/auto_schedule/param.cpp
index 1db0458d0e2f..7102e1d61217 100644
--- a/test/auto_schedule/param.cpp
+++ b/test/auto_schedule/param.cpp
@@ -23,7 +23,11 @@ void run_test_1() {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     g.print_loop_nest();
@@ -50,7 +54,11 @@ void run_test_2() {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     g.print_loop_nest();
@@ -77,7 +85,11 @@ void run_test_3() {
     Target target = get_jit_target_from_environment();
     Pipeline p(output);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     output.print_loop_nest();
@@ -107,7 +119,11 @@ void run_test_4() {
     Target target = get_jit_target_from_environment();
     Pipeline p(output);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     output.print_loop_nest();
diff --git a/test/auto_schedule/reorder.cpp b/test/auto_schedule/reorder.cpp
index 24c4893051f7..ba15be2544aa 100644
--- a/test/auto_schedule/reorder.cpp
+++ b/test/auto_schedule/reorder.cpp
@@ -27,7 +27,11 @@ double run_test_1(bool auto_schedule) {
         // Provide estimates on the pipeline output
         r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}});
         // Auto-schedule the pipeline
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         p.auto_schedule(target);
+#else
+        p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
     } else {
         /*
         r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4);
@@ -79,7 +83,11 @@ double run_test_2(bool auto_schedule) {
         // Provide estimates on the pipeline output
         diff.set_estimates({{0, left_im.width()}, {0, left_im.height()}, {0, 32}, {0, 3}});
         // Auto-schedule the pipeline
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         p.auto_schedule(target);
+#else
+        p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
     } else {
         Var t("t");
         diff.reorder(c, z).fuse(c, z, t).parallel(t).vectorize(x, 16);
@@ -118,7 +126,11 @@ double run_test_3(bool auto_schedule) {
         // Provide estimates on the pipeline output
         r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}});
         // Auto-schedule the pipeline
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
         p.auto_schedule(target);
+#else
+        p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
     } else {
         Var par("par");
         r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4);
diff --git a/test/auto_schedule/small_pure_update.cpp b/test/auto_schedule/small_pure_update.cpp
index 4ef2649048ee..3954c257015a 100644
--- a/test/auto_schedule/small_pure_update.cpp
+++ b/test/auto_schedule/small_pure_update.cpp
@@ -28,8 +28,13 @@ int main(int argc, char **argv) {
     h.set_estimates({{0, 13}, {0, 17}});
     in_param.set_estimates({{0, 13}, {0, 17}});
 
+    Target target = get_target_from_environment();
     Pipeline p(h);
-    p.auto_schedule(Target("host"));
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     in_param.set(in);
 
diff --git a/test/auto_schedule/tile_vs_inline.cpp b/test/auto_schedule/tile_vs_inline.cpp
index 1c067cd81ab7..01ebaa15baca 100644
--- a/test/auto_schedule/tile_vs_inline.cpp
+++ b/test/auto_schedule/tile_vs_inline.cpp
@@ -44,7 +44,11 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     g.print_loop_nest();
diff --git a/test/auto_schedule/unused_func.cpp b/test/auto_schedule/unused_func.cpp
index bac796b6baa3..406ba438f0c9 100644
--- a/test/auto_schedule/unused_func.cpp
+++ b/test/auto_schedule/unused_func.cpp
@@ -28,7 +28,11 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(f);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     f.print_loop_nest();
diff --git a/test/auto_schedule/vectorize_var_in_update.cpp b/test/auto_schedule/vectorize_var_in_update.cpp
index 8b0f6881220f..13f9bf155bb9 100644
--- a/test/auto_schedule/vectorize_var_in_update.cpp
+++ b/test/auto_schedule/vectorize_var_in_update.cpp
@@ -50,7 +50,11 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(h);
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     // Inspect the schedule
     h.print_loop_nest();
diff --git a/test/correctness/custom_auto_scheduler.cpp b/test/correctness/custom_auto_scheduler.cpp
index 32eec8b25dae..cda182861340 100644
--- a/test/correctness/custom_auto_scheduler.cpp
+++ b/test/correctness/custom_auto_scheduler.cpp
@@ -6,7 +6,11 @@ int call_count = 0;
 
 void inline_everything(const Pipeline &,
                        const Target &,
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
                        const MachineParams &,
+#else
+                       const AutoschedulerParams &,
+#endif
                        AutoSchedulerResults *) {
     call_count++;
     // Inlining everything is really easy.
@@ -22,13 +26,22 @@ int main(int argc, char **argv) {
     Func f;
     Var x;
     f(x) = 3;
-    Pipeline(f).auto_schedule(kSchedulerName, Target("host"));
-
-    Pipeline::set_default_autoscheduler_name(kSchedulerName);
 
     Func g;
     g(x) = 3;
-    Pipeline(g).auto_schedule(Target("host"));
+
+    Target t("host");
+
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    Pipeline(f).auto_schedule(kSchedulerName, t);
+
+    Pipeline::set_default_autoscheduler_name(kSchedulerName);
+    Pipeline(g).auto_schedule(t);
+#else
+    AutoschedulerParams autoscheduler_params(kSchedulerName);
+    Pipeline(f).apply_autoscheduler(t, autoscheduler_params);
+    Pipeline(g).apply_autoscheduler(t, autoscheduler_params);
+#endif
 
     if (call_count != 2) {
         printf("Should have called the custom autoscheduler twice. Instead called it %d times\n", call_count);
diff --git a/test/error/auto_schedule_no_parallel.cpp b/test/error/auto_schedule_no_parallel.cpp
index 74e2b269025f..2519619a3b1b 100644
--- a/test/error/auto_schedule_no_parallel.cpp
+++ b/test/error/auto_schedule_no_parallel.cpp
@@ -25,7 +25,11 @@ int main(int argc, char **argv) {
 
     // This should throw an error since auto-scheduler does not currently
     // support partial schedules
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     printf("Success!\n");
     return 0;
diff --git a/test/error/auto_schedule_no_reorder.cpp b/test/error/auto_schedule_no_reorder.cpp
index 8f39114ee9ea..d9fb344473e4 100644
--- a/test/error/auto_schedule_no_reorder.cpp
+++ b/test/error/auto_schedule_no_reorder.cpp
@@ -25,7 +25,11 @@ int main(int argc, char **argv) {
 
     // This should throw an error since auto-scheduler does not currently
     // support partial schedules
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
     p.auto_schedule(target);
+#else
+    p.apply_autoscheduler(target, {"Mullapudi2016"});
+#endif
 
     printf("Success!\n");
     return 0;
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index e6f82c285641..3097fc8b6955 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -109,10 +109,17 @@ endif ()
 
 # alias_aottest.cpp
 # alias_generator.cpp
-halide_define_aot_test(alias EXTRA_LIBS alias_with_offset_42)
-add_halide_library(alias_with_offset_42
-                   FROM alias.generator
-                   GENERATOR alias_with_offset_42)
+set(ALIAS_LIBS alias_with_offset_42 alias_Adams2019 alias_Li2018 alias_Mullapudi2016)
+halide_define_aot_test(alias EXTRA_LIBS ${ALIAS_LIBS})
+foreach (LIB IN LISTS ALIAS_LIBS)
+    # We don't really need all the plugins at once here --
+    # It's just easier to specify them all (and adds a test that loading
+    # multiple plugins works)
+    add_halide_library(${LIB}
+                       FROM alias.generator
+                       GENERATOR ${LIB}
+                       PLUGINS Halide::Adams2019 Halide::Li2018 Halide::Mullapudi2016)
+endforeach ()
 
 # argvcall_aottest.cpp
 # argvcall_generator.cpp
diff --git a/test/generator/alias_aottest.cpp b/test/generator/alias_aottest.cpp
index 80c2f61a9602..41c1a9f0ae80 100644
--- a/test/generator/alias_aottest.cpp
+++ b/test/generator/alias_aottest.cpp
@@ -6,6 +6,13 @@
 
 #include "alias.h"
 #include "alias_with_offset_42.h"
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+// nothing
+#else
+#include "alias_Adams2019.h"
+#include "alias_Li2018.h"
+#include "alias_Mullapudi2016.h"
+#endif
 
 using namespace Halide::Runtime;
 
@@ -18,16 +25,45 @@ int main(int argc, char **argv) {
         input(x) = x;
     });
 
+    output.fill(0);
     alias(input, output);
+    output.copy_to_host();
     input.for_each_element([=](int x) {
         assert(output(x) == input(x));
     });
 
+    output.fill(0);
     alias_with_offset_42(input, output);
+    output.copy_to_host();
     input.for_each_element([=](int x) {
         assert(output(x) == input(x) + 42);
     });
 
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+    // nothing
+#else
+    output.fill(0);
+    alias_Adams2019(input, output);
+    output.copy_to_host();
+    input.for_each_element([=](int x) {
+        assert(output(x) == input(x) + 2019);
+    });
+
+    output.fill(0);
+    alias_Li2018(input, output);
+    output.copy_to_host();
+    input.for_each_element([=](int x) {
+        assert(output(x) == input(x) + 2018);
+    });
+
+    output.fill(0);
+    output.copy_to_host();
+    alias_Mullapudi2016(input, output);
+    input.for_each_element([=](int x) {
+        assert(output(x) == input(x) + 2016);
+    });
+#endif
+
     printf("Success!\n");
     return 0;
 }
diff --git a/test/generator/alias_generator.cpp b/test/generator/alias_generator.cpp
index 84d3e803709f..5661588229d6 100644
--- a/test/generator/alias_generator.cpp
+++ b/test/generator/alias_generator.cpp
@@ -11,6 +11,15 @@ class Alias : public Halide::Generator<Alias> {
     void generate() {
         Var x;
         output(x) = input(x) + offset;
+
+        // set estimates for the autoschedulers
+        input.set_estimates({{0, 32}});
+        output.set_estimates({{0, 32}});
+
+        if (!using_autoscheduler()) {
+            // Don't really need a default schedule for something this simple, but sure, why not
+            output.vectorize(x, natural_vector_size<int32_t>()).compute_root();
+        }
     }
 };
 
@@ -18,3 +27,12 @@ class Alias : public Halide::Generator<Alias> {
 
 HALIDE_REGISTER_GENERATOR(Alias, alias)
 HALIDE_REGISTER_GENERATOR_ALIAS(alias_with_offset_42, alias, {{"offset", "42"}})
+#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API
+// nothing
+#else
+// Since autoscheduler-to-use is now an ordinary GeneratorParam, we can specify it in Aliases for convenience.
+// (Set unique offsets just to verify these are all separate calls.)
+HALIDE_REGISTER_GENERATOR_ALIAS(alias_Adams2019, alias, {{"autoscheduler", "Adams2019"}, {"offset", "2019"}})
+HALIDE_REGISTER_GENERATOR_ALIAS(alias_Li2018, alias, {{"autoscheduler", "Li2018"}, {"offset", "2018"}})
+HALIDE_REGISTER_GENERATOR_ALIAS(alias_Mullapudi2016, alias, {{"autoscheduler", "Mullapudi2016"}, {"offset", "2016"}})
+#endif
diff --git a/test/generator/example_generator.cpp b/test/generator/example_generator.cpp
index 41ab28e8da2d..9997b6ccfcad 100644
--- a/test/generator/example_generator.cpp
+++ b/test/generator/example_generator.cpp
@@ -81,7 +81,7 @@ class Example : public Halide::Generator<Example> {
         runtime_factor.set_estimate(1);
         output.set_estimates({{0, 32}, {0, 32}, {0, 3}});
 
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             output
                 .bound(c, 0, channels)
                 .reorder(c, x, y)
diff --git a/test/generator/stubtest_generator.cpp b/test/generator/stubtest_generator.cpp
index bbb68aaadc17..8f5b41640e41 100644
--- a/test/generator/stubtest_generator.cpp
+++ b/test/generator/stubtest_generator.cpp
@@ -92,7 +92,7 @@ class StubTest : public Halide::Generator<StubTest> {
     }
 
     void schedule() {
-        if (!auto_schedule) {
+        if (!using_autoscheduler()) {
             intermediate.compute_at(intermediate_level);
             intermediate.specialize(vectorize).vectorize(x, natural_vector_size<float>());
         }
diff --git a/test/generator/stubtest_jittest.cpp b/test/generator/stubtest_jittest.cpp
index 2973501941f4..1c0aa3f8fc14 100644
--- a/test/generator/stubtest_jittest.cpp
+++ b/test/generator/stubtest_jittest.cpp
@@ -142,7 +142,7 @@ int main(int argc, char **argv) {
         // from the specific inputs we provide, but for the JIT (and AOT) cases, there are
         // no such inputs available, so we must be explicit. (Note that these are the same
         // values specified in our Make/CMake files.)
-        const std::map<std::string, std::string> gp = {
+        const GeneratorParamsMap gp = {
             {"untyped_buffer_input.type", "uint8"},
             {"untyped_buffer_input.dim", "3"},
             {"simple_input.type", "float32"},
@@ -217,7 +217,7 @@ int main(int argc, char **argv) {
         // from the specific inputs we provide, but for the JIT (and AOT) cases, there are
         // no such inputs available, so we must be explicit. (Note that these are the same
         // values specified in our Make/CMake files.)
-        const std::map<std::string, std::string> gp = {
+        const GeneratorParamsMap gp = {
             {"untyped_buffer_input.type", "uint8"},
             {"untyped_buffer_input.dim", "3"},
             {"simple_input.type", "float32"},
diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt
index 7d86ae5d6ae3..2db06c7d0dfa 100644
--- a/tutorial/CMakeLists.txt
+++ b/tutorial/CMakeLists.txt
@@ -194,11 +194,14 @@ if (TARGET Halide::Mullapudi2016)
 
     add_halide_library(auto_schedule_false FROM lesson_21_auto_scheduler_generate
                        TARGETS cmake
-                       GENERATOR auto_schedule_gen PARAMS auto_schedule=false)
+                       GENERATOR auto_schedule_gen)
     add_halide_library(auto_schedule_true FROM lesson_21_auto_scheduler_generate
                        TARGETS cmake
                        AUTOSCHEDULER Halide::Mullapudi2016
-                       GENERATOR auto_schedule_gen PARAMS machine_params=32,16777216,40)
+                       GENERATOR auto_schedule_gen
+                       PARAMS autoscheduler.parallelism=32
+                              autoscheduler.last_level_cache_size=16777216
+                              autoscheduler.balance=40)
 
     add_executable(lesson_21_auto_scheduler_run lesson_21_auto_scheduler_run.cpp)
     target_link_libraries(lesson_21_auto_scheduler_run PRIVATE
diff --git a/tutorial/lesson_21_auto_scheduler_generate.cpp b/tutorial/lesson_21_auto_scheduler_generate.cpp
index 44a1bcac6aea..4258599e8d58 100644
--- a/tutorial/lesson_21_auto_scheduler_generate.cpp
+++ b/tutorial/lesson_21_auto_scheduler_generate.cpp
@@ -2,7 +2,7 @@
 
 // So far we have written Halide schedules by hand, but it is also possible to
 // ask Halide to suggest a reasonable schedule. We call this auto-scheduling.
-// This lesson demonstrates how to use the auto-scheduler to generate a
+// This lesson demonstrates how to use the autoscheduler to generate a
 // copy-pasteable CPU schedule that can be subsequently improved upon.
 
 // On linux or os x, you can compile and run it like so:
@@ -11,7 +11,7 @@
 // export LD_LIBRARY_PATH=<path/to/libHalide.so>   # For linux
 // export DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> # For OS X
 // ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_false -e static_library,h,schedule target=host auto_schedule=false
-// ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_true -e static_library,h,schedule -p <path/to/libautoschedule_mullapudi2016.so> -S Mullapudi2016 target=host auto_schedule=true machine_params=32,16777216,40
+// ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_true -e static_library,h,schedule -p <path/to/libautoschedule_mullapudi2016.so> -S Mullapudi2016 target=host autoscheduler=Mullapudi2016 autoscheduler.parallelism=32 autoscheduler.last_level_cache_size=16777216 autoscheduler.balance=40
 // g++ lesson_21_auto_scheduler_run.cpp -std=c++17 -I <path/to/Halide.h> -I <path/to/tools/halide_image_io.h> auto_schedule_false.a auto_schedule_true.a -ldl -lpthread -o lesson_21_run
 // ./lesson_21_run
 
@@ -69,8 +69,8 @@ class AutoScheduled : public Halide::Generator<AutoScheduled> {
     }
 
     void schedule() {
-        if (auto_schedule) {
-            // The auto-scheduler requires estimates on all the input/output
+        if (using_autoscheduler()) {
+            // The autoscheduler requires estimates on all the input/output
             // sizes and parameter values in order to compare different
             // alternatives and decide on a good schedule.
 
@@ -95,31 +95,33 @@ class AutoScheduled : public Halide::Generator<AutoScheduled> {
             // schedule will be.
 
             // To auto-schedule the pipeline, we don't have to do anything else:
-            // every Generator implicitly has a GeneratorParam named "auto_schedule";
-            // if this is set to true, Halide will call auto_schedule() on all of
-            // our pipeline's outputs automatically.
-
-            // Every Generator also implicitly has a GeneratorParams named "machine_params",
-            // which allows you to specify characteristics of the machine architecture
-            // for the auto-scheduler; it's generally specified in your Makefile.
+            // every Generator implicitly has a GeneratorParam named "auto_scheduler.name";
+            // if this is set to the name of the Autoscheduler we want to use, Halide will
+            // apply it to all of our pipeline's outputs automatically.
+
+            // Every Generator also implicitly has additional, optional GeneratorParams that are
+            // dependent on the specific Autoscheduler select, which allows you to specify
+            // characteristics of the machine architecture
+            // for the autoscheduler; it's generally specified in your Makefile.
             // If none is specified, the default machine parameters for a generic CPU
-            // architecture will be used by the auto-scheduler.
+            // architecture will be used by the autoscheduler.
 
-            // Let's see some arbitrary but plausible values for the machine parameters.
+            // Let's see some arbitrary but plausible values for the machine parameters
+            // for the Mullapudi2016 Autoscheduler:
             //
-            //      const int kParallelism = 32;
-            //      const int kLastLevelCacheSize = 16 * 1024 * 1024;
-            //      const int kBalance = 40;
-            //      MachineParams machine_params(kParallelism, kLastLevelCacheSize, kBalance);
+            //      autoscheduler=Mullapudi2016
+            //      autoscheduler.parallelism=32
+            //      autoscheduler.last_level_cache_size=16777216
+            //      autoscheduler.balance=40
             //
-            // The arguments to MachineParams are the maximum level of parallelism
-            // available, the size of the last-level cache (in KB), and the ratio
+            // These are the maximum level of parallelism
+            // available, the size of the last-level cache (in bytes), and the ratio
             // between the cost of a miss at the last level cache and the cost
             // of arithmetic on the target architecture, in that order.
 
-            // Note that when using the auto-scheduler, no schedule should have
-            // been applied to the pipeline; otherwise, the auto-scheduler will
-            // throw an error. The current auto-scheduler cannot handle a
+            // Note that when using the autoscheduler, no schedule should have
+            // been applied to the pipeline; otherwise, the autoscheduler will
+            // throw an error. The current autoscheduler cannot handle a
             // partially-scheduled pipeline.
 
             // If HL_DEBUG_CODEGEN is set to 3 or greater, the schedule will be dumped
@@ -131,12 +133,12 @@ class AutoScheduled : public Halide::Generator<AutoScheduled> {
             // Halide C++ source, which is readily copy-pasteable back into
             // this very same source file with few modifications. Programmers
             // can use this as a starting schedule and iteratively improve the
-            // schedule. Note that the current auto-scheduler is only able to
+            // schedule. Note that the current autoscheduler is only able to
             // generate CPU schedules and only does tiling, simple vectorization
             // and parallelization. It doesn't deal with line buffering, storage
             // reordering, or factoring reductions.
 
-            // At the time of writing, the auto-scheduler will produce the
+            // At the time of writing, the autoscheduler will produce the
             // following schedule for the estimates and machine parameters
             // declared above when run on this pipeline:
             //
@@ -211,7 +213,7 @@ class AutoScheduled : public Halide::Generator<AutoScheduled> {
 
         } else {
             // This is where you would declare the schedule you have written by
-            // hand or paste the schedule generated by the auto-scheduler.
+            // hand or paste the schedule generated by the autoscheduler.
             // We will use a naive schedule here to compare the performance of
             // the autoschedule with a basic schedule.
             gray.compute_root();

From 2d907c45a7043361164bba532221d62b8e0fe0d9 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <derek.gerstmann@gmail.com>
Date: Fri, 15 Jul 2022 15:15:18 -0700
Subject: [PATCH 2/2] [vulkan phase0] Add adts for containers and memory
 allocation to runtime (#6829)

* Cherry pick runtime internals as standalone commit (preparation work for Vulkan runtime)

* Clang format/tidy fixes

* Fix runtime test linkage and include paths to not include libHalide

* Update test/runtime/CMakeLists.txt

Fix typo mismatch for HALIDE_VERSION_PATCH

Co-authored-by: Alex Reinking <reinking@google.com>

* Add compiler id guard to build options for runtime tests

* Avoid building runtime tests on MSVC since Halide runtime headers are not MS compatible
Remove CLANG warning flag for runtime test

* Change runtime test compile definitions to be PRIVATE.
Remove PUBLIC_EXPORTS from runtime test definition.

* Add comment about GNU warnings for 'no-builtin-declaration-mismatch'

* Change to debug(user_context) for debug messages where context is valid.
Wrap verbose debugging with DEBUG_RUNTIME ifdef.
Syle pass based on review comments.

* Add note explaining why we disable the internal runtime tests on MSVC.

* Cleanup cmake logic for disabling runtime internal tests for MSVC and add a status message.

* Don't use strncpy for prepend since some implementations may insert a null char regardless of the length used

* Workaround varying platform str implementations and handle termination
directly.

* Clang Tidy/Format pass

Co-authored-by: Derek Gerstmann <dgerstmann@adobe.com>
Co-authored-by: Alex Reinking <reinking@google.com>
---
 Makefile                                |  15 +-
 src/runtime/internal/block_allocator.h  | 478 ++++++++++++++++++++++++
 src/runtime/internal/block_storage.h    | 425 +++++++++++++++++++++
 src/runtime/internal/linked_list.h      | 333 +++++++++++++++++
 src/runtime/internal/memory_arena.h     | 310 +++++++++++++++
 src/runtime/internal/memory_resources.h | 280 ++++++++++++++
 src/runtime/internal/pointer_table.h    | 366 ++++++++++++++++++
 src/runtime/internal/region_allocator.h | 462 +++++++++++++++++++++++
 src/runtime/internal/string_storage.h   | 216 +++++++++++
 src/runtime/internal/string_table.h     | 217 +++++++++++
 src/runtime/runtime_internal.h          |   5 +
 test/CMakeLists.txt                     |  21 ++
 test/runtime/CMakeLists.txt             |  32 ++
 test/runtime/block_allocator.cpp        | 140 +++++++
 test/runtime/block_storage.cpp          | 148 ++++++++
 test/runtime/common.h                   |  29 ++
 test/runtime/linked_list.cpp            |  91 +++++
 test/runtime/memory_arena.cpp           |  88 +++++
 test/runtime/string_storage.cpp         |  63 ++++
 test/runtime/string_table.cpp           |  44 +++
 20 files changed, 3762 insertions(+), 1 deletion(-)
 create mode 100644 src/runtime/internal/block_allocator.h
 create mode 100644 src/runtime/internal/block_storage.h
 create mode 100644 src/runtime/internal/linked_list.h
 create mode 100644 src/runtime/internal/memory_arena.h
 create mode 100644 src/runtime/internal/memory_resources.h
 create mode 100644 src/runtime/internal/pointer_table.h
 create mode 100644 src/runtime/internal/region_allocator.h
 create mode 100644 src/runtime/internal/string_storage.h
 create mode 100644 src/runtime/internal/string_table.h
 create mode 100644 test/runtime/CMakeLists.txt
 create mode 100644 test/runtime/block_allocator.cpp
 create mode 100644 test/runtime/block_storage.cpp
 create mode 100644 test/runtime/common.h
 create mode 100644 test/runtime/linked_list.cpp
 create mode 100644 test/runtime/memory_arena.cpp
 create mode 100644 test/runtime/string_storage.cpp
 create mode 100644 test/runtime/string_table.cpp

diff --git a/Makefile b/Makefile
index 97d481012909..640b59fa9a68 100644
--- a/Makefile
+++ b/Makefile
@@ -1144,6 +1144,7 @@ CORRECTNESS_TESTS = $(shell ls $(ROOT_DIR)/test/correctness/*.cpp) $(shell ls $(
 PERFORMANCE_TESTS = $(shell ls $(ROOT_DIR)/test/performance/*.cpp)
 ERROR_TESTS = $(shell ls $(ROOT_DIR)/test/error/*.cpp)
 WARNING_TESTS = $(shell ls $(ROOT_DIR)/test/warning/*.cpp)
+RUNTIME_TESTS = $(shell ls $(ROOT_DIR)/test/runtime/*.cpp)
 GENERATOR_EXTERNAL_TESTS := $(shell ls $(ROOT_DIR)/test/generator/*test.cpp)
 GENERATOR_EXTERNAL_TEST_GENERATOR := $(shell ls $(ROOT_DIR)/test/generator/*_generator.cpp)
 TUTORIALS = $(filter-out %_generate.cpp, $(shell ls $(ROOT_DIR)/tutorial/*.cpp))
@@ -1153,6 +1154,7 @@ test_correctness: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=quiet_c
 test_performance: $(PERFORMANCE_TESTS:$(ROOT_DIR)/test/performance/%.cpp=performance_%)
 test_error: $(ERROR_TESTS:$(ROOT_DIR)/test/error/%.cpp=error_%)
 test_warning: $(WARNING_TESTS:$(ROOT_DIR)/test/warning/%.cpp=warning_%)
+test_runtime: $(RUNTIME_TESTS:$(ROOT_DIR)/test/runtime/%.cpp=runtime_%)
 test_tutorial: $(TUTORIALS:$(ROOT_DIR)/tutorial/%.cpp=tutorial_%)
 test_valgrind: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=valgrind_%)
 test_avx512: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=avx512_%)
@@ -1239,7 +1241,7 @@ test_generator: $(GENERATOR_AOT_TESTS) $(GENERATOR_AOTCPP_TESTS) $(GENERATOR_JIT
 	$(FILTERS_DIR)/rungen_test
 	$(FILTERS_DIR)/registration_test
 
-ALL_TESTS = test_internal test_correctness test_error test_tutorial test_warning test_generator
+ALL_TESTS = test_internal test_correctness test_error test_tutorial test_warning test_runtime test_generator
 
 # These targets perform timings of each test. For most tests this includes Halide JIT compile times, and run times.
 # For generator tests they time the compile time only. The times are recorded in CSV files.
@@ -1260,6 +1262,7 @@ build_tests: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=$(BIN_DIR)/c
 	$(PERFORMANCE_TESTS:$(ROOT_DIR)/test/performance/%.cpp=$(BIN_DIR)/performance_%) \
 	$(ERROR_TESTS:$(ROOT_DIR)/test/error/%.cpp=$(BIN_DIR)/error_%) \
 	$(WARNING_TESTS:$(ROOT_DIR)/test/warning/%.cpp=$(BIN_DIR)/warning_%) \
+	$(RUNTIME_TESTS:$(ROOT_DIR)/test/runtime/%.cpp=$(BIN_DIR)/runtime_%) \
 	$(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_aottest.cpp=$(BIN_DIR)/$(TARGET)/generator_aot_%) \
 	$(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_jittest.cpp=$(BIN_DIR)/generator_jit_%) \
 	$(AUTO_SCHEDULE_TESTS:$(ROOT_DIR)/test/auto_schedule/%.cpp=$(BIN_DIR)/auto_schedule_%)
@@ -1332,6 +1335,11 @@ $(BIN_DIR)/error_%: $(ROOT_DIR)/test/error/%.cpp $(BIN_DIR)/libHalide.$(SHARED_E
 $(BIN_DIR)/warning_%: $(ROOT_DIR)/test/warning/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h
 	$(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
 
+# Runtime tests that test internals
+RUNTIME_TESTS_CXXFLAGS = -fno-rtti -fno-exceptions -fno-threadsafe-statics -Wno-builtin-declaration-mismatch -DCOMPILING_HALIDE_RUNTIME -DCOMPILING_HALIDE_RUNTIME_TESTS
+$(BIN_DIR)/runtime_%: $(ROOT_DIR)/test/runtime/%.cpp $(ROOT_DIR)/test/runtime/common.h
+	$(CXX) $(TEST_CXX_FLAGS) $(RUNTIME_TESTS_CXXFLAGS) -I$(ROOT_DIR)/test/runtime -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $< $(COMMON_LD_FLAGS) -o $@
+
 # Auto schedule tests that link against libHalide
 $(BIN_DIR)/auto_schedule_%: $(ROOT_DIR)/test/auto_schedule/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h
 	$(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
@@ -1929,6 +1937,11 @@ warning_%: $(BIN_DIR)/warning_%
 	cd $(TMP_DIR) ; $(CURDIR)/$< 2>&1 | egrep --q "^Warning"
 	@-echo
 
+runtime_%: $(BIN_DIR)/runtime_%
+	@-mkdir -p $(TMP_DIR)
+	cd $(TMP_DIR) ; $(CURDIR)/$<
+	@-echo
+
 generator_jit_%: $(BIN_DIR)/generator_jit_%
 	@-mkdir -p $(TMP_DIR)
 	cd $(TMP_DIR) ; $(CURDIR)/$<
diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
new file mode 100644
index 000000000000..f7f0247e441f
--- /dev/null
+++ b/src/runtime/internal/block_allocator.h
@@ -0,0 +1,478 @@
+#ifndef HALIDE_RUNTIME_BLOCK_ALLOCATOR_H
+#define HALIDE_RUNTIME_BLOCK_ALLOCATOR_H
+
+#include "linked_list.h"
+#include "memory_resources.h"
+#include "region_allocator.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// --
+
+/** Allocator class interface for managing large contiguous blocks
+ * of memory, which are then sub-allocated into smaller regions of
+ * memory. This class only manages the address creation for the
+ * regions -- allocation callback functions are used to request the
+ * memory from the necessary system or API calls. This class is
+ * intended to be used inside of a higher level memory management
+ * class that provides thread safety, policy management and API
+ * integration for a specific runtime API (eg Vulkan, OpenCL, etc)
+ */
+class BlockAllocator {
+public:
+    // disable copy constructors and assignment
+    BlockAllocator(const BlockAllocator &) = delete;
+    BlockAllocator &operator=(const BlockAllocator &) = delete;
+
+    // disable non-factory based construction
+    BlockAllocator() = delete;
+    ~BlockAllocator() = delete;
+
+    // Allocators for the different types of memory we need to allocate
+    struct MemoryAllocators {
+        SystemMemoryAllocatorFns system;
+        MemoryBlockAllocatorFns block;
+        MemoryRegionAllocatorFns region;
+    };
+
+    // Runtime configuration parameters to adjust the behaviour of the block allocator
+    struct Config {
+        size_t initial_capacity = 0;
+        size_t minimum_block_size = 0;
+        size_t maximum_block_size = 0;
+        size_t maximum_block_count = 0;
+    };
+
+    // Factory methods for creation / destruction
+    static BlockAllocator *create(void *user_context, const Config &config, const MemoryAllocators &allocators);
+    static void destroy(void *user_context, BlockAllocator *block_allocator);
+
+    // Public interface methods
+    MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
+    void reclaim(void *user_context, MemoryRegion *region);
+    bool collect(void *user_context);  //< returns true if any blocks were removed
+    void release(void *user_context);
+    void destroy(void *user_context);
+
+    // Access methods
+    const MemoryAllocators &current_allocators() const;
+    const Config &current_config() const;
+    const Config &default_config() const;
+    size_t block_count() const;
+
+private:
+    // Linked-list for storing the block resources
+    typedef LinkedList::EntryType BlockEntry;
+
+    // Initializes a new instance
+    void initialize(void *user_context, const Config &config, const MemoryAllocators &allocators);
+
+    // Reserves a region of memory using the given allocator for the given block resource, returns nullptr on failure
+    MemoryRegion *reserve_memory_region(void *user_context, RegionAllocator *allocator, const MemoryRequest &request);
+
+    // Creates a new region allocator for the given block resource
+    RegionAllocator *create_region_allocator(void *user_context, BlockResource *block);
+
+    // Destroys the given region allocator and all associated memory regions
+    void destroy_region_allocator(void *user_context, RegionAllocator *region_allocator);
+
+    // Reserves a block of memory for the requested size and returns the corresponding block entry, or nullptr on failure
+    BlockEntry *reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+
+    // Locates the "best-fit" block entry for the requested size, or nullptr if none was found
+    BlockEntry *find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+
+    // Creates a new block entry and int the list
+    BlockEntry *create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+
+    // Releases the block entry from being used, and makes it available for further allocations
+    void release_block_entry(void *user_context, BlockEntry *block_entry);
+
+    // Destroys the block entry and removes it from the list
+    void destroy_block_entry(void *user_context, BlockEntry *block_entry);
+
+    // Invokes the allocation callback to allocate memory for the block region
+    void alloc_memory_block(void *user_context, BlockResource *block);
+
+    // Invokes the deallocation callback to free memory for the memory block
+    void free_memory_block(void *user_context, BlockResource *block);
+
+    // Returns a constrained size for the requested size based on config parameters
+    size_t constrain_requested_size(size_t size) const;
+
+    // Returns true if the given block is compatible with the given properties
+    bool is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const;
+
+    Config config;
+    LinkedList block_list;
+    MemoryAllocators allocators;
+};
+
+BlockAllocator *BlockAllocator::create(void *user_context, const Config &cfg, const MemoryAllocators &allocators) {
+    halide_abort_if_false(user_context, allocators.system.allocate != nullptr);
+    BlockAllocator *result = reinterpret_cast<BlockAllocator *>(
+        allocators.system.allocate(user_context, sizeof(BlockAllocator)));
+
+    if (result == nullptr) {
+        error(user_context) << "BlockAllocator: Failed to create instance! Out of memory!\n";
+        return nullptr;
+    }
+
+    result->initialize(user_context, cfg, allocators);
+    return result;
+}
+
+void BlockAllocator::destroy(void *user_context, BlockAllocator *instance) {
+    halide_abort_if_false(user_context, instance != nullptr);
+    const MemoryAllocators &allocators = instance->allocators;
+    instance->destroy(user_context);
+    halide_abort_if_false(user_context, allocators.system.deallocate != nullptr);
+    allocators.system.deallocate(user_context, instance);
+}
+
+void BlockAllocator::initialize(void *user_context, const Config &cfg, const MemoryAllocators &ma) {
+    config = cfg;
+    allocators = ma;
+    block_list.initialize(user_context,
+                          sizeof(BlockResource),
+                          config.initial_capacity,
+                          allocators.system);
+}
+
+MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &request) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "BlockAllocator: Reserve ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "offset=" << (uint32_t)request.offset << " "
+                        << "size=" << (uint32_t)request.size << " "
+                        << "dedicated=" << (request.dedicated ? "true" : "false") << " "
+                        << "usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                        << "caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                        << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...\n";
+#endif
+    BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated);
+    if (block_entry == nullptr) {
+        debug(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size ("
+                            << (int32_t)(request.size) << " bytes)!\n";
+        return nullptr;
+    }
+
+    BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+    halide_abort_if_false(user_context, block != nullptr);
+    halide_abort_if_false(user_context, block->allocator != nullptr);
+
+    MemoryRegion *result = reserve_memory_region(user_context, block->allocator, request);
+    if (result == nullptr) {
+
+        // Unable to reserve region in an existing block ... create a new block and try again.
+        size_t actual_size = constrain_requested_size(request.size);
+        block_entry = create_block_entry(user_context, request.properties, actual_size, request.dedicated);
+        if (block_entry == nullptr) {
+            debug(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
+                                << (int32_t)(actual_size) << " bytes)!\n";
+            return nullptr;
+        }
+
+        block = static_cast<BlockResource *>(block_entry->value);
+        if (block->allocator == nullptr) {
+            block->allocator = create_region_allocator(user_context, block);
+        }
+
+        result = reserve_memory_region(user_context, block->allocator, request);
+    }
+    return result;
+}
+
+void BlockAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
+    halide_abort_if_false(user_context, memory_region != nullptr);
+    RegionAllocator *allocator = RegionAllocator::find_allocator(user_context, memory_region);
+    if (allocator == nullptr) { return; }
+    allocator->reclaim(user_context, memory_region);
+}
+
+bool BlockAllocator::collect(void *user_context) {
+    bool result = false;
+    BlockEntry *block_entry = block_list.back();
+    while (block_entry != nullptr) {
+        BlockEntry *prev_entry = block_entry->prev_ptr;
+
+        const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+        if (block->allocator == nullptr) {
+            block_entry = prev_entry;
+            continue;
+        }
+
+        block->allocator->collect(user_context);
+        if (block->reserved == 0) {
+            destroy_block_entry(user_context, block_entry);
+            result = true;
+        }
+
+        block_entry = prev_entry;
+    }
+    return result;
+}
+
+void BlockAllocator::release(void *user_context) {
+    BlockEntry *block_entry = block_list.back();
+    while (block_entry != nullptr) {
+        BlockEntry *prev_entry = block_entry->prev_ptr;
+        release_block_entry(user_context, block_entry);
+        block_entry = prev_entry;
+    }
+}
+
+void BlockAllocator::destroy(void *user_context) {
+    BlockEntry *block_entry = block_list.back();
+    while (block_entry != nullptr) {
+        BlockEntry *prev_entry = block_entry->prev_ptr;
+        destroy_block_entry(user_context, block_entry);
+        block_entry = prev_entry;
+    }
+}
+
+MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAllocator *allocator, const MemoryRequest &request) {
+    MemoryRegion *result = allocator->reserve(user_context, request);
+    if (result == nullptr) {
+#ifdef DEBUG_RUNTIME
+        debug(user_context) << "BlockAllocator: Failed to allocate region of size ("
+                            << (int32_t)(request.size) << " bytes)!\n";
+#endif
+        // allocator has enough free space, but not enough contiguous space
+        // -- collect and try to reallocate
+        if (allocator->collect(user_context)) {
+            result = allocator->reserve(user_context, request);
+        }
+    }
+    return result;
+}
+
+BlockAllocator::BlockEntry *
+BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+    BlockEntry *block_entry = nullptr;
+    for (block_entry = block_list.front(); block_entry != nullptr; block_entry = block_entry->next_ptr) {
+
+        const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+        if (!is_compatible_block(block, properties)) {
+            continue;
+        }
+
+        // skip blocks that can't be dedicated to a single allocation
+        if (dedicated && (block->reserved > 0)) {
+            continue;
+        }
+
+        // skip dedicated blocks that are already allocated
+        if (block->memory.dedicated && (block->reserved > 0)) {
+            continue;
+        }
+
+        size_t available = (block->memory.size - block->reserved);
+        if (available >= size) {
+#ifdef DEBUG_RUNTIME
+            debug(user_context) << "BlockAllocator: find_block_entry (FOUND) ("
+                                << "user_context=" << (void *)(user_context) << " "
+                                << "block_entry=" << (void *)(block_entry) << " "
+                                << "size=" << (uint32_t)size << " "
+                                << "dedicated=" << (dedicated ? "true" : "false") << " "
+                                << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                                << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                                << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n";
+#endif
+            break;
+        }
+    }
+
+    return block_entry;
+}
+
+BlockAllocator::BlockEntry *
+BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+    BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated);
+    if (block_entry == nullptr) {
+        size_t actual_size = constrain_requested_size(size);
+        block_entry = create_block_entry(user_context, properties, actual_size, dedicated);
+    }
+
+    if (block_entry) {
+        BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+        if (block->allocator == nullptr) {
+            block->allocator = create_region_allocator(user_context, block);
+        }
+    }
+    return block_entry;
+}
+
+RegionAllocator *
+BlockAllocator::create_region_allocator(void *user_context, BlockResource *block) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "BlockAllocator: Creating region allocator ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "block_resource=" << (void *)(block) << ")...\n";
+#endif
+    halide_abort_if_false(user_context, block != nullptr);
+    RegionAllocator *region_allocator = RegionAllocator::create(
+        user_context, block, {allocators.system, allocators.region});
+
+    if (region_allocator == nullptr) {
+        error(user_context) << "BlockAllocator: Failed to create new region allocator!\n";
+        return nullptr;
+    }
+
+    return region_allocator;
+}
+
+void BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator *region_allocator) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "BlockAllocator: Destroying region allocator ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "region_allocator=" << (void *)(region_allocator) << ")...\n";
+#endif
+    if (region_allocator == nullptr) { return; }
+    region_allocator->destroy(user_context);
+    RegionAllocator::destroy(user_context, region_allocator);
+}
+
+BlockAllocator::BlockEntry *
+BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+    if (config.maximum_block_count && (block_count() >= config.maximum_block_count)) {
+        debug(user_context) << "BlockAllocator: No free blocks found! Maximum block count reached ("
+                            << (int32_t)(config.maximum_block_count) << ")!\n";
+        return nullptr;
+    }
+
+    BlockEntry *block_entry = block_list.append(user_context);
+    if (block_entry == nullptr) {
+        debug(user_context) << "BlockAllocator: Failed to allocate new block entry!\n";
+        return nullptr;
+    }
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "BlockAllocator: Creating block entry ("
+                        << "block_entry=" << (void *)(block_entry) << " "
+                        << "block=" << (void *)(block_entry->value) << " "
+                        << "allocator=" << (void *)(allocators.block.allocate) << ")...\n";
+#endif
+
+    BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+    block->memory.size = size;
+    block->memory.properties = properties;
+    block->memory.dedicated = dedicated;
+    block->reserved = 0;
+    block->allocator = create_region_allocator(user_context, block);
+    alloc_memory_block(user_context, block);
+    return block_entry;
+}
+
+void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "BlockAllocator: Releasing block entry ("
+                        << "block_entry=" << (void *)(block_entry) << " "
+                        << "block=" << (void *)(block_entry->value) << ")...\n";
+#endif
+    BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+    if (block->allocator) {
+        block->allocator->release(user_context);
+    }
+}
+
+void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "BlockAllocator: Destroying block entry ("
+                        << "block_entry=" << (void *)(block_entry) << " "
+                        << "block=" << (void *)(block_entry->value) << " "
+                        << "deallocator=" << (void *)(allocators.block.deallocate) << ")...\n";
+#endif
+    BlockResource *block = static_cast<BlockResource *>(block_entry->value);
+    if (block->allocator) {
+        destroy_region_allocator(user_context, block->allocator);
+        block->allocator = nullptr;
+    }
+    free_memory_block(user_context, block);
+    block_list.remove(user_context, block_entry);
+}
+
+void BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n";
+#endif
+    halide_abort_if_false(user_context, allocators.block.allocate != nullptr);
+    MemoryBlock *memory_block = &(block->memory);
+    allocators.block.allocate(user_context, memory_block);
+    block->reserved = 0;
+}
+
+void BlockAllocator::free_memory_block(void *user_context, BlockResource *block) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n";
+#endif
+    halide_abort_if_false(user_context, allocators.block.deallocate != nullptr);
+    MemoryBlock *memory_block = &(block->memory);
+    allocators.block.deallocate(user_context, memory_block);
+    block->reserved = 0;
+    block->memory.size = 0;
+}
+
+size_t BlockAllocator::constrain_requested_size(size_t size) const {
+    size_t actual_size = size;
+    if (config.minimum_block_size) {
+        actual_size = ((actual_size < config.minimum_block_size) ?
+                           config.minimum_block_size :
+                           actual_size);
+    }
+    if (config.maximum_block_size) {
+        actual_size = ((actual_size > config.maximum_block_size) ?
+                           config.maximum_block_size :
+                           actual_size);
+    }
+    return actual_size;
+}
+
+bool BlockAllocator::is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const {
+    if (properties.caching != MemoryCaching::DefaultCaching) {
+        if (properties.caching != block->memory.properties.caching) {
+            return false;
+        }
+    }
+
+    if (properties.visibility != MemoryVisibility::DefaultVisibility) {
+        if (properties.visibility != block->memory.properties.visibility) {
+            return false;
+        }
+    }
+
+    if (properties.usage != MemoryUsage::DefaultUsage) {
+        if (properties.usage != block->memory.properties.usage) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+const BlockAllocator::MemoryAllocators &BlockAllocator::current_allocators() const {
+    return allocators;
+}
+
+const BlockAllocator::Config &BlockAllocator::current_config() const {
+    return config;
+}
+
+const BlockAllocator::Config &BlockAllocator::default_config() const {
+    static Config result;
+    return result;
+}
+
+size_t BlockAllocator::block_count() const {
+    return block_list.size();
+}
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+#endif  // HALIDE_RUNTIME_BLOCK_ALLOCATOR_H
diff --git a/src/runtime/internal/block_storage.h b/src/runtime/internal/block_storage.h
new file mode 100644
index 000000000000..648f10a84846
--- /dev/null
+++ b/src/runtime/internal/block_storage.h
@@ -0,0 +1,425 @@
+#ifndef HALIDE_RUNTIME_BLOCK_STORAGE_H
+#define HALIDE_RUNTIME_BLOCK_STORAGE_H
+
+#include "memory_resources.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// Dynamically resizable array for block storage (eg plain old data)
+// -- No usage of constructors/destructors for value type
+// -- Assumes all elements stored are uniformly the same fixed size
+// -- Allocations are done in blocks of a fixed size
+// -- Implementation uses memcpy/memmove for copying
+// -- Customizable allocator ... default uses NativeSystemAllocator
+class BlockStorage {
+public:
+    static constexpr size_t default_capacity = 32;  // smallish
+
+    // Configurable parameters
+    struct Config {
+        uint32_t entry_size = 1;   // bytes per entry
+        uint32_t block_size = 32;  // bytes per each allocation block
+        uint32_t minimum_capacity = default_capacity;
+    };
+
+    BlockStorage(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &sma = default_allocator());
+    BlockStorage(const BlockStorage &other);
+    ~BlockStorage();
+
+    void initialize(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &sma = default_allocator());
+
+    BlockStorage &operator=(const BlockStorage &other);
+    bool operator==(const BlockStorage &other) const;
+    bool operator!=(const BlockStorage &other) const;
+
+    void reserve(void *user_context, size_t capacity, bool free_existing = false);
+    void resize(void *user_context, size_t entry_count, bool realloc = true);
+
+    void assign(void *user_context, size_t index, const void *entry_ptr);
+    void insert(void *user_context, size_t index, const void *entry_ptr);
+    void prepend(void *user_context, const void *entry_ptr);
+    void append(void *user_context, const void *entry_ptr);
+    void remove(void *user_context, size_t index);
+
+    void fill(void *user_context, const void *array, size_t array_size);
+    void insert(void *user_context, size_t index, const void *array, size_t array_size);
+    void replace(void *user_context, size_t index, const void *array, size_t array_size);
+    void prepend(void *user_context, const void *array, size_t array_size);
+    void append(void *user_context, const void *array, size_t array_size);
+    void remove(void *user_context, size_t index, size_t entry_count);
+
+    void pop_front(void *user_context);
+    void pop_back(void *user_context);
+    void shrink_to_fit(void *user_context);
+    void clear(void *user_context);
+    void destroy(void *user_context);
+
+    bool empty() const;
+    size_t stride() const;
+    size_t size() const;
+
+    void *operator[](size_t index);  ///< logical entry index (returns ptr = data() + (index * stride())
+    const void *operator[](size_t index) const;
+
+    void *data();
+    void *front();
+    void *back();
+
+    const void *data() const;
+    const void *front() const;
+    const void *back() const;
+
+    const Config &current_config() const;
+    static const Config &default_config();
+
+    const SystemMemoryAllocatorFns &current_allocator() const;
+    static const SystemMemoryAllocatorFns &default_allocator();
+
+private:
+    void allocate(void *user_context, size_t capacity);
+
+    void *ptr = nullptr;
+    size_t count = 0;
+    size_t capacity = 0;
+    Config config;
+    SystemMemoryAllocatorFns allocator;
+};
+
+BlockStorage::BlockStorage(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &sma)
+    : config(cfg), allocator(sma) {
+    halide_abort_if_false(user_context, config.entry_size != 0);
+    halide_abort_if_false(user_context, allocator.allocate != nullptr);
+    halide_abort_if_false(user_context, allocator.deallocate != nullptr);
+    if (config.minimum_capacity) {
+        reserve(user_context, config.minimum_capacity);
+    }
+}
+
+BlockStorage::BlockStorage(const BlockStorage &other)
+    : BlockStorage(nullptr, other.config, other.allocator) {
+    if (other.count) {
+        resize(nullptr, other.count);
+        memcpy(this->ptr, other.ptr, count * config.entry_size);
+    }
+}
+
+BlockStorage::~BlockStorage() {
+    destroy(nullptr);
+}
+
+void BlockStorage::destroy(void *user_context) {
+    halide_abort_if_false(user_context, allocator.deallocate != nullptr);
+    if (ptr != nullptr) {
+        allocator.deallocate(user_context, ptr);
+    }
+    capacity = count = 0;
+    ptr = nullptr;
+}
+
+void BlockStorage::initialize(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &sma) {
+    allocator = sma;
+    config = cfg;
+    capacity = count = 0;
+    ptr = nullptr;
+    if (config.minimum_capacity) {
+        reserve(user_context, config.minimum_capacity);
+    }
+}
+
+BlockStorage &BlockStorage::operator=(const BlockStorage &other) {
+    if (&other != this) {
+        config = other.config;
+        resize(nullptr, other.count);
+        if (count != 0 && other.ptr != nullptr) {
+            memcpy(ptr, other.ptr, count * config.entry_size);
+        }
+    }
+    return *this;
+}
+
+bool BlockStorage::operator==(const BlockStorage &other) const {
+    if (config.entry_size != other.config.entry_size) { return false; }
+    if (count != other.count) { return false; }
+    return memcmp(this->ptr, other.ptr, this->size() * config.entry_size) == 0;
+}
+
+bool BlockStorage::operator!=(const BlockStorage &other) const {
+    return !(*this == other);
+}
+
+void BlockStorage::fill(void *user_context, const void *array, size_t array_size) {
+    if (array_size != 0) {
+        resize(user_context, array_size);
+        memcpy(this->ptr, array, array_size * config.entry_size);
+        count = array_size;
+    }
+}
+
+void BlockStorage::assign(void *user_context, size_t index, const void *entry_ptr) {
+    replace(user_context, index, entry_ptr, 1);
+}
+
+void BlockStorage::prepend(void *user_context, const void *entry_ptr) {
+    insert(user_context, 0, entry_ptr, 1);
+}
+
+void BlockStorage::append(void *user_context, const void *entry_ptr) {
+    append(user_context, entry_ptr, 1);
+}
+
+void BlockStorage::pop_front(void *user_context) {
+    halide_debug_assert(user_context, count > 0);
+    remove(user_context, 0);
+}
+
+void BlockStorage::pop_back(void *user_context) {
+    halide_debug_assert(user_context, count > 0);
+    resize(user_context, size() - 1);
+}
+
+void BlockStorage::clear(void *user_context) {
+    resize(user_context, 0);
+}
+
+void BlockStorage::reserve(void *user_context, size_t new_capacity, bool free_existing) {
+    new_capacity = max(new_capacity, count);
+
+    if ((new_capacity < capacity) && !free_existing) {
+        new_capacity = capacity;
+    }
+
+    allocate(user_context, new_capacity);
+}
+
+void BlockStorage::resize(void *user_context, size_t entry_count, bool realloc) {
+    size_t current_size = capacity;
+    size_t requested_size = entry_count;
+    size_t minimum_size = config.minimum_capacity;
+    size_t actual_size = current_size;
+    count = requested_size;
+
+    // increase capacity upto 1.5x existing (or at least min_capacity)
+    if (requested_size > current_size) {
+        actual_size = max(requested_size, max(current_size * 3 / 2, minimum_size));
+    } else if (!realloc) {
+        return;
+    }
+
+#if DEBUG
+    debug(user_context) << "BlockStorage: Resize ("
+                        << "requested_size=" << (int32_t)requested_size << " "
+                        << "current_size=" << (int32_t)current_size << " "
+                        << "minimum_size=" << (int32_t)minimum_size << " "
+                        << "actual_size=" << (int32_t)actual_size << " "
+                        << "entry_size=" << (int32_t)config.entry_size << " "
+                        << "realloc=" << (realloc ? "true" : "false") << ")...\n";
+#endif
+
+    allocate(user_context, actual_size);
+}
+
+void BlockStorage::shrink_to_fit(void *user_context) {
+    if (capacity > count) {
+        void *new_ptr = nullptr;
+        if (count > 0) {
+            size_t actual_bytes = count * config.entry_size;
+            new_ptr = allocator.allocate(user_context, actual_bytes);
+            memcpy(new_ptr, ptr, actual_bytes);
+        }
+        allocator.deallocate(user_context, ptr);
+        capacity = count;
+        ptr = new_ptr;
+    }
+}
+
+void BlockStorage::insert(void *user_context, size_t index, const void *entry_ptr) {
+    insert(user_context, index, entry_ptr, 1);
+}
+
+void BlockStorage::remove(void *user_context, size_t index) {
+    remove(user_context, index, 1);
+}
+
+void BlockStorage::remove(void *user_context, size_t index, size_t entry_count) {
+    halide_debug_assert(user_context, index < count);
+    const size_t last_index = size();
+    if (index < (last_index - entry_count)) {
+        size_t dst_offset = index * config.entry_size;
+        size_t src_offset = (index + entry_count) * config.entry_size;
+        size_t bytes = (last_index - index - entry_count) * config.entry_size;
+
+#if DEBUG
+        debug(0) << "BlockStorage: Remove ("
+                 << "index=" << (int32_t)index << " "
+                 << "entry_count=" << (int32_t)entry_count << " "
+                 << "entry_size=" << (int32_t)config.entry_size << " "
+                 << "last_index=" << (int32_t)last_index << " "
+                 << "src_offset=" << (int32_t)src_offset << " "
+                 << "dst_offset=" << (int32_t)dst_offset << " "
+                 << "bytes=" << (int32_t)bytes << ")...\n";
+#endif
+        void *dst_ptr = offset_address(ptr, dst_offset);
+        void *src_ptr = offset_address(ptr, src_offset);
+        memmove(dst_ptr, src_ptr, bytes);
+    }
+    resize(user_context, last_index - entry_count);
+}
+
+void BlockStorage::replace(void *user_context, size_t index, const void *array, size_t array_size) {
+    halide_debug_assert(user_context, index < count);
+    size_t offset = index * config.entry_size;
+    size_t remaining = count - index;
+
+#if DEBUG
+    debug(0) << "BlockStorage: Replace ("
+             << "index=" << (int32_t)index << " "
+             << "array_size=" << (int32_t)array_size << " "
+             << "entry_size=" << (int32_t)config.entry_size << " "
+             << "offset=" << (int32_t)offset << " "
+             << "remaining=" << (int32_t)remaining << " "
+             << "capacity=" << (int32_t)capacity << ")...\n";
+#endif
+
+    halide_debug_assert(user_context, remaining > 0);
+    size_t copy_count = min(remaining, array_size);
+    void *dst_ptr = offset_address(ptr, offset);
+    memcpy(dst_ptr, array, copy_count * config.entry_size);
+    count = max(count, index + copy_count);
+}
+
+void BlockStorage::insert(void *user_context, size_t index, const void *array, size_t array_size) {
+    halide_debug_assert(user_context, index <= count);
+    const size_t last_index = size();
+    resize(user_context, last_index + array_size);
+    if (index < last_index) {
+        size_t src_offset = index * config.entry_size;
+        size_t dst_offset = (index + array_size) * config.entry_size;
+        size_t bytes = (last_index - index) * config.entry_size;
+        void *src_ptr = offset_address(ptr, src_offset);
+        void *dst_ptr = offset_address(ptr, dst_offset);
+        memmove(dst_ptr, src_ptr, bytes);
+    }
+    replace(user_context, index, array, array_size);
+}
+
+void BlockStorage::prepend(void *user_context, const void *array, size_t array_size) {
+    insert(user_context, 0, array, array_size);
+}
+
+void BlockStorage::append(void *user_context, const void *array, size_t array_size) {
+    const size_t last_index = size();
+    insert(user_context, last_index, array, array_size);
+}
+
+bool BlockStorage::empty() const {
+    return count == 0;
+}
+
+size_t BlockStorage::size() const {
+    return count;
+}
+
+size_t BlockStorage::stride() const {
+    return config.entry_size;
+}
+
+void *BlockStorage::operator[](size_t index) {
+    halide_debug_assert(nullptr, index < capacity);
+    return offset_address(ptr, index * config.entry_size);
+}
+
+const void *BlockStorage::operator[](size_t index) const {
+    halide_debug_assert(nullptr, index < capacity);
+    return offset_address(ptr, index * config.entry_size);
+}
+
+void *BlockStorage::data() {
+    return ptr;
+}
+
+void *BlockStorage::front() {
+    halide_debug_assert(nullptr, count > 0);
+    return ptr;
+}
+
+void *BlockStorage::back() {
+    halide_debug_assert(nullptr, count > 0);
+    size_t index = count - 1;
+    return offset_address(ptr, index * config.entry_size);
+}
+
+const void *BlockStorage::data() const {
+    return ptr;
+}
+
+const void *BlockStorage::front() const {
+    halide_debug_assert(nullptr, count > 0);
+    return ptr;
+}
+
+const void *BlockStorage::back() const {
+    halide_debug_assert(nullptr, count > 0);
+    size_t index = count - 1;
+    return offset_address(ptr, index * config.entry_size);
+}
+
+void BlockStorage::allocate(void *user_context, size_t new_capacity) {
+    if (new_capacity != capacity) {
+        halide_abort_if_false(user_context, allocator.allocate != nullptr);
+        size_t requested_bytes = new_capacity * config.entry_size;
+        size_t block_size = max(config.block_size, config.entry_size);
+        size_t block_count = (requested_bytes / block_size);
+        block_count += (requested_bytes % block_size) ? 1 : 0;
+        size_t alloc_size = block_count * block_size;
+#if DEBUG
+        debug(0) << "BlockStorage: Allocating ("
+                 << "requested_bytes=" << (int32_t)requested_bytes << " "
+                 << "block_size=" << (int32_t)block_size << " "
+                 << "block_count=" << (int32_t)block_count << " "
+                 << "alloc_size=" << (int32_t)alloc_size << ") ...\n";
+#endif
+        void *new_ptr = alloc_size ? allocator.allocate(user_context, alloc_size) : nullptr;
+        if (count != 0 && ptr != nullptr && new_ptr != nullptr) {
+            memcpy(new_ptr, ptr, count * config.entry_size);
+        }
+        if (ptr != nullptr) {
+            halide_abort_if_false(user_context, allocator.deallocate != nullptr);
+            allocator.deallocate(user_context, ptr);
+        }
+        capacity = new_capacity;
+        ptr = new_ptr;
+    }
+}
+
+const SystemMemoryAllocatorFns &
+BlockStorage::current_allocator() const {
+    return this->allocator;
+}
+
+const BlockStorage::Config &
+BlockStorage::default_config() {
+    static Config default_cfg;
+    return default_cfg;
+}
+
+const BlockStorage::Config &
+BlockStorage::current_config() const {
+    return this->config;
+}
+
+const SystemMemoryAllocatorFns &
+BlockStorage::default_allocator() {
+    static SystemMemoryAllocatorFns native_allocator = {
+        native_system_malloc, native_system_free};
+    return native_allocator;
+}
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+#endif  // HALIDE_RUNTIME_BLOCK_STORAGE_H
diff --git a/src/runtime/internal/linked_list.h b/src/runtime/internal/linked_list.h
new file mode 100644
index 000000000000..dea22c13285e
--- /dev/null
+++ b/src/runtime/internal/linked_list.h
@@ -0,0 +1,333 @@
+#ifndef HALIDE_RUNTIME_LINKED_LIST_H
+#define HALIDE_RUNTIME_LINKED_LIST_H
+
+#include "memory_arena.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// Doubly linked list container
+// -- Implemented using MemoryArena for allocation
+class LinkedList {
+public:
+    // Disable copy support
+    LinkedList(const LinkedList &) = delete;
+    LinkedList &operator=(const LinkedList &) = delete;
+
+    // Default initial capacity
+    static constexpr uint32_t default_capacity = uint32_t(32);  // smallish
+
+    // List entry
+    struct EntryType {
+        void *value = nullptr;
+        EntryType *prev_ptr = nullptr;
+        EntryType *next_ptr = nullptr;
+    };
+
+    LinkedList(void *user_context, uint32_t entry_size, uint32_t capacity = default_capacity,
+               const SystemMemoryAllocatorFns &allocator = default_allocator());
+    ~LinkedList();
+
+    void initialize(void *user_context, uint32_t entry_size, uint32_t capacity = default_capacity,
+                    const SystemMemoryAllocatorFns &allocator = default_allocator());
+
+    EntryType *front();
+    EntryType *back();
+
+    const EntryType *front() const;
+    const EntryType *back() const;
+
+    EntryType *prepend(void *user_context);
+    EntryType *prepend(void *user_context, const void *value);
+
+    EntryType *append(void *user_context);
+    EntryType *append(void *user_context, const void *value);
+
+    void pop_front(void *user_context);
+    void pop_back(void *user_context);
+
+    EntryType *insert_before(void *user_context, EntryType *entry_ptr);
+    EntryType *insert_before(void *user_context, EntryType *entry_ptr, const void *value);
+
+    EntryType *insert_after(void *user_context, EntryType *entry_ptr);
+    EntryType *insert_after(void *user_context, EntryType *entry_ptr, const void *value);
+
+    void remove(void *user_context, EntryType *entry_ptr);
+    void clear(void *user_context);
+    void destroy(void *user_context);
+
+    size_t size() const;
+    bool empty() const;
+
+    const SystemMemoryAllocatorFns &current_allocator() const;
+    static const SystemMemoryAllocatorFns &default_allocator();
+
+private:
+    EntryType *reserve(void *user_context);
+    void reclaim(void *user_context, EntryType *entry_ptr);
+
+    MemoryArena *link_arena = nullptr;
+    MemoryArena *data_arena = nullptr;
+    EntryType *front_ptr = nullptr;
+    EntryType *back_ptr = nullptr;
+    size_t entry_count = 0;
+};
+
+LinkedList::LinkedList(void *user_context, uint32_t entry_size, uint32_t capacity,
+                       const SystemMemoryAllocatorFns &sma) {
+    uint32_t arena_capacity = max(capacity, MemoryArena::default_capacity);
+    link_arena = MemoryArena::create(user_context, {sizeof(EntryType), arena_capacity, 0}, sma);
+    data_arena = MemoryArena::create(user_context, {entry_size, arena_capacity, 0}, sma);
+    front_ptr = nullptr;
+    back_ptr = nullptr;
+    entry_count = 0;
+}
+
+LinkedList::~LinkedList() {
+    destroy(nullptr);
+}
+
+void LinkedList::initialize(void *user_context, uint32_t entry_size, uint32_t capacity,
+                            const SystemMemoryAllocatorFns &sma) {
+    uint32_t arena_capacity = max(capacity, MemoryArena::default_capacity);
+    link_arena = MemoryArena::create(user_context, {sizeof(EntryType), arena_capacity, 0}, sma);
+    data_arena = MemoryArena::create(user_context, {entry_size, arena_capacity, 0}, sma);
+    front_ptr = nullptr;
+    back_ptr = nullptr;
+    entry_count = 0;
+}
+
+void LinkedList::destroy(void *user_context) {
+    clear(nullptr);
+    if (link_arena) { MemoryArena::destroy(nullptr, link_arena); }
+    if (data_arena) { MemoryArena::destroy(nullptr, data_arena); }
+    link_arena = nullptr;
+    data_arena = nullptr;
+    front_ptr = nullptr;
+    back_ptr = nullptr;
+    entry_count = 0;
+}
+
+typename LinkedList::EntryType *LinkedList::front() {
+    return front_ptr;
+}
+
+typename LinkedList::EntryType *LinkedList::back() {
+    return back_ptr;
+}
+
+const typename LinkedList::EntryType *LinkedList::front() const {
+    return front_ptr;
+}
+
+const typename LinkedList::EntryType *LinkedList::back() const {
+    return back_ptr;
+}
+
+typename LinkedList::EntryType *
+LinkedList::prepend(void *user_context) {
+    EntryType *entry_ptr = reserve(user_context);
+    if (empty()) {
+        front_ptr = entry_ptr;
+        back_ptr = entry_ptr;
+        entry_count = 1;
+    } else {
+        entry_ptr->next_ptr = front_ptr;
+        front_ptr->prev_ptr = entry_ptr;
+        front_ptr = entry_ptr;
+        ++entry_count;
+    }
+    return entry_ptr;
+}
+
+typename LinkedList::EntryType *
+LinkedList::append(void *user_context) {
+    EntryType *entry_ptr = reserve(user_context);
+    if (empty()) {
+        front_ptr = entry_ptr;
+        back_ptr = entry_ptr;
+        entry_count = 1;
+    } else {
+        entry_ptr->prev_ptr = back_ptr;
+        back_ptr->next_ptr = entry_ptr;
+        back_ptr = entry_ptr;
+        ++entry_count;
+    }
+    return entry_ptr;
+}
+
+typename LinkedList::EntryType *
+LinkedList::prepend(void *user_context, const void *value) {
+    EntryType *entry_ptr = prepend(user_context);
+    memcpy(entry_ptr->value, value, data_arena->current_config().entry_size);
+    return entry_ptr;
+}
+
+typename LinkedList::EntryType *
+LinkedList::append(void *user_context, const void *value) {
+    EntryType *entry_ptr = append(user_context);
+    memcpy(entry_ptr->value, value, data_arena->current_config().entry_size);
+    return entry_ptr;
+}
+
+void LinkedList::pop_front(void *user_context) {
+    halide_abort_if_false(user_context, (entry_count > 0));
+    EntryType *remove_ptr = front_ptr;
+    EntryType *next_ptr = remove_ptr->next_ptr;
+    if (next_ptr != nullptr) {
+        next_ptr->prev_ptr = nullptr;
+    }
+    front_ptr = next_ptr;
+    reclaim(user_context, remove_ptr);
+    --entry_count;
+}
+
+void LinkedList::pop_back(void *user_context) {
+    halide_abort_if_false(user_context, (entry_count > 0));
+    EntryType *remove_ptr = back_ptr;
+    EntryType *prev_ptr = remove_ptr->prev_ptr;
+    if (prev_ptr != nullptr) {
+        prev_ptr->next_ptr = nullptr;
+    }
+    back_ptr = prev_ptr;
+    reclaim(user_context, remove_ptr);
+    --entry_count;
+}
+
+void LinkedList::clear(void *user_context) {
+    if (empty() == false) {
+        EntryType *remove_ptr = back_ptr;
+        while (remove_ptr != nullptr) {
+            EntryType *prev_ptr = remove_ptr->prev_ptr;
+            reclaim(user_context, remove_ptr);
+            remove_ptr = prev_ptr;
+        }
+        front_ptr = nullptr;
+        back_ptr = nullptr;
+        entry_count = 0;
+    }
+}
+
+void LinkedList::remove(void *user_context, EntryType *entry_ptr) {
+    halide_abort_if_false(user_context, (entry_ptr != nullptr));
+    halide_abort_if_false(user_context, (entry_count > 0));
+
+    if (entry_ptr->prev_ptr != nullptr) {
+        entry_ptr->prev_ptr->next_ptr = entry_ptr->next_ptr;
+    } else {
+        halide_abort_if_false(user_context, (front_ptr == entry_ptr));
+        front_ptr = entry_ptr->next_ptr;
+    }
+
+    if (entry_ptr->next_ptr != nullptr) {
+        entry_ptr->next_ptr->prev_ptr = entry_ptr->prev_ptr;
+    } else {
+        halide_abort_if_false(user_context, (back_ptr == entry_ptr));
+        back_ptr = entry_ptr->prev_ptr;
+    }
+
+    reclaim(user_context, entry_ptr);
+    --entry_count;
+}
+
+typename LinkedList::EntryType *
+LinkedList::insert_before(void *user_context, EntryType *entry_ptr) {
+    if (entry_ptr != nullptr) {
+        EntryType *prev_ptr = entry_ptr->prev_ptr;
+        EntryType *new_ptr = reserve(user_context);
+        new_ptr->prev_ptr = prev_ptr;
+        new_ptr->next_ptr = entry_ptr;
+        entry_ptr->prev_ptr = new_ptr;
+        if (prev_ptr != nullptr) {
+            prev_ptr->next_ptr = new_ptr;
+        } else {
+            halide_abort_if_false(user_context, (front_ptr == entry_ptr));
+            front_ptr = new_ptr;
+        }
+        ++entry_count;
+        return new_ptr;
+    } else {
+        return append(user_context);
+    }
+}
+
+typename LinkedList::EntryType *
+LinkedList::insert_after(void *user_context, EntryType *entry_ptr) {
+    if (entry_ptr != nullptr) {
+        EntryType *next_ptr = entry_ptr->next_ptr;
+        EntryType *new_ptr = reserve(user_context);
+        new_ptr->next_ptr = next_ptr;
+        new_ptr->prev_ptr = entry_ptr;
+        entry_ptr->next_ptr = new_ptr;
+        if (next_ptr != nullptr) {
+            next_ptr->prev_ptr = new_ptr;
+        } else {
+            halide_abort_if_false(user_context, (back_ptr == entry_ptr));
+            back_ptr = new_ptr;
+        }
+        ++entry_count;
+        return new_ptr;
+    } else {
+        return prepend(user_context);
+    }
+}
+
+typename LinkedList::EntryType *
+LinkedList::insert_before(void *user_context, EntryType *entry_ptr, const void *value) {
+    EntryType *new_ptr = insert_before(user_context, entry_ptr);
+    memcpy(new_ptr->value, value, data_arena->current_config().entry_size);
+    return new_ptr;
+}
+
+typename LinkedList::EntryType *
+LinkedList::insert_after(void *user_context, EntryType *entry_ptr, const void *value) {
+    EntryType *new_ptr = insert_after(user_context, entry_ptr);
+    memcpy(new_ptr->value, value, data_arena->current_config().entry_size);
+    return new_ptr;
+}
+
+size_t LinkedList::size() const {
+    return entry_count;
+}
+
+bool LinkedList::empty() const {
+    return entry_count == 0;
+}
+
+const SystemMemoryAllocatorFns &
+LinkedList::current_allocator() const {
+    return link_arena->current_allocator();
+}
+
+const SystemMemoryAllocatorFns &
+LinkedList::default_allocator() {
+    return MemoryArena::default_allocator();
+}
+
+typename LinkedList::EntryType *
+LinkedList::reserve(void *user_context) {
+    EntryType *entry_ptr = static_cast<EntryType *>(
+        link_arena->reserve(user_context, true));
+    entry_ptr->value = data_arena->reserve(user_context, true);
+    entry_ptr->next_ptr = nullptr;
+    entry_ptr->prev_ptr = nullptr;
+    return entry_ptr;
+}
+
+void LinkedList::reclaim(void *user_context, EntryType *entry_ptr) {
+    void *value_ptr = entry_ptr->value;
+    entry_ptr->value = nullptr;
+    entry_ptr->next_ptr = nullptr;
+    entry_ptr->prev_ptr = nullptr;
+    data_arena->reclaim(user_context, value_ptr);
+    link_arena->reclaim(user_context, entry_ptr);
+}
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+#endif  // HALIDE_RUNTIME_LINKED_LIST_H
diff --git a/src/runtime/internal/memory_arena.h b/src/runtime/internal/memory_arena.h
new file mode 100644
index 000000000000..27c3d871dccf
--- /dev/null
+++ b/src/runtime/internal/memory_arena.h
@@ -0,0 +1,310 @@
+#ifndef HALIDE_RUNTIME_MEMORY_ARENA_H
+#define HALIDE_RUNTIME_MEMORY_ARENA_H
+
+#include "block_storage.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// --
+// Memory Arena class for region based allocations and caching of same-type data
+// -- Implementation uses block_storage, and internally manages lists of allocated entries
+// -- Customizable allocator (defaults to BlockStorage::default_allocator())
+// -- Not thread safe ... locking must be done by client
+//
+class MemoryArena {
+public:
+    // Disable copy constructors and assignment
+    MemoryArena(const MemoryArena &) = delete;
+    MemoryArena &operator=(const MemoryArena &) = delete;
+
+    // Default initial capacity
+    static constexpr uint32_t default_capacity = uint32_t(32);  // smallish
+
+    // Configurable parameters
+    struct Config {
+        uint32_t entry_size = 1;
+        uint32_t minimum_block_capacity = default_capacity;
+        uint32_t maximum_block_count = 0;
+    };
+
+    MemoryArena(void *user_context, const Config &config = default_config(),
+                const SystemMemoryAllocatorFns &allocator = default_allocator());
+
+    ~MemoryArena();
+
+    // Factory methods for creation / destruction
+    static MemoryArena *create(void *user_context, const Config &config, const SystemMemoryAllocatorFns &allocator = default_allocator());
+    static void destroy(void *user_context, MemoryArena *arena);
+
+    // Initialize a newly created instance
+    void initialize(void *user_context, const Config &config,
+                    const SystemMemoryAllocatorFns &allocator = default_allocator());
+
+    // Public interface methods
+    void *reserve(void *user_context, bool initialize = false);
+    void reclaim(void *user_context, void *ptr);
+    bool collect(void *user_context);  //< returns true if any blocks were removed
+    void destroy(void *user_context);
+
+    // Access methods
+    const Config &current_config() const;
+    static const Config &default_config();
+
+    const SystemMemoryAllocatorFns &current_allocator() const;
+    static const SystemMemoryAllocatorFns &default_allocator();
+
+private:
+    // Sentinal invalid entry value
+    static const uint32_t invalid_entry = uint32_t(-1);
+
+    // Each block contains:
+    // - an array of entries
+    // - an array of indices (for the free list)
+    // - an array of status flags (indicating usage)
+    // - free index points to next available entry for the block (or invalid_entry if block is full)
+    struct Block {
+        void *entries = nullptr;
+        uint32_t *indices = nullptr;
+        AllocationStatus *status = nullptr;
+        uint32_t capacity = 0;
+        uint32_t free_index = 0;
+    };
+
+    Block *create_block(void *user_context);
+    bool collect_block(void *user_context, Block *block);  //< returns true if any blocks were removed
+    void destroy_block(void *user_context, Block *block);
+    Block *lookup_block(void *user_context, uint32_t index);
+
+    void *create_entry(void *user_context, Block *block, uint32_t index);
+    void destroy_entry(void *user_context, Block *block, uint32_t index);
+    void *lookup_entry(void *user_context, Block *block, uint32_t index);
+
+    Config config;
+    BlockStorage blocks;
+};
+
+MemoryArena::MemoryArena(void *user_context,
+                         const Config &cfg,
+                         const SystemMemoryAllocatorFns &alloc)
+    : config(cfg),
+      blocks(user_context, {sizeof(MemoryArena::Block), 32, 32}, alloc) {
+    halide_debug_assert(user_context, config.minimum_block_capacity > 1);
+}
+
+MemoryArena::~MemoryArena() {
+    destroy(nullptr);
+}
+
+MemoryArena *MemoryArena::create(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &system_allocator) {
+    halide_abort_if_false(user_context, system_allocator.allocate != nullptr);
+    MemoryArena *result = reinterpret_cast<MemoryArena *>(
+        system_allocator.allocate(user_context, sizeof(MemoryArena)));
+
+    if (result == nullptr) {
+        halide_error(user_context, "MemoryArena: Failed to create instance! Out of memory!\n");
+        return nullptr;
+    }
+
+    result->initialize(user_context, cfg, system_allocator);
+    return result;
+}
+
+void MemoryArena::destroy(void *user_context, MemoryArena *instance) {
+    halide_abort_if_false(user_context, instance != nullptr);
+    const SystemMemoryAllocatorFns &system_allocator = instance->blocks.current_allocator();
+    instance->destroy(user_context);
+    halide_abort_if_false(user_context, system_allocator.deallocate != nullptr);
+    system_allocator.deallocate(user_context, instance);
+}
+
+void MemoryArena::initialize(void *user_context,
+                             const Config &cfg,
+                             const SystemMemoryAllocatorFns &system_allocator) {
+    config = cfg;
+    blocks.initialize(user_context, {sizeof(MemoryArena::Block), 32, 32}, system_allocator);
+    halide_debug_assert(user_context, config.minimum_block_capacity > 1);
+}
+
+void MemoryArena::destroy(void *user_context) {
+    for (size_t i = blocks.size(); i--;) {
+        Block *block = lookup_block(user_context, i);
+        halide_abort_if_false(user_context, block != nullptr);
+        destroy_block(user_context, block);
+    }
+    blocks.destroy(user_context);
+}
+
+bool MemoryArena::collect(void *user_context) {
+    bool result = false;
+    for (size_t i = blocks.size(); i--;) {
+        Block *block = lookup_block(user_context, i);
+        halide_abort_if_false(user_context, block != nullptr);
+        if (collect_block(user_context, block)) {
+            blocks.remove(user_context, i);
+            result = true;
+        }
+    }
+    return result;
+}
+
+void *MemoryArena::reserve(void *user_context, bool initialize) {
+    // Scan blocks for a free entry
+    for (size_t i = blocks.size(); i--;) {
+        Block *block = lookup_block(user_context, i);
+        halide_abort_if_false(user_context, block != nullptr);
+        if (block->free_index != invalid_entry) {
+            return create_entry(user_context, block, block->free_index);
+        }
+    }
+
+    if (config.maximum_block_count && (blocks.size() >= config.maximum_block_count)) {
+        halide_error(user_context, "MemoryArena: Failed to reserve new entry! Maxmimum blocks reached!\n");
+        return nullptr;
+    }
+
+    // All blocks full ... create a new one
+    uint32_t index = 0;
+    Block *block = create_block(user_context);
+    void *entry_ptr = create_entry(user_context, block, index);
+
+    // Optionally clear the allocation if requested
+    if (initialize) {
+        memset(entry_ptr, 0, config.entry_size);
+    }
+    return entry_ptr;
+}
+
+void MemoryArena::reclaim(void *user_context, void *entry_ptr) {
+    for (size_t i = blocks.size(); i--;) {
+        Block *block = lookup_block(user_context, i);
+        halide_abort_if_false(user_context, block != nullptr);
+
+        // is entry_ptr in the address range of this block.
+        uint8_t *offset_ptr = static_cast<uint8_t *>(entry_ptr);
+        uint8_t *base_ptr = static_cast<uint8_t *>(block->entries);
+        uint8_t *end_ptr = static_cast<uint8_t *>(offset_address(block->entries, block->capacity * config.entry_size));
+        if ((entry_ptr >= base_ptr) && (entry_ptr < end_ptr)) {
+            const uint32_t offset = static_cast<uint32_t>(offset_ptr - base_ptr);
+            const uint32_t index = offset / config.entry_size;
+            destroy_entry(user_context, block, index);
+            return;
+        }
+    }
+    halide_error(user_context, "MemoryArena: Pointer address doesn't belong to this memory pool!\n");
+}
+
+typename MemoryArena::Block *MemoryArena::create_block(void *user_context) {
+    // resize capacity starting with initial up to 1.5 last capacity
+    uint32_t new_capacity = config.minimum_block_capacity;
+    if (!blocks.empty()) {
+        const Block *last_block = static_cast<Block *>(blocks.back());
+        new_capacity = (last_block->capacity * 3 / 2);
+    }
+
+    halide_abort_if_false(user_context, current_allocator().allocate != nullptr);
+    void *new_entries = current_allocator().allocate(user_context, config.entry_size * new_capacity);
+    memset(new_entries, 0, config.entry_size * new_capacity);
+
+    uint32_t *new_indices = (uint32_t *)current_allocator().allocate(user_context, sizeof(uint32_t) * new_capacity);
+    AllocationStatus *new_status = (AllocationStatus *)current_allocator().allocate(user_context, sizeof(AllocationStatus) * new_capacity);
+
+    for (uint32_t i = 0; i < new_capacity - 1; ++i) {
+        new_indices[i] = i + 1;                       // singly-linked list of all free entries in the block
+        new_status[i] = AllocationStatus::Available;  // usage status
+    }
+
+    new_indices[new_capacity - 1] = invalid_entry;
+    new_status[new_capacity - 1] = AllocationStatus::InvalidStatus;
+
+    const Block new_block = {new_entries, new_indices, new_status, new_capacity, 0};
+    blocks.append(user_context, &new_block);
+    return static_cast<Block *>(blocks.back());
+}
+
+void MemoryArena::destroy_block(void *user_context, Block *block) {
+    halide_abort_if_false(user_context, block != nullptr);
+    if (block->entries != nullptr) {
+        halide_abort_if_false(user_context, current_allocator().deallocate != nullptr);
+        current_allocator().deallocate(user_context, block->entries);
+        current_allocator().deallocate(user_context, block->indices);
+        current_allocator().deallocate(user_context, block->status);
+        block->entries = nullptr;
+        block->indices = nullptr;
+        block->status = nullptr;
+    }
+}
+
+bool MemoryArena::collect_block(void *user_context, Block *block) {
+    halide_abort_if_false(user_context, block != nullptr);
+    if (block->entries != nullptr) {
+        bool can_collect = true;
+        for (size_t i = block->capacity; i--;) {
+            if (block->status[i] == AllocationStatus::InUse) {
+                can_collect = false;
+                break;
+            }
+        }
+        if (can_collect) {
+            destroy_block(user_context, block);
+            return true;
+        }
+    }
+    return false;
+}
+
+MemoryArena::Block *MemoryArena::lookup_block(void *user_context, uint32_t index) {
+    return static_cast<Block *>(blocks[index]);
+}
+
+void *MemoryArena::lookup_entry(void *user_context, Block *block, uint32_t index) {
+    halide_abort_if_false(user_context, block != nullptr);
+    halide_abort_if_false(user_context, block->entries != nullptr);
+    return offset_address(block->entries, index * config.entry_size);
+}
+
+void *MemoryArena::create_entry(void *user_context, Block *block, uint32_t index) {
+    void *entry_ptr = lookup_entry(user_context, block, index);
+    block->free_index = block->indices[index];
+    block->status[index] = AllocationStatus::InUse;
+#if DEBUG_RUNTIME
+    memset(entry_ptr, 0, config.entry_size);
+#endif
+    return entry_ptr;
+}
+
+void MemoryArena::destroy_entry(void *user_context, Block *block, uint32_t index) {
+    block->status[index] = AllocationStatus::Available;
+    block->indices[index] = block->free_index;
+    block->free_index = index;
+}
+
+const typename MemoryArena::Config &
+MemoryArena::current_config() const {
+    return config;
+}
+
+const typename MemoryArena::Config &
+MemoryArena::default_config() {
+    static Config result;
+    return result;
+}
+
+const SystemMemoryAllocatorFns &
+MemoryArena::current_allocator() const {
+    return blocks.current_allocator();
+}
+
+const SystemMemoryAllocatorFns &
+MemoryArena::default_allocator() {
+    return BlockStorage::default_allocator();
+}
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+#endif  // HALIDE_RUNTIME_MEMORY_ARENA_H
diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
new file mode 100644
index 000000000000..513892922530
--- /dev/null
+++ b/src/runtime/internal/memory_resources.h
@@ -0,0 +1,280 @@
+#ifndef HALIDE_RUNTIME_MEMORY_RESOURCES_H
+#define HALIDE_RUNTIME_MEMORY_RESOURCES_H
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// --
+
+// Hint for allocation usage indicating whether or not the resource
+// is in use, available, or dedicated (and can't be split or shared)
+enum class AllocationStatus {
+    InvalidStatus,
+    InUse,
+    Available,
+    Dedicated
+};
+
+// Hint for allocation requests indicating intended usage
+// required between host and device address space mappings
+enum class MemoryVisibility {
+    InvalidVisibility,  //< invalid enum value
+    HostOnly,           //< host local
+    DeviceOnly,         //< device local
+    DeviceToHost,       //< transfer from device to host
+    HostToDevice,       //< transfer from host to device
+    DefaultVisibility,  //< default visibility (use any valid visibility -- unable to determine prior to usage)
+};
+
+// Hint for allocation requests indicating intended update
+// frequency for modifying the contents of the allocation
+enum class MemoryUsage {
+    InvalidUsage,    //< invalid enum value
+    StaticStorage,   //< intended for static storage, whereby the contents will be set once and remain unchanged
+    DynamicStorage,  //< intended for dyanmic storage, whereby the contents will be set frequently and change constantly
+    UniformStorage,  //< intended for fast & small fixed read-only uniform storage (intended for passing shader parameters), whereby the contents will be set once and remain unchanged
+    TransferSrc,     //< intended for staging storage updates, whereby the contents will be used as the source of a transfer
+    TransferDst,     //< intended for staging storage updates, whereby the contents will be used as the destination of a transfer
+    TransferSrcDst,  //< intended for staging storage updates, whereby the contents will be used either as a source or destination of a transfer
+    DefaultUsage     //< default usage (use any valid usage -- unable to determine prior to usage)
+};
+
+// Hint for allocation requests indicating ideal caching support (if available)
+enum class MemoryCaching {
+    InvalidCaching,    //< invalid enum value
+    Cached,            //< cached
+    Uncached,          //< uncached
+    CachedCoherent,    //< cached and coherent
+    UncachedCoherent,  //< uncached but still coherent
+    DefaultCaching     //< default caching (use any valid caching behaviour -- unable to determine prior to usage)
+};
+
+struct MemoryProperties {
+    MemoryVisibility visibility = MemoryVisibility::InvalidVisibility;
+    MemoryUsage usage = MemoryUsage::InvalidUsage;
+    MemoryCaching caching = MemoryCaching::InvalidCaching;
+};
+
+// Client-facing struct for exchanging memory block allocation requests
+struct MemoryBlock {
+    void *handle = nullptr;       //< client data storing native handle (managed by alloc_block_region/free_block_region)
+    size_t size = 0;              //< allocated size (in bytes)
+    bool dedicated = false;       //< flag indicating whether allocation is one dedicated resource (or split/shared into other resources)
+    MemoryProperties properties;  //< properties for the allocated block
+};
+
+// Client-facing struct for exchanging memory region allocation requests
+struct MemoryRegion {
+    void *handle = nullptr;       //< client data storing native handle (managed by alloc_block_region/free_block_region)
+    size_t offset = 0;            //< offset from base address in block (in bytes)
+    size_t size = 0;              //< allocated size (in bytes)
+    bool dedicated = false;       //< flag indicating whether allocation is one dedicated resource (or split/shared into other resources)
+    MemoryProperties properties;  //< properties for the allocated region
+};
+
+// Client-facing struct for issuing memory allocation requests
+struct MemoryRequest {
+    size_t offset = 0;            //< offset from base address in block (in bytes)
+    size_t size = 0;              //< allocated size (in bytes)
+    size_t alignment = 0;         //< alignment constraint for address
+    bool dedicated = false;       //< flag indicating whether allocation is one dedicated resource (or split/shared into other resources)
+    MemoryProperties properties;  //< properties for the allocated region
+};
+
+class RegionAllocator;
+struct BlockRegion;
+
+// Internal struct for block resource state
+// -- Note: first field must MemoryBlock
+struct BlockResource {
+    MemoryBlock memory;                    //< memory info for the allocated block
+    RegionAllocator *allocator = nullptr;  //< designated allocator for the block
+    BlockRegion *regions = nullptr;        //< head of linked list of memory regions
+    size_t reserved = 0;                   //< number of bytes already reserved to regions
+};
+
+// Internal struct for block region state
+// -- Note: first field must MemoryRegion
+struct BlockRegion {
+    MemoryRegion memory;                                        //< memory info for the allocated region
+    AllocationStatus status = AllocationStatus::InvalidStatus;  //< allocation status indicator
+    BlockRegion *next_ptr = nullptr;                            //< pointer to next block region in linked list
+    BlockRegion *prev_ptr = nullptr;                            //< pointer to prev block region in linked list
+    BlockResource *block_ptr = nullptr;                         //< pointer to parent block resource
+};
+
+// Returns an aligned byte offset to adjust the given offset based on alignment constraints
+// -- Alignment must be power of two!
+ALWAYS_INLINE size_t aligned_offset(size_t offset, size_t alignment) {
+    return (offset + (alignment - 1)) & ~(alignment - 1);
+}
+
+// Returns a padded size to accomodate an adjusted offset due to alignment constraints
+// -- Alignment must be power of two!
+ALWAYS_INLINE size_t aligned_size(size_t offset, size_t size, size_t alignment) {
+    size_t actual_offset = aligned_offset(offset, alignment);
+    size_t padding = actual_offset - offset;
+    size_t actual_size = padding + size;
+    return actual_size;
+}
+
+// Clamps the given value to be within the [min_value, max_value] range
+ALWAYS_INLINE size_t clamped_size(size_t value, size_t min_value, size_t max_value) {
+    size_t result = (value < min_value) ? min_value : value;
+    return (result > max_value) ? max_value : result;
+}
+
+// Offset the untyped pointer by the given number of bytes
+ALWAYS_INLINE const void *offset_address(const void *address, size_t byte_offset) {
+    const uintptr_t base = reinterpret_cast<uintptr_t>(address);
+    return reinterpret_cast<const void *>(base + byte_offset);
+}
+
+// Offset the untyped pointer by the given number of bytes
+ALWAYS_INLINE void *offset_address(void *address, size_t byte_offset) {
+    const uintptr_t base = reinterpret_cast<uintptr_t>(address);
+    return reinterpret_cast<void *>(base + byte_offset);
+}
+
+// --
+
+typedef void *(*AllocateSystemFn)(void *, size_t);
+typedef void (*DeallocateSystemFn)(void *, void *);
+
+ALWAYS_INLINE void *native_system_malloc(void *user_context, size_t bytes) {
+    return malloc(bytes);
+}
+
+ALWAYS_INLINE void native_system_free(void *user_context, void *ptr) {
+    free(ptr);
+}
+
+struct SystemMemoryAllocatorFns {
+    AllocateSystemFn allocate = nullptr;
+    DeallocateSystemFn deallocate = nullptr;
+};
+
+struct HalideSystemAllocatorFns {
+    AllocateSystemFn allocate = halide_malloc;
+    DeallocateSystemFn deallocate = halide_free;
+};
+
+typedef void (*AllocateBlockFn)(void *, MemoryBlock *);
+typedef void (*DeallocateBlockFn)(void *, MemoryBlock *);
+
+struct MemoryBlockAllocatorFns {
+    AllocateBlockFn allocate = nullptr;
+    DeallocateBlockFn deallocate = nullptr;
+};
+
+typedef void (*AllocateRegionFn)(void *, MemoryRegion *);
+typedef void (*DeallocateRegionFn)(void *, MemoryRegion *);
+
+struct MemoryRegionAllocatorFns {
+    AllocateRegionFn allocate = nullptr;
+    DeallocateRegionFn deallocate = nullptr;
+};
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+// --
+
+extern "C" {
+
+WEAK const char *halide_memory_visibility_name(MemoryVisibility value) {
+    switch (value) {
+    case MemoryVisibility::InvalidVisibility: {
+        return "InvalidVisibility";
+    }
+    case MemoryVisibility::DefaultVisibility: {
+        return "DefaultVisibility";
+    }
+    case MemoryVisibility::HostOnly: {
+        return "HostOnly";
+    }
+    case MemoryVisibility::DeviceOnly: {
+        return "DeviceOnly";
+    }
+    case MemoryVisibility::HostToDevice: {
+        return "HostToDevice";
+    }
+    case MemoryVisibility::DeviceToHost: {
+        return "DeviceToHost";
+    }
+    default: {
+        return "<unknown memory visibility value>";
+    }
+    };
+    return "<unknown memory visibility value>";
+}
+
+WEAK const char *halide_memory_usage_name(MemoryUsage value) {
+    switch (value) {
+    case MemoryUsage::InvalidUsage: {
+        return "InvalidUsage";
+    }
+    case MemoryUsage::DefaultUsage: {
+        return "DefaultUsage";
+    }
+    case MemoryUsage::StaticStorage: {
+        return "StaticStorage";
+    }
+    case MemoryUsage::DynamicStorage: {
+        return "DynamicStorage";
+    }
+    case MemoryUsage::UniformStorage: {
+        return "UniformStorage";
+    }
+    case MemoryUsage::TransferSrc: {
+        return "TransferSrc";
+    }
+    case MemoryUsage::TransferDst: {
+        return "TransferDst";
+    }
+    case MemoryUsage::TransferSrcDst: {
+        return "TransferSrcDst";
+    }
+    default: {
+        return "<unknown memory usage value>";
+    }
+    };
+    return "<unknown memory usage value>";
+}
+
+WEAK const char *halide_memory_caching_name(MemoryCaching value) {
+    switch (value) {
+    case MemoryCaching::InvalidCaching: {
+        return "InvalidCaching";
+    }
+    case MemoryCaching::DefaultCaching: {
+        return "DefaultCaching";
+    }
+    case MemoryCaching::Cached: {
+        return "Cached";
+    }
+    case MemoryCaching::Uncached: {
+        return "Uncached";
+    }
+    case MemoryCaching::CachedCoherent: {
+        return "CachedCoherent";
+    }
+    case MemoryCaching::UncachedCoherent: {
+        return "UncachedCoherent";
+    }
+    default: {
+        return "<unknown memory visibility value>";
+    }
+    };
+    return "<unknown memory visibility value>";
+}
+
+}  // extern "C"
+
+// --
+
+#endif  // HALIDE_RUNTIME_MEMORY_RESOURCES_H
diff --git a/src/runtime/internal/pointer_table.h b/src/runtime/internal/pointer_table.h
new file mode 100644
index 000000000000..b5ff3bfd6f7c
--- /dev/null
+++ b/src/runtime/internal/pointer_table.h
@@ -0,0 +1,366 @@
+#ifndef HALIDE_RUNTIME_POINTER_TABLE_H
+#define HALIDE_RUNTIME_POINTER_TABLE_H
+
+#include "memory_resources.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// Dynamically resizable array for storing untyped pointers
+// -- Implementation uses memcpy/memmove for copying
+// -- Customizable allocator ... default uses NativeSystemAllocator
+class PointerTable {
+public:
+    static constexpr size_t default_capacity = 32;  // smallish
+
+    PointerTable(void *user_context, size_t initial_capacity = 0, const SystemMemoryAllocatorFns &sma = default_allocator());
+    PointerTable(const PointerTable &other);
+    ~PointerTable();
+
+    void initialize(void *user_context, size_t initial_capacity = 0, const SystemMemoryAllocatorFns &sma = default_allocator());
+
+    PointerTable &operator=(const PointerTable &other);
+    bool operator==(const PointerTable &other) const;
+    bool operator!=(const PointerTable &other) const;
+
+    void reserve(void *user_context, size_t capacity, bool free_existing = false);
+    void resize(void *user_context, size_t entry_count, bool realloc = true);
+
+    void assign(void *user_context, size_t index, const void *entry_ptr);
+    void insert(void *user_context, size_t index, const void *entry_ptr);
+    void prepend(void *user_context, const void *entry_ptr);
+    void append(void *user_context, const void *entry_ptr);
+    void remove(void *user_context, size_t index);
+
+    void fill(void *user_context, const void **array, size_t array_size);
+    void insert(void *user_context, size_t index, const void **array, size_t array_size);
+    void replace(void *user_context, size_t index, const void **array, size_t array_size);
+    void prepend(void *user_context, const void **array, size_t array_size);
+    void append(void *user_context, const void **array, size_t array_size);
+    void remove(void *user_context, size_t index, size_t entry_count);
+
+    void pop_front(void *user_context);
+    void pop_back(void *user_context);
+    void shrink_to_fit(void *user_context);
+    void clear(void *user_context);
+    void destroy(void *user_context);
+
+    bool empty() const;
+    size_t size() const;
+
+    void *operator[](size_t index);
+    void *operator[](size_t index) const;
+
+    void **data();
+    const void **data() const;
+
+    void *front();
+    void *back();
+
+    const SystemMemoryAllocatorFns &current_allocator() const;
+    static const SystemMemoryAllocatorFns &default_allocator();
+
+private:
+    void allocate(void *user_context, size_t capacity);
+
+    void **ptr = nullptr;
+    size_t count = 0;
+    size_t capacity = 0;
+    SystemMemoryAllocatorFns allocator;
+};
+
+PointerTable::PointerTable(void *user_context, size_t initial_capacity, const SystemMemoryAllocatorFns &sma)
+    : allocator(sma) {
+    halide_abort_if_false(user_context, allocator.allocate != nullptr);
+    halide_abort_if_false(user_context, allocator.deallocate != nullptr);
+    if (initial_capacity) { reserve(user_context, initial_capacity); }
+}
+
+PointerTable::PointerTable(const PointerTable &other)
+    : PointerTable(nullptr, 0, other.allocator) {
+    if (other.capacity) {
+        ptr = static_cast<void **>(allocator.allocate(nullptr, other.capacity * sizeof(void *)));
+        capacity = other.capacity;
+    }
+    if (ptr && other.count != 0) {
+        count = other.count;
+        memcpy(this->ptr, other.ptr, count * sizeof(void *));
+    }
+}
+
+PointerTable::~PointerTable() {
+    destroy(nullptr);
+}
+
+void PointerTable::destroy(void *user_context) {
+    halide_abort_if_false(user_context, allocator.deallocate != nullptr);
+    if (ptr != nullptr) {
+        allocator.deallocate(user_context, ptr);
+    }
+    capacity = count = 0;
+    ptr = nullptr;
+}
+
+void PointerTable::initialize(void *user_context, size_t initial_capacity, const SystemMemoryAllocatorFns &sma) {
+    allocator = sma;
+    capacity = count = 0;
+    ptr = nullptr;
+    if (initial_capacity) {
+        reserve(user_context, initial_capacity);
+    }
+}
+
+PointerTable &PointerTable::operator=(const PointerTable &other) {
+    if (&other != this) {
+        resize(nullptr, other.count);
+        if (count != 0 && other.ptr != nullptr) {
+            memcpy(ptr, other.ptr, count * sizeof(void *));
+        }
+    }
+    return *this;
+}
+
+bool PointerTable::operator==(const PointerTable &other) const {
+    if (count != other.count) { return false; }
+    return memcmp(this->ptr, other.ptr, this->size() * sizeof(void *)) == 0;
+}
+
+bool PointerTable::operator!=(const PointerTable &other) const {
+    return !(*this == other);
+}
+
+void PointerTable::fill(void *user_context, const void **array, size_t array_size) {
+    if (array_size != 0) {
+        resize(user_context, array_size);
+        memcpy(this->ptr, array, array_size * sizeof(void *));
+        count = array_size;
+    }
+}
+
+void PointerTable::assign(void *user_context, size_t index, const void *entry_ptr) {
+    halide_debug_assert(user_context, index < count);
+    ptr[index] = const_cast<void *>(entry_ptr);
+}
+
+void PointerTable::prepend(void *user_context, const void *entry_ptr) {
+    insert(user_context, 0, &entry_ptr, 1);
+}
+
+void PointerTable::append(void *user_context, const void *entry_ptr) {
+    append(user_context, &entry_ptr, 1);
+}
+
+void PointerTable::pop_front(void *user_context) {
+    halide_debug_assert(user_context, count > 0);
+    remove(user_context, 0);
+}
+
+void PointerTable::pop_back(void *user_context) {
+    halide_debug_assert(user_context, count > 0);
+    resize(user_context, size() - 1);
+}
+
+void PointerTable::clear(void *user_context) {
+    resize(user_context, 0);
+}
+
+void PointerTable::reserve(void *user_context, size_t new_capacity, bool free_existing) {
+    new_capacity = max(new_capacity, count);
+    if ((new_capacity < capacity) && !free_existing) {
+        new_capacity = capacity;
+    }
+    allocate(user_context, new_capacity);
+}
+
+void PointerTable::resize(void *user_context, size_t entry_count, bool realloc) {
+    size_t current_size = capacity;
+    size_t requested_size = entry_count;
+    size_t minimum_size = default_capacity;
+    size_t actual_size = current_size;
+    count = requested_size;
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "PointerTable: Resize ("
+                        << "requested_size=" << (int32_t)requested_size << " "
+                        << "current_size=" << (int32_t)current_size << " "
+                        << "minimum_size=" << (int32_t)minimum_size << " "
+                        << "sizeof(void*)=" << (int32_t)sizeof(void *) << " "
+                        << "realloc=" << (realloc ? "true" : "false") << ")...\n";
+#endif
+
+    // increase capacity upto 1.5x existing (or at least min_capacity)
+    if (requested_size > current_size) {
+        actual_size = max(requested_size, max(current_size * 3 / 2, minimum_size));
+    } else if (!realloc) {
+        return;
+    }
+
+    allocate(user_context, actual_size);
+}
+
+void PointerTable::shrink_to_fit(void *user_context) {
+    if (capacity > count) {
+        void *new_ptr = nullptr;
+        if (count > 0) {
+            size_t bytes = count * sizeof(void *);
+            new_ptr = allocator.allocate(user_context, bytes);
+            memcpy(new_ptr, ptr, bytes);
+        }
+        allocator.deallocate(user_context, ptr);
+        capacity = count;
+        ptr = static_cast<void **>(new_ptr);
+    }
+}
+
+void PointerTable::insert(void *user_context, size_t index, const void *entry_ptr) {
+    const void *addr = reinterpret_cast<const void *>(entry_ptr);
+    insert(user_context, index, &addr, 1);
+}
+
+void PointerTable::remove(void *user_context, size_t index) {
+    remove(user_context, index, 1);
+}
+
+void PointerTable::remove(void *user_context, size_t index, size_t entry_count) {
+    halide_debug_assert(user_context, index < count);
+    const size_t last_index = size();
+    if (index < (last_index - entry_count)) {
+        size_t dst_offset = index * sizeof(void *);
+        size_t src_offset = (index + entry_count) * sizeof(void *);
+        size_t bytes = (last_index - index - entry_count) * sizeof(void *);
+
+#ifdef DEBUG_RUNTIME
+        debug(user_context) << "PointerTable: Remove ("
+                            << "index=" << (int32_t)index << " "
+                            << "entry_count=" << (int32_t)entry_count << " "
+                            << "last_index=" << (int32_t)last_index << " "
+                            << "src_offset=" << (int32_t)src_offset << " "
+                            << "dst_offset=" << (int32_t)dst_offset << " "
+                            << "bytes=" << (int32_t)bytes << ")...\n";
+#endif
+        memmove(ptr + dst_offset, ptr + src_offset, bytes);
+    }
+    resize(user_context, last_index - entry_count);
+}
+
+void PointerTable::replace(void *user_context, size_t index, const void **array, size_t array_size) {
+    halide_debug_assert(user_context, index < count);
+    size_t remaining = count - index;
+    size_t copy_count = min(remaining, array_size);
+
+#ifdef DEBUG_RUNTIME
+
+    debug(user_context) << "PointerTable: Replace ("
+                        << "index=" << (int32_t)index << " "
+                        << "array_size=" << (int32_t)array_size << " "
+                        << "remaining=" << (int32_t)remaining << " "
+                        << "copy_count=" << (int32_t)copy_count << " "
+                        << "capacity=" << (int32_t)capacity << ")...\n";
+#endif
+
+    halide_debug_assert(user_context, remaining > 0);
+    memcpy(ptr + index, array, copy_count * sizeof(void *));
+    count = max(count, index + copy_count);
+}
+
+void PointerTable::insert(void *user_context, size_t index, const void **array, size_t array_size) {
+    halide_debug_assert(user_context, index <= count);
+    const size_t last_index = size();
+    resize(user_context, last_index + array_size);
+    if (index < last_index) {
+        size_t src_offset = index * sizeof(void *);
+        size_t dst_offset = (index + array_size) * sizeof(void *);
+        size_t bytes = (last_index - index) * sizeof(void *);
+        memmove(ptr + dst_offset, ptr + src_offset, bytes);
+    }
+    replace(user_context, index, array, array_size);
+}
+
+void PointerTable::prepend(void *user_context, const void **array, size_t array_size) {
+    insert(user_context, 0, array, array_size);
+}
+
+void PointerTable::append(void *user_context, const void **array, size_t array_size) {
+    const size_t last_index = size();
+    insert(user_context, last_index, array, array_size);
+}
+
+bool PointerTable::empty() const {
+    return count == 0;
+}
+
+size_t PointerTable::size() const {
+    return count;
+}
+
+void *PointerTable::operator[](size_t index) {
+    halide_debug_assert(nullptr, index < capacity);
+    return ptr[index];
+}
+
+void *PointerTable::operator[](size_t index) const {
+    halide_debug_assert(nullptr, index < capacity);
+    return ptr[index];
+}
+
+void **PointerTable::data() {
+    return ptr;
+}
+
+void *PointerTable::front() {
+    halide_debug_assert(nullptr, count > 0);
+    return ptr[0];
+}
+
+void *PointerTable::back() {
+    halide_debug_assert(nullptr, count > 0);
+    size_t index = count - 1;
+    return ptr[index];
+}
+
+const void **PointerTable::data() const {
+    return const_cast<const void **>(ptr);
+}
+
+void PointerTable::allocate(void *user_context, size_t new_capacity) {
+    if (new_capacity != capacity) {
+        halide_abort_if_false(user_context, allocator.allocate != nullptr);
+        size_t bytes = new_capacity * sizeof(void *);
+
+#ifdef DEBUG_RUNTIME
+        debug(user_context) << "PointerTable: Allocating (bytes=" << (int32_t)bytes << " allocator=" << (void *)allocator.allocate << ")...\n";
+#endif
+
+        void *new_ptr = bytes ? allocator.allocate(user_context, bytes) : nullptr;
+        if (count != 0 && ptr != nullptr && new_ptr != nullptr) {
+            memcpy(new_ptr, ptr, count * sizeof(void *));
+        }
+        if (ptr != nullptr) {
+            halide_abort_if_false(user_context, allocator.deallocate != nullptr);
+            allocator.deallocate(user_context, ptr);
+        }
+        capacity = new_capacity;
+        ptr = static_cast<void **>(new_ptr);
+    }
+}
+
+const SystemMemoryAllocatorFns &
+PointerTable::current_allocator() const {
+    return this->allocator;
+}
+
+const SystemMemoryAllocatorFns &
+PointerTable::default_allocator() {
+    static SystemMemoryAllocatorFns native_allocator = {
+        native_system_malloc, native_system_free};
+    return native_allocator;
+}
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+#endif  // HALIDE_RUNTIME_POINTER_TABLE_H
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
new file mode 100644
index 000000000000..8c7f8602abe7
--- /dev/null
+++ b/src/runtime/internal/region_allocator.h
@@ -0,0 +1,462 @@
+#ifndef HALIDE_RUNTIME_REGION_ALLOCATOR_H
+#define HALIDE_RUNTIME_REGION_ALLOCATOR_H
+
+#include "memory_arena.h"
+#include "memory_resources.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// --
+
+/** Allocator class interface for sub-allocating a contiguous
+ * memory block into smaller regions of memory. This class only
+ * manages the address creation for the regions -- allocation
+ * callback functions are used to request the memory from the
+ * necessary system or API calls. This class is intended to be
+ * used inside of a higher level memory management class that
+ * provides thread safety, policy management and API
+ * integration for a specific runtime API (eg Vulkan, OpenCL, etc)
+ */
+class RegionAllocator {
+public:
+    // disable copy constructors and assignment
+    RegionAllocator(const RegionAllocator &) = delete;
+    RegionAllocator &operator=(const RegionAllocator &) = delete;
+
+    // disable non-factory based construction
+    RegionAllocator() = delete;
+    ~RegionAllocator() = delete;
+
+    // Allocators for the different types of memory we need to allocate
+    struct MemoryAllocators {
+        SystemMemoryAllocatorFns system;
+        MemoryRegionAllocatorFns region;
+    };
+
+    // Factory methods for creation / destruction
+    static RegionAllocator *create(void *user_context, BlockResource *block, const MemoryAllocators &ma);
+    static void destroy(void *user_context, RegionAllocator *region_allocator);
+
+    // Returns the allocator class instance for the given allocation (or nullptr)
+    static RegionAllocator *find_allocator(void *user_context, MemoryRegion *memory_region);
+
+    // Public interface methods
+    MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
+    void reclaim(void *user_context, MemoryRegion *memory_region);
+    bool collect(void *user_context);  //< returns true if any blocks were removed
+    void release(void *user_context);
+    void destroy(void *user_context);
+
+    // Returns the currently managed block resource
+    BlockResource *block_resource() const;
+
+private:
+    // Initializes a new instance
+    void initialize(void *user_context, BlockResource *block, const MemoryAllocators &ma);
+
+    // Search through allocated block regions (Best-Fit)
+    BlockRegion *find_block_region(void *user_context, const MemoryRequest &request);
+
+    // Returns true if neighbouring block regions to the given region can be coalesced into one
+    bool can_coalesce(BlockRegion *region);
+
+    // Merges available neighbouring block regions into the given region
+    BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region);
+
+    // Returns true if the given region can be split to accomadate the given size
+    bool can_split(BlockRegion *region, size_t size);
+
+    // Splits the given block region into a smaller region to accomadate the given size, followed by empty space for the remaining
+    BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment);
+
+    // Creates a new block region and adds it to the region list
+    BlockRegion *create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated);
+
+    // Creates a new block region and adds it to the region list
+    void destroy_block_region(void *user_context, BlockRegion *region);
+
+    // Invokes the allocation callback to allocate memory for the block region
+    void alloc_block_region(void *user_context, BlockRegion *region);
+
+    // Releases a block region and leaves it in the list for further allocations
+    void release_block_region(void *user_context, BlockRegion *region);
+
+    // Invokes the deallocation callback to free memory for the block region
+    void free_block_region(void *user_context, BlockRegion *region);
+
+    // Returns true if the given block region is compatible with the given properties
+    bool is_compatible_block_region(const BlockRegion *region, const MemoryProperties &properties) const;
+
+    BlockResource *block = nullptr;
+    MemoryArena *arena = nullptr;
+    MemoryAllocators allocators;
+};
+
+RegionAllocator *RegionAllocator::create(void *user_context, BlockResource *block_resource, const MemoryAllocators &allocators) {
+    halide_abort_if_false(user_context, allocators.system.allocate != nullptr);
+    RegionAllocator *result = reinterpret_cast<RegionAllocator *>(
+        allocators.system.allocate(user_context, sizeof(RegionAllocator)));
+
+    if (result == nullptr) {
+        halide_error(user_context, "RegionAllocator: Failed to create instance! Out of memory!\n");
+        return nullptr;
+    }
+
+    result->initialize(user_context, block_resource, allocators);
+    return result;
+}
+
+void RegionAllocator::destroy(void *user_context, RegionAllocator *instance) {
+    halide_abort_if_false(user_context, instance != nullptr);
+    const MemoryAllocators &allocators = instance->allocators;
+    instance->destroy(user_context);
+    halide_abort_if_false(user_context, allocators.system.deallocate != nullptr);
+    allocators.system.deallocate(user_context, instance);
+}
+
+void RegionAllocator::initialize(void *user_context, BlockResource *mb, const MemoryAllocators &ma) {
+    block = mb;
+    allocators = ma;
+    arena = MemoryArena::create(user_context, {sizeof(BlockRegion), MemoryArena::default_capacity, 0}, allocators.system);
+    halide_abort_if_false(user_context, arena != nullptr);
+    block->allocator = this;
+    block->regions = create_block_region(
+        user_context,
+        block->memory.properties,
+        0, block->memory.size,
+        block->memory.dedicated);
+}
+
+MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) {
+    halide_abort_if_false(user_context, request.size > 0);
+    size_t remaining = block->memory.size - block->reserved;
+    if (remaining < request.size) {
+#ifdef DEBUG_RUNTIME
+        debug(user_context) << "RegionAllocator: Unable to reserve more memory from block "
+                            << "-- requested size (" << (int32_t)(request.size) << " bytes) "
+                            << "greater than available (" << (int32_t)(remaining) << " bytes)!\n";
+#endif
+        return nullptr;
+    }
+
+    BlockRegion *block_region = find_block_region(user_context, request);
+    if (block_region == nullptr) {
+#ifdef DEBUG_RUNTIME
+        debug(user_context) << "RegionAllocator: Failed to locate region for requested size ("
+                            << (int32_t)(request.size) << " bytes)!\n";
+#endif
+        return nullptr;
+    }
+
+    if (can_split(block_region, request.size)) {
+#ifdef DEBUG_RUNTIME
+        debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
+                            << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n";
+#endif
+        split_block_region(user_context, block_region, request.size, request.alignment);
+    }
+
+    alloc_block_region(user_context, block_region);
+    return reinterpret_cast<MemoryRegion *>(block_region);
+}
+
+void RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
+    BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
+    halide_abort_if_false(user_context, block_region != nullptr);
+    halide_abort_if_false(user_context, block_region->block_ptr == block);
+    free_block_region(user_context, block_region);
+    if (can_coalesce(block_region)) {
+        block_region = coalesce_block_regions(user_context, block_region);
+    }
+}
+
+RegionAllocator *RegionAllocator::find_allocator(void *user_context, MemoryRegion *memory_region) {
+    BlockRegion *block_region = reinterpret_cast<BlockRegion *>(memory_region);
+    halide_abort_if_false(user_context, block_region != nullptr);
+    halide_abort_if_false(user_context, block_region->block_ptr != nullptr);
+    return block_region->block_ptr->allocator;
+}
+
+BlockRegion *RegionAllocator::find_block_region(void *user_context, const MemoryRequest &request) {
+    BlockRegion *result = nullptr;
+    for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
+
+        if (block_region->status != AllocationStatus::Available) {
+            continue;
+        }
+
+        // skip incompatible block regions for this request
+        if (!is_compatible_block_region(block_region, request.properties)) {
+            continue;
+        }
+
+        // is the requested size larger than the current region?
+        if (request.size > block_region->memory.size) {
+            continue;
+        }
+
+        size_t actual_size = aligned_size(block_region->memory.offset, request.size, request.alignment);
+
+        // is the adjusted size larger than the current region?
+        if (actual_size > block_region->memory.size) {
+            continue;
+        }
+
+        // will the adjusted size fit within the remaining unallocated space?
+        if ((actual_size + block->reserved) < block->memory.size) {
+            result = block_region;  // best-fit!
+            break;
+        }
+    }
+    return result;
+}
+
+bool RegionAllocator::can_coalesce(BlockRegion *block_region) {
+    if (block_region == nullptr) { return false; }
+    if (block_region->prev_ptr && (block_region->prev_ptr->status == AllocationStatus::Available)) {
+        return true;
+    }
+    if (block_region->next_ptr && (block_region->next_ptr->status == AllocationStatus::Available)) {
+        return true;
+    }
+    return false;
+}
+
+BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRegion *block_region) {
+    if (block_region->prev_ptr && (block_region->prev_ptr->status == AllocationStatus::Available)) {
+        BlockRegion *prev_region = block_region->prev_ptr;
+
+#ifdef DEBUG_RUNTIME
+        debug(user_context) << "RegionAllocator: Coalescing "
+                            << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) "
+                            << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!";
+#endif
+
+        prev_region->next_ptr = block_region->next_ptr;
+        if (block_region->next_ptr) {
+            block_region->next_ptr->prev_ptr = prev_region;
+        }
+        prev_region->memory.size += block_region->memory.size;
+        destroy_block_region(user_context, block_region);
+        block_region = prev_region;
+    }
+
+    if (block_region->next_ptr && (block_region->next_ptr->status == AllocationStatus::Available)) {
+        BlockRegion *next_region = block_region->next_ptr;
+
+#ifdef DEBUG_RUNTIME
+        debug(user_context) << "RegionAllocator: Coalescing "
+                            << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) "
+                            << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n";
+#endif
+
+        if (next_region->next_ptr) {
+            next_region->next_ptr->prev_ptr = block_region;
+        }
+        block_region->next_ptr = next_region->next_ptr;
+        block_region->memory.size += next_region->memory.size;
+        destroy_block_region(user_context, next_region);
+    }
+
+    return block_region;
+}
+
+bool RegionAllocator::can_split(BlockRegion *block_region, size_t size) {
+    return (block_region && (block_region->memory.size > size));
+}
+
+BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
+    size_t adjusted_size = aligned_size(block_region->memory.offset, size, alignment);
+    size_t adjusted_offset = aligned_offset(block_region->memory.offset, alignment);
+
+    size_t empty_offset = adjusted_offset + size;
+    size_t empty_size = block_region->memory.size - adjusted_size;
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Splitting "
+                        << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
+                        << "to create empty region (offset=" << (int32_t)empty_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
+#endif
+
+    BlockRegion *next_region = block_region->next_ptr;
+    BlockRegion *empty_region = create_block_region(user_context,
+                                                    block_region->memory.properties,
+                                                    empty_offset, empty_size,
+                                                    block_region->memory.dedicated);
+    halide_abort_if_false(user_context, empty_region != nullptr);
+
+    empty_region->next_ptr = next_region;
+    if (next_region) {
+        next_region->prev_ptr = empty_region;
+    }
+    block_region->next_ptr = empty_region;
+    block_region->memory.size = size;
+    return empty_region;
+}
+
+BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Creating block region ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "offset=" << (uint32_t)offset << " "
+                        << "size=" << (uint32_t)size << " "
+                        << "dedicated=" << (dedicated ? "true" : "false") << " "
+                        << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                        << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                        << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n";
+#endif
+
+    BlockRegion *block_region = static_cast<BlockRegion *>(arena->reserve(user_context, true));
+
+    if (block_region == nullptr) {
+        error(user_context) << "RegionAllocator: Failed to allocate new block region!\n";
+        return nullptr;
+    }
+
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Added block region ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "block_region=" << (void *)(block_region) << ") ...\n";
+#endif
+
+    block_region->memory.offset = offset;
+    block_region->memory.size = size;
+    block_region->memory.properties = properties;
+    block_region->memory.dedicated = dedicated;
+    block_region->status = AllocationStatus::Available;
+    block_region->block_ptr = block;
+    return block_region;
+}
+
+void RegionAllocator::release_block_region(void *user_context, BlockRegion *block_region) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Releasing block region ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "block_region=" << (void *)(block_region) << ") ...\n";
+#endif
+    free_block_region(user_context, block_region);
+}
+
+void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block_region) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Destroying block region ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "block_region=" << (void *)(block_region) << ") ...\n";
+#endif
+
+    free_block_region(user_context, block_region);
+    arena->reclaim(user_context, block_region);
+}
+
+void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_region) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Allocating region (size=" << (int32_t)(block_region->memory.size) << ", offset=" << (int32_t)block_region->memory.offset << ")!\n";
+#endif
+    halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
+    halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available);
+    MemoryRegion *memory_region = &(block_region->memory);
+    allocators.region.allocate(user_context, memory_region);
+    block_region->status = block_region->memory.dedicated ? AllocationStatus::Dedicated : AllocationStatus::InUse;
+    block->reserved += block_region->memory.size;
+}
+
+void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_region) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Freeing block region ("
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "block_region=" << (void *)(block_region) << ") ...\n";
+#endif
+    if ((block_region->status == AllocationStatus::InUse) ||
+        (block_region->status == AllocationStatus::Dedicated)) {
+        debug(user_context) << "RegionAllocator: Deallocating region (size=" << (int32_t)(block_region->memory.size) << ", offset=" << (int32_t)block_region->memory.offset << ")!\n";
+        halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
+        MemoryRegion *memory_region = &(block_region->memory);
+        allocators.region.deallocate(user_context, memory_region);
+        block->reserved -= block_region->memory.size;
+    }
+    block_region->status = AllocationStatus::Available;
+}
+
+void RegionAllocator::release(void *user_context) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Releasing all regions ("
+                        << "user_context=" << (void *)(user_context) << ") ...\n";
+#endif
+    for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
+        release_block_region(user_context, block_region);
+    }
+}
+
+bool RegionAllocator::collect(void *user_context) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Collecting free block regions ("
+                        << "user_context=" << (void *)(user_context) << ") ...\n";
+#endif
+    bool result = false;
+    for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) {
+        if (block_region->status == AllocationStatus::Available) {
+            if (can_coalesce(block_region)) {
+                block_region = coalesce_block_regions(user_context, block_region);
+                result = true;
+            }
+        }
+    }
+    return result;
+}
+
+void RegionAllocator::destroy(void *user_context) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context) << "RegionAllocator: Destroying all block regions ("
+                        << "user_context=" << (void *)(user_context) << ") ...\n";
+#endif
+    for (BlockRegion *block_region = block->regions; block_region != nullptr;) {
+
+        if (block_region->next_ptr == nullptr) {
+            destroy_block_region(user_context, block_region);
+            block_region = nullptr;
+        } else {
+            BlockRegion *prev_region = block_region;
+            block_region = block_region->next_ptr;
+            destroy_block_region(user_context, prev_region);
+        }
+    }
+    block->regions = nullptr;
+    block->reserved = 0;
+    arena->destroy(user_context);
+}
+
+bool RegionAllocator::is_compatible_block_region(const BlockRegion *block_region, const MemoryProperties &properties) const {
+    if (properties.caching != MemoryCaching::DefaultCaching) {
+        if (properties.caching != block_region->memory.properties.caching) {
+            return false;
+        }
+    }
+
+    if (properties.visibility != MemoryVisibility::DefaultVisibility) {
+        if (properties.visibility != block_region->memory.properties.visibility) {
+            return false;
+        }
+    }
+
+    if (properties.usage != MemoryUsage::DefaultUsage) {
+        if (properties.usage != block_region->memory.properties.usage) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+BlockResource *RegionAllocator::block_resource() const {
+    return block;
+}
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+#endif  // HALIDE_RUNTIME_REGION_ALLOCATOR_H
diff --git a/src/runtime/internal/string_storage.h b/src/runtime/internal/string_storage.h
new file mode 100644
index 000000000000..6b4daa95ac0a
--- /dev/null
+++ b/src/runtime/internal/string_storage.h
@@ -0,0 +1,216 @@
+#ifndef HALIDE_RUNTIME_STRING_STORAGE_H
+#define HALIDE_RUNTIME_STRING_STORAGE_H
+
+#include "block_storage.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// Static utility functions for dealing with string data
+struct StringUtils {
+    static bool is_empty(const char *str) {
+        if (str == nullptr) { return true; }
+        if (str[0] == '\0') { return true; }
+        return false;
+    }
+
+    // count the number of delimited string tokens
+    static size_t count_tokens(const char *str, const char *delim) {
+        if (StringUtils::is_empty(str)) { return 0; }
+        if (StringUtils::is_empty(delim)) { return 1; }  // no delim ... string is one token
+
+        size_t count = 0;
+        const char *ptr = str;
+        size_t delim_length = strlen(delim);
+        while (!StringUtils::is_empty(ptr)) {
+            const char *next_delim = strstr(ptr, delim);
+            ptr = (next_delim != nullptr) ? (next_delim + delim_length) : nullptr;
+            ++count;
+        }
+        return count;
+    }
+
+    static size_t count_length(const char *str) {
+        const char *ptr = str;
+        while (!StringUtils::is_empty(ptr)) {
+            ++ptr;
+        }
+        return size_t(ptr - str);
+    }
+};
+
+// --
+// Storage class for handling c-string data (based on block storage)
+// -- Intended for building and maintaining string data w/8-bit chars
+//
+class StringStorage {
+public:
+    StringStorage(void *user_context = nullptr, uint32_t capacity = 0, const SystemMemoryAllocatorFns &sma = default_allocator());
+    StringStorage(const StringStorage &other) = default;
+    ~StringStorage();
+
+    void initialize(void *user_context, uint32_t capacity = 0, const SystemMemoryAllocatorFns &sma = default_allocator());
+    void destroy(void *user_context);
+
+    StringStorage &operator=(const StringStorage &other);
+    bool operator==(const StringStorage &other) const;
+    bool operator!=(const StringStorage &other) const;
+
+    bool contains(const char *str) const;
+    bool contains(const StringStorage &other) const;
+
+    void reserve(void *user_context, size_t length);
+    void assign(void *user_context, char ch);
+    void assign(void *user_context, const char *str, size_t length = 0);  // if length is zero, strlen is used
+    void append(void *user_context, char ch);
+    void append(void *user_context, const char *str, size_t length = 0);  // if length is zero, strlen is used
+    void prepend(void *user_context, char ch);
+    void prepend(void *user_context, const char *str, size_t length = 0);  // if length is zero, strlen is used
+    void clear(void *user_context);
+    void terminate(void *user_context, size_t length);
+
+    size_t length() const;
+    const char *data() const;
+
+    const SystemMemoryAllocatorFns &current_allocator() const;
+    static const SystemMemoryAllocatorFns &default_allocator();
+
+private:
+    BlockStorage contents;
+};
+
+StringStorage::StringStorage(void *user_context, uint32_t capacity, const SystemMemoryAllocatorFns &sma)
+    : contents(user_context, {sizeof(char), 32, 32}, sma) {
+    if (capacity) { contents.reserve(user_context, capacity); }
+}
+
+StringStorage::~StringStorage() {
+    destroy(nullptr);
+}
+
+StringStorage &StringStorage::operator=(const StringStorage &other) {
+    if (&other != this) {
+        assign(nullptr, other.data(), other.length());
+    }
+    return *this;
+}
+
+bool StringStorage::contains(const char *str) const {
+    const char *this_str = static_cast<const char *>(contents.data());
+    return strstr(this_str, str) != nullptr;
+}
+
+bool StringStorage::contains(const StringStorage &other) const {
+    const char *this_str = static_cast<const char *>(contents.data());
+    const char *other_str = static_cast<const char *>(other.contents.data());
+    return strstr(this_str, other_str) != nullptr;
+}
+
+bool StringStorage::operator==(const StringStorage &other) const {
+    if (contents.size() != other.contents.size()) { return false; }
+    const char *this_str = static_cast<const char *>(contents.data());
+    const char *other_str = static_cast<const char *>(other.contents.data());
+    return strncmp(this_str, other_str, contents.size()) == 0;
+}
+
+bool StringStorage::operator!=(const StringStorage &other) const {
+    return !(*this == other);
+}
+
+void StringStorage::reserve(void *user_context, size_t length) {
+    contents.reserve(user_context, length + 1);  // leave room for termination
+    contents.resize(user_context, length, false);
+    terminate(user_context, length);
+}
+
+void StringStorage::assign(void *user_context, char ch) {
+    contents.resize(user_context, 1);
+    char *ptr = static_cast<char *>(contents[0]);
+    (*ptr) = ch;
+}
+
+void StringStorage::assign(void *user_context, const char *str, size_t length) {
+    if (StringUtils::is_empty(str)) { return; }
+    if (length == 0) { length = strlen(str); }
+    char *this_str = static_cast<char *>(contents.data());
+    reserve(user_context, length);
+    memcpy(this_str, str, length);
+    terminate(user_context, length);
+}
+
+void StringStorage::append(void *user_context, const char *str, size_t length) {
+    if (StringUtils::is_empty(str)) { return; }
+    if (length == 0) { length = strlen(str); }
+    const size_t old_size = contents.size();
+    size_t new_length = old_size + length;
+    char *this_str = static_cast<char *>(contents[old_size]);
+    reserve(user_context, length);
+    memcpy(this_str, str, length);
+    terminate(user_context, new_length);
+}
+
+void StringStorage::append(void *user_context, char ch) {
+    contents.append(user_context, &ch);
+}
+
+void StringStorage::prepend(void *user_context, const char *str, size_t length) {
+    if (StringUtils::is_empty(str)) { return; }
+    if (length == 0) { length = strlen(str); }
+    const size_t old_size = contents.size();
+    size_t new_length = old_size + length;
+    char *this_str = static_cast<char *>(contents.data());
+    reserve(user_context, new_length);
+    memcpy(this_str + length, this_str, old_size);
+    memcpy(this_str, str, length);
+    terminate(user_context, new_length);
+}
+
+void StringStorage::prepend(void *user_context, char ch) {
+    contents.prepend(user_context, &ch);
+}
+
+void StringStorage::terminate(void *user_context, size_t length) {
+    char *end_ptr = static_cast<char *>(contents[length]);
+    (*end_ptr) = '\0';
+}
+
+void StringStorage::clear(void *user_context) {
+    contents.clear(user_context);
+    if (contents.data()) { terminate(user_context, 0); }
+}
+
+void StringStorage::initialize(void *user_context, uint32_t capacity, const SystemMemoryAllocatorFns &sma) {
+    contents.initialize(user_context, {sizeof(char), 32, 32}, sma);
+    if (capacity) { contents.reserve(user_context, capacity); }
+}
+
+void StringStorage::destroy(void *user_context) {
+    contents.destroy(user_context);
+}
+
+size_t StringStorage::length() const {
+    return StringUtils::count_length(data());
+}
+
+const char *StringStorage::data() const {
+    return static_cast<const char *>(contents.data());
+}
+
+const SystemMemoryAllocatorFns &
+StringStorage::current_allocator() const {
+    return contents.current_allocator();
+}
+
+const SystemMemoryAllocatorFns &
+StringStorage::default_allocator() {
+    return BlockStorage::default_allocator();
+}
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+#endif  // HALIDE_RUNTIME_STRING_STORAGE_H
diff --git a/src/runtime/internal/string_table.h b/src/runtime/internal/string_table.h
new file mode 100644
index 000000000000..07e09f5f97b2
--- /dev/null
+++ b/src/runtime/internal/string_table.h
@@ -0,0 +1,217 @@
+#ifndef HALIDE_RUNTIME_STRING_TABLE_H
+#define HALIDE_RUNTIME_STRING_TABLE_H
+
+#include "linked_list.h"
+#include "pointer_table.h"
+#include "string_storage.h"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+
+// Storage class for an array of strings (based on block storage)
+// -- Intended for building and maintaining tables of strings
+class StringTable {
+public:
+    // Disable copy constructors
+    StringTable(const StringTable &) = delete;
+    StringTable &operator=(const StringTable &) = delete;
+
+    StringTable(const SystemMemoryAllocatorFns &allocator = StringStorage::default_allocator());
+    StringTable(void *user_context, size_t capacity, const SystemMemoryAllocatorFns &allocator = StringStorage::default_allocator());
+    StringTable(void *user_context, const char **array, size_t count, const SystemMemoryAllocatorFns &allocator = StringStorage::default_allocator());
+    ~StringTable();
+
+    void resize(void *user_context, size_t capacity);
+    void destroy(void *user_context);
+    void clear(void *user_context);
+
+    // fills the contents of the table (copies strings from given array)
+    void fill(void *user_context, const char **array, size_t coun);
+
+    // assign the entry at given index the given string
+    void assign(void *user_context, size_t index, const char *str, size_t length = 0);  // if length is zero, strlen is used
+
+    // appends the given string to the end of the table
+    void append(void *user_context, const char *str, size_t length = 0);  // if length is zero, strlen is used
+
+    // prepend the given string to the end of the table
+    void prepend(void *user_context, const char *str, size_t length = 0);  // if length is zero, strlen is used
+
+    // parses the given c-string based on given delimiter, stores each substring in the resulting table
+    size_t parse(void *user_context, const char *str, const char *delim);
+
+    // index-based access operator
+    const char *operator[](size_t index) const;
+
+    // returns the raw string table pointer
+    const char **data() const;
+
+    // scans the table for existance of the given string within any entry (linear scan w/string compare!)
+    bool contains(const char *str) const;
+
+    size_t size() const {
+        return contents.size();
+    }
+
+private:
+    LinkedList contents;    //< owns string data
+    PointerTable pointers;  //< stores pointers
+};
+
+// --
+
+StringTable::StringTable(const SystemMemoryAllocatorFns &sma)
+    : contents(nullptr, sizeof(StringStorage), 0, sma),
+      pointers(nullptr, 0, sma) {
+    // EMPTY!
+}
+
+StringTable::StringTable(void *user_context, size_t capacity, const SystemMemoryAllocatorFns &sma)
+    : contents(user_context, sizeof(StringStorage), capacity, sma),
+      pointers(user_context, capacity, sma) {
+    if (capacity) { resize(user_context, capacity); }
+}
+
+StringTable::StringTable(void *user_context, const char **array, size_t count, const SystemMemoryAllocatorFns &sma)
+    : contents(user_context, sizeof(StringStorage), count, sma),
+      pointers(user_context, count, sma) {
+    fill(user_context, array, count);
+}
+
+StringTable::~StringTable() {
+    destroy(nullptr);
+}
+
+void StringTable::resize(void *user_context, size_t capacity) {
+    for (size_t n = contents.size(); n < capacity; ++n) {
+        LinkedList::EntryType *entry_ptr = contents.append(user_context);
+        StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+        storage_ptr->initialize(user_context, 0, contents.current_allocator());
+    }
+    pointers.resize(user_context, capacity);
+}
+
+void StringTable::clear(void *user_context) {
+    for (size_t n = 0; n < contents.size(); ++n) {
+        LinkedList::EntryType *entry_ptr = contents.front();
+        StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+        storage_ptr->clear(user_context);
+        contents.pop_front(user_context);
+    }
+    contents.clear(user_context);
+    pointers.clear(user_context);
+}
+
+void StringTable::destroy(void *user_context) {
+    for (size_t n = 0; n < contents.size(); ++n) {
+        LinkedList::EntryType *entry_ptr = contents.front();
+        StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+        storage_ptr->destroy(user_context);
+        contents.pop_front(user_context);
+    }
+    contents.destroy(user_context);
+    pointers.destroy(user_context);
+}
+
+const char *StringTable::operator[](size_t index) const {
+    return static_cast<const char *>(pointers[index]);
+}
+
+void StringTable::fill(void *user_context, const char **array, size_t count) {
+    resize(user_context, count);
+    LinkedList::EntryType *entry_ptr = contents.front();
+    for (size_t n = 0; n < count && n < contents.size() && entry_ptr != nullptr; ++n) {
+        StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+        storage_ptr->assign(user_context, array[n]);
+        pointers.assign(user_context, n, storage_ptr->data());
+        entry_ptr = entry_ptr->next_ptr;
+    }
+}
+
+void StringTable::assign(void *user_context, size_t index, const char *str, size_t length) {
+    if (length == 0) { length = strlen(str); }
+    LinkedList::EntryType *entry_ptr = contents.front();
+    for (size_t n = 0; n < contents.size() && entry_ptr != nullptr; ++n) {
+        if (n == index) {
+            StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+            storage_ptr->assign(user_context, str, length);
+            pointers.assign(user_context, n, storage_ptr->data());
+            break;
+        }
+        entry_ptr = entry_ptr->next_ptr;
+    }
+}
+
+void StringTable::append(void *user_context, const char *str, size_t length) {
+    LinkedList::EntryType *entry_ptr = contents.append(user_context);
+    StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+    storage_ptr->initialize(user_context, 0, contents.current_allocator());
+    storage_ptr->assign(user_context, str, length);
+    pointers.append(user_context, storage_ptr->data());
+}
+
+void StringTable::prepend(void *user_context, const char *str, size_t length) {
+    LinkedList::EntryType *entry_ptr = contents.prepend(user_context);
+    StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+    storage_ptr->initialize(user_context, 0, contents.current_allocator());
+    storage_ptr->assign(user_context, str, length);
+    pointers.prepend(user_context, storage_ptr->data());
+}
+
+size_t StringTable::parse(void *user_context, const char *str, const char *delim) {
+    if (StringUtils::is_empty(str)) { return 0; }
+
+    size_t delim_length = strlen(delim);
+    size_t total_length = strlen(str);
+    size_t entry_count = StringUtils::count_tokens(str, delim);
+    if (entry_count < 1) { return 0; }
+
+    resize(user_context, entry_count);
+
+    // save each entry into the table
+    size_t index = 0;
+    const char *ptr = str;
+    LinkedList::EntryType *entry_ptr = contents.front();
+    while (!StringUtils::is_empty(ptr) && (index < entry_count)) {
+        size_t ptr_offset = ptr - str;
+        const char *next_delim = strstr(ptr, delim);
+        size_t token_length = (next_delim == nullptr) ? (total_length - ptr_offset) : (next_delim - ptr);
+        if (token_length > 0 && entry_ptr != nullptr) {
+            StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+            storage_ptr->assign(user_context, ptr, token_length);
+            pointers.assign(user_context, index, storage_ptr->data());
+            entry_ptr = entry_ptr->next_ptr;
+            ++index;
+        }
+        ptr = (next_delim != nullptr) ? (next_delim + delim_length) : nullptr;
+    }
+    return entry_count;
+}
+
+bool StringTable::contains(const char *str) const {
+    if (StringUtils::is_empty(str)) { return false; }
+
+    const LinkedList::EntryType *entry_ptr = contents.front();
+    for (size_t n = 0; n < contents.size() && entry_ptr != nullptr; ++n) {
+        StringStorage *storage_ptr = static_cast<StringStorage *>(entry_ptr->value);
+        if (storage_ptr->contains(str)) {
+            return true;
+        }
+        entry_ptr = entry_ptr->next_ptr;
+    }
+
+    return false;
+}
+
+const char **StringTable::data() const {
+    return reinterpret_cast<const char **>(pointers.data());
+}
+
+// --
+
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+#endif  // HALIDE_RUNTIME_STRING_STORAGE_H
diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h
index e551d080613b..2801f9bfedc5 100644
--- a/src/runtime/runtime_internal.h
+++ b/src/runtime/runtime_internal.h
@@ -1,9 +1,13 @@
 #ifndef HALIDE_RUNTIME_INTERNAL_H
 #define HALIDE_RUNTIME_INTERNAL_H
 
+#ifdef COMPILING_HALIDE_RUNTIME_TESTS
+// Only allowed if building Halide runtime tests ... since they use system compiler which may be GCC or MSVS
+#else
 #if __STDC_HOSTED__
 #error "Halide runtime files must be compiled with clang in freestanding mode."
 #endif
+#endif
 
 #ifdef __UINT8_TYPE__
 typedef __INT64_TYPE__ int64_t;
@@ -92,6 +96,7 @@ int strncmp(const char *s, const char *t, size_t n);
 size_t strlen(const char *s);
 const char *strchr(const char *s, int c);
 void *memcpy(void *s1, const void *s2, size_t n);
+void *memmove(void *dest, const void *src, size_t n);
 int memcmp(const void *s1, const void *s2, size_t n);
 void *memset(void *s, int val, size_t n);
 // Use fopen+fileno+fclose instead of open+close - the value of the
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4130f8fddaf3..ca1e3f46acf8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -39,4 +39,25 @@ if (WITH_TEST_GENERATOR)
     add_subdirectory(generator)
 endif ()
 
+# FIXME: Disable the runtime tests for MSVC until we have a MS compatible header.
+#
+# The runtime tests include src/runtime/runtime_internal.h which was written 
+# to only support clang (GCC's front end is close enough it works fine as well). 
+# We originally setup the tests to compile with clang (in the same way as the actual 
+# runtime bitcode files), but that wasn't very clean and didn't integrate well with 
+# the other tests, so we switched to just using the native system compiler. 
+# Sadly MSVC isn't compatible with the current runtime_internal.h which would need
+# some platform specific ifdefs for attributes and types that are causing compile 
+# errors.
+#
+cmake_dependent_option(WITH_TEST_RUNTIME "Build runtime tests" ON
+                       "NOT MSVC" OFF)
+
+if (WITH_TEST_RUNTIME)
+    message(STATUS "Building internal runtime tests enabled")
+    add_subdirectory(runtime)
+else ()
+    message(STATUS "Building internal runtime tests disabled")
+endif ()
+
 # FIXME: failing_with_issue is dead code :)
diff --git a/test/runtime/CMakeLists.txt b/test/runtime/CMakeLists.txt
new file mode 100644
index 000000000000..54c219ffa392
--- /dev/null
+++ b/test/runtime/CMakeLists.txt
@@ -0,0 +1,32 @@
+function(halide_define_runtime_internal_test NAME)
+    add_executable(runtime_internal_${NAME} ${NAME}.cpp)
+    target_link_libraries(runtime_internal_${NAME} PRIVATE Halide::Test)
+    target_include_directories(runtime_internal_${NAME} PRIVATE "${Halide_SOURCE_DIR}/src")
+    target_include_directories(runtime_internal_${NAME} PRIVATE "${Halide_SOURCE_DIR}/src/runtime")
+    target_link_libraries(runtime_internal_${NAME} PRIVATE Halide::Runtime)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        # Halide runtime lib has declarations for memcmp etc that conflict with GNU stdlib
+        target_compile_options(runtime_internal_${NAME} PRIVATE -Wno-builtin-declaration-mismatch )
+    endif()
+    target_compile_definitions(
+        runtime_internal_${NAME}
+        PRIVATE
+        HALIDE_VERSION=${Halide_VERSION}
+        HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR}
+        HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR}
+        HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH}
+        COMPILING_HALIDE_RUNTIME
+        COMPILING_HALIDE_RUNTIME_TESTS
+    )    
+    add_halide_test(runtime_internal_${NAME} GROUPS runtime_internal)
+endfunction()
+
+# NOTE: These tests directly include runtime_internal.h which isn't compatible with MSVC
+if(NOT MSVC)
+    halide_define_runtime_internal_test(block_allocator)
+    halide_define_runtime_internal_test(block_storage)
+    halide_define_runtime_internal_test(linked_list)
+    halide_define_runtime_internal_test(memory_arena)
+    halide_define_runtime_internal_test(string_storage)
+    halide_define_runtime_internal_test(string_table)
+endif()
\ No newline at end of file
diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp
new file mode 100644
index 000000000000..69479901fa95
--- /dev/null
+++ b/test/runtime/block_allocator.cpp
@@ -0,0 +1,140 @@
+#include "common.h"
+
+#include "internal/block_allocator.h"
+#include "internal/pointer_table.h"
+
+using namespace Halide::Runtime::Internal;
+
+namespace {
+
+size_t allocated_block_memory = 0;
+size_t allocated_region_memory = 0;
+
+void allocate_block(void *user_context, MemoryBlock *block) {
+    block->handle = native_system_malloc(user_context, block->size);
+    allocated_block_memory += block->size;
+
+    debug(user_context) << "Test : allocate_block ("
+                        << "block=" << (void *)(block) << " "
+                        << "block_size=" << int32_t(block->size) << " "
+                        << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
+                        << ") !\n";
+}
+
+void deallocate_block(void *user_context, MemoryBlock *block) {
+    native_system_free(user_context, block->handle);
+    allocated_block_memory -= block->size;
+
+    debug(user_context) << "Test : deallocate_block ("
+                        << "block=" << (void *)(block) << " "
+                        << "block_size=" << int32_t(block->size) << " "
+                        << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
+                        << ") !\n";
+}
+
+void allocate_region(void *user_context, MemoryRegion *region) {
+    region->handle = (void *)1;
+    allocated_region_memory += region->size;
+
+    debug(user_context) << "Test : allocate_region ("
+                        << "region=" << (void *)(region) << " "
+                        << "region_size=" << int32_t(region->size) << " "
+                        << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
+                        << ") !\n";
+}
+
+void deallocate_region(void *user_context, MemoryRegion *region) {
+    region->handle = (void *)0;
+    allocated_region_memory -= region->size;
+
+    debug(user_context) << "Test : deallocate_region ("
+                        << "region=" << (void *)(region) << " "
+                        << "region_size=" << int32_t(region->size) << " "
+                        << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
+                        << ") !\n";
+}
+
+}  // end namespace
+
+int main(int argc, char **argv) {
+    void *user_context = (void *)1;
+
+    SystemMemoryAllocatorFns system_allocator = {native_system_malloc, native_system_free};
+    MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block};
+    MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region};
+
+    // test class interface
+    {
+        BlockAllocator::Config config = {0};
+        config.minimum_block_size = 1024;
+
+        BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
+        BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
+
+        MemoryRequest request = {0};
+        request.size = sizeof(int);
+        request.alignment = sizeof(int);
+        request.properties.visibility = MemoryVisibility::DefaultVisibility;
+        request.properties.caching = MemoryCaching::DefaultCaching;
+        request.properties.usage = MemoryUsage::DefaultUsage;
+
+        MemoryRegion *r1 = instance->reserve(user_context, request);
+        halide_abort_if_false(user_context, r1 != nullptr);
+        halide_abort_if_false(user_context, allocated_block_memory == config.minimum_block_size);
+        halide_abort_if_false(user_context, allocated_region_memory == request.size);
+
+        MemoryRegion *r2 = instance->reserve(user_context, request);
+        halide_abort_if_false(user_context, r2 != nullptr);
+        halide_abort_if_false(user_context, allocated_block_memory == config.minimum_block_size);
+        halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size));
+
+        instance->reclaim(user_context, r1);
+        halide_abort_if_false(user_context, allocated_region_memory == (1 * request.size));
+
+        instance->destroy(user_context);
+        halide_abort_if_false(user_context, allocated_block_memory == 0);
+        halide_abort_if_false(user_context, allocated_region_memory == 0);
+
+        BlockAllocator::destroy(user_context, instance);
+    }
+
+    // stress test
+    {
+        BlockAllocator::Config config = {0};
+        config.minimum_block_size = 1024;
+
+        BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
+        BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
+
+        MemoryRequest request = {0};
+        request.size = sizeof(int);
+        request.alignment = sizeof(int);
+        request.properties.visibility = MemoryVisibility::DefaultVisibility;
+        request.properties.caching = MemoryCaching::DefaultCaching;
+        request.properties.usage = MemoryUsage::DefaultUsage;
+
+        static size_t test_allocations = 1000;
+        PointerTable pointers(user_context, test_allocations, system_allocator);
+        for (size_t n = 0; n < test_allocations; ++n) {
+            size_t count = n % 32;
+            count = count > 1 ? count : 1;
+            request.size = count * sizeof(int);
+            MemoryRegion *region = instance->reserve(user_context, request);
+            pointers.append(user_context, region);
+        }
+
+        for (size_t n = 0; n < pointers.size(); ++n) {
+            MemoryRegion *region = static_cast<MemoryRegion *>(pointers[n]);
+            instance->reclaim(user_context, region);
+        }
+        halide_abort_if_false(user_context, allocated_region_memory == 0);
+
+        instance->destroy(user_context);
+        halide_abort_if_false(user_context, allocated_block_memory == 0);
+
+        BlockAllocator::destroy(user_context, instance);
+    }
+
+    print(user_context) << "Success!\n";
+    return 0;
+}
diff --git a/test/runtime/block_storage.cpp b/test/runtime/block_storage.cpp
new file mode 100644
index 000000000000..ad7499f84378
--- /dev/null
+++ b/test/runtime/block_storage.cpp
@@ -0,0 +1,148 @@
+#include "common.h"
+
+#include "internal/block_storage.h"
+
+using namespace Halide::Runtime::Internal;
+
+struct TestStruct {
+    int8_t i8;
+    uint16_t ui16;
+    float f32;
+};
+
+template<typename T>
+T read_as(const BlockStorage &bs, size_t index) {
+    const T *ptr = static_cast<const T *>(bs[index]);
+    return *ptr;
+}
+
+int main(int argc, char **argv) {
+    void *user_context = (void *)1;
+
+    // test class interface
+    {
+        BlockStorage::Config config = BlockStorage::default_config();
+        config.entry_size = sizeof(int);
+
+        BlockStorage bs(user_context, config);
+        bs.reserve(user_context, 256);
+        halide_abort_if_false(user_context, bs.size() == 0);
+
+        int a1[4] = {12, 34, 56, 78};
+        bs.append(user_context, &a1[0]);
+        halide_abort_if_false(user_context, bs.size() == 1);
+        halide_abort_if_false(user_context, read_as<int>(bs, 0) == a1[0]);
+
+        bs.append(user_context, &a1[1]);
+        halide_abort_if_false(user_context, bs.size() == 2);
+        halide_abort_if_false(user_context, read_as<int>(bs, 1) == a1[1]);
+
+        bs.insert(user_context, 1, &a1[2]);
+        halide_abort_if_false(user_context, bs.size() == 3);
+        halide_abort_if_false(user_context, read_as<int>(bs, 0) == a1[0]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 1) == a1[2]);  // inserted here
+        halide_abort_if_false(user_context, read_as<int>(bs, 2) == a1[1]);
+
+        bs.prepend(user_context, &a1[3]);
+        halide_abort_if_false(user_context, bs.size() == 4);
+        halide_abort_if_false(user_context, read_as<int>(bs, 0) == a1[3]);
+
+        int a2[] = {98, 76, 54, 32, 10};
+        size_t a2_size = 5;
+        bs.fill(user_context, a2, a2_size);
+        halide_abort_if_false(user_context, bs.size() == a2_size);
+        halide_abort_if_false(user_context, read_as<int>(bs, 0) == a2[0]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 1) == a2[1]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 2) == a2[2]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 3) == a2[3]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 4) == a2[4]);
+
+        int a3[] = {77, 66, 55};
+        size_t a3_size = 3;
+        bs.insert(user_context, 2, a3, a3_size);
+        halide_abort_if_false(user_context, bs.size() == (a2_size + a3_size));
+        halide_abort_if_false(user_context, read_as<int>(bs, 0) == a2[0]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 1) == a2[1]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 2) == a3[0]);  // a3 inserted here
+        halide_abort_if_false(user_context, read_as<int>(bs, 3) == a3[1]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 4) == a3[2]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 5) == a2[2]);  // a2 resumes here
+        halide_abort_if_false(user_context, read_as<int>(bs, 6) == a2[3]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 7) == a2[4]);
+
+        bs.pop_front(user_context);
+        bs.pop_front(user_context);
+
+        bs.pop_back(user_context);
+        bs.pop_back(user_context);
+
+        halide_abort_if_false(user_context, bs.size() == (a2_size + a3_size - 4));
+        halide_abort_if_false(user_context, read_as<int>(bs, 0) == a3[0]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 1) == a3[1]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 2) == a3[2]);
+        halide_abort_if_false(user_context, read_as<int>(bs, 3) == a2[2]);
+
+        bs.clear(user_context);
+        halide_abort_if_false(user_context, bs.size() == 0);
+    }
+
+    // test copy and equality
+    {
+        BlockStorage::Config config = BlockStorage::default_config();
+        config.entry_size = sizeof(int);
+
+        int a1[] = {98, 76, 54, 32, 10};
+        size_t a1_size = 5;
+
+        int a2[] = {77, 66, 55};
+        size_t a2_size = 3;
+
+        BlockStorage bs1(user_context, config);
+        bs1.fill(user_context, a1, a1_size);
+
+        BlockStorage bs2(user_context, config);
+        bs2.fill(user_context, a2, a2_size);
+
+        BlockStorage bs3(bs1);
+
+        halide_abort_if_false(user_context, bs1.size() == (a1_size));
+        halide_abort_if_false(user_context, bs2.size() == (a2_size));
+        halide_abort_if_false(user_context, bs3.size() == bs1.size());
+
+        halide_abort_if_false(user_context, bs1 != bs2);
+        halide_abort_if_false(user_context, bs1 == bs3);
+
+        bs2 = bs1;
+        halide_abort_if_false(user_context, bs1 == bs2);
+    }
+
+    // test struct storage
+    {
+        BlockStorage::Config config = BlockStorage::default_config();
+        config.entry_size = sizeof(TestStruct);
+
+        BlockStorage bs(user_context, config);
+        halide_abort_if_false(user_context, bs.size() == 0);
+
+        TestStruct s1 = {8, 16, 32.0f};
+        bs.append(user_context, &s1);
+        halide_abort_if_false(user_context, bs.size() == 1);
+
+        const TestStruct e1 = read_as<TestStruct>(bs, 0);
+        halide_abort_if_false(user_context, e1.i8 == s1.i8);
+        halide_abort_if_false(user_context, e1.ui16 == s1.ui16);
+        halide_abort_if_false(user_context, e1.f32 == s1.f32);
+
+        TestStruct s2 = {1, 2, 3.0f};
+        bs.prepend(user_context, &s2);
+        halide_abort_if_false(user_context, bs.size() == 2);
+
+        const TestStruct e2 = read_as<TestStruct>(bs, 0);
+        halide_abort_if_false(user_context, e2.i8 == s2.i8);
+        halide_abort_if_false(user_context, e2.ui16 == s2.ui16);
+        halide_abort_if_false(user_context, e2.f32 == s2.f32);
+    }
+
+    print(user_context) << "Success!\n";
+    return 0;
+}
diff --git a/test/runtime/common.h b/test/runtime/common.h
new file mode 100644
index 000000000000..523e3b7e6797
--- /dev/null
+++ b/test/runtime/common.h
@@ -0,0 +1,29 @@
+#include <cstddef>
+#include <cstdint>
+
+#include "HalideRuntime.h"
+#include "msan_stubs.cpp"
+#include "runtime_internal.h"
+#include "to_string.cpp"
+
+extern "C" {
+
+extern int printf(const char *format, ...);
+
+void halide_print(void *user_context, const char *str) {
+    printf("%s", str);
+}
+
+void halide_error(void *user_context, const char *msg) {
+    halide_print(user_context, msg);
+}
+
+void halide_profiler_report(void *user_context) {
+}
+
+void halide_profiler_reset() {
+}
+
+}  // extern "C"
+
+#include "printer.h"
diff --git a/test/runtime/linked_list.cpp b/test/runtime/linked_list.cpp
new file mode 100644
index 000000000000..4e2ab51da685
--- /dev/null
+++ b/test/runtime/linked_list.cpp
@@ -0,0 +1,91 @@
+#include "common.h"
+
+#include "internal/linked_list.h"
+
+using namespace Halide::Runtime::Internal;
+
+struct TestStruct {
+    int8_t i8;
+    uint16_t ui16;
+    float f32;
+};
+
+template<typename T>
+T read_as(const LinkedList::EntryType *entry_ptr) {
+    const T *ptr = static_cast<const T *>(entry_ptr->value);
+    return *ptr;
+}
+
+int main(int argc, char **argv) {
+    void *user_context = (void *)1;
+
+    // test class interface
+    {
+        LinkedList list(user_context, sizeof(int), 64);
+        halide_abort_if_false(user_context, list.size() == 0);
+
+        const int i0 = 12;
+        list.append(user_context, &i0);  // contents: 12
+        halide_abort_if_false(user_context, list.size() == 1);
+        halide_abort_if_false(user_context, (list.front() != nullptr));
+        halide_abort_if_false(user_context, (list.back() != nullptr));
+        halide_abort_if_false(user_context, read_as<int>(list.front()) == i0);
+        halide_abort_if_false(user_context, read_as<int>(list.back()) == i0);
+
+        const int i1 = 34;
+        list.append(user_context, &i1);  // contents: 12, 34
+        halide_abort_if_false(user_context, list.size() == 2);
+        halide_abort_if_false(user_context, read_as<int>(list.back()) == i1);
+
+        const int i2 = 56;
+        list.insert_before(user_context, list.back(), &i2);  // contents: 12, 56, 34
+        halide_abort_if_false(user_context, list.size() == 3);
+        halide_abort_if_false(user_context, read_as<int>(list.back()) == i1);
+
+        const int i3 = 78;
+        list.prepend(user_context, &i3);  // contents: 78, 12, 56, 34
+        halide_abort_if_false(user_context, list.size() == 4);
+        halide_abort_if_false(user_context, read_as<int>(list.front()) == i3);
+        halide_abort_if_false(user_context, read_as<int>(list.back()) == i1);
+
+        list.pop_front(user_context);  // contents: 12, 56, 34
+        halide_abort_if_false(user_context, list.size() == 3);
+        halide_abort_if_false(user_context, read_as<int>(list.front()) == i0);
+        halide_abort_if_false(user_context, read_as<int>(list.back()) == i1);
+
+        list.pop_back(user_context);  // contents: 12, 56
+        halide_abort_if_false(user_context, list.size() == 2);
+        halide_abort_if_false(user_context, read_as<int>(list.front()) == i0);
+        halide_abort_if_false(user_context, read_as<int>(list.back()) == i2);
+
+        list.clear(user_context);
+        halide_abort_if_false(user_context, list.size() == 0);
+    }
+
+    // test struct storage
+    {
+        LinkedList list(user_context, sizeof(TestStruct));
+        halide_abort_if_false(user_context, list.size() == 0);
+
+        TestStruct s1 = {8, 16, 32.0f};
+        list.append(user_context, &s1);
+        halide_abort_if_false(user_context, list.size() == 1);
+
+        const TestStruct e1 = read_as<TestStruct>(list.front());
+        halide_abort_if_false(user_context, e1.i8 == s1.i8);
+        halide_abort_if_false(user_context, e1.ui16 == s1.ui16);
+        halide_abort_if_false(user_context, e1.f32 == s1.f32);
+
+        TestStruct s2 = {1, 2, 3.0f};
+        list.prepend(user_context, &s2);
+        halide_abort_if_false(user_context, list.size() == 2);
+
+        TestStruct e2 = read_as<TestStruct>(list.front());
+        halide_abort_if_false(user_context, e2.i8 == s2.i8);
+        halide_abort_if_false(user_context, e2.ui16 == s2.ui16);
+        halide_abort_if_false(user_context, e2.f32 == s2.f32);
+    }
+
+    print(user_context) << "Success!\n";
+    return 0;
+}
diff --git a/test/runtime/memory_arena.cpp b/test/runtime/memory_arena.cpp
new file mode 100644
index 000000000000..cce3c7bf1c02
--- /dev/null
+++ b/test/runtime/memory_arena.cpp
@@ -0,0 +1,88 @@
+#include "common.h"
+
+#include "internal/memory_arena.h"
+
+using namespace Halide::Runtime::Internal;
+
+namespace {
+
+size_t counter = 0;
+
+void *allocate_system(void *user_context, size_t bytes) {
+    ++counter;
+    return native_system_malloc(user_context, bytes);
+}
+
+void deallocate_system(void *user_context, void *ptr) {
+    native_system_free(user_context, ptr);
+    --counter;
+}
+
+}  // namespace
+
+struct TestStruct {
+    int8_t i8;
+    uint16_t ui16;
+    float f32;
+};
+
+int main(int argc, char **argv) {
+    void *user_context = (void *)1;
+
+    // test class interface
+    {
+        SystemMemoryAllocatorFns test_allocator = {allocate_system, deallocate_system};
+
+        MemoryArena::Config config = {sizeof(int), 32, 0};
+        MemoryArena arena(user_context, config, test_allocator);
+        void *p1 = arena.reserve(user_context);
+        halide_abort_if_false(user_context, counter > 1);
+        halide_abort_if_false(user_context, p1 != nullptr);
+
+        void *p2 = arena.reserve(user_context, true);
+        halide_abort_if_false(user_context, counter > 2);
+        halide_abort_if_false(user_context, p2 != nullptr);
+        halide_abort_if_false(user_context, (*static_cast<int *>(p2)) == 0);
+
+        arena.reclaim(user_context, p1);
+        arena.destroy(user_context);
+
+        halide_abort_if_false(user_context, counter == 0);
+    }
+
+    // test struct allocations
+    {
+        SystemMemoryAllocatorFns test_allocator = {allocate_system, deallocate_system};
+        MemoryArena::Config config = {sizeof(TestStruct), 32, 0};
+        MemoryArena arena(user_context, config, test_allocator);
+        void *s1 = arena.reserve(user_context, true);
+        halide_abort_if_false(user_context, s1 != nullptr);
+        halide_abort_if_false(user_context, counter > 1);
+        halide_abort_if_false(user_context, ((TestStruct *)s1)->i8 == int8_t(0));
+        halide_abort_if_false(user_context, ((TestStruct *)s1)->ui16 == uint16_t(0));
+        halide_abort_if_false(user_context, ((TestStruct *)s1)->f32 == float(0));
+
+        arena.destroy(user_context);
+
+        size_t count = 4 * 1024;
+        void *pointers[count];
+        for (size_t n = 0; n < count; ++n) {
+            pointers[n] = arena.reserve(user_context, true);
+        }
+
+        for (size_t n = 0; n < count; ++n) {
+            void *s1 = pointers[n];
+            halide_abort_if_false(user_context, s1 != nullptr);
+            halide_abort_if_false(user_context, ((TestStruct *)s1)->i8 == int8_t(0));
+            halide_abort_if_false(user_context, ((TestStruct *)s1)->ui16 == uint16_t(0));
+            halide_abort_if_false(user_context, ((TestStruct *)s1)->f32 == float(0));
+        }
+
+        arena.destroy(user_context);
+
+        halide_abort_if_false(user_context, counter == 0);
+    }
+
+    print(user_context) << "Success!\n";
+    return 0;
+}
diff --git a/test/runtime/string_storage.cpp b/test/runtime/string_storage.cpp
new file mode 100644
index 000000000000..b7428d4440a3
--- /dev/null
+++ b/test/runtime/string_storage.cpp
@@ -0,0 +1,63 @@
+#include "common.h"
+
+#include "internal/string_storage.h"
+
+using namespace Halide::Runtime::Internal;
+
+int main(int argc, char **argv) {
+    void *user_context = (void *)1;
+
+    // test class interface
+    {
+        StringStorage ss;
+        halide_abort_if_false(user_context, ss.length() == 0);
+
+        const char *ts1 = "Testing!";
+        const size_t ts1_length = strlen(ts1);
+        ss.assign(user_context, ts1);
+        halide_abort_if_false(user_context, ss.length() == ts1_length);
+        halide_abort_if_false(user_context, ss.contains(ts1));
+
+        const char *ts2 = "More ";
+        const size_t ts2_length = strlen(ts2);
+        ss.prepend(user_context, ts2);
+        halide_abort_if_false(user_context, ss.length() == (ts1_length + ts2_length));
+        halide_abort_if_false(user_context, ss.contains(ts2));
+        halide_abort_if_false(user_context, ss.contains(ts1));
+
+        ss.append(user_context, '!');
+        halide_abort_if_false(user_context, ss.length() == (ts1_length + ts2_length + 1));
+
+        ss.clear(user_context);
+        halide_abort_if_false(user_context, ss.length() == 0);
+    }
+
+    // test copy and equality
+    {
+        const char *ts1 = "Test One!";
+        const size_t ts1_length = strlen(ts1);
+
+        const char *ts2 = "Test Two!";
+        const size_t ts2_length = strlen(ts2);
+
+        StringStorage ss1;
+        ss1.assign(user_context, ts1, ts1_length);
+
+        StringStorage ss2;
+        ss2.assign(user_context, ts2, ts2_length);
+
+        StringStorage ss3(ss1);
+
+        halide_abort_if_false(user_context, ss1.length() == (ts1_length));
+        halide_abort_if_false(user_context, ss2.length() == (ts2_length));
+        halide_abort_if_false(user_context, ss3.length() == ss1.length());
+
+        halide_abort_if_false(user_context, ss1 != ss2);
+        halide_abort_if_false(user_context, ss1 == ss3);
+
+        ss2 = ss1;
+        halide_abort_if_false(user_context, ss1 == ss2);
+    }
+    print(user_context) << "Success!\n";
+    return 0;
+}
diff --git a/test/runtime/string_table.cpp b/test/runtime/string_table.cpp
new file mode 100644
index 000000000000..82d0525d02f3
--- /dev/null
+++ b/test/runtime/string_table.cpp
@@ -0,0 +1,44 @@
+#include "common.h"
+
+#include "internal/string_table.h"
+
+using namespace Halide::Runtime::Internal;
+
+int main(int argc, char **argv) {
+    void *user_context = (void *)1;
+
+    // test class interface
+    {
+        size_t data_size = 4;
+        const char *data[] = {
+            "one", "two", "three", "four"};
+
+        StringTable st1;
+        halide_abort_if_false(user_context, st1.size() == 0);
+
+        st1.fill(user_context, data, data_size);
+        halide_abort_if_false(user_context, st1.size() == data_size);
+        halide_abort_if_false(user_context, strncmp(st1[0], data[0], strlen(data[0])) == 0);
+        halide_abort_if_false(user_context, strncmp(st1[1], data[1], strlen(data[1])) == 0);
+        halide_abort_if_false(user_context, strncmp(st1[2], data[2], strlen(data[2])) == 0);
+        halide_abort_if_false(user_context, strncmp(st1[3], data[3], strlen(data[3])) == 0);
+        halide_abort_if_false(user_context, st1.contains(data[0]));
+        halide_abort_if_false(user_context, st1.contains(data[1]));
+        halide_abort_if_false(user_context, st1.contains(data[2]));
+        halide_abort_if_false(user_context, st1.contains(data[3]));
+
+        st1.clear(user_context);
+        halide_abort_if_false(user_context, st1.size() == 0);
+
+        size_t entry_count = st1.parse(user_context, "one:two:three:four", ":");
+        halide_abort_if_false(user_context, entry_count == data_size);
+        halide_abort_if_false(user_context, st1.size() == data_size);
+        halide_abort_if_false(user_context, st1.contains(data[0]));
+        halide_abort_if_false(user_context, st1.contains(data[1]));
+        halide_abort_if_false(user_context, st1.contains(data[2]));
+        halide_abort_if_false(user_context, st1.contains(data[3]));
+    }
+
+    print(user_context) << "Success!\n";
+    return 0;
+}