From b1ca3345b81b3a5ebee16278d1a603b4fe7eae31 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 15 Jul 2022 15:13:50 -0700 Subject: [PATCH 1/2] Rework autoscheduler API (#6788) (#6838) * Rework autoschduler API (#6788) * Oops * Update test_function_dag.cpp * clang-tidy * trigger buildbots * Update Generator.h * Minor cleanups * Update README_cmake.md * Check for malformed autoscheduler_params dicts * Add alias-with-autoscheduler code, plus tweaks * Update stubtest_jittest.cpp * Update Makefile * trigger buildbots * fixes * Update AbstractGenerator.cpp * Update stubtest_generator.cpp * Update Makefile * Add deprecation warning for HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API * Make AutoschedulerParams a real struct * clang-tidy --- Makefile | 29 ++- README_cmake.md | 7 +- apps/HelloPyTorch/Makefile | 36 ++-- apps/HelloPyTorch/src/add_generator.cpp | 4 +- apps/bgu/Makefile | 4 +- apps/bgu/bgu_generator.cpp | 4 +- apps/bilateral_grid/Makefile | 4 +- .../bilateral_grid_generator.cpp | 2 +- apps/camera_pipe/Makefile | 4 +- apps/camera_pipe/camera_pipe_generator.cpp | 12 +- apps/conv_layer/Makefile | 4 +- apps/conv_layer/conv_layer_generator.cpp | 2 +- apps/depthwise_separable_conv/Makefile | 4 +- .../depthwise_separable_conv_generator.cpp | 4 +- apps/harris/Makefile | 4 +- apps/harris/harris_generator.cpp | 2 +- apps/hist/Makefile | 4 +- apps/hist/hist_generator.cpp | 2 +- apps/iir_blur/Makefile | 4 +- apps/iir_blur/iir_blur_generator.cpp | 4 +- apps/interpolate/Makefile | 4 +- apps/interpolate/interpolate_generator.cpp | 2 +- apps/lens_blur/Makefile | 4 +- apps/lens_blur/lens_blur_generator.cpp | 2 +- apps/linear_blur/linear_blur_generator.cpp | 2 +- apps/linear_blur/linear_to_srgb_generator.cpp | 2 +- apps/linear_blur/simple_blur_generator.cpp | 2 +- apps/linear_blur/srgb_to_linear_generator.cpp | 2 +- apps/local_laplacian/Makefile | 4 +- .../local_laplacian_generator.cpp | 2 +- apps/max_filter/Makefile | 4 +- apps/max_filter/max_filter_generator.cpp | 2 +- apps/nl_means/Makefile | 4 +- apps/nl_means/nl_means_generator.cpp | 2 +- apps/resnet_50/Makefile | 2 +- apps/stencil_chain/Makefile | 4 +- .../stencil_chain/stencil_chain_generator.cpp | 2 +- apps/support/autoscheduler.inc | 99 ----------- apps/unsharp/Makefile | 4 +- apps/unsharp/unsharp_generator.cpp | 2 +- cmake/HalideGeneratorHelpers.cmake | 5 +- python_bindings/src/PyHalide.cpp | 4 + python_bindings/src/PyMachineParams.cpp | 2 + python_bindings/src/PyMachineParams.h | 2 + python_bindings/src/PyModule.cpp | 6 +- python_bindings/src/PyPipeline.cpp | 32 +++- python_bindings/todo.txt | 1 - src/AbstractGenerator.cpp | 25 ++- src/AbstractGenerator.h | 2 +- src/Generator.cpp | 166 ++++++++++++++++-- src/Generator.h | 92 ++++++++-- src/Module.cpp | 30 +++- src/Pipeline.cpp | 35 ++++ src/Pipeline.h | 60 ++++++- src/autoschedulers/adams2019/AutoSchedule.cpp | 43 +++-- src/autoschedulers/adams2019/AutoSchedule.h | 2 +- src/autoschedulers/adams2019/Cache.cpp | 2 +- src/autoschedulers/adams2019/Cache.h | 2 +- src/autoschedulers/adams2019/CostModel.h | 10 +- .../adams2019/DefaultCostModel.cpp | 2 +- .../adams2019/DefaultCostModel.h | 8 +- src/autoschedulers/adams2019/FunctionDAG.cpp | 2 +- src/autoschedulers/adams2019/FunctionDAG.h | 4 +- src/autoschedulers/adams2019/LoopNest.cpp | 6 +- src/autoschedulers/adams2019/LoopNest.h | 6 +- src/autoschedulers/adams2019/Makefile | 11 +- src/autoschedulers/adams2019/State.cpp | 10 +- src/autoschedulers/adams2019/State.h | 10 +- src/autoschedulers/adams2019/autotune_loop.sh | 7 +- .../adams2019/cost_model_generator.cpp | 6 +- .../included_schedule_file_generator.cpp | 2 +- src/autoschedulers/adams2019/test.cpp | 70 +++++--- .../adams2019/test_function_dag.cpp | 18 +- .../li2018/GradientAutoscheduler.cpp | 43 ++++- src/autoschedulers/li2018/Makefile | 2 +- src/autoschedulers/li2018/test.cpp | 64 +++++-- src/autoschedulers/li2018/test.py | 5 +- .../mullapudi2016/AutoSchedule.cpp | 61 ++++++- test/auto_schedule/cost_function.cpp | 4 + test/auto_schedule/data_dependent.cpp | 4 + test/auto_schedule/extern.cpp | 12 ++ test/auto_schedule/fibonacci.cpp | 4 + test/auto_schedule/histogram.cpp | 4 + test/auto_schedule/large_window.cpp | 4 + test/auto_schedule/mat_mul.cpp | 4 + test/auto_schedule/max_filter.cpp | 4 + test/auto_schedule/multi_output.cpp | 10 +- test/auto_schedule/overlap.cpp | 4 + test/auto_schedule/param.cpp | 16 ++ test/auto_schedule/reorder.cpp | 12 ++ test/auto_schedule/small_pure_update.cpp | 7 +- test/auto_schedule/tile_vs_inline.cpp | 4 + test/auto_schedule/unused_func.cpp | 4 + .../auto_schedule/vectorize_var_in_update.cpp | 4 + test/correctness/custom_auto_scheduler.cpp | 21 ++- test/error/auto_schedule_no_parallel.cpp | 4 + test/error/auto_schedule_no_reorder.cpp | 4 + test/generator/CMakeLists.txt | 15 +- test/generator/alias_aottest.cpp | 36 ++++ test/generator/alias_generator.cpp | 18 ++ test/generator/example_generator.cpp | 2 +- test/generator/stubtest_generator.cpp | 2 +- test/generator/stubtest_jittest.cpp | 4 +- tutorial/CMakeLists.txt | 7 +- .../lesson_21_auto_scheduler_generate.cpp | 52 +++--- 105 files changed, 1016 insertions(+), 399 deletions(-) delete mode 100644 apps/support/autoscheduler.inc diff --git a/Makefile b/Makefile index 45440516c540..97d481012909 100644 --- a/Makefile +++ b/Makefile @@ -1443,6 +1443,18 @@ $(FILTERS_DIR)/alias_with_offset_42.a: $(BIN_DIR)/alias.generator @mkdir -p $(@D) $(CURDIR)/$< -g alias_with_offset_42 -f alias_with_offset_42 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime +$(FILTERS_DIR)/alias_Adams2019.a: $(BIN_DIR)/alias.generator autoschedulers + @mkdir -p $(@D) + $(CURDIR)/$< -g alias_Adams2019 -f alias_Adams2019 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_DIR)/libautoschedule_adams2019.$(SHARED_EXT) + +$(FILTERS_DIR)/alias_Li2018.a: $(BIN_DIR)/alias.generator autoschedulers + @mkdir -p $(@D) + $(CURDIR)/$< -g alias_Li2018 -f alias_Li2018 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_DIR)/libautoschedule_li2018.$(SHARED_EXT) + +$(FILTERS_DIR)/alias_Mullapudi2016.a: $(BIN_DIR)/alias.generator autoschedulers + @mkdir -p $(@D) + $(CURDIR)/$< -g alias_Mullapudi2016 -f alias_Mullapudi2016 $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -p $(BIN_DIR)/libautoschedule_mullapudi2016.$(SHARED_EXT) + METADATA_TESTER_GENERATOR_ARGS=\ input.type=uint8 input.dim=3 \ dim_only_input_buffer.type=uint8 \ @@ -1552,7 +1564,7 @@ $(FILTERS_DIR)/stubtest.a: $(BIN_DIR)/stubtest.generator $(FILTERS_DIR)/stubuser_auto.a: $(BIN_DIR)/stubuser.generator $(BIN_MULLAPUDI2016) @mkdir -p $(@D) - $(CURDIR)/$< -g stubuser $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f stubuser_auto target=$(TARGET)-no_runtime auto_schedule=true -s Mullapudi2016 -p $(BIN_MULLAPUDI2016) + $(CURDIR)/$< -g stubuser $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f stubuser_auto target=$(TARGET)-no_runtime autoscheduler=Mullapudi2016 -p $(BIN_MULLAPUDI2016) $(FILTERS_DIR)/external_code.a: $(BIN_DIR)/external_code.generator @mkdir -p $(@D) @@ -1564,7 +1576,7 @@ $(FILTERS_DIR)/external_code.halide_generated.cpp: $(BIN_DIR)/external_code.gene $(FILTERS_DIR)/autograd_grad.a: $(BIN_DIR)/autograd.generator $(BIN_MULLAPUDI2016) @mkdir -p $(@D) - $(CURDIR)/$< -g autograd $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f autograd_grad target=$(TARGET)-no_runtime auto_schedule=true -s Mullapudi2016 -d 1 -p $(BIN_MULLAPUDI2016) + $(CURDIR)/$< -g autograd $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) -f autograd_grad target=$(TARGET)-no_runtime autoscheduler=Mullapudi2016 -d 1 -p $(BIN_MULLAPUDI2016) # Usually, it's considered best practice to have one Generator per # .cpp file, with the generator-name and filename matching; @@ -1611,12 +1623,13 @@ $(BIN_DIR)/$(TARGET)/generator_aot_sanitizercoverage: $(ROOT_DIR)/test/generator @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter-out %.h,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ + # alias has additional deps to link in -$(BIN_DIR)/$(TARGET)/generator_aot_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.a $(FILTERS_DIR)/alias_with_offset_42.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a +$(BIN_DIR)/$(TARGET)/generator_aot_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.a $(FILTERS_DIR)/alias_with_offset_42.a $(FILTERS_DIR)/alias_Adams2019.a $(FILTERS_DIR)/alias_Li2018.a $(FILTERS_DIR)/alias_Mullapudi2016.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ -$(BIN_DIR)/$(TARGET)/generator_aotcpp_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.halide_generated.cpp $(FILTERS_DIR)/alias_with_offset_42.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a +$(BIN_DIR)/$(TARGET)/generator_aotcpp_alias: $(ROOT_DIR)/test/generator/alias_aottest.cpp $(FILTERS_DIR)/alias.halide_generated.cpp $(FILTERS_DIR)/alias_with_offset_42.halide_generated.cpp $(FILTERS_DIR)/alias_Adams2019.halide_generated.cpp $(FILTERS_DIR)/alias_Li2018.halide_generated.cpp $(FILTERS_DIR)/alias_Mullapudi2016.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a @mkdir -p $(@D) $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) -o $@ @@ -1841,13 +1854,17 @@ $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate: $(ROOT_DIR)/tutorial/less $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \ -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ -# The values in MachineParams are: +# The values are: # - the maximum level of parallelism available, # - the size of the last-level cache (in bytes), # - the ratio between the cost of a miss at the last level cache and the cost # of arithmetic on the target architecture # ...in that order. -LESSON_21_MACHINE_PARAMS = 32,16777216,40 +LESSON_21_AUTOSCHEDULER_PARAMS=\ + autoscheduler=Mullapudi2016 \ + autoscheduler.parallelism=32 \ + autoscheduler.last_level_cache_size=16777216 \ + autoscheduler.balance=40 $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_run: $(ROOT_DIR)/tutorial/lesson_21_auto_scheduler_run.cpp $(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate $(BIN_MULLAPUDI2016) @-mkdir -p $(TMP_DIR) diff --git a/README_cmake.md b/README_cmake.md index c6d971e03c71..0ceb4b931dc9 100644 --- a/README_cmake.md +++ b/README_cmake.md @@ -677,8 +677,7 @@ autoscheduler: ```cmake add_halide_library(my_second_generator FROM my_generators - AUTOSCHEDULER Halide::Adams2019 - PARAMS auto_schedule=true) + AUTOSCHEDULER Halide::Adams2019) ``` ### RunGenMain @@ -858,9 +857,9 @@ being created. When `TARGETS` is empty and the `host` target would not cross-compile, then `host` will be used. Otherwise, `cmake` will be used and an author warning will be issued. -To set the default autoscheduler, set the `AUTOSCHEDULER` argument to a target +To use an autoscheduler, set the `AUTOSCHEDULER` argument to a target named like `Namespace::Scheduler`, for example `Halide::Adams19`. This will set -the `-s` flag on the generator command line to `Scheduler` and add the target to +the `autoscheduler` GeneratorParam on the generator command line to `Scheduler` and add the target to the list of plugins. Additional plugins can be loaded by setting the `PLUGINS` argument. If the argument to `AUTOSCHEDULER` does not contain `::` or it does not name a target, it will be passed to the `-s` flag verbatim. diff --git a/apps/HelloPyTorch/Makefile b/apps/HelloPyTorch/Makefile index 15dd231b99de..c05d2826a475 100644 --- a/apps/HelloPyTorch/Makefile +++ b/apps/HelloPyTorch/Makefile @@ -84,8 +84,7 @@ $(BIN)/%/add_float32.a: $(GENERATOR_BIN)/add.generator -f add_float32 \ -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ - target=$* \ - auto_schedule=false + target=$* $(BIN)/%/add_halidegrad_float32.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -95,11 +94,10 @@ $(BIN)/%/add_halidegrad_float32.a: $(GENERATOR_BIN)/add.generator -f add_halidegrad_float32 \ -e static_library,c_header,pytorch_wrapper \ -p $(HALIDE_DISTRIB_PATH)/lib/libautoschedule_li2018.so \ - -s Li2018 \ -o $(@D) \ -d 1 \ target=$* \ - auto_schedule=true + autoscheduler=Li2018 $(BIN)/%/add_grad_float32.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -109,8 +107,7 @@ $(BIN)/%/add_grad_float32.a: $(GENERATOR_BIN)/add.generator -f add_grad_float32 \ -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ - target=$* \ - auto_schedule=false + target=$* $(BIN)/%/add_float64.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -120,8 +117,7 @@ $(BIN)/%/add_float64.a: $(GENERATOR_BIN)/add.generator -f add_float64 \ -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ - target=$* \ - auto_schedule=false + target=$* $(BIN)/%/add_halidegrad_float64.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -132,11 +128,10 @@ $(BIN)/%/add_halidegrad_float64.a: $(GENERATOR_BIN)/add.generator -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ -p $(HALIDE_DISTRIB_PATH)/lib/libautoschedule_li2018.so \ - -s Li2018 \ target=$* \ -d 1 \ target=$* \ - auto_schedule=true + autoscheduler=Li2018 $(BIN)/%/add_grad_float64.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -146,8 +141,7 @@ $(BIN)/%/add_grad_float64.a: $(GENERATOR_BIN)/add.generator -f add_grad_float64 \ -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ - target=$* \ - auto_schedule=false + target=$* # ----------------------------------------------------------------------------- @@ -160,8 +154,7 @@ $(BIN)/%/add_cuda_float32.a: $(GENERATOR_BIN)/add.generator -f add_cuda_float32 \ -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ - target=$(CUDA_TARGET) \ - auto_schedule=false + target=$(CUDA_TARGET) $(BIN)/%/add_halidegrad_cuda_float32.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -172,10 +165,9 @@ $(BIN)/%/add_halidegrad_cuda_float32.a: $(GENERATOR_BIN)/add.generator -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ -p $(HALIDE_DISTRIB_PATH)/lib/libautoschedule_li2018.so \ - -s Li2018 \ -d 1 \ target=$(CUDA_TARGET) \ - auto_schedule=true + autoscheduler=Li2018 $(BIN)/%/add_grad_cuda_float32.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -185,8 +177,7 @@ $(BIN)/%/add_grad_cuda_float32.a: $(GENERATOR_BIN)/add.generator -f add_grad_cuda_float32 \ -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ - target=$(CUDA_TARGET) \ - auto_schedule=false + target=$(CUDA_TARGET) $(BIN)/%/add_cuda_float64.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -196,8 +187,7 @@ $(BIN)/%/add_cuda_float64.a: $(GENERATOR_BIN)/add.generator -f add_cuda_float64 \ -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ - target=$(CUDA_TARGET) \ - auto_schedule=false + target=$(CUDA_TARGET) $(BIN)/%/add_halidegrad_cuda_float64.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -208,10 +198,9 @@ $(BIN)/%/add_halidegrad_cuda_float64.a: $(GENERATOR_BIN)/add.generator -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ -p $(HALIDE_DISTRIB_PATH)/lib/libautoschedule_li2018.so \ - -s Li2018 \ -d 1 \ target=$(CUDA_TARGET) \ - auto_schedule=true + autoscheduler=Li2018 $(BIN)/%/add_grad_cuda_float64.a: $(GENERATOR_BIN)/add.generator @mkdir -p $(@D) @@ -221,8 +210,7 @@ $(BIN)/%/add_grad_cuda_float64.a: $(GENERATOR_BIN)/add.generator -f add_grad_cuda_float64 \ -e static_library,c_header,pytorch_wrapper \ -o $(@D) \ - target=$(CUDA_TARGET) \ - auto_schedule=false + target=$(CUDA_TARGET) # ----------------------------------------------------------------------------- diff --git a/apps/HelloPyTorch/src/add_generator.cpp b/apps/HelloPyTorch/src/add_generator.cpp index 8f2d8f4d6a81..ccfaa937d5e9 100644 --- a/apps/HelloPyTorch/src/add_generator.cpp +++ b/apps/HelloPyTorch/src/add_generator.cpp @@ -30,7 +30,7 @@ class AddGenerator : public Generator { output.set_estimates({{0, kEdge}, {0, kEdge}, {0, kEdge}, {0, kEdge}}); // Schedule - if (!auto_schedule) { + if (!using_autoscheduler()) { Var tx("tx"), xy("xy"), cn("cn"), allvars("allvars"); if (get_target().has_gpu_feature()) { output @@ -84,7 +84,7 @@ class AddGradGenerator : public Generator { d_input_b.set_estimates({{0, kEdge}, {0, kEdge}, {0, kEdge}, {0, kEdge}}); // Schedule - if (!auto_schedule) { + if (!using_autoscheduler()) { Var tx("tx"), xy("xy"), cn("cn"), allvars("allvars"); if (get_target().has_gpu_feature()) { diff --git a/apps/bgu/Makefile b/apps/bgu/Makefile index 297ceaee90b0..8eb687ec064a 100644 --- a/apps/bgu/Makefile +++ b/apps/bgu/Makefile @@ -16,11 +16,11 @@ $(GENERATOR_BIN)/bgu.generator: bgu_generator.cpp $(GENERATOR_DEPS) $(BIN)/%/bgu.a: $(GENERATOR_BIN)/bgu.generator @mkdir -p $(@D) - $< -g bgu -f bgu -o $(BIN)/$* target=$*-no_runtime auto_schedule=false + $< -g bgu -f bgu -o $(BIN)/$* target=$*-no_runtime $(BIN)/%/bgu_auto_schedule.a: $(GENERATOR_BIN)/bgu.generator @mkdir -p $(@D) - $< -g bgu -f bgu_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true + $< -g bgu -f bgu_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/bgu.generator @mkdir -p $(@D) diff --git a/apps/bgu/bgu_generator.cpp b/apps/bgu/bgu_generator.cpp index 054df3e52ba6..1b2cff5b1dc7 100644 --- a/apps/bgu/bgu_generator.cpp +++ b/apps/bgu/bgu_generator.cpp @@ -430,7 +430,7 @@ class BGU : public Generator { b(2, 2) += weighted_lambda * gain; // Now solve Ax = b - Matrix<3, 4> result = transpose(solve_symmetric(A, b, line, x, auto_schedule, get_target())); + Matrix<3, 4> result = transpose(solve_symmetric(A, b, line, x, using_autoscheduler(), get_target())); // Pack the resulting matrix into the output Func. line(x, y, z, c) = pack_channels(c, {result(0, 0), @@ -509,7 +509,7 @@ class BGU : public Generator { output = slice; // Schedule - if (!auto_schedule) { + if (!using_autoscheduler()) { if (!get_target().has_gpu_feature()) { // 7.09 ms on an Intel i9-9960X using 16 threads // diff --git a/apps/bilateral_grid/Makefile b/apps/bilateral_grid/Makefile index 405d3e3c6782..11d79fbcd946 100644 --- a/apps/bilateral_grid/Makefile +++ b/apps/bilateral_grid/Makefile @@ -10,11 +10,11 @@ $(GENERATOR_BIN)/bilateral_grid.generator: bilateral_grid_generator.cpp $(GENERA $(BIN)/%/bilateral_grid.a: $(GENERATOR_BIN)/bilateral_grid.generator @mkdir -p $(@D) - $^ -g bilateral_grid -e $(GENERATOR_OUTPUTS) -o $(@D) -f bilateral_grid target=$* auto_schedule=false + $^ -g bilateral_grid -e $(GENERATOR_OUTPUTS) -o $(@D) -f bilateral_grid target=$* $(BIN)/%/bilateral_grid_auto_schedule.a: $(GENERATOR_BIN)/bilateral_grid.generator @mkdir -p $(@D) - $^ -g bilateral_grid -e $(GENERATOR_OUTPUTS) -o $(@D) -f bilateral_grid_auto_schedule target=$*-no_runtime auto_schedule=true + $^ -g bilateral_grid -e $(GENERATOR_OUTPUTS) -o $(@D) -f bilateral_grid_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/filter: filter.cpp $(BIN)/%/bilateral_grid.a $(BIN)/%/bilateral_grid_auto_schedule.a @mkdir -p $(@D) diff --git a/apps/bilateral_grid/bilateral_grid_generator.cpp b/apps/bilateral_grid/bilateral_grid_generator.cpp index ede57459d5ab..b1e07fb15cdf 100644 --- a/apps/bilateral_grid/bilateral_grid_generator.cpp +++ b/apps/bilateral_grid/bilateral_grid_generator.cpp @@ -80,7 +80,7 @@ class BilateralGrid : public Halide::Generator { blury.set_estimate(z, 0, 12); bilateral_grid.set_estimates({{0, 1536}, {0, 2560}}); - if (auto_schedule) { + if (using_autoscheduler()) { // nothing } else if (get_target().has_gpu_feature()) { // 0.50ms on an RTX 2060 diff --git a/apps/camera_pipe/Makefile b/apps/camera_pipe/Makefile index 38f984d2af3e..b86698cd36ed 100644 --- a/apps/camera_pipe/Makefile +++ b/apps/camera_pipe/Makefile @@ -12,11 +12,11 @@ $(GENERATOR_BIN)/camera_pipe.generator: camera_pipe_generator.cpp $(GENERATOR_DE $(BIN)/%/camera_pipe.a: $(GENERATOR_BIN)/camera_pipe.generator @mkdir -p $(@D) - $^ -g camera_pipe -e $(GENERATOR_OUTPUTS) -o $(@D) -f camera_pipe target=$* auto_schedule=false + $^ -g camera_pipe -e $(GENERATOR_OUTPUTS) -o $(@D) -f camera_pipe target=$* $(BIN)/%/camera_pipe_auto_schedule.a: $(GENERATOR_BIN)/camera_pipe.generator @mkdir -p $(@D) - $^ -g camera_pipe -e $(GENERATOR_OUTPUTS) -o $(@D) -f camera_pipe_auto_schedule target=$*-no_runtime auto_schedule=true + $^ -g camera_pipe -e $(GENERATOR_OUTPUTS) -o $(@D) -f camera_pipe_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/process: process.cpp $(BIN)/%/camera_pipe.a $(BIN)/%/camera_pipe_auto_schedule.a @mkdir -p $(@D) diff --git a/apps/camera_pipe/camera_pipe_generator.cpp b/apps/camera_pipe/camera_pipe_generator.cpp index ec0323676cd4..06251f5691bb 100644 --- a/apps/camera_pipe/camera_pipe_generator.cpp +++ b/apps/camera_pipe/camera_pipe_generator.cpp @@ -154,7 +154,7 @@ class Demosaic : public Halide::Generator { void schedule() { Pipeline p(output); - if (auto_schedule) { + if (using_autoscheduler()) { // blank } else if (get_target().has_gpu_feature()) { Var xi, yi; @@ -270,7 +270,7 @@ Func CameraPipe::color_correct(Func input) { Expr val = (matrix_3200(x, y) * alpha + matrix_7000(x, y) * (1 - alpha)); matrix(x, y) = cast(val * 256.0f); // Q8.8 fixed point - if (!auto_schedule) { + if (!using_autoscheduler()) { matrix.compute_root(); if (get_target().has_gpu_feature()) { matrix.gpu_single_thread(); @@ -331,7 +331,7 @@ Func CameraPipe::apply_curve(Func input) { // makeLUT add guard band outside of (minRaw, maxRaw]: curve(x) = select(x <= minRaw, 0, select(x > maxRaw, 255, val)); - if (!auto_schedule) { + if (!using_autoscheduler()) { // It's a LUT, compute it once ahead of time. curve.compute_root(); if (get_target().has_gpu_feature()) { @@ -370,7 +370,7 @@ Func CameraPipe::sharpen(Func input) { // Convert the sharpening strength to 2.5 fixed point. This allows sharpening in the range [0, 4]. Func sharpen_strength_x32("sharpen_strength_x32"); sharpen_strength_x32() = u8_sat(sharpen_strength * 32); - if (!auto_schedule) { + if (!using_autoscheduler()) { sharpen_strength_x32.compute_root(); if (get_target().has_gpu_feature()) { sharpen_strength_x32.gpu_single_thread(); @@ -439,12 +439,12 @@ void CameraPipe::generate() { processed.set_estimates({{0, 2592}, {0, 1968}, {0, 3}}); // Schedule - if (auto_schedule) { + if (using_autoscheduler()) { // nothing } else if (get_target().has_gpu_feature()) { // We can generate slightly better code if we know the output is even-sized - if (!auto_schedule) { + if (!using_autoscheduler()) { // TODO: The autoscheduler really ought to be able to // accommodate bounds on the output Func. Expr out_width = processed.width(); diff --git a/apps/conv_layer/Makefile b/apps/conv_layer/Makefile index 2ac64101691f..43db9f9ee70a 100644 --- a/apps/conv_layer/Makefile +++ b/apps/conv_layer/Makefile @@ -10,11 +10,11 @@ $(GENERATOR_BIN)/conv_layer.generator: conv_layer_generator.cpp $(GENERATOR_DEPS $(BIN)/%/conv_layer.a: $(GENERATOR_BIN)/conv_layer.generator @mkdir -p $(@D) - $^ -g conv_layer -e $(GENERATOR_OUTPUTS) -o $(@D) -f conv_layer target=$* auto_schedule=false + $^ -g conv_layer -e $(GENERATOR_OUTPUTS) -o $(@D) -f conv_layer target=$* $(BIN)/%/conv_layer_auto_schedule.a: $(GENERATOR_BIN)/conv_layer.generator @mkdir -p $(@D) - $^ -g conv_layer -e $(GENERATOR_OUTPUTS) -o $(@D) -f conv_layer_auto_schedule target=$*-no_runtime auto_schedule=true + $^ -g conv_layer -e $(GENERATOR_OUTPUTS) -o $(@D) -f conv_layer_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/process: process.cpp $(BIN)/%/conv_layer.a $(BIN)/%/conv_layer_auto_schedule.a @mkdir -p $(@D) diff --git a/apps/conv_layer/conv_layer_generator.cpp b/apps/conv_layer/conv_layer_generator.cpp index 5b6ff1ee5e10..a27d367a076d 100644 --- a/apps/conv_layer/conv_layer_generator.cpp +++ b/apps/conv_layer/conv_layer_generator.cpp @@ -49,7 +49,7 @@ class ConvolutionLayer : public Halide::Generator { bias.dim(0).set_bounds(0, CO).set_stride(1); - if (auto_schedule) { + if (using_autoscheduler()) { input.dim(0).set_estimate(0, CI); input.dim(1).set_estimate(0, W + 2); input.dim(2).set_estimate(0, H + 2); diff --git a/apps/depthwise_separable_conv/Makefile b/apps/depthwise_separable_conv/Makefile index def2146eb3f6..001e12444809 100644 --- a/apps/depthwise_separable_conv/Makefile +++ b/apps/depthwise_separable_conv/Makefile @@ -8,11 +8,11 @@ $(GENERATOR_BIN)/depthwise_separable_conv.generator: depthwise_separable_conv_ge $(BIN)/%/depthwise_separable_conv.a: $(GENERATOR_BIN)/depthwise_separable_conv.generator @mkdir -p $(@D) - $^ -g depthwise_separable_conv -e $(GENERATOR_OUTPUTS) -o $(@D) -f depthwise_separable_conv target=$* auto_schedule=false + $^ -g depthwise_separable_conv -e $(GENERATOR_OUTPUTS) -o $(@D) -f depthwise_separable_conv target=$* $(BIN)/%/depthwise_separable_conv_auto_schedule.a: $(GENERATOR_BIN)/depthwise_separable_conv.generator @mkdir -p $(@D) - $^ -g depthwise_separable_conv -e $(GENERATOR_OUTPUTS) -o $(@D) -f depthwise_separable_conv_auto_schedule target=$*-no_runtime auto_schedule=true + $^ -g depthwise_separable_conv -e $(GENERATOR_OUTPUTS) -o $(@D) -f depthwise_separable_conv_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/process: process.cpp $(BIN)/%/depthwise_separable_conv.a $(BIN)/%/depthwise_separable_conv_auto_schedule.a @-mkdir -p $(BIN) diff --git a/apps/depthwise_separable_conv/depthwise_separable_conv_generator.cpp b/apps/depthwise_separable_conv/depthwise_separable_conv_generator.cpp index d560a8bea376..ba230ee03653 100644 --- a/apps/depthwise_separable_conv/depthwise_separable_conv_generator.cpp +++ b/apps/depthwise_separable_conv/depthwise_separable_conv_generator.cpp @@ -74,7 +74,7 @@ class DepthwiseSeparableConvolution : public Generator { } // Schedule - if (!auto_schedule) { + if (!using_autoscheduler()) { Var xi("xi"), yi("yi"); if (get_target().has_gpu_feature()) { // 0.253ms on a 2060 RTX diff --git a/apps/hist/Makefile b/apps/hist/Makefile index 5f4faa1b835a..b0843bda1fb0 100644 --- a/apps/hist/Makefile +++ b/apps/hist/Makefile @@ -12,11 +12,11 @@ $(GENERATOR_BIN)/hist.generator: hist_generator.cpp $(GENERATOR_DEPS) $(BIN)/%/hist.a: $(GENERATOR_BIN)/hist.generator @mkdir -p $(@D) - $< -g hist -f hist -o $(BIN)/$* target=$*-no_runtime auto_schedule=false + $< -g hist -f hist -o $(BIN)/$* target=$*-no_runtime $(BIN)/%/hist_auto_schedule.a: $(GENERATOR_BIN)/hist.generator @mkdir -p $(@D) - $< -g hist -f hist_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true + $< -g hist -f hist_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/hist.generator @mkdir -p $(@D) diff --git a/apps/hist/hist_generator.cpp b/apps/hist/hist_generator.cpp index e3d5de7f5737..32d86d3d0186 100644 --- a/apps/hist/hist_generator.cpp +++ b/apps/hist/hist_generator.cpp @@ -64,7 +64,7 @@ class Hist : public Halide::Generator { } // Schedule - if (!auto_schedule) { + if (!using_autoscheduler()) { cdf.bound(x, 0, 256); Var xi("xi"), yi("yi"); diff --git a/apps/iir_blur/Makefile b/apps/iir_blur/Makefile index 8c9983c8fa14..49104b3e5fa3 100644 --- a/apps/iir_blur/Makefile +++ b/apps/iir_blur/Makefile @@ -10,11 +10,11 @@ $(GENERATOR_BIN)/iir_blur.generator: iir_blur_generator.cpp $(GENERATOR_DEPS) $(BIN)/%/iir_blur.a: $(GENERATOR_BIN)/iir_blur.generator @mkdir -p $(@D) - $< -g iir_blur -f iir_blur -o $(BIN)/$* target=$*-no_runtime auto_schedule=false + $< -g iir_blur -f iir_blur -o $(BIN)/$* target=$*-no_runtime $(BIN)/%/iir_blur_auto_schedule.a: $(GENERATOR_BIN)/iir_blur.generator @mkdir -p $(@D) - $< -g iir_blur -f iir_blur_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true + $< -g iir_blur -f iir_blur_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/iir_blur.generator @mkdir -p $(@D) diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp index 59ef065e79e6..1aeb3e0d1a5f 100644 --- a/apps/iir_blur/iir_blur_generator.cpp +++ b/apps/iir_blur/iir_blur_generator.cpp @@ -145,10 +145,10 @@ class IirBlur : public Generator { Expr height = input.height(); // First, blur the columns of the input. - Func blury_T = blur_cols_transpose(input, height, alpha, auto_schedule, get_target()); + Func blury_T = blur_cols_transpose(input, height, alpha, using_autoscheduler(), get_target()); // Blur the columns again (the rows of the original). - Func blur = blur_cols_transpose(blury_T, width, alpha, auto_schedule, get_target()); + Func blur = blur_cols_transpose(blury_T, width, alpha, using_autoscheduler(), get_target()); // Scheduling is done inside blur_cols_transpose. output = blur; diff --git a/apps/interpolate/Makefile b/apps/interpolate/Makefile index 8e55e16a1283..95c165b533ee 100644 --- a/apps/interpolate/Makefile +++ b/apps/interpolate/Makefile @@ -12,11 +12,11 @@ $(GENERATOR_BIN)/interpolate.generator: interpolate_generator.cpp $(GENERATOR_DE $(BIN)/%/interpolate.a: $(GENERATOR_BIN)/interpolate.generator @mkdir -p $(@D) - $< -g interpolate -e $(GENERATOR_OUTPUTS) -f interpolate -o $(BIN)/$* target=$*-no_runtime auto_schedule=false + $< -g interpolate -e $(GENERATOR_OUTPUTS) -f interpolate -o $(BIN)/$* target=$*-no_runtime $(BIN)/%/interpolate_auto_schedule.a: $(GENERATOR_BIN)/interpolate.generator @mkdir -p $(@D) - $< -g interpolate -e $(GENERATOR_OUTPUTS) -f interpolate_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true + $< -g interpolate -e $(GENERATOR_OUTPUTS) -f interpolate_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/interpolate.generator @mkdir -p $(@D) diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp index 58d6d65374eb..1e4026b9ef87 100644 --- a/apps/interpolate/interpolate_generator.cpp +++ b/apps/interpolate/interpolate_generator.cpp @@ -72,7 +72,7 @@ class Interpolate : public Halide::Generator { normalize(x, y, c) = interpolated[0](x, y, c) / interpolated[0](x, y, 3); // Schedule - if (auto_schedule) { + if (using_autoscheduler()) { output = normalize; } else { // 0.86ms on a 2060 RTX diff --git a/apps/lens_blur/Makefile b/apps/lens_blur/Makefile index 8ede6b797ffe..c5c424c82edf 100644 --- a/apps/lens_blur/Makefile +++ b/apps/lens_blur/Makefile @@ -11,11 +11,11 @@ $(GENERATOR_BIN)/lens_blur.generator: lens_blur_generator.cpp $(GENERATOR_DEPS) $(BIN)/%/lens_blur.a: $(GENERATOR_BIN)/lens_blur.generator @mkdir -p $(@D) - $^ -g lens_blur -e $(GENERATOR_OUTPUTS) -o $(@D) -f lens_blur target=$* auto_schedule=false + $^ -g lens_blur -e $(GENERATOR_OUTPUTS) -o $(@D) -f lens_blur target=$* $(BIN)/%/lens_blur_auto_schedule.a: $(GENERATOR_BIN)/lens_blur.generator @mkdir -p $(@D) - $^ -g lens_blur -e $(GENERATOR_OUTPUTS) -o $(@D) -f lens_blur_auto_schedule target=$*-no_runtime auto_schedule=true + $^ -g lens_blur -e $(GENERATOR_OUTPUTS) -o $(@D) -f lens_blur_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/process: process.cpp $(BIN)/%/lens_blur.a $(BIN)/%/lens_blur_auto_schedule.a @mkdir -p $(@D) diff --git a/apps/lens_blur/lens_blur_generator.cpp b/apps/lens_blur/lens_blur_generator.cpp index 52fad46cb82b..14aa92c876f2 100644 --- a/apps/lens_blur/lens_blur_generator.cpp +++ b/apps/lens_blur/lens_blur_generator.cpp @@ -166,7 +166,7 @@ class LensBlur : public Halide::Generator { final.set_estimates({{0, 192}, {0, 320}, {0, 3}}); /* THE SCHEDULE */ - if (auto_schedule) { + if (using_autoscheduler()) { // nothing } else if (get_target().has_gpu_feature()) { // Manual GPU schedule diff --git a/apps/linear_blur/linear_blur_generator.cpp b/apps/linear_blur/linear_blur_generator.cpp index 9b18e4b4bd3d..ec9db2e8097b 100644 --- a/apps/linear_blur/linear_blur_generator.cpp +++ b/apps/linear_blur/linear_blur_generator.cpp @@ -17,7 +17,7 @@ struct LinearBlur : public Halide::Generator { Func srgb = linear_to_srgb::generate(this, {blurred}); output(x, y, c) = srgb(x, y, c); - if (auto_schedule) { + if (using_autoscheduler()) { input.set_estimates({{0, 1536}, {0, 2560}, {0, 4}}); output.set_estimates({{0, 1536}, {0, 2560}, {0, 4}}); } else { diff --git a/apps/linear_blur/linear_to_srgb_generator.cpp b/apps/linear_blur/linear_to_srgb_generator.cpp index adf7b9426712..a45285e3b5a8 100644 --- a/apps/linear_blur/linear_to_srgb_generator.cpp +++ b/apps/linear_blur/linear_to_srgb_generator.cpp @@ -17,7 +17,7 @@ struct LinearTosRGB : public Halide::Generator { } void schedule() { - if (auto_schedule) { + if (using_autoscheduler()) { const int W = 1536, H = 2560, C = 4; // Wart: Input are defined with Vars we don't know. // Might be x,y but might be _0,_1. Use the args() to work around. diff --git a/apps/linear_blur/simple_blur_generator.cpp b/apps/linear_blur/simple_blur_generator.cpp index a53a3e26c426..78d23ae253cd 100644 --- a/apps/linear_blur/simple_blur_generator.cpp +++ b/apps/linear_blur/simple_blur_generator.cpp @@ -22,7 +22,7 @@ struct SimpleBlur : public Halide::Generator { } void schedule() { - if (auto_schedule) { + if (using_autoscheduler()) { const int W = 1536, H = 2560, C = 4; // Wart: Input are defined with Vars we don't know. // Might be x,y but might be _0,_1. Use the args() to work around. diff --git a/apps/linear_blur/srgb_to_linear_generator.cpp b/apps/linear_blur/srgb_to_linear_generator.cpp index b03907463c83..95cf203ada85 100644 --- a/apps/linear_blur/srgb_to_linear_generator.cpp +++ b/apps/linear_blur/srgb_to_linear_generator.cpp @@ -17,7 +17,7 @@ struct sRGBToLinear : public Halide::Generator { } void schedule() { - if (auto_schedule) { + if (using_autoscheduler()) { const int W = 1536, H = 2560, C = 4; // Wart: Input are defined with Vars we don't know. // Might be x,y but might be _0,_1. Use the args() to work around. diff --git a/apps/local_laplacian/Makefile b/apps/local_laplacian/Makefile index 21fa7bf74f6b..a9f57b4de81a 100644 --- a/apps/local_laplacian/Makefile +++ b/apps/local_laplacian/Makefile @@ -10,11 +10,11 @@ $(GENERATOR_BIN)/local_laplacian.generator: local_laplacian_generator.cpp $(GENE $(BIN)/%/local_laplacian.a: $(GENERATOR_BIN)/local_laplacian.generator @mkdir -p $(@D) - $^ -g local_laplacian -e $(GENERATOR_OUTPUTS) -o $(@D) -f local_laplacian target=$* auto_schedule=false + $^ -g local_laplacian -e $(GENERATOR_OUTPUTS) -o $(@D) -f local_laplacian target=$* $(BIN)/%/local_laplacian_auto_schedule.a: $(GENERATOR_BIN)/local_laplacian.generator @mkdir -p $(@D) - $^ -g local_laplacian -e $(GENERATOR_OUTPUTS) -o $(@D) -f local_laplacian_auto_schedule target=$*-no_runtime auto_schedule=true + $^ -g local_laplacian -e $(GENERATOR_OUTPUTS) -o $(@D) -f local_laplacian_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/process: process.cpp $(BIN)/%/local_laplacian.a $(BIN)/%/local_laplacian_auto_schedule.a @mkdir -p $(@D) diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index b1c697a2a3b7..ee6e7dc09c57 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -98,7 +98,7 @@ class LocalLaplacian : public Halide::Generator { output.set_estimates({{0, 1536}, {0, 2560}, {0, 3}}); /* THE SCHEDULE */ - if (auto_schedule) { + if (using_autoscheduler()) { // Nothing. } else if (get_target().has_gpu_feature()) { // GPU schedule. diff --git a/apps/max_filter/Makefile b/apps/max_filter/Makefile index bd755774b2f5..ec7fdc7e0739 100644 --- a/apps/max_filter/Makefile +++ b/apps/max_filter/Makefile @@ -12,11 +12,11 @@ $(GENERATOR_BIN)/max_filter.generator: max_filter_generator.cpp $(GENERATOR_DEPS $(BIN)/%/max_filter.a: $(GENERATOR_BIN)/max_filter.generator @mkdir -p $(@D) - $< -g max_filter -f max_filter -o $(BIN)/$* target=$*-no_runtime auto_schedule=false + $< -g max_filter -f max_filter -o $(BIN)/$* target=$*-no_runtime $(BIN)/%/max_filter_auto_schedule.a: $(GENERATOR_BIN)/max_filter.generator @mkdir -p $(@D) - $< -g max_filter -f max_filter_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true + $< -g max_filter -f max_filter_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/max_filter.generator @mkdir -p $(@D) diff --git a/apps/max_filter/max_filter_generator.cpp b/apps/max_filter/max_filter_generator.cpp index 02856a5e4604..bfe0c9457e23 100644 --- a/apps/max_filter/max_filter_generator.cpp +++ b/apps/max_filter/max_filter_generator.cpp @@ -64,7 +64,7 @@ class Max : public Halide::Generator { } // Schedule - if (!auto_schedule) { + if (!using_autoscheduler()) { if (get_target().has_gpu_feature()) { // 11.8ms on a 2060 RTX diff --git a/apps/nl_means/Makefile b/apps/nl_means/Makefile index 2c7fecdccc47..109cb5af13f7 100644 --- a/apps/nl_means/Makefile +++ b/apps/nl_means/Makefile @@ -10,11 +10,11 @@ $(GENERATOR_BIN)/nl_means.generator: nl_means_generator.cpp $(GENERATOR_DEPS) $(BIN)/%/nl_means.a: $(GENERATOR_BIN)/nl_means.generator @mkdir -p $(@D) - $^ -g nl_means -e $(GENERATOR_OUTPUTS) -o $(@D) -f nl_means target=$* auto_schedule=false + $^ -g nl_means -e $(GENERATOR_OUTPUTS) -o $(@D) -f nl_means target=$* $(BIN)/%/nl_means_auto_schedule.a: $(GENERATOR_BIN)/nl_means.generator @mkdir -p $(@D) - $^ -g nl_means -e $(GENERATOR_OUTPUTS) -o $(@D) -f nl_means_auto_schedule target=$*-no_runtime auto_schedule=true + $^ -g nl_means -e $(GENERATOR_OUTPUTS) -o $(@D) -f nl_means_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/process: process.cpp $(BIN)/%/nl_means.a $(BIN)/%/nl_means_auto_schedule.a @mkdir -p $(@D) diff --git a/apps/nl_means/nl_means_generator.cpp b/apps/nl_means/nl_means_generator.cpp index ec51844119ed..5b3e136111ff 100644 --- a/apps/nl_means/nl_means_generator.cpp +++ b/apps/nl_means/nl_means_generator.cpp @@ -81,7 +81,7 @@ class NonLocalMeans : public Halide::Generator { // Provide estimates on the output pipeline non_local_means.set_estimates({{0, 1536}, {0, 2560}, {0, 3}}); - if (auto_schedule) { + if (using_autoscheduler()) { // nothing } else if (get_target().has_gpu_feature()) { // 22 ms on a 2060 RTX diff --git a/apps/resnet_50/Makefile b/apps/resnet_50/Makefile index 3d1dd30c9ce8..5303bd06e449 100644 --- a/apps/resnet_50/Makefile +++ b/apps/resnet_50/Makefile @@ -17,7 +17,7 @@ $(GENERATOR_BIN)/resnet50.generator: Resnet50Generator.cpp $(GENERATOR_DEPS) $(BIN)/%/resnet50.a: $(GENERATOR_BIN)/resnet50.generator @mkdir -p $(@D) - $^ -g resnet50 -o $(@D) -f resnet50 target=$* auto_schedule=false + $^ -g resnet50 -o $(@D) -f resnet50 target=$* $(BIN)/%/process: process.cpp $(BIN)/%/resnet50.a @mkdir -p $(@D) diff --git a/apps/stencil_chain/Makefile b/apps/stencil_chain/Makefile index 116922d03095..4c2706e66cd5 100644 --- a/apps/stencil_chain/Makefile +++ b/apps/stencil_chain/Makefile @@ -10,11 +10,11 @@ $(GENERATOR_BIN)/stencil_chain.generator: stencil_chain_generator.cpp $(GENERATO $(BIN)/%/stencil_chain.a: $(GENERATOR_BIN)/stencil_chain.generator @mkdir -p $(@D) - $^ -g stencil_chain -e $(GENERATOR_OUTPUTS) -o $(@D) -f stencil_chain target=$* auto_schedule=false + $^ -g stencil_chain -e $(GENERATOR_OUTPUTS) -o $(@D) -f stencil_chain target=$* $(BIN)/%/stencil_chain_auto_schedule.a: $(GENERATOR_BIN)/stencil_chain.generator @mkdir -p $(@D) - $^ -g stencil_chain -e $(GENERATOR_OUTPUTS) -o $(@D) -f stencil_chain_auto_schedule target=$*-no_runtime auto_schedule=true + $^ -g stencil_chain -e $(GENERATOR_OUTPUTS) -o $(@D) -f stencil_chain_auto_schedule target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/process: process.cpp $(BIN)/%/stencil_chain.a $(BIN)/%/stencil_chain_auto_schedule.a @mkdir -p $(@D) diff --git a/apps/stencil_chain/stencil_chain_generator.cpp b/apps/stencil_chain/stencil_chain_generator.cpp index ebe07d51bdba..f62f269d6146 100644 --- a/apps/stencil_chain/stencil_chain_generator.cpp +++ b/apps/stencil_chain/stencil_chain_generator.cpp @@ -45,7 +45,7 @@ class StencilChain : public Halide::Generator { output.set_estimates({{0, width}, {0, height}}); } - if (auto_schedule) { + if (using_autoscheduler()) { // nothing } else if (get_target().has_gpu_feature()) { // GPU schedule diff --git a/apps/support/autoscheduler.inc b/apps/support/autoscheduler.inc deleted file mode 100644 index fc3aeb8f1876..000000000000 --- a/apps/support/autoscheduler.inc +++ /dev/null @@ -1,99 +0,0 @@ -ifndef BIN -$(error BIN must be set prior to including autoscheduler.inc) -endif - -AUTOSCHED_SRC ?= $(realpath ../autoscheduler) - -# Default to $(BIN) so that the toplevel Makefile can put all build products -# into the build products directory (rather than into the source tree) -AUTOSCHED_BIN ?= $(BIN) -AUTOSCHED_SAMPLES_OUT ?= $(AUTOSCHED_SRC)/samples - -AUTOSCHED_WEIGHT_OBJECTS=$(AUTOSCHED_BIN)/baseline_weights.o - -# TODO(srj): depending on something not in the distrib folder isn't strictly -# kosher, but this is still experimental -$(AUTOSCHED_BIN)/binary2cpp: ../../tools/binary2cpp.cpp - @mkdir -p $(@D) - $(CXX) $< -o $@ - -$(AUTOSCHED_BIN)/baseline_weights.cpp: $(AUTOSCHED_BIN)/binary2cpp $(AUTOSCHED_SRC)/baseline.weights - @mkdir -p $(@D) - $(AUTOSCHED_BIN)/binary2cpp baseline_weights < $(AUTOSCHED_SRC)/baseline.weights > $@ - -$(AUTOSCHED_BIN)/baseline_weights.o: $(AUTOSCHED_BIN)/baseline_weights.cpp - $(CXX) -c $< -o $@ - -AUTOSCHED_COST_MODEL_LIBS=\ -$(AUTOSCHED_BIN)/cost_model/cost_model.a \ -$(AUTOSCHED_BIN)/cost_model/train_cost_model.a \ - -$(AUTOSCHED_BIN)/cost_model.generator: $(AUTOSCHED_SRC)/cost_model_generator.cpp \ - $(AUTOSCHED_SRC)/cost_model_schedule.h \ - $(AUTOSCHED_SRC)/NetworkSize.h \ - $(GENERATOR_DEPS) - @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_EXPORT_DYNAMIC) - -$(AUTOSCHED_BIN)/auto_schedule_runtime.a: $(AUTOSCHED_BIN)/cost_model.generator - @mkdir -p $(@D) - $^ -r auto_schedule_runtime -o $(AUTOSCHED_BIN) target=$(HL_TARGET) - -$(AUTOSCHED_BIN)/cost_model/%.a: $(AUTOSCHED_BIN)/cost_model.generator - @mkdir -p $(@D) - $^ -g $* -o $(AUTOSCHED_BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime auto_schedule=false -e stmt,static_library,h,assembly - -# It's important to use dynamic lookups for undefined symbols here: all of libHalide -# is expected to be present (in the loading binary), so we explicitly make the symbols -# undefined rather than dependent on libHalide.so. -$(AUTOSCHED_BIN)/libauto_schedule.so: $(AUTOSCHED_SRC)/AutoSchedule.cpp \ - $(AUTOSCHED_SRC)/ASLog.cpp \ - $(AUTOSCHED_SRC)/DefaultCostModel.h \ - $(AUTOSCHED_SRC)/DefaultCostModel.cpp \ - $(AUTOSCHED_SRC)/Weights.h \ - $(AUTOSCHED_SRC)/Weights.cpp \ - $(AUTOSCHED_SRC)/FunctionDAG.h \ - $(AUTOSCHED_SRC)/FunctionDAG.cpp \ - $(AUTOSCHED_SRC)/LoopNest.h \ - $(AUTOSCHED_SRC)/LoopNest.cpp \ - $(AUTOSCHED_SRC)/Featurization.h \ - $(AUTOSCHED_SRC)/CostModel.h \ - $(AUTOSCHED_SRC)/PerfectHashMap.h \ - $(AUTOSCHED_WEIGHT_OBJECTS) \ - $(AUTOSCHED_COST_MODEL_LIBS) \ - $(GENERATOR_DEPS) \ - $(AUTOSCHED_BIN)/auto_schedule_runtime.a - @mkdir -p $(@D) - $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) -I $(AUTOSCHED_BIN)/cost_model $(filter-out %.h $(LIBHALIDE_LDFLAGS),$^) -o $@ $(HALIDE_SYSTEM_LIBS) - -$(AUTOSCHED_BIN)/retrain_cost_model: $(AUTOSCHED_SRC)/retrain_cost_model.cpp \ - $(AUTOSCHED_SRC)/ASLog.cpp \ - $(AUTOSCHED_SRC)/DefaultCostModel.h \ - $(AUTOSCHED_SRC)/DefaultCostModel.cpp \ - $(AUTOSCHED_SRC)/Weights.h \ - $(AUTOSCHED_SRC)/Weights.cpp \ - $(AUTOSCHED_SRC)/CostModel.h \ - $(AUTOSCHED_SRC)/NetworkSize.h \ - $(AUTOSCHED_COST_MODEL_LIBS) \ - $(AUTOSCHED_WEIGHT_OBJECTS) \ - $(AUTOSCHED_BIN)/auto_schedule_runtime.a - @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) -frtti -Wall -I ../support -I $(AUTOSCHED_BIN)/cost_model $(OPTIMIZE) $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(USE_OPEN_MP) - -$(AUTOSCHED_BIN)/featurization_to_sample: $(AUTOSCHED_SRC)/featurization_to_sample.cpp - @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $< $(OPTIMIZE) -o $@ - -$(AUTOSCHED_BIN)/get_host_target: $(AUTOSCHED_SRC)/get_host_target.cpp $(LIB_HALIDE) $(HALIDE_DISTRIB_PATH)/include/Halide.h - @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) $(LIBHALIDE_LDFLAGS) $(OPTIMIZE) -o $@ - -$(AUTOSCHED_BIN)/weightsdir_to_weightsfile: $(AUTOSCHED_SRC)/weightsdir_to_weightsfile.cpp $(AUTOSCHED_SRC)/Weights.cpp - @mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ - -# This is the value that machine_params defaults to if no custom value is specified; -# see MachineParams::generic() -HL_MACHINE_PARAMS ?= 32,25165824,160 - - diff --git a/apps/unsharp/Makefile b/apps/unsharp/Makefile index fa912ad172e1..047fc2854fb3 100644 --- a/apps/unsharp/Makefile +++ b/apps/unsharp/Makefile @@ -10,11 +10,11 @@ $(GENERATOR_BIN)/unsharp.generator: unsharp_generator.cpp $(GENERATOR_DEPS) $(BIN)/%/unsharp.a: $(GENERATOR_BIN)/unsharp.generator @mkdir -p $(@D) - $< -g unsharp -f unsharp -o $(BIN)/$* target=$*-no_runtime auto_schedule=false + $< -g unsharp -f unsharp -o $(BIN)/$* target=$*-no_runtime $(BIN)/%/unsharp_auto_schedule.a: $(GENERATOR_BIN)/unsharp.generator @mkdir -p $(@D) - $< -g unsharp -f unsharp_auto_schedule -o $(BIN)/$* target=$*-no_runtime auto_schedule=true + $< -g unsharp -f unsharp_auto_schedule -o $(BIN)/$* target=$*-no_runtime autoscheduler=Mullapudi2016 $(BIN)/%/runtime.a: $(GENERATOR_BIN)/unsharp.generator @mkdir -p $(@D) diff --git a/apps/unsharp/unsharp_generator.cpp b/apps/unsharp/unsharp_generator.cpp index d68702bf1e20..c1070b2753fe 100644 --- a/apps/unsharp/unsharp_generator.cpp +++ b/apps/unsharp/unsharp_generator.cpp @@ -61,7 +61,7 @@ class Unsharp : public Halide::Generator { } // Schedule - if (!auto_schedule) { + if (!using_autoscheduler()) { // Some Intel Mac Minis have GPUs that require tile sizes smaller than 32x32 // for this pipeline because they have too few registers. Drop to 16x16 to // avoid unexpected crashes in CI. diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake index fa724e73bc33..b4465bf74d9b 100644 --- a/cmake/HalideGeneratorHelpers.cmake +++ b/cmake/HalideGeneratorHelpers.cmake @@ -281,7 +281,6 @@ function(add_halide_library TARGET) # Attach an autoscheduler if the user requested it ## - set(autoscheduler "") if (ARG_AUTOSCHEDULER) if ("${ARG_AUTOSCHEDULER}" MATCHES "::") if (NOT TARGET "${ARG_AUTOSCHEDULER}") @@ -295,8 +294,7 @@ function(add_halide_library TARGET) elseif (NOT ARG_PLUGINS) message(AUTHOR_WARNING "AUTOSCHEDULER set to a scheduler name but no plugins were loaded") endif () - set(autoscheduler -s "${ARG_AUTOSCHEDULER}") - list(PREPEND ARG_PARAMS auto_schedule=true) + list(PREPEND ARG_PARAMS "autoscheduler=${ARG_AUTOSCHEDULER}") endif () ## @@ -334,7 +332,6 @@ function(add_halide_library TARGET) -f "${ARG_FUNCTION_NAME}" -e "$>" ${generator_plugins} - ${autoscheduler} -o . "target=$>" ${ARG_PARAMS} diff --git a/python_bindings/src/PyHalide.cpp b/python_bindings/src/PyHalide.cpp index da598b3b0bc8..a7348dcffc67 100644 --- a/python_bindings/src/PyHalide.cpp +++ b/python_bindings/src/PyHalide.cpp @@ -15,7 +15,9 @@ #include "PyImageParam.h" #include "PyInlineReductions.h" #include "PyLambda.h" +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API #include "PyMachineParams.h" +#endif #include "PyModule.h" #include "PyParam.h" #include "PyPipeline.h" @@ -53,7 +55,9 @@ PYBIND11_MODULE(HALIDE_PYBIND_MODULE_NAME, m) { define_extern_func_argument(m); define_var(m); define_rdom(m); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API define_machine_params(m); +#endif define_module(m); define_callable(m); define_func(m); diff --git a/python_bindings/src/PyMachineParams.cpp b/python_bindings/src/PyMachineParams.cpp index e99dd594b11d..93c49d97fae6 100644 --- a/python_bindings/src/PyMachineParams.cpp +++ b/python_bindings/src/PyMachineParams.cpp @@ -1,3 +1,4 @@ +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API #include "PyMachineParams.h" namespace Halide { @@ -23,3 +24,4 @@ void define_machine_params(py::module &m) { } // namespace PythonBindings } // namespace Halide +#endif diff --git a/python_bindings/src/PyMachineParams.h b/python_bindings/src/PyMachineParams.h index aa15ee73c069..82b4ff3ac441 100644 --- a/python_bindings/src/PyMachineParams.h +++ b/python_bindings/src/PyMachineParams.h @@ -1,6 +1,7 @@ #ifndef HALIDE_PYTHON_BINDINGS_PYMACHINEPARAMS_H #define HALIDE_PYTHON_BINDINGS_PYMACHINEPARAMS_H +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API #include "PyHalide.h" namespace Halide { @@ -11,4 +12,5 @@ void define_machine_params(py::module &m); } // namespace PythonBindings } // namespace Halide +#endif #endif // HALIDE_PYTHON_BINDINGS_PYMACHINEPARAMS_H diff --git a/python_bindings/src/PyModule.cpp b/python_bindings/src/PyModule.cpp index ac98de2d58e3..2527ae035121 100644 --- a/python_bindings/src/PyModule.cpp +++ b/python_bindings/src/PyModule.cpp @@ -12,9 +12,13 @@ void define_module(py::module &m) { auto auto_scheduler_results_class = py::class_(m, "AutoSchedulerResults") .def(py::init<>()) - .def_readwrite("scheduler_name", &AutoSchedulerResults::scheduler_name) .def_readwrite("target", &AutoSchedulerResults::target) +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + .def_readwrite("scheduler_name", &AutoSchedulerResults::scheduler_name) .def_readwrite("machine_params_string", &AutoSchedulerResults::machine_params_string) +#else + .def_readwrite("autoscheduler_params", &AutoSchedulerResults::autoscheduler_params) +#endif .def_readwrite("schedule_source", &AutoSchedulerResults::schedule_source) .def_readwrite("featurization", &AutoSchedulerResults::featurization) .def("__repr__", [](const AutoSchedulerResults &o) -> std::string { diff --git a/python_bindings/src/PyPipeline.cpp b/python_bindings/src/PyPipeline.cpp index 68caf873e2f2..18d932f01649 100644 --- a/python_bindings/src/PyPipeline.cpp +++ b/python_bindings/src/PyPipeline.cpp @@ -41,6 +41,32 @@ void define_pipeline(py::module &m) { // - set_custom_trace() // - set_custom_print() +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API +// nothing +#else + py::class_(m, "AutoschedulerParams") + .def(py::init<>()) + .def(py::init(), py::arg("name")) + .def(py::init([](const std::string &name, const py::dict &extra) -> AutoschedulerParams { + // Manually convert the dict: + // we want to allow Python to pass in dicts that have non-string values for some keys; + // PyBind will reject these as a type failure. We'll stringify them here explicitly. + AutoschedulerParams asp(name); + for (auto item : extra) { + const std::string name = py::str(item.first).cast(); + const std::string value = py::str(item.second).cast(); + asp.extra[name] = value; + } + return asp; + }), + py::arg("target"), py::arg("autoscheduler_params")) + .def_readwrite("name", &AutoschedulerParams::name) + .def_readwrite("extra", &AutoschedulerParams::extra) + .def("__repr__", [](const AutoSchedulerResults &o) -> std::string { + return ""; + }); +#endif + auto pipeline_class = py::class_(m, "Pipeline") .def(py::init<>()) @@ -49,6 +75,7 @@ void define_pipeline(py::module &m) { .def("outputs", &Pipeline::outputs) +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API .def("auto_schedule", (AutoSchedulerResults(Pipeline::*)(const std::string &, const Target &, const MachineParams &) const) & Pipeline::auto_schedule, py::arg("autoscheduler_name"), py::arg("target"), py::arg("machine_params") = MachineParams::generic()) .def("auto_schedule", (AutoSchedulerResults(Pipeline::*)(const Target &, const MachineParams &) const) & Pipeline::auto_schedule, @@ -56,7 +83,10 @@ void define_pipeline(py::module &m) { .def_static("set_default_autoscheduler_name", &Pipeline::set_default_autoscheduler_name, py::arg("autoscheduler_name")) - +#else + .def("apply_autoscheduler", (AutoSchedulerResults(Pipeline::*)(const Target &, const AutoschedulerParams &) const) & Pipeline::apply_autoscheduler, + py::arg("target"), py::arg("autoscheduler_params")) +#endif .def("get_func", &Pipeline::get_func, py::arg("index")) .def("print_loop_nest", &Pipeline::print_loop_nest) diff --git a/python_bindings/todo.txt b/python_bindings/todo.txt index dfb2bdb780bb..c73685c38443 100644 --- a/python_bindings/todo.txt +++ b/python_bindings/todo.txt @@ -25,7 +25,6 @@ - InlineReductions - IROperator - LoopLevel - - MachineParams - Module - OutputImageParam - Pipeline diff --git a/src/AbstractGenerator.cpp b/src/AbstractGenerator.cpp index cdcc80c4800a..52bd89553e38 100644 --- a/src/AbstractGenerator.cpp +++ b/src/AbstractGenerator.cpp @@ -25,9 +25,19 @@ Module AbstractGenerator::build_module(const std::string &function_name) { AutoSchedulerResults auto_schedule_results; const auto context = this->context(); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API if (context.auto_schedule()) { auto_schedule_results = pipeline.auto_schedule(context.target(), context.machine_params()); } +#else + const auto &asp = context.autoscheduler_params(); + if (!asp.name.empty()) { + debug(1) << "Applying autoscheduler " << asp.name << " to Generator " << name() << " ...\n"; + auto_schedule_results = pipeline.apply_autoscheduler(context.target(), asp); + } else { + debug(1) << "Applying autoscheduler (NONE) to Generator " << name() << " ...\n"; + } +#endif std::vector filter_arguments; const auto arg_infos = arginfos(); @@ -215,9 +225,17 @@ Module AbstractGenerator::build_gradient_module(const std::string &function_name AutoSchedulerResults auto_schedule_results; const auto context = this->context(); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API if (context.auto_schedule()) { auto_schedule_results = grad_pipeline.auto_schedule(context.target(), context.machine_params()); - } else { + } +#else + const auto &asp = context.autoscheduler_params(); + if (!asp.name.empty()) { + auto_schedule_results = grad_pipeline.apply_autoscheduler(context.target(), asp); + } +#endif + else { user_warning << "Autoscheduling is not enabled in build_gradient_module(), so the resulting " "gradient module will be unscheduled; this is very unlikely to be what you want.\n"; } @@ -257,8 +275,13 @@ Callable AbstractGenerator::compile_to_callable(const JITHandlers *jit_handlers, void AbstractGenerator::set_generatorparam_values(const GeneratorParamsMap &m) { for (const auto &c : m) { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API user_assert(c.first != "target" && c.first != "auto_schedule" && c.first != "machine_params") << "The GeneratorParam '" << c.first << "' cannot be specified via string here; use GeneratorContext instead."; +#else + user_assert(c.first != "target" && c.first != "auto_scheduler") + << "The GeneratorParam '" << c.first << "' cannot be specified via string here; use GeneratorContext instead."; +#endif set_generatorparam_value(c.first, c.second); } } diff --git a/src/AbstractGenerator.h b/src/AbstractGenerator.h index 28dc9335b0f3..95e904dfd9aa 100644 --- a/src/AbstractGenerator.h +++ b/src/AbstractGenerator.h @@ -81,7 +81,7 @@ class AbstractGenerator { * used to register it.) */ virtual std::string name() = 0; - /** Return the Target, autoscheduler flag, and MachineParams that this Generator + /** Return the Target and autoscheduler info that this Generator * was created with. Always legal to call on any AbstractGenerator instance, * regardless of what other methods have been called. (All AbstractGenerator instances * are expected to be created with immutable values for these, which can't be diff --git a/src/Generator.cpp b/src/Generator.cpp index d0254c3206bf..4fe37a305417 100644 --- a/src/Generator.cpp +++ b/src/Generator.cpp @@ -21,35 +21,59 @@ namespace Halide { #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE GeneratorContext::GeneratorContext(const Target &target, +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API bool auto_schedule, const MachineParams &machine_params, +#else + const AutoschedulerParams &autoscheduler_params, +#endif std::shared_ptr externs_map) : target_(target), +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto_schedule_(auto_schedule), machine_params_(machine_params), +#else + autoscheduler_params_(autoscheduler_params), +#endif externs_map_(std::move(externs_map)) { } -#endif +#endif // HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API GeneratorContext::GeneratorContext(const Target &target, bool auto_schedule, const MachineParams &machine_params) : target_(target), auto_schedule_(auto_schedule), - machine_params_(machine_params) -#ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE - , - externs_map_(std::make_shared()) -#endif -{ + machine_params_(machine_params) { +} +#else +GeneratorContext::GeneratorContext(const Target &target) + : target_(target), + autoscheduler_params_() { } +GeneratorContext::GeneratorContext(const Target &target, + const AutoschedulerParams &autoscheduler_params) + : target_(target), + autoscheduler_params_(autoscheduler_params) { +} +#endif + GeneratorContext GeneratorContext::with_target(const Target &t) const { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE return GeneratorContext(t, auto_schedule_, machine_params_, externs_map_); #else return GeneratorContext(t, auto_schedule_, machine_params_); #endif +#else +#ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE + return GeneratorContext(t, autoscheduler_params_, externs_map_); +#else + return GeneratorContext(t, autoscheduler_params_); +#endif +#endif } namespace Internal { @@ -183,11 +207,18 @@ class StubEmitter { std::vector out; for (auto *p : in) { // These are always propagated specially. +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API if (p->name() == "target" || p->name() == "auto_schedule" || p->name() == "machine_params") { continue; } +#else + if (p->name() == "target" || + p->name() == "autoscheduler") { + continue; + } +#endif if (p->is_synthetic_param()) { continue; } @@ -225,7 +256,11 @@ void StubEmitter::emit_generator_params_struct() { indent_level++; std::string comma = ""; for (auto *p : v) { - stream << get_indent() << comma << p->get_c_type() << " " << p->name() << "\n"; + std::string c_type = p->get_c_type(); + if (c_type == "AutoschedulerParams") { + c_type = "const AutoschedulerParams&"; + } + stream << get_indent() << comma << c_type << " " << p->name() << "\n"; comma = ", "; } indent_level--; @@ -683,8 +718,6 @@ gengen find one. Flags across all of the targets that do not affect runtime code generation, such as `no_asserts` and `no_runtime`, are ignored. - -s The name of an autoscheduler to set as the default. - -t Timeout for the Generator to run, in seconds; mainly useful to ensure that bugs and/or degenerate cases don't stall build systems. Defaults to 900 (=15 minutes). Specify 0 to allow ~infinite time. @@ -700,7 +733,9 @@ gengen {"-o", ""}, {"-p", ""}, {"-r", ""}, +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API {"-s", ""}, +#endif {"-t", "900"}, // 15 minutes }; @@ -717,6 +752,15 @@ gengen ++i; continue; } else { + if (!strcmp(argv[i], "-s")) { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + user_warning << "HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API is deprecated in Halide 15 " + "(and will be removed in Halide 16).\n"; +#else + user_error << "-s is no longer supported for setting autoscheduler; specify autoschduler.name=NAME instead.\n" + << kUsage; +#endif + } user_error << "Unknown flag: " << argv[i] << "\n" << kUsage; } @@ -730,10 +774,21 @@ gengen } } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API const auto autoscheduler_name = flags_info["-s"]; if (!autoscheduler_name.empty()) { Pipeline::set_default_autoscheduler_name(autoscheduler_name); } +#else + if (args.generator_params.count("auto_schedule")) { + user_error << "auto_schedule=true is no longer supported for enabling autoscheduling; specify autoscheduler=NAME instead.\n" + << kUsage; + } + if (args.generator_params.count("machine_params")) { + user_error << "machine_params is no longer supported as a GeneratorParam; specify autoscheduler.FIELD=VALUE instead.\n" + << kUsage; + } +#endif const auto &d_val = flags_info["-d"]; user_assert(d_val == "1" || d_val == "0") << "-d must be 0 or 1\n" @@ -855,14 +910,17 @@ gengen if (do_compiler_logging) { const bool obfuscate_compiler_logging = get_env_variable("HL_OBFUSCATE_COMPILER_LOGGER") == "1"; args.compiler_logger_factory = - [obfuscate_compiler_logging, &args, &autoscheduler_name](const std::string &function_name, const Target &target) -> std::unique_ptr { + [obfuscate_compiler_logging, &args](const std::string &function_name, const Target &target) -> std::unique_ptr { // rebuild generator_args from the map so that they are always canonical - std::string generator_args_string; + std::string generator_args_string, autoscheduler_name; std::string sep; for (const auto &it : args.generator_params) { std::string quote = it.second.find(' ') != std::string::npos ? "\\\"" : ""; generator_args_string += sep + it.first + "=" + quote + it.second + quote; sep = " "; + if (it.first == "autoscheduler") { + autoscheduler_name = it.second; + } } std::unique_ptr t(new JSONCompilerLogger( obfuscate_compiler_logging ? "" : args.generator_name, @@ -1091,6 +1149,7 @@ void execute_generator(const ExecuteGeneratorArgs &args_in) { // Don't bother with this if we're just emitting a cpp_stub. if (!cpp_stub_only) { auto output_files = compute_output_files(args.targets[0], base_path, args.output_types); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API const auto get_gp = [&](const std::string &key) { auto it = args.generator_params.find(key); return it != args.generator_params.end() ? it->second : ""; @@ -1099,8 +1158,10 @@ void execute_generator(const ExecuteGeneratorArgs &args_in) { const auto machine_params_string = get_gp("machine_params"); const bool auto_schedule = auto_schedule_string == "true" || auto_schedule_string == "True"; const MachineParams machine_params = !machine_params_string.empty() ? MachineParams(machine_params_string) : MachineParams::generic(); +#endif auto module_factory = [&](const std::string &function_name, const Target &target) -> Module { - // Must re-create each time since each instance will have a different Target. + // Must re-create each time since each instance will have a different Target. +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto gen = args.create_generator(args.generator_name, GeneratorContext(target, auto_schedule, machine_params)); for (const auto &kv : args.generator_params) { if (kv.first == "target" || @@ -1110,6 +1171,15 @@ void execute_generator(const ExecuteGeneratorArgs &args_in) { } gen->set_generatorparam_value(kv.first, kv.second); } +#else + auto gen = args.create_generator(args.generator_name, GeneratorContext(target)); + for (const auto &kv : args.generator_params) { + if (kv.first == "target") { + continue; + } + gen->set_generatorparam_value(kv.first, kv.second); + } +#endif return args.build_mode == ExecuteGeneratorArgs::Gradient ? gen->build_gradient_module(function_name) : gen->build_module(function_name); @@ -1131,11 +1201,18 @@ GeneratorParamBase::~GeneratorParamBase() { void GeneratorParamBase::check_value_readable() const { // These are always readable. +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API if (name() == "target" || name() == "auto_schedule" || name() == "machine_params") { return; } +#else + if (name() == "target" || + name() == "autoscheduler") { + return; + } +#endif user_assert(generator && generator->phase >= GeneratorBase::ConfigureCalled) << "The GeneratorParam \"" << name() << "\" cannot be read before configure()/generate() is called.\n"; } @@ -1153,6 +1230,50 @@ void GeneratorParamBase::fail_wrong_type(const char *type) { user_error << "The GeneratorParam \"" << name() << "\" cannot be set with a value of type " << type << ".\n"; } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API +// nothing +#else +GeneratorParam_AutoSchedulerParams::GeneratorParam_AutoSchedulerParams() + : GeneratorParamImpl("autoscheduler", {}) { +} + +void GeneratorParam_AutoSchedulerParams::set_from_string(const std::string &new_value_string) { + internal_error << "This method should never be called."; +} + +std::string GeneratorParam_AutoSchedulerParams::get_default_value() const { + internal_error << "This method should never be called."; + return ""; +} + +std::string GeneratorParam_AutoSchedulerParams::call_to_string(const std::string &v) const { + internal_error << "This method should never be called."; + return ""; +} + +std::string GeneratorParam_AutoSchedulerParams::get_c_type() const { + internal_error << "This method should never be called."; + return ""; +} + +bool GeneratorParam_AutoSchedulerParams::try_set(const std::string &key, const std::string &value) { + const auto &n = this->name(); + if (key == n) { + user_assert(this->value_.name.empty()) << "The GeneratorParam " << key << " cannot be set more than once.\n"; + this->value_.name = value; + return true; + } else if (starts_with(key, n + ".")) { + const auto sub_key = key.substr(n.size() + 1); + user_assert(this->value_.extra.count(sub_key) == 0) << "The GeneratorParam " << key << " cannot be set more than once.\n"; + this->value_.extra[sub_key] = value; + return true; + } else { + return false; + } +} + +#endif + /* static */ GeneratorRegistry &GeneratorRegistry::get_registry() { static GeneratorRegistry *registry = new GeneratorRegistry; @@ -1302,17 +1423,29 @@ GeneratorOutputBase *GeneratorBase::find_output_by_name(const std::string &name) } GeneratorContext GeneratorBase::context() const { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE return GeneratorContext(target, auto_schedule, machine_params, externs_map); #else return GeneratorContext(target, auto_schedule, machine_params); #endif +#else +#ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE + return GeneratorContext(target, autoscheduler_.value(), externs_map); +#else + return GeneratorContext(target, autoscheduler_.value()); +#endif +#endif } void GeneratorBase::init_from_context(const Halide::GeneratorContext &context) { target.set(context.target_); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto_schedule.set(context.auto_schedule_); machine_params.set(context.machine_params_); +#else + autoscheduler_.set(context.autoscheduler_params_); +#endif #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE externs_map = context.externs_map_; @@ -1465,12 +1598,19 @@ void GeneratorBase::check_input_kind(Internal::GeneratorInputBase *in, Internal: } void GeneratorBase::set_generatorparam_value(const std::string &name, const std::string &value) { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API if (name == "target" || name == "auto_schedule" || name == "machine_params") { user_error << "The GeneratorParam named " << name << " cannot be set by set_generatorparam_value().\n"; } +#else + user_assert(name != "target") << "The GeneratorParam named " << name << " cannot be set by set_generatorparam_value().\n"; + if (autoscheduler_.try_set(name, value)) { + return; + } +#endif GeneratorParamInfo &pi = param_info(); diff --git a/src/Generator.h b/src/Generator.h index e5f254e82b36..d40fddc79141 100644 --- a/src/Generator.h +++ b/src/Generator.h @@ -208,27 +208,27 @@ * }; * \endcode * - * All Generators have three GeneratorParams that are implicitly provided + * All Generators have two GeneratorParams that are implicitly provided * by the base class: * * GeneratorParam target{"target", Target()}; - * GeneratorParam auto_schedule{"auto_schedule", false}; - * GeneratorParam machine_params{"machine_params", MachineParams::generic()}; + * GeneratorParam autoscheduler{"autoscheduler", {}} * * - 'target' is the Halide::Target for which the Generator is producing code. * It is read-only during the Generator's lifetime, and must not be modified; * its value should always be filled in by the calling code: either the Halide * build system (for ahead-of-time compilation), or ordinary C++ code * (for JIT compilation). - * - 'auto_schedule' indicates whether the auto-scheduler should be run for this - * Generator: - * - if 'false', the Generator should schedule its Funcs as it sees fit. - * - if 'true', the Generator should only provide estimate()s for its Funcs, - * and not call any other scheduling methods. - * - 'machine_params' is only used if auto_schedule is true; it is ignored - * if auto_schedule is false. It provides details about the machine architecture - * being targeted which may be used to enhance the automatically-generated - * schedule. + * - 'autoscheduler' is a string-to-string map that is used to indicates whether + * and how an auto-scheduler should be run for this Generator: + * - if empty, the Generator should schedule its Funcs as it sees fit; no autoscheduler will be run. + * - if the 'name' key is set, it should be one of the known autoschedulers + * provided with this release of Halide, which will be used to schedule + * the Funcs in the Generator. In this case, the Generator should only + * provide estimate()s for its Funcs, and not call any other scheduling methods. + * - Other keys may be specified in the params, on a per-autoscheduler + * basis, to optimize or enhance the automatically-generated schedule. + * See documentation for each autoscheduler for options. * * Generators are added to a global registry to simplify AOT build mechanics; this * is done by simply using the HALIDE_REGISTER_GENERATOR macro at global scope: @@ -426,7 +426,11 @@ class GeneratorParamBase { HALIDE_GENERATOR_PARAM_TYPED_SETTER(float) HALIDE_GENERATOR_PARAM_TYPED_SETTER(double) HALIDE_GENERATOR_PARAM_TYPED_SETTER(Target) +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API HALIDE_GENERATOR_PARAM_TYPED_SETTER(MachineParams) +#else + HALIDE_GENERATOR_PARAM_TYPED_SETTER(AutoschedulerParams) +#endif HALIDE_GENERATOR_PARAM_TYPED_SETTER(Type) HALIDE_GENERATOR_PARAM_TYPED_SETTER(LoopLevel) @@ -540,7 +544,11 @@ class GeneratorParamImpl : public GeneratorParamBase { HALIDE_GENERATOR_PARAM_TYPED_SETTER(float) HALIDE_GENERATOR_PARAM_TYPED_SETTER(double) HALIDE_GENERATOR_PARAM_TYPED_SETTER(Target) +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API HALIDE_GENERATOR_PARAM_TYPED_SETTER(MachineParams) +#else + HALIDE_GENERATOR_PARAM_TYPED_SETTER(AutoschedulerParams) +#endif HALIDE_GENERATOR_PARAM_TYPED_SETTER(Type) HALIDE_GENERATOR_PARAM_TYPED_SETTER(LoopLevel) @@ -634,6 +642,7 @@ class GeneratorParam_Target : public GeneratorParamImpl { } }; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API template class GeneratorParam_MachineParams : public GeneratorParamImpl { public: @@ -659,6 +668,22 @@ class GeneratorParam_MachineParams : public GeneratorParamImpl { return "MachineParams"; } }; +#else +class GeneratorParam_AutoSchedulerParams : public GeneratorParamImpl { +public: + GeneratorParam_AutoSchedulerParams(); + + void set_from_string(const std::string &new_value_string) override; + std::string get_default_value() const override; + std::string call_to_string(const std::string &v) const override; + std::string get_c_type() const override; + +private: + friend class GeneratorBase; + + bool try_set(const std::string &key, const std::string &value); +}; +#endif class GeneratorParam_LoopLevel : public GeneratorParamImpl { public: @@ -954,7 +979,9 @@ template using GeneratorParamImplBase = typename select_type< cond::value, GeneratorParam_Target>, +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API cond::value, GeneratorParam_MachineParams>, +#endif cond::value, GeneratorParam_LoopLevel>, cond::value, GeneratorParam_String>, cond::value, GeneratorParam_Type>, @@ -3009,7 +3036,7 @@ class GeneratorParam_Synthetic : public GeneratorParamImpl { * \endcode * * Note that all Generators embed a GeneratorContext, so if you are using a Stub - * from within a Generator, you can just pass 'contex()' for the GeneratorContext: + * from within a Generator, you can just pass 'context()' for the GeneratorContext: * \code * struct SomeGen : Generator { * void generate() { @@ -3034,9 +3061,15 @@ class GeneratorContext { using ExternsMap = std::map; #endif +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API explicit GeneratorContext(const Target &t, bool auto_schedule = false, const MachineParams &machine_params = MachineParams::generic()); +#else + explicit GeneratorContext(const Target &t); + explicit GeneratorContext(const Target &t, + const AutoschedulerParams &autoscheduler_params); +#endif GeneratorContext() = default; GeneratorContext(const GeneratorContext &) = default; @@ -3047,17 +3080,24 @@ class GeneratorContext { const Target &target() const { return target_; } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API bool auto_schedule() const { return auto_schedule_; } const MachineParams &machine_params() const { return machine_params_; } +#else + const AutoschedulerParams &autoscheduler_params() const { + return autoscheduler_params_; + } +#endif HALIDE_ATTRIBUTE_DEPRECATED("Call GeneratorContext::target() instead of GeneratorContext::get_target().") const Target &get_target() const { return target_; } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API HALIDE_ATTRIBUTE_DEPRECATED("Call GeneratorContext::auto_schedule() instead of GeneratorContext::get_auto_schedule().") bool get_auto_schedule() const { return auto_schedule_; @@ -3066,6 +3106,7 @@ class GeneratorContext { const MachineParams &get_machine_params() const { return machine_params_; } +#endif // Return a copy of this GeneratorContext that uses the given Target. // This method is rarely needed; it's really provided as a convenience @@ -3085,18 +3126,26 @@ class GeneratorContext { private: Target target_; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API bool auto_schedule_ = false; MachineParams machine_params_ = MachineParams::generic(); +#else + AutoschedulerParams autoscheduler_params_; +#endif #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE - std::shared_ptr externs_map_; + std::shared_ptr externs_map_ = std::make_shared(); #endif #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE GeneratorContext(const Target &target, +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API bool auto_schedule, const MachineParams &machine_params, - std::shared_ptr externs_map); +#else + const AutoschedulerParams &autoscheduler_params, #endif + std::shared_ptr externs_map); +#endif // HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE }; class NamesInterface { @@ -3516,12 +3565,21 @@ class GeneratorBase : public NamesInterface, public AbstractGenerator { Target get_target() const { return target; } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API bool get_auto_schedule() const { return auto_schedule; } MachineParams get_machine_params() const { return machine_params; } + bool using_autoscheduler() const { + return get_auto_schedule(); + } +#else + bool using_autoscheduler() const { + return !autoscheduler_.value().name.empty(); + } +#endif #ifdef HALIDE_ALLOW_GENERATOR_EXTERNAL_CODE /** Generators can register ExternalCode objects onto @@ -3550,8 +3608,12 @@ class GeneratorBase : public NamesInterface, public AbstractGenerator { // These must remain here for legacy code that access the fields directly. GeneratorParam target{"target", Target()}; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API GeneratorParam auto_schedule{"auto_schedule", false}; GeneratorParam machine_params{"machine_params", MachineParams::generic()}; +#else + GeneratorParam_AutoSchedulerParams autoscheduler_; +#endif private: friend void ::Halide::Internal::generator_test(); diff --git a/src/Module.cpp b/src/Module.cpp index aae1064bd65c..7eb9d02b703f 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -253,7 +253,11 @@ std::string indent_string(const std::string &src, const std::string &indent) { void emit_schedule_file(const std::string &name, const std::vector &targets, const std::string &scheduler_name, +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API const std::string &machine_params_string, +#else + const std::string &autoscheduler_params_string, +#endif const std::string &body, std::ostream &stream) { std::string s = R"INLINE_CODE(#ifndef $CLEANNAME$_SCHEDULE_H @@ -262,7 +266,7 @@ void emit_schedule_file(const std::string &name, // MACHINE GENERATED -- DO NOT EDIT // This schedule was automatically generated by $SCHEDULER$ // for target=$TARGET$ // NOLINT -// with machine_params=$MACHINEPARAMS$ +// with $MPNAME$=$MACHINEPARAMS$ #include "Halide.h" @@ -316,7 +320,13 @@ inline void apply_schedule_$SHORTNAME$( s = replace_all(s, "$NAMESPACECLOSE$", nsclose); s = replace_all(s, "$TARGET$", target_string); s = replace_all(s, "$BODY$", body_text); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + s = replace_all(s, "$MPNAME$", "machine_params"); s = replace_all(s, "$MACHINEPARAMS$", machine_params_string); +#else + s = replace_all(s, "$MPNAME$", "autoscheduler_params"); + s = replace_all(s, "$MACHINEPARAMS$", autoscheduler_params_string); +#endif stream << s; } @@ -671,10 +681,16 @@ void Module::compile(const std::map &output_files) debug(1) << "Module.compile(): schedule " << output_files.at(OutputFileType::schedule) << "\n"; std::ofstream file(output_files.at(OutputFileType::schedule)); auto *r = contents->auto_scheduler_results.get(); + std::string body = r && !r->schedule_source.empty() ? r->schedule_source : "// No autoscheduler has been run for this Generator.\n"; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API std::string scheduler = r ? r->scheduler_name : "(None)"; std::string machine_params = r ? r->machine_params_string : "(None)"; - std::string body = r && !r->schedule_source.empty() ? r->schedule_source : "// No autoscheduler has been run for this Generator.\n"; emit_schedule_file(name(), {target()}, scheduler, machine_params, body, file); +#else + std::string scheduler = r ? r->autoscheduler_params.name : "(None)"; + std::string autoscheduler_params_string = r ? r->autoscheduler_params.to_string() : "(None)"; + emit_schedule_file(name(), {target()}, scheduler, autoscheduler_params_string, body, file); +#endif } if (contains(output_files, OutputFileType::featurization)) { debug(1) << "Module.compile(): featurization " << output_files.at(OutputFileType::featurization) << "\n"; @@ -1004,6 +1020,7 @@ void compile_multitarget(const std::string &fn_name, if (contains(output_files, OutputFileType::schedule)) { debug(1) << "compile_multitarget: schedule " << output_files.at(OutputFileType::schedule) << "\n"; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API std::string scheduler = auto_scheduler_results.front().scheduler_name; if (scheduler.empty()) { scheduler = "(None)"; @@ -1012,6 +1029,11 @@ void compile_multitarget(const std::string &fn_name, if (machine_params.empty()) { machine_params = "(None)"; } +#else + const auto &autoscheduler_params = auto_scheduler_results.front().autoscheduler_params; + std::string scheduler = autoscheduler_params.name.empty() ? "(None)" : autoscheduler_params.name; + std::string autoscheduler_params_string = autoscheduler_params.name.empty() ? "(None)" : autoscheduler_params.to_string(); +#endif // Find the features that are unique to each stage (vs the baseline case). const auto &baseline_target = auto_scheduler_results.back().target; @@ -1053,7 +1075,11 @@ void compile_multitarget(const std::string &fn_name, } std::ofstream file(output_files.at(OutputFileType::schedule)); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API emit_schedule_file(fn_name, targets, scheduler, machine_params, body.str(), file); +#else + emit_schedule_file(fn_name, targets, scheduler, autoscheduler_params_string, body.str(), file); +#endif } if (contains(output_files, OutputFileType::static_library)) { diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp index 19e088d517ea..d2b669614470 100644 --- a/src/Pipeline.cpp +++ b/src/Pipeline.cpp @@ -220,6 +220,7 @@ std::map &Pipeline::get_autoscheduler_map() { return autoschedulers; } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API /* static */ std::string &Pipeline::get_default_autoscheduler_name() { static std::string autoscheduler_name = ""; @@ -228,6 +229,7 @@ std::string &Pipeline::get_default_autoscheduler_name() { } return autoscheduler_name; } +#endif /* static */ AutoSchedulerFn Pipeline::find_autoscheduler(const std::string &autoscheduler_name) { @@ -244,6 +246,7 @@ AutoSchedulerFn Pipeline::find_autoscheduler(const std::string &autoscheduler_na return it->second; } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API AutoSchedulerResults Pipeline::auto_schedule(const std::string &autoscheduler_name, const Target &target, const MachineParams &arch_params) const { auto autoscheduler_fn = find_autoscheduler(autoscheduler_name); user_assert(autoscheduler_fn) @@ -261,6 +264,23 @@ AutoSchedulerResults Pipeline::auto_schedule(const std::string &autoscheduler_na AutoSchedulerResults Pipeline::auto_schedule(const Target &target, const MachineParams &arch_params) const { return auto_schedule(get_default_autoscheduler_name(), target, arch_params); } +#else +AutoSchedulerResults Pipeline::apply_autoscheduler(const Target &target, const AutoschedulerParams &autoscheduler_params) const { + user_assert(!autoscheduler_params.name.empty()) << "apply_autoscheduler was called with no Autoscheduler specified."; + + auto autoscheduler_fn = find_autoscheduler(autoscheduler_params.name); + user_assert(autoscheduler_fn) + << "Could not find autoscheduler named '" << autoscheduler_params.name << "'.\n" + << "Did you remember to load the plugin?"; + + AutoSchedulerResults results; + results.target = target; + results.autoscheduler_params = autoscheduler_params; + + autoscheduler_fn(*this, target, autoscheduler_params, &results); + return results; +} +#endif /* static */ void Pipeline::add_autoscheduler(const std::string &autoscheduler_name, const AutoSchedulerFn &autoscheduler) { @@ -269,11 +289,13 @@ void Pipeline::add_autoscheduler(const std::string &autoscheduler_name, const Au m[autoscheduler_name] = autoscheduler; } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API /* static */ void Pipeline::set_default_autoscheduler_name(const std::string &autoscheduler_name) { (void)find_autoscheduler(autoscheduler_name); // ensure it's valid get_default_autoscheduler_name() = autoscheduler_name; } +#endif Func Pipeline::get_func(size_t index) { // Compute an environment @@ -1186,6 +1208,7 @@ JITExtern::JITExtern(const ExternCFunction &extern_c_function) : extern_c_function_(extern_c_function) { } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API MachineParams MachineParams::generic() { std::string params = Internal::get_env_variable("HL_MACHINE_PARAMS"); if (params.empty()) { @@ -1208,5 +1231,17 @@ MachineParams::MachineParams(const std::string &s) { last_level_cache_size = std::atoll(v[1].c_str()); balance = std::atof(v[2].c_str()); } +#endif + +std::string AutoschedulerParams::to_string() const { + std::ostringstream os; + if (!name.empty()) { + os << "autoscheduler=" << name; + } + for (const auto &kv : extra) { + os << " autoscheduler." << kv.first << "=" << kv.second; + } + return os.str(); +} } // namespace Halide diff --git a/src/Pipeline.h b/src/Pipeline.h index 15e19652c107..bb67391f4a44 100644 --- a/src/Pipeline.h +++ b/src/Pipeline.h @@ -31,6 +31,7 @@ class Callable; class Func; struct PipelineContents; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API /** A struct representing the machine parameters to generate the auto-scheduled * code for. */ struct MachineParams { @@ -55,6 +56,40 @@ struct MachineParams { /** Reconstruct a MachineParams from canonical string form. */ explicit MachineParams(const std::string &s); }; +#else +/** Special the Autoscheduler to be used (if any), along with arbitrary + * additional arguments specific to the given Autoscheduler. + * + * The 'name' field specifies the type of Autoscheduler + * to be used (e.g. Adams2019, Mullapudi2016). If this is an empty string, + * no autoscheduling will be done; if not, it mustbe the name of a known Autoscheduler. + * + * At this time, well-known autoschedulers include: + * "Mullapudi2016" -- heuristics-based; the first working autoscheduler; currently built in to libHalide + * see http://graphics.cs.cmu.edu/projects/halidesched/ + * "Adams2019" -- aka "the ML autoscheduler"; currently located in apps/autoscheduler + * see https://halide-lang.org/papers/autoscheduler2019.html + * "Li2018" -- aka "the gradient autoscheduler"; currently located in apps/gradient_autoscheduler. + * see https://people.csail.mit.edu/tzumao/gradient_halide + * + * The key/value pairs in 'extra' are defined on a per-autoscheduler basis. + * An autoscheduler can have any number of required or optional keys. + */ +struct AutoschedulerParams { + std::string name; + std::map extra; + + AutoschedulerParams() = default; + /*not-explicit*/ AutoschedulerParams(const std::string &name) + : name(name) { + } + AutoschedulerParams(const std::string &name, const std::map &extra) + : name(name), extra(extra) { + } + + std::string to_string() const; +}; +#endif namespace Internal { class IRMutator; @@ -88,16 +123,25 @@ struct CustomLoweringPass { struct JITExtern; struct AutoSchedulerResults { - std::string scheduler_name; // name of the autoscheduler used - Target target; // Target specified to the autoscheduler - std::string machine_params_string; // MachineParams specified to the autoscheduler (in string form) +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + std::string scheduler_name; // name of the autoscheduler used + Target target; // Target specified to the autoscheduler + std::string machine_params_string; // MachineParams specified to the autoscheduler (in string form) +#else + Target target; // Target specified to the autoscheduler + AutoschedulerParams autoscheduler_params; // The autoscheduler used, along with its params +#endif std::string schedule_source; // The C++ source code of the generated schedule std::vector featurization; // The featurization of the pipeline (if any) }; class Pipeline; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API using AutoSchedulerFn = std::function; +#else +using AutoSchedulerFn = std::function; +#endif /** A class representing a Halide pipeline. Constructed from the Func * or Funcs that it outputs. */ @@ -155,7 +199,9 @@ class Pipeline { static std::map &get_autoscheduler_map(); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API static std::string &get_default_autoscheduler_name(); +#endif static AutoSchedulerFn find_autoscheduler(const std::string &autoscheduler_name); @@ -188,6 +234,7 @@ class Pipeline { /** Get the Funcs this pipeline outputs. */ std::vector outputs() const; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API /** Generate a schedule for the pipeline using the currently-default autoscheduler. */ AutoSchedulerResults auto_schedule(const Target &target, const MachineParams &arch_params = MachineParams::generic()) const; @@ -196,11 +243,17 @@ class Pipeline { AutoSchedulerResults auto_schedule(const std::string &autoscheduler_name, const Target &target, const MachineParams &arch_params = MachineParams::generic()) const; +#else + /** Generate a schedule for the pipeline using the specified autoscheduler. */ + AutoSchedulerResults apply_autoscheduler(const Target &target, + const AutoschedulerParams &autoscheduler_params) const; +#endif /** Add a new the autoscheduler method with the given name. Does not affect the current default autoscheduler. * It is an error to call this with the same name multiple times. */ static void add_autoscheduler(const std::string &autoscheduler_name, const AutoSchedulerFn &autoscheduler); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API /** Globally set the default autoscheduler method to use whenever * autoscheduling any Pipeline when no name is specified. If the autoscheduler_name isn't in the * current table of known autoschedulers, assert-fail. @@ -214,6 +267,7 @@ class Pipeline { * see https://people.csail.mit.edu/tzumao/gradient_halide */ static void set_default_autoscheduler_name(const std::string &autoscheduler_name); +#endif /** Return handle to the index-th Func within the pipeline based on the * topological order. */ diff --git a/src/autoschedulers/adams2019/AutoSchedule.cpp b/src/autoschedulers/adams2019/AutoSchedule.cpp index 56f8ed3dbda5..baa4c14160f5 100644 --- a/src/autoschedulers/adams2019/AutoSchedule.cpp +++ b/src/autoschedulers/adams2019/AutoSchedule.cpp @@ -31,9 +31,6 @@ Write out a training featurization for the selected schedule into this file. Needs to be converted to a sample file with the runtime using featurization_to_sample before it can be used to train. - HL_MACHINE_PARAMS - An architecture description string. Used by Halide master to configure the cost model. We only use the first term. Set it to the number of cores to target. - HL_PERMIT_FAILED_UNROLL Set to 1 to tell Halide not to freak out if we try to unroll a loop that doesn't have a constant extent. Should generally not be necessary, but sometimes the autoscheduler's model for what will and will not turn into a constant during lowering is inaccurate, because Halide isn't perfect at constant-folding. @@ -256,7 +253,7 @@ class StateQueue { // Configure a cost model to process a specific pipeline. void configure_pipeline_features(const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, CostModel *cost_model) { cost_model->reset(); cost_model->set_pipeline_features(dag, params); @@ -265,7 +262,7 @@ void configure_pipeline_features(const FunctionDAG &dag, // A single pass of coarse-to-fine beam search. IntrusivePtr optimal_schedule_pass(FunctionDAG &dag, const vector &outputs, - const MachineParams ¶ms, + const Adams2019Params ¶ms, CostModel *cost_model, std::mt19937 &rng, int beam_size, @@ -464,7 +461,7 @@ IntrusivePtr optimal_schedule_pass(FunctionDAG &dag, // Performance coarse-to-fine beam search and return the best state found. IntrusivePtr optimal_schedule(FunctionDAG &dag, const vector &outputs, - const MachineParams ¶ms, + const Adams2019Params ¶ms, CostModel *cost_model, std::mt19937 &rng, int beam_size, @@ -543,7 +540,7 @@ int State::cost_calculations = 0; // The main entrypoint to generate a schedule for a pipeline. void generate_schedule(const std::vector &outputs, const Target &target, - const MachineParams ¶ms, + const Adams2019Params ¶ms, AutoSchedulerResults *auto_scheduler_results) { aslog(1) << "generate_schedule for target=" << target.to_string() << "\n"; @@ -580,7 +577,7 @@ void generate_schedule(const std::vector &outputs, int64_t memory_limit = memory_limit_str.empty() ? (uint64_t)(-1) : std::atoll(memory_limit_str.c_str()); // Analyse the Halide algorithm and construct our abstract representation of it - FunctionDAG dag(outputs, params, target); + FunctionDAG dag(outputs, target); if (aslog::aslog_level() >= 2) { dag.dump(aslog(2).get_ostream()); } @@ -641,7 +638,9 @@ void generate_schedule(const std::vector &outputs, } if (auto_scheduler_results) { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto_scheduler_results->scheduler_name = "Adams2019"; +#endif auto_scheduler_results->schedule_source = optimal->schedule_source; { std::ostringstream out; @@ -653,13 +652,37 @@ void generate_schedule(const std::vector &outputs, } struct Adams2019 { - void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms, AutoSchedulerResults *results) { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms_in, AutoSchedulerResults *results) { + std::vector outputs; + for (const Func &f : p.outputs()) { + outputs.push_back(f.function()); + } + Adams2019Params params; + params.parallelism = params_in.parallelism; + Autoscheduler::generate_schedule(outputs, target, params, results); + } +#else + void operator()(const Pipeline &p, const Target &target, const AutoschedulerParams ¶ms_in, AutoSchedulerResults *results) { + internal_assert(params_in.name == "Adams2019"); + // Verify that no unknown keys are set in params_in + const std::set legal_keys = {"parallelism"}; + for (const auto &it : params_in.extra) { + user_assert(legal_keys.count(it.first) == 1) << "The key " << it.first << " is not legal to use for the Adams2019 Autoscheduler."; + } + std::vector outputs; for (const Func &f : p.outputs()) { outputs.push_back(f.function()); } + Adams2019Params params; + if (params_in.extra.count("parallelism")) { + params.parallelism = std::stoi(params_in.extra.at("parallelism")); + } Autoscheduler::generate_schedule(outputs, target, params, results); + results->autoscheduler_params = params_in; } +#endif }; REGISTER_AUTOSCHEDULER(Adams2019) @@ -667,7 +690,7 @@ REGISTER_AUTOSCHEDULER(Adams2019) // An alternative entrypoint for other uses void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, - const MachineParams ¶ms, + const Adams2019Params ¶ms, CostModel *cost_model, int beam_size, int64_t memory_limit, diff --git a/src/autoschedulers/adams2019/AutoSchedule.h b/src/autoschedulers/adams2019/AutoSchedule.h index b7a76dc67e50..270ca7a24641 100644 --- a/src/autoschedulers/adams2019/AutoSchedule.h +++ b/src/autoschedulers/adams2019/AutoSchedule.h @@ -11,7 +11,7 @@ namespace Autoscheduler { typedef PerfectHashMap StageMapOfScheduleFeatures; -void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const MachineParams ¶ms, +void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const Adams2019Params ¶ms, CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); } // namespace Autoscheduler diff --git a/src/autoschedulers/adams2019/Cache.cpp b/src/autoschedulers/adams2019/Cache.cpp index ef14e9313563..b149accc36c1 100644 --- a/src/autoschedulers/adams2019/Cache.cpp +++ b/src/autoschedulers/adams2019/Cache.cpp @@ -18,7 +18,7 @@ bool Cache::add_memoized_blocks(const State *state, std::function &&)> &accept_child, const FunctionDAG::Node *node, int &num_children, const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, CostModel *cost_model, int64_t memory_limit) const { if (!options.cache_blocks || !memoized_compute_root_blocks.contains(node)) { diff --git a/src/autoschedulers/adams2019/Cache.h b/src/autoschedulers/adams2019/Cache.h index 3272691ab13f..c1cedc23f856 100644 --- a/src/autoschedulers/adams2019/Cache.h +++ b/src/autoschedulers/adams2019/Cache.h @@ -122,7 +122,7 @@ struct Cache { const FunctionDAG::Node *node, int &num_children, const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, CostModel *cost_model, int64_t memory_limit) const; diff --git a/src/autoschedulers/adams2019/CostModel.h b/src/autoschedulers/adams2019/CostModel.h index 8459932c8dca..82ba413a17b0 100644 --- a/src/autoschedulers/adams2019/CostModel.h +++ b/src/autoschedulers/adams2019/CostModel.h @@ -3,6 +3,7 @@ #include +#include "Featurization.h" #include "FunctionDAG.h" #include "HalideBuffer.h" #include "PerfectHashMap.h" @@ -12,7 +13,14 @@ namespace Halide { namespace Internal { namespace Autoscheduler { + typedef PerfectHashMap StageMapOfScheduleFeatures; + +struct Adams2019Params { + /** Maximum level of parallelism available. */ + int parallelism = 16; +}; + } // namespace Autoscheduler } // namespace Internal @@ -22,7 +30,7 @@ class CostModel { // Configure the cost model for the algorithm to be scheduled. virtual void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - const MachineParams ¶ms) = 0; + const Internal::Autoscheduler::Adams2019Params ¶ms) = 0; // Enqueue a schedule to be evaluated. Will annotate the value located at cost_ptr when the evaluation takes place. // Note that the dag argument should correspond to the dag specified previously when calling set_pipeline_features. diff --git a/src/autoschedulers/adams2019/DefaultCostModel.cpp b/src/autoschedulers/adams2019/DefaultCostModel.cpp index 630628c4354e..01307d765131 100644 --- a/src/autoschedulers/adams2019/DefaultCostModel.cpp +++ b/src/autoschedulers/adams2019/DefaultCostModel.cpp @@ -47,7 +47,7 @@ bool ends_with(const std::string &str, const std::string &suffix) { } // namespace void DefaultCostModel::set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - const MachineParams ¶ms) { + const Internal::Autoscheduler::Adams2019Params ¶ms) { const int pipeline_feat_size = head1_w * head1_h; // We ignore the first seven pipeline features in the cost diff --git a/src/autoschedulers/adams2019/DefaultCostModel.h b/src/autoschedulers/adams2019/DefaultCostModel.h index 11dff14ef0dc..9f7d6ac6c39b 100644 --- a/src/autoschedulers/adams2019/DefaultCostModel.h +++ b/src/autoschedulers/adams2019/DefaultCostModel.h @@ -7,6 +7,12 @@ namespace Halide { +namespace Internal { +namespace Autoscheduler { +struct Adams2019Params; +} // namespace Autoscheduler +} // namespace Internal + class DefaultCostModel : public CostModel { private: Internal::Weights weights; @@ -37,7 +43,7 @@ class DefaultCostModel : public CostModel { // Configure the cost model for the algorithm to be scheduled. void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, - const MachineParams ¶ms) override; + const Internal::Autoscheduler::Adams2019Params ¶ms) override; void set_pipeline_features(const Runtime::Buffer &, int n); // Enqueue a schedule to be evaluated. The second version of this method returns a buffer of diff --git a/src/autoschedulers/adams2019/FunctionDAG.cpp b/src/autoschedulers/adams2019/FunctionDAG.cpp index 72bc9dc7e0e1..52a481eed18c 100644 --- a/src/autoschedulers/adams2019/FunctionDAG.cpp +++ b/src/autoschedulers/adams2019/FunctionDAG.cpp @@ -572,7 +572,7 @@ bool depends_on_estimate(const Expr &expr) { return dependency_checker.found_estimate; } -FunctionDAG::FunctionDAG(const vector &outputs, const MachineParams ¶ms, const Target &target) { +FunctionDAG::FunctionDAG(const vector &outputs, const Target &target) { map env = build_environment(outputs); // A mutator to apply parameter estimates to the expressions diff --git a/src/autoschedulers/adams2019/FunctionDAG.h b/src/autoschedulers/adams2019/FunctionDAG.h index 44f0cf315db8..75c75c3c8b07 100644 --- a/src/autoschedulers/adams2019/FunctionDAG.h +++ b/src/autoschedulers/adams2019/FunctionDAG.h @@ -27,6 +27,8 @@ using std::string; using std::unique_ptr; using std::vector; +struct Adams2019Params; + // First we have various utility classes. // An optional rational type used when analyzing memory dependencies. @@ -563,7 +565,7 @@ struct FunctionDAG { // Create the function DAG, and do all the dependency and cost // analysis. This is done once up-front before the tree search. - FunctionDAG(const vector &outputs, const MachineParams ¶ms, const Target &target); + FunctionDAG(const vector &outputs, const Target &target); void dump(std::ostream &os) const; diff --git a/src/autoschedulers/adams2019/LoopNest.cpp b/src/autoschedulers/adams2019/LoopNest.cpp index a5cf19a61274..8568e92df8a9 100644 --- a/src/autoschedulers/adams2019/LoopNest.cpp +++ b/src/autoschedulers/adams2019/LoopNest.cpp @@ -227,7 +227,7 @@ void LoopNest::get_sites(StageMap &sites, // Do a recursive walk over the loop nest computing features to feed the cost model. void LoopNest::compute_features(const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, const StageMap &sites, int64_t instances, int64_t parallelism, @@ -1355,7 +1355,7 @@ void LoopNest::compute_here(const FunctionDAG::Node *f, bool tileable, int v) { } // Parallelize this loop according to the given tiling. -IntrusivePtr LoopNest::parallelize_in_tiles(const MachineParams ¶ms, +IntrusivePtr LoopNest::parallelize_in_tiles(const Adams2019Params ¶ms, const vector &tiling, const LoopNest *parent) const { @@ -1423,7 +1423,7 @@ IntrusivePtr LoopNest::parallelize_in_tiles(const MachineParams // this loop nest. vector> LoopNest::compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, - const MachineParams ¶ms, + const Adams2019Params ¶ms, int v, bool in_realization) const { internal_assert(f); diff --git a/src/autoschedulers/adams2019/LoopNest.h b/src/autoschedulers/adams2019/LoopNest.h index b937d1133da7..e9cb9e872441 100644 --- a/src/autoschedulers/adams2019/LoopNest.h +++ b/src/autoschedulers/adams2019/LoopNest.h @@ -129,7 +129,7 @@ struct LoopNest { // Do a recursive walk over the loop nest computing features to feed the cost model. void compute_features(const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, const StageMap &sites, int64_t instances, int64_t parallelism, @@ -189,7 +189,7 @@ struct LoopNest { void compute_here(const FunctionDAG::Node *f, bool tileable, int v); // Parallelize this loop according to the given tiling. - IntrusivePtr parallelize_in_tiles(const MachineParams ¶ms, + IntrusivePtr parallelize_in_tiles(const Adams2019Params ¶ms, const vector &tiling, const LoopNest *parent) const; @@ -197,7 +197,7 @@ struct LoopNest { // this loop nest. std::vector> compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, - const MachineParams ¶ms, + const Adams2019Params ¶ms, int v, bool in_realization) const; diff --git a/src/autoschedulers/adams2019/Makefile b/src/autoschedulers/adams2019/Makefile index 050bce258ebd..389565c7109d 100644 --- a/src/autoschedulers/adams2019/Makefile +++ b/src/autoschedulers/adams2019/Makefile @@ -52,7 +52,7 @@ $(BIN)/auto_schedule_runtime.a: $(BIN)/cost_model.generator $(BIN)/cost_model/%.a: $(BIN)/cost_model.generator @mkdir -p $(@D) - $^ -g $* -o $(BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime auto_schedule=false -e stmt,static_library,h,assembly + $^ -g $* -o $(BIN)/cost_model -f $* target=$(HL_TARGET)-no_runtime -e stmt,static_library,h,assembly # It's important to use dynamic lookups for undefined symbols here: all of libHalide # is expected to be present (in the loading binary), so we explicitly make the symbols @@ -107,11 +107,6 @@ $(BIN)/weightsdir_to_weightsfile: $(SRC)/weightsdir_to_weightsfile.cpp $(SRC)/We @mkdir -p $(@D) $(CXX) $(CXXFLAGS) $^ $(OPTIMIZE) -o $@ -# This is the value that machine_params defaults to if no custom value is specified; -# see MachineParams::generic() -HL_MACHINE_PARAMS ?= 32,25165824,160 - - # A sample generator to autoschedule. Note that if it statically links # to libHalide, then it must be build with $(USE_EXPORT_DYNAMIC), or the # autoscheduler can't find the libHalide symbols that it needs. @@ -123,7 +118,7 @@ $(GENERATOR_BIN)/demo.generator: $(SRC)/demo_generator.cpp $(GENERATOR_DEPS) $(BIN)/%/demo.a: $(GENERATOR_BIN)/demo.generator $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) @mkdir -p $(@D) HL_WEIGHTS_DIR=$(SRC)/baseline.weights \ - $(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* auto_schedule=true -p $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) -s Adams2019 + $(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* autoscheduler=Adams2019 autoscheduler.parallelism=32 -p $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) $(BIN)/%/demo.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/demo.registration.cpp $(BIN)/%/demo.a @mkdir -p $(@D) @@ -207,7 +202,7 @@ $(GENERATOR_BIN)/included_schedule_file_none.generator: $(SRC)/included_schedule $(BIN)/%/included_schedule_file.schedule.h: $(GENERATOR_BIN)/included_schedule_file_none.generator $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) @mkdir -p $(@D) HL_WEIGHTS_DIR=$(SRC)/baseline.weights \ - $< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* auto_schedule=true -p $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) -s Adams2019 -e schedule + $< -g included_schedule_file -o $(@D) -f included_schedule_file target=$* autoscheduler=Adams2019 autoscheduler.parallelism=32 -p $(BIN)/libautoschedule_adams2019.$(SHARED_EXT) -e schedule # Note that this depends on included_schedule_file.schedule.h rather than $(BIN)/%/included_schedule_file.schedule.h -- # the former should be generated by something like diff --git a/src/autoschedulers/adams2019/State.cpp b/src/autoschedulers/adams2019/State.cpp index d85bf91ce6f6..e7cb410d7921 100644 --- a/src/autoschedulers/adams2019/State.cpp +++ b/src/autoschedulers/adams2019/State.cpp @@ -14,7 +14,7 @@ uint64_t State::structural_hash(int depth) const { return h; } -void State::compute_featurization(const FunctionDAG &dag, const MachineParams ¶ms, +void State::compute_featurization(const FunctionDAG &dag, const Adams2019Params ¶ms, StageMap *features, const CachingOptions &cache_options) { StageMap sites; sites.make_large(dag.nodes[0].stages[0].max_id); @@ -93,7 +93,7 @@ void State::compute_featurization(const FunctionDAG &dag, const MachineParams &p } } -void State::save_featurization(const FunctionDAG &dag, const MachineParams ¶ms, +void State::save_featurization(const FunctionDAG &dag, const Adams2019Params ¶ms, const CachingOptions &cache_options, std::ostream &out) { StageMap features; compute_featurization(dag, params, &features, cache_options); @@ -123,7 +123,7 @@ void State::save_featurization(const FunctionDAG &dag, const MachineParams ¶ } } -bool State::calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, +bool State::calculate_cost(const FunctionDAG &dag, const Adams2019Params ¶ms, CostModel *cost_model, const CachingOptions &cache_options, int64_t memory_limit, int verbosity) { StageMap features; @@ -200,7 +200,7 @@ IntrusivePtr State::make_child() const { // Generate the successor states to this state void State::generate_children(const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, CostModel *cost_model, int64_t memory_limit, std::function &&)> &accept_child, @@ -539,7 +539,7 @@ void State::dump(std::ostream &os) const { // Apply the schedule represented by this state to a Halide // Pipeline. Also generate source code for the schedule for the // user to copy-paste to freeze this schedule as permanent artifact. -void State::apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms) { +void State::apply_schedule(const FunctionDAG &dag, const Adams2019Params ¶ms) { StageMap> state_map; root->apply(LoopLevel::root(), state_map, params.parallelism, 0, nullptr, nullptr); diff --git a/src/autoschedulers/adams2019/State.h b/src/autoschedulers/adams2019/State.h index 592b6db8930e..0cb0419fb96f 100644 --- a/src/autoschedulers/adams2019/State.h +++ b/src/autoschedulers/adams2019/State.h @@ -52,20 +52,20 @@ struct State { // Compute the featurization of this state (based on `root`), // and store features in `features`. Defers to `root->compute_features()`. void compute_featurization(const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, StageMap *features, const CachingOptions &cache_options); // Calls `compute_featurization` and prints those features to `out`. void save_featurization(const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, const CachingOptions &cache_options, std::ostream &out); // Performs some pruning to decide if this state is worth queuing in // the cost_model. If it is, calls `cost_model->enqueue` and returns true, // otherwise sets `cost` equal to a large value and returns false. - bool calculate_cost(const FunctionDAG &dag, const MachineParams ¶ms, + bool calculate_cost(const FunctionDAG &dag, const Adams2019Params ¶ms, CostModel *cost_model, const CachingOptions &cache_options, int64_t memory_limit, int verbosity = 99); @@ -79,7 +79,7 @@ struct State { // If they are not pruned by `calculate_cost()`, // then calls `accept_child()` on them. void generate_children(const FunctionDAG &dag, - const MachineParams ¶ms, + const Adams2019Params ¶ms, CostModel *cost_model, int64_t memory_limit, std::function &&)> &accept_child, @@ -92,7 +92,7 @@ struct State { // Pipeline. Also generate source code for the schedule for the // user to copy-paste to freeze this schedule as permanent artifact. // Also fills `schedule_source`. - void apply_schedule(const FunctionDAG &dag, const MachineParams ¶ms); + void apply_schedule(const FunctionDAG &dag, const Adams2019Params ¶ms); }; } // namespace Autoscheduler diff --git a/src/autoschedulers/adams2019/autotune_loop.sh b/src/autoschedulers/adams2019/autotune_loop.sh index f4fd01afa967..14805d830c1c 100755 --- a/src/autoschedulers/adams2019/autotune_loop.sh +++ b/src/autoschedulers/adams2019/autotune_loop.sh @@ -107,7 +107,6 @@ make_featurization() { HL_WEIGHTS_DIR=${WEIGHTS} \ HL_RANDOM_DROPOUT=${dropout} \ HL_BEAM_SIZE=${beam} \ - HL_MACHINE_PARAMS=32,24000000,40 \ ${TIMEOUT_CMD} -k ${COMPILATION_TIMEOUT} ${COMPILATION_TIMEOUT} \ ${GENERATOR} \ -g ${PIPELINE} \ @@ -115,11 +114,11 @@ make_featurization() { -o ${D} \ -e stmt,assembly,static_library,c_header,registration,schedule,featurization \ target=${HL_TARGET} \ - auto_schedule=true \ ${EXTRA_GENERATOR_ARGS} \ -p ${AUTOSCHED_BIN}/libautoschedule_adams2019.${SHARED_EXT} \ - -s Adams2019 \ - 2> ${D}/compile_log.txt || echo "Compilation failed or timed out for ${D}" + autoscheduler=Adams2019 \ + autoscheduler.parallelism=32 \ + 2> ${D}/compile_log.txt || echo "Compilation failed or timed out for ${D}" # We don't need image I/O for this purpose, diff --git a/src/autoschedulers/adams2019/cost_model_generator.cpp b/src/autoschedulers/adams2019/cost_model_generator.cpp index dfca665505b1..4ab6b59c1b57 100644 --- a/src/autoschedulers/adams2019/cost_model_generator.cpp +++ b/src/autoschedulers/adams2019/cost_model_generator.cpp @@ -123,7 +123,7 @@ class CostModel : public Generator> { using Input = GeneratorInput; template using Output = GeneratorOutput; - using Generator>::auto_schedule; + using Generator>::using_autoscheduler; using Generator>::get_pipeline; // Number of pipeline stages @@ -482,9 +482,9 @@ class CostModel : public Generator> { true_runtime.set_estimates({{0, 80}}); // SCHEDULE - if (training && !auto_schedule) { + if (training && !using_autoscheduler()) { do_cost_model_schedule(get_pipeline()); - } else if (auto_schedule) { + } else if (using_autoscheduler()) { // Do nothing. } else { // We just write down a good schedule for diff --git a/src/autoschedulers/adams2019/included_schedule_file_generator.cpp b/src/autoschedulers/adams2019/included_schedule_file_generator.cpp index 21ee6ec0918c..cdd2bc7f6bf3 100644 --- a/src/autoschedulers/adams2019/included_schedule_file_generator.cpp +++ b/src/autoschedulers/adams2019/included_schedule_file_generator.cpp @@ -37,7 +37,7 @@ struct IncludedScheduleFile : public Halide::Generator { relu.set_estimates({{0, CO}, {0, W}, {0, H}, {0, N}}); // Schedule - if (auto_schedule) { + if (using_autoscheduler()) { // nothing } else { #if defined(GENERATING_SCHEDULE) diff --git a/src/autoschedulers/adams2019/test.cpp b/src/autoschedulers/adams2019/test.cpp index 21e0f0ec20bb..a135c11fe63f 100644 --- a/src/autoschedulers/adams2019/test.cpp +++ b/src/autoschedulers/adams2019/test.cpp @@ -14,7 +14,7 @@ void set_env_variable(const std::string &name, const std::string &value, int ove #endif } -bool test_caching(Pipeline &p1, Pipeline &p2, const Target &target, const MachineParams ¶ms) { +bool test_caching(Pipeline &p1, Pipeline &p2, const Target &target) { static const std::string seed_value = Internal::get_env_variable("HL_SEED"); if (seed_value.empty()) { // If HL_SEED is not set, then set seed for both autoscheduling executions. @@ -22,15 +22,30 @@ bool test_caching(Pipeline &p1, Pipeline &p2, const Target &target, const Machin set_env_variable("HL_SEED", std::to_string(seed), /* overwrite */ 0); } + constexpr int parallelism = 32; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + MachineParams params(parallelism, 16000000, 40); +#else + AutoschedulerParams params = {"Adams2019", {{"parallelism", std::to_string(parallelism)}}}; +#endif + // Turn off caching. set_env_variable("HL_DISABLE_MEMOIZED_FEATURES", "1", /* overwrite */ 1); set_env_variable("HL_DISABLE_MEMOIZED_BLOCKS", "1", /* overwrite */ 1); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto results_without_caching = p1.auto_schedule(target, params); +#else + auto results_without_caching = p1.apply_autoscheduler(target, params); +#endif // Turn on caching. set_env_variable("HL_DISABLE_MEMOIZED_FEATURES", "0", /* overwrite */ 1); set_env_variable("HL_DISABLE_MEMOIZED_BLOCKS", "0", /* overwrite */ 1); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto results_with_caching = p2.auto_schedule(target, params); +#else + auto results_with_caching = p2.apply_autoscheduler(target, params); +#endif // Reset environment variables to what they were before (memoization variables are reset in main). if (seed_value.empty()) { @@ -65,7 +80,6 @@ int main(int argc, char **argv) { const std::string cache_features = Internal::get_env_variable("HL_DISABLE_MEMOIZED_FEATURES"); const std::string cache_blocks = Internal::get_env_variable("HL_DISABLE_MEMOIZED_BLOCKS"); - MachineParams params(32, 16000000, 40); // Use a fixed target for the analysis to get consistent results from this test. Target target("x86-64-linux-sse41-avx-avx2"); @@ -90,7 +104,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on point-wise pipeline" << std::endl; return 1; } @@ -123,7 +137,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on huge expensive stencils and low memory costs" << std::endl; return 1; } @@ -149,7 +163,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on moderate isotropic stencils" << std::endl; return 1; } @@ -175,7 +189,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on smaller footprint stencil" << std::endl; return 1; } @@ -207,7 +221,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on stencil chain" << std::endl; return 1; } @@ -231,7 +245,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on an outer product" << std::endl; return 1; } @@ -263,7 +277,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on a separable downsample" << std::endl; return 1; } @@ -295,7 +309,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on Func with multiple stages + loops" << std::endl; return 1; } @@ -332,7 +346,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on scan with pointwise stages before and after" << std::endl; return 1; } @@ -365,7 +379,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on bad vectorization" << std::endl; return 1; } @@ -397,7 +411,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on matrix multiply + wrapper" << std::endl; return 1; } @@ -440,7 +454,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(pipeline1, pipeline2, target, params)) { + if (!test_caching(pipeline1, pipeline2, target)) { std::cerr << "Caching check failed on scan + downsample" << std::endl; return 1; } @@ -473,7 +487,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on gather with LUT" << std::endl; return 1; } @@ -501,7 +515,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on 'compute inside an rvar'" << std::endl; return 1; } @@ -529,7 +543,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on alternating vectorized dimensions" << std::endl; return 1; } @@ -560,7 +574,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on no-win scenario" << std::endl; return 1; } @@ -585,7 +599,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on boring memcpy" << std::endl; return 1; } @@ -609,7 +623,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on load from a tiny input image" << std::endl; return 1; } @@ -640,7 +654,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on many-dimension func" << std::endl; return 1; } @@ -673,7 +687,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on long transpose chain" << std::endl; return 1; } @@ -711,7 +725,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on inlines + stencil chain" << std::endl; return 1; } @@ -738,7 +752,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on alternating vectorized dimensions" << std::endl; return 1; } @@ -766,7 +780,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on vectorizable with pure var using RoundUp" << std::endl; return 1; } @@ -812,7 +826,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on convolution pyramid" << std::endl; return 1; } @@ -844,7 +858,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on casted scan" << std::endl; return 1; } @@ -874,7 +888,7 @@ int main(int argc, char **argv) { } } - if (!test_caching(p1, p2, target, params)) { + if (!test_caching(p1, p2, target)) { std::cerr << "Caching check failed on histogram" << std::endl; return 1; } diff --git a/src/autoschedulers/adams2019/test_function_dag.cpp b/src/autoschedulers/adams2019/test_function_dag.cpp index 253307321ecc..0b4604b9500d 100644 --- a/src/autoschedulers/adams2019/test_function_dag.cpp +++ b/src/autoschedulers/adams2019/test_function_dag.cpp @@ -1,3 +1,4 @@ +#include "Featurization.h" #include "FunctionDAG.h" #include "Halide.h" #include @@ -31,7 +32,7 @@ extern "C" int mul_by_two( return 0; } -void test_coeff_wise(const MachineParams ¶ms, const Target &target) { +void test_coeff_wise(const Target &target) { Var x("x"), y("y"); std::ostringstream with_extern; @@ -55,7 +56,7 @@ void test_coeff_wise(const MachineParams ¶ms, const Target &target) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); std::vector v; v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + Halide::Internal::Autoscheduler::FunctionDAG d(v, target); d.dump(with_extern); } @@ -70,7 +71,7 @@ void test_coeff_wise(const MachineParams ¶ms, const Target &target) { h.set_estimate(x, 0, 1000).set_estimate(y, 0, 1000); std::vector v; v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + Halide::Internal::Autoscheduler::FunctionDAG d(v, target); d.dump(without_extern); } @@ -113,7 +114,7 @@ extern "C" int matmul( return 0; } -void test_matmul(const MachineParams ¶ms, const Target &target) { +void test_matmul(const Target &target) { Var x("x"), y("y"), k("k"); RDom r(0, 200); Halide::Buffer input1(200, 200); @@ -140,7 +141,7 @@ void test_matmul(const MachineParams ¶ms, const Target &target) { h.set_estimate(x, 0, 200).set_estimate(y, 0, 200); std::vector v; v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + Halide::Internal::Autoscheduler::FunctionDAG d(v, target); d.dump(with_extern); } @@ -153,7 +154,7 @@ void test_matmul(const MachineParams ¶ms, const Target &target) { h.set_estimate(x, 0, 200).set_estimate(y, 0, 200); std::vector v; v.push_back(h.function()); - Halide::Internal::Autoscheduler::FunctionDAG d(v, params, target); + Halide::Internal::Autoscheduler::FunctionDAG d(v, target); d.dump(without_extern); } @@ -164,11 +165,10 @@ void test_matmul(const MachineParams ¶ms, const Target &target) { int main(int argc, char **argv) { // Use a fixed target for the analysis to get consistent results from this test. - MachineParams params(32, 16000000, 40); Target target("x86-64-linux-sse41-avx-avx2"); - test_coeff_wise(params, target); - test_matmul(params, target); + test_coeff_wise(target); + test_matmul(target); return 0; } diff --git a/src/autoschedulers/li2018/GradientAutoscheduler.cpp b/src/autoschedulers/li2018/GradientAutoscheduler.cpp index d2068c3c6a57..aed686ed51c8 100644 --- a/src/autoschedulers/li2018/GradientAutoscheduler.cpp +++ b/src/autoschedulers/li2018/GradientAutoscheduler.cpp @@ -8,6 +8,11 @@ namespace Autoscheduler { namespace { +struct GradientAutoschedulerParams { + /** Maximum level of parallelism available. */ + int parallelism = 16; +}; + std::map inference_bounds(const std::vector &functions, const std::vector &output_bounds) { std::vector funcs; @@ -86,7 +91,7 @@ int natural_vector_size(const Target &target, const Type &t) { template void parallelize_vars_and_rvars_gpu( - const MachineParams ¶ms, + const GradientAutoschedulerParams ¶ms, FuncOrStage func_or_stage, bool is_pure_def, const std::vector &vars, @@ -324,7 +329,7 @@ void parallelize_vars_and_rvars_gpu( template void parallelize_vars_and_rvars_cpu( - const MachineParams ¶ms, + const GradientAutoschedulerParams ¶ms, FuncOrStage func_or_stage, int natural_vector_size, bool is_pure_def, @@ -528,7 +533,7 @@ void parallelize_vars_and_rvars_cpu( template void parallelize_vars_and_rvars( - const MachineParams ¶ms, + const GradientAutoschedulerParams ¶ms, FuncOrStage func_or_stage, int natural_vector_size, bool is_pure_def, @@ -565,7 +570,7 @@ void parallelize_vars_and_rvars( } } -void apply_schedule(const MachineParams ¶ms, +void apply_schedule(const GradientAutoschedulerParams ¶ms, const Target &target, Func func, int update_id, @@ -817,7 +822,7 @@ void apply_schedule(const MachineParams ¶ms, void generate_schedule(const std::vector &outputs, const Target &target, - const MachineParams ¶ms, + const GradientAutoschedulerParams ¶ms, AutoSchedulerResults *auto_scheduler_results) { // The first few steps are the same as src/AutoSchedule.cpp // Make an environment map which is used throughout the auto scheduling process. @@ -919,19 +924,45 @@ void generate_schedule(const std::vector &outputs, } } +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API auto_scheduler_results->scheduler_name = "Li2018"; +#endif auto_scheduler_results->schedule_source = schedule_source.str(); debug(1) << schedule_source.str() << "\n"; } struct Li2018 { - void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms, AutoSchedulerResults *results) { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + void operator()(const Pipeline &p, const Target &target, const MachineParams ¶ms_in, AutoSchedulerResults *results) { std::vector outputs; for (const Func &f : p.outputs()) { outputs.push_back(f.function()); } + GradientAutoschedulerParams params; + params.parallelism = params_in.parallelism; + generate_schedule(outputs, target, params, results); + } +#else + void operator()(const Pipeline &p, const Target &target, const AutoschedulerParams ¶ms_in, AutoSchedulerResults *results) { + internal_assert(params_in.name == "Li2018"); + // Verify that no unknown keys are set in params_in + const std::set legal_keys = {"parallelism"}; + for (const auto &it : params_in.extra) { + user_assert(legal_keys.count(it.first) == 1) << "The key " << it.first << " is not legal to use for the Li2018 Autoscheduler."; + } + + std::vector outputs; + for (const Func &f : p.outputs()) { + outputs.push_back(f.function()); + } + GradientAutoschedulerParams params; + if (params_in.extra.count("parallelism")) { + params.parallelism = std::stoi(params_in.extra.at("parallelism")); + } generate_schedule(outputs, target, params, results); + results->autoscheduler_params = params_in; } +#endif }; REGISTER_AUTOSCHEDULER(Li2018) diff --git a/src/autoschedulers/li2018/Makefile b/src/autoschedulers/li2018/Makefile index 2dc6a1aed289..8bf442918ae2 100644 --- a/src/autoschedulers/li2018/Makefile +++ b/src/autoschedulers/li2018/Makefile @@ -35,7 +35,7 @@ $(GENERATOR_BIN)/demo.generator: $(SRC)/demo_generator.cpp $(GENERATOR_DEPS) # Use the -p flag to the generator to load the autoscheduler as a plugin $(BIN)/%/demo.a: $(GENERATOR_BIN)/demo.generator $(BIN)/libautoschedule_li2018.$(SHARED_EXT) @mkdir -p $(@D) - $(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* auto_schedule=true -p $(BIN)/libautoschedule_li2018.$(SHARED_EXT) -s Li2018 + $(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* autoscheduler=Li2018 -p $(BIN)/libautoschedule_li2018.$(SHARED_EXT) $(BIN)/%/demo.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/demo.registration.cpp $(BIN)/%/demo.a @mkdir -p $(@D) diff --git a/src/autoschedulers/li2018/test.cpp b/src/autoschedulers/li2018/test.cpp index 6518cda38960..f3fb11f7cca7 100644 --- a/src/autoschedulers/li2018/test.cpp +++ b/src/autoschedulers/li2018/test.cpp @@ -10,7 +10,13 @@ int main(int argc, char **argv) { load_plugin(argv[1]); - MachineParams params(32, 16000000, 40); + constexpr int parallelism = 32; +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + MachineParams params(parallelism, 16000000, 40); +#else + AutoschedulerParams params = {"Li2018", {{"parallelism", std::to_string(parallelism)}}}; +#endif + Target target; Var x("x"), y("y"); @@ -27,8 +33,11 @@ int main(int argc, char **argv) { f2.set_estimate(x, 0, 10000); - AutoSchedulerResults result = - Pipeline(f2).auto_schedule(target, params); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + AutoSchedulerResults result = Pipeline(f2).auto_schedule(target, params); +#else + AutoSchedulerResults result = Pipeline(f2).apply_autoscheduler(target, params); +#endif std::cout << "Schedule for 1D pointwise operations:\n" << result.schedule_source << "\n\n"; } @@ -46,8 +55,11 @@ int main(int argc, char **argv) { f2.set_estimate(x, 0, 1000) .set_estimate(y, 0, 1000); - AutoSchedulerResults result = - Pipeline(f2).auto_schedule(target, params); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + AutoSchedulerResults result = Pipeline(f2).auto_schedule(target, params); +#else + AutoSchedulerResults result = Pipeline(f2).apply_autoscheduler(target, params); +#endif std::cout << "Schedule for 2D pointwise operations:\n" << result.schedule_source << "\n\n"; } @@ -61,8 +73,11 @@ int main(int argc, char **argv) { f0.set_estimate(x, 0, 1000); - AutoSchedulerResults result = - Pipeline(f0).auto_schedule(target, params); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + AutoSchedulerResults result = Pipeline(f0).auto_schedule(target, params); +#else + AutoSchedulerResults result = Pipeline(f0).apply_autoscheduler(target, params); +#endif std::cout << "Schedule for 1D convolution:\n" << result.schedule_source << "\n\n"; } @@ -77,8 +92,11 @@ int main(int argc, char **argv) { f0.set_estimate(x, 0, 1000) .set_estimate(y, 0, 1000); - AutoSchedulerResults result = - Pipeline(f0).auto_schedule(target, params); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + AutoSchedulerResults result = Pipeline(f0).auto_schedule(target, params); +#else + AutoSchedulerResults result = Pipeline(f0).apply_autoscheduler(target, params); +#endif std::cout << "Schedule for 2D convolution:\n" << result.schedule_source << "\n\n"; } @@ -93,8 +111,11 @@ int main(int argc, char **argv) { hist.set_estimate(x, 0, 10); - AutoSchedulerResults result = - Pipeline(hist).auto_schedule(target, params); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + AutoSchedulerResults result = Pipeline(hist).auto_schedule(target, params); +#else + AutoSchedulerResults result = Pipeline(hist).apply_autoscheduler(target, params); +#endif std::cout << "Schedule for 1D histogram:\n" << result.schedule_source << "\n\n"; } @@ -109,8 +130,11 @@ int main(int argc, char **argv) { hist.set_estimate(x, 0, 10); - AutoSchedulerResults result = - Pipeline(hist).auto_schedule(target, params); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + AutoSchedulerResults result = Pipeline(hist).auto_schedule(target, params); +#else + AutoSchedulerResults result = Pipeline(hist).apply_autoscheduler(target, params); +#endif std::cout << "Schedule for 2D histogram:\n" << result.schedule_source << "\n\n"; } @@ -125,8 +149,11 @@ int main(int argc, char **argv) { hist.set_estimate(x, 0, 10000); - AutoSchedulerResults result = - Pipeline(hist).auto_schedule(target, params); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + AutoSchedulerResults result = Pipeline(hist).auto_schedule(target, params); +#else + AutoSchedulerResults result = Pipeline(hist).apply_autoscheduler(target, params); +#endif std::cout << "Schedule for 2D histogram with larger domain:\n" << result.schedule_source << "\n\n"; } @@ -146,8 +173,11 @@ int main(int argc, char **argv) { f2.set_estimate(y, 0, 1024) .set_estimate(x, 0, 4); - AutoSchedulerResults result = - Pipeline(f2).auto_schedule(target, params); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + AutoSchedulerResults result = Pipeline(f2).auto_schedule(target, params); +#else + AutoSchedulerResults result = Pipeline(f2).apply_autoscheduler(target, params); +#endif std::cout << "Schedule for 2D pointwise operations with small x dimension:\n" << result.schedule_source << "\n\n"; } diff --git a/src/autoschedulers/li2018/test.py b/src/autoschedulers/li2018/test.py index 31971ba556d8..72afc9334540 100644 --- a/src/autoschedulers/li2018/test.py +++ b/src/autoschedulers/li2018/test.py @@ -17,9 +17,8 @@ def main(): f_2.set_estimate(x, 0, 1000) p = hl.Pipeline(f_2) target = hl.Target() - # Only first parameter is used (number of cores on CPU) - params = hl.MachineParams(32, 0, 0); - result = p.auto_schedule('Li2018', target, params) + asp = hl.AutoschedulerParams('Li2018', {'parallelism': 32}) + result = p.apply_autoscheduler(target, asp) print('Schedule:') print(result.schedule_source) diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp index 6253b8229c46..3fc82e293508 100644 --- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp +++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp @@ -20,6 +20,18 @@ using std::vector; namespace { +struct ArchParams { + /** Maximum level of parallelism avalaible. */ + int parallelism = 16; + + /** Size of the last-level cache (in bytes). */ + uint64_t last_level_cache_size = 16 * 1024 * 1024; + + /** Indicates how much more expensive is the cost of a load compared to + * the cost of an arithmetic operation at last level cache. */ + float balance = 40; +}; + // Substitute parameter estimates into the exprs describing the box bounds. void substitute_estimates_box(Box &box) { box.used = substitute_var_estimates(box.used); @@ -1054,7 +1066,7 @@ struct Partitioner { const map &pipeline_bounds; // Parameters of the machine model that is used for estimating the cost of each // group in the pipeline. - const MachineParams &arch_params; + const ArchParams &arch_params; // Dependency analysis of the pipeline. This support queries on regions // accessed and computed for producing some regions of some functions. DependenceAnalysis &dep_analysis; @@ -1065,7 +1077,7 @@ struct Partitioner { const vector &outputs; Partitioner(const map &_pipeline_bounds, - const MachineParams &_arch_params, + const ArchParams &_arch_params, const vector &_outputs, DependenceAnalysis &_dep_analysis, RegionCosts &_costs); @@ -1305,7 +1317,7 @@ void Partitioner::disp_pipeline_costs() { // Construct a partitioner and build the pipeline graph on which the grouping // algorithm operates. Partitioner::Partitioner(const map &_pipeline_bounds, - const MachineParams &_arch_params, + const ArchParams &_arch_params, const vector &_outputs, DependenceAnalysis &_dep_analysis, RegionCosts &_costs) @@ -3166,7 +3178,7 @@ bool inline_unbounded(const vector &outputs, // outputs. This applies the schedules and returns a string representation of // the schedules. The target architecture is specified by 'target'. string generate_schedules(const vector &outputs, const Target &target, - const MachineParams &arch_params) { + const ArchParams &arch_params) { // Make an environment map which is used throughout the auto scheduling process. map env; for (const Function &f : outputs) { @@ -3372,21 +3384,56 @@ string generate_schedules(const vector &outputs, const Target &target, } struct Mullapudi2016 { - void operator()(const Pipeline &pipeline, const Target &target, const MachineParams &arch_params, AutoSchedulerResults *outputs) { +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + void operator()(const Pipeline &pipeline, const Target &target, const MachineParams ¶ms_in, AutoSchedulerResults *outputs) { AutoSchedulerResults results; results.target = target; - results.machine_params_string = arch_params.to_string(); + results.machine_params_string = params_in.to_string(); results.scheduler_name = "Mullapudi2016"; std::vector pipeline_outputs; for (const Func &f : pipeline.outputs()) { pipeline_outputs.push_back(f.function()); } + ArchParams arch_params{params_in.parallelism, params_in.last_level_cache_size, params_in.balance}; results.schedule_source = generate_schedules(pipeline_outputs, target, arch_params); // this autoscheduler has no featurization + *outputs = std::move(results); + } +#else + void operator()(const Pipeline &pipeline, const Target &target, const AutoschedulerParams ¶ms_in, AutoSchedulerResults *outputs) { + internal_assert(params_in.name == "Mullapudi2016"); + // Verify that no unknown keys are set in params_in + const std::set legal_keys = {"parallelism", "last_level_cache_size", "balance"}; + for (const auto &it : params_in.extra) { + user_assert(legal_keys.count(it.first) == 1) << "The key " << it.first << " is not legal to use for the Mullapudi2016 Autoscheduler."; + } - *outputs = results; + AutoSchedulerResults results; + results.target = target; + results.autoscheduler_params = params_in; + + std::vector pipeline_outputs; + for (const Func &f : pipeline.outputs()) { + pipeline_outputs.push_back(f.function()); + } + + ArchParams arch_params; + if (params_in.extra.count("parallelism")) { + arch_params.parallelism = std::stoi(params_in.extra.at("parallelism")); + } + if (params_in.extra.count("last_level_cache_size")) { + arch_params.last_level_cache_size = (uint64_t)std::stol(params_in.extra.at("last_level_cache_size")); + } + if (params_in.extra.count("balance")) { + arch_params.balance = std::stoi(params_in.extra.at("balance")); + } + results.schedule_source = generate_schedules(pipeline_outputs, target, arch_params); + results.autoscheduler_params = params_in; + // this autoscheduler has no featurization + *outputs = std::move(results); } +#endif }; REGISTER_AUTOSCHEDULER(Mullapudi2016) diff --git a/test/auto_schedule/cost_function.cpp b/test/auto_schedule/cost_function.cpp index 683785e9914e..7200c9348f5d 100644 --- a/test/auto_schedule/cost_function.cpp +++ b/test/auto_schedule/cost_function.cpp @@ -48,7 +48,11 @@ int main(int argc, char **argv) { // Auto-schedule the pipeline Target target = get_jit_target_from_environment(); Pipeline p(stencils[num_stencils - 1]); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API AutoSchedulerResults results = p.auto_schedule(target); +#else + AutoSchedulerResults results = p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif std::cout << "\n\n******************************************\nSCHEDULE:\n" << "******************************************\n" diff --git a/test/auto_schedule/data_dependent.cpp b/test/auto_schedule/data_dependent.cpp index 5a54626c4763..828a1061cd3e 100644 --- a/test/auto_schedule/data_dependent.cpp +++ b/test/auto_schedule/data_dependent.cpp @@ -40,7 +40,11 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); Pipeline p(g); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule g.print_loop_nest(); diff --git a/test/auto_schedule/extern.cpp b/test/auto_schedule/extern.cpp index 02fe11582c4e..8cd4b5181c2c 100644 --- a/test/auto_schedule/extern.cpp +++ b/test/auto_schedule/extern.cpp @@ -52,7 +52,11 @@ void test_case_1() { Target target = get_jit_target_from_environment(); Pipeline p(g); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule g.print_loop_nest(); @@ -82,7 +86,11 @@ void test_case_2() { Target target = get_jit_target_from_environment(); Pipeline p(g); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule g.print_loop_nest(); @@ -114,7 +122,11 @@ void test_case_3() { Target target = get_jit_target_from_environment(); Pipeline p(g); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule g.print_loop_nest(); diff --git a/test/auto_schedule/fibonacci.cpp b/test/auto_schedule/fibonacci.cpp index a394af50a921..0d2a05a3001b 100644 --- a/test/auto_schedule/fibonacci.cpp +++ b/test/auto_schedule/fibonacci.cpp @@ -22,7 +22,11 @@ double run_test(bool auto_schedule) { if (auto_schedule) { // Auto-schedule the pipeline +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif } // Inspect the schedule diff --git a/test/auto_schedule/histogram.cpp b/test/auto_schedule/histogram.cpp index c51cac7436b4..0cc4f151030b 100644 --- a/test/auto_schedule/histogram.cpp +++ b/test/auto_schedule/histogram.cpp @@ -64,7 +64,11 @@ double run_test(bool auto_schedule) { // Provide estimates on the pipeline output color.set_estimates({{0, 1920}, {0, 1024}, {0, 3}}); // Auto-schedule the pipeline +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif } else if (target.has_gpu_feature()) { Var xi("xi"), yi("yi"); Y.compute_root().gpu_tile(x, y, xi, yi, 16, 16); diff --git a/test/auto_schedule/large_window.cpp b/test/auto_schedule/large_window.cpp index 2626b9a2508b..c449d7136873 100644 --- a/test/auto_schedule/large_window.cpp +++ b/test/auto_schedule/large_window.cpp @@ -46,7 +46,11 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); Pipeline p(g); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule g.print_loop_nest(); diff --git a/test/auto_schedule/mat_mul.cpp b/test/auto_schedule/mat_mul.cpp index 07e5fefce2ca..73bac853d393 100644 --- a/test/auto_schedule/mat_mul.cpp +++ b/test/auto_schedule/mat_mul.cpp @@ -40,7 +40,11 @@ double run_test(bool auto_schedule) { // Provide estimates on the pipeline output out.set_estimate(x, 0, size).set_estimate(y, 0, size); // Auto-schedule the pipeline +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif } else if (target.has_gpu_feature()) { Var xi("xi"), yi("yi"), xii("xii"), yii("yii"), xt("xt"), yt("yt"); out.tile(x, y, xi, yi, 8, 8).unroll(xi).unroll(yi).gpu_tile(x, y, xt, yt, 8, 8); diff --git a/test/auto_schedule/max_filter.cpp b/test/auto_schedule/max_filter.cpp index fa9b72706d5d..f9d7e0854012 100644 --- a/test/auto_schedule/max_filter.cpp +++ b/test/auto_schedule/max_filter.cpp @@ -72,7 +72,11 @@ double run_test(bool auto_schedule) { .set_estimate(y, 0, in.height()) .set_estimate(c, 0, in.channels()); // Auto-schedule the pipeline +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif } else if (target.has_gpu_feature()) { slice_for_radius.compute_root(); filter_height.compute_root(); diff --git a/test/auto_schedule/multi_output.cpp b/test/auto_schedule/multi_output.cpp index f00f4ee09fa3..3ad372568e13 100644 --- a/test/auto_schedule/multi_output.cpp +++ b/test/auto_schedule/multi_output.cpp @@ -44,10 +44,14 @@ int main(int argc, char **argv) { std::vector outs; outs.push_back(h); outs.push_back(g); - Pipeline test(outs); + Pipeline p(outs); Target target = get_jit_target_from_environment(); - test.auto_schedule(target); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule h.print_loop_nest(); @@ -56,7 +60,7 @@ int main(int argc, char **argv) { Buffer out_1(999, 999), out_2(999, 999); // Run the schedule - test.realize({out_1, out_2}); + p.realize({out_1, out_2}); printf("Success!\n"); return 0; diff --git a/test/auto_schedule/overlap.cpp b/test/auto_schedule/overlap.cpp index 8fe4a0b5aa1f..2f747879244f 100644 --- a/test/auto_schedule/overlap.cpp +++ b/test/auto_schedule/overlap.cpp @@ -50,7 +50,11 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); Pipeline p(up[num_levels - 1]); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule up[num_levels - 1].print_loop_nest(); diff --git a/test/auto_schedule/param.cpp b/test/auto_schedule/param.cpp index 1db0458d0e2f..7102e1d61217 100644 --- a/test/auto_schedule/param.cpp +++ b/test/auto_schedule/param.cpp @@ -23,7 +23,11 @@ void run_test_1() { Target target = get_jit_target_from_environment(); Pipeline p(g); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule g.print_loop_nest(); @@ -50,7 +54,11 @@ void run_test_2() { Target target = get_jit_target_from_environment(); Pipeline p(g); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule g.print_loop_nest(); @@ -77,7 +85,11 @@ void run_test_3() { Target target = get_jit_target_from_environment(); Pipeline p(output); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule output.print_loop_nest(); @@ -107,7 +119,11 @@ void run_test_4() { Target target = get_jit_target_from_environment(); Pipeline p(output); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule output.print_loop_nest(); diff --git a/test/auto_schedule/reorder.cpp b/test/auto_schedule/reorder.cpp index 24c4893051f7..ba15be2544aa 100644 --- a/test/auto_schedule/reorder.cpp +++ b/test/auto_schedule/reorder.cpp @@ -27,7 +27,11 @@ double run_test_1(bool auto_schedule) { // Provide estimates on the pipeline output r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}}); // Auto-schedule the pipeline +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif } else { /* r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4); @@ -79,7 +83,11 @@ double run_test_2(bool auto_schedule) { // Provide estimates on the pipeline output diff.set_estimates({{0, left_im.width()}, {0, left_im.height()}, {0, 32}, {0, 3}}); // Auto-schedule the pipeline +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif } else { Var t("t"); diff.reorder(c, z).fuse(c, z, t).parallel(t).vectorize(x, 16); @@ -118,7 +126,11 @@ double run_test_3(bool auto_schedule) { // Provide estimates on the pipeline output r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}}); // Auto-schedule the pipeline +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif } else { Var par("par"); r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4); diff --git a/test/auto_schedule/small_pure_update.cpp b/test/auto_schedule/small_pure_update.cpp index 4ef2649048ee..3954c257015a 100644 --- a/test/auto_schedule/small_pure_update.cpp +++ b/test/auto_schedule/small_pure_update.cpp @@ -28,8 +28,13 @@ int main(int argc, char **argv) { h.set_estimates({{0, 13}, {0, 17}}); in_param.set_estimates({{0, 13}, {0, 17}}); + Target target = get_target_from_environment(); Pipeline p(h); - p.auto_schedule(Target("host")); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif in_param.set(in); diff --git a/test/auto_schedule/tile_vs_inline.cpp b/test/auto_schedule/tile_vs_inline.cpp index 1c067cd81ab7..01ebaa15baca 100644 --- a/test/auto_schedule/tile_vs_inline.cpp +++ b/test/auto_schedule/tile_vs_inline.cpp @@ -44,7 +44,11 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); Pipeline p(g); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule g.print_loop_nest(); diff --git a/test/auto_schedule/unused_func.cpp b/test/auto_schedule/unused_func.cpp index bac796b6baa3..406ba438f0c9 100644 --- a/test/auto_schedule/unused_func.cpp +++ b/test/auto_schedule/unused_func.cpp @@ -28,7 +28,11 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); Pipeline p(f); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule f.print_loop_nest(); diff --git a/test/auto_schedule/vectorize_var_in_update.cpp b/test/auto_schedule/vectorize_var_in_update.cpp index 8b0f6881220f..13f9bf155bb9 100644 --- a/test/auto_schedule/vectorize_var_in_update.cpp +++ b/test/auto_schedule/vectorize_var_in_update.cpp @@ -50,7 +50,11 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); Pipeline p(h); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif // Inspect the schedule h.print_loop_nest(); diff --git a/test/correctness/custom_auto_scheduler.cpp b/test/correctness/custom_auto_scheduler.cpp index 32eec8b25dae..cda182861340 100644 --- a/test/correctness/custom_auto_scheduler.cpp +++ b/test/correctness/custom_auto_scheduler.cpp @@ -6,7 +6,11 @@ int call_count = 0; void inline_everything(const Pipeline &, const Target &, +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API const MachineParams &, +#else + const AutoschedulerParams &, +#endif AutoSchedulerResults *) { call_count++; // Inlining everything is really easy. @@ -22,13 +26,22 @@ int main(int argc, char **argv) { Func f; Var x; f(x) = 3; - Pipeline(f).auto_schedule(kSchedulerName, Target("host")); - - Pipeline::set_default_autoscheduler_name(kSchedulerName); Func g; g(x) = 3; - Pipeline(g).auto_schedule(Target("host")); + + Target t("host"); + +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + Pipeline(f).auto_schedule(kSchedulerName, t); + + Pipeline::set_default_autoscheduler_name(kSchedulerName); + Pipeline(g).auto_schedule(t); +#else + AutoschedulerParams autoscheduler_params(kSchedulerName); + Pipeline(f).apply_autoscheduler(t, autoscheduler_params); + Pipeline(g).apply_autoscheduler(t, autoscheduler_params); +#endif if (call_count != 2) { printf("Should have called the custom autoscheduler twice. Instead called it %d times\n", call_count); diff --git a/test/error/auto_schedule_no_parallel.cpp b/test/error/auto_schedule_no_parallel.cpp index 74e2b269025f..2519619a3b1b 100644 --- a/test/error/auto_schedule_no_parallel.cpp +++ b/test/error/auto_schedule_no_parallel.cpp @@ -25,7 +25,11 @@ int main(int argc, char **argv) { // This should throw an error since auto-scheduler does not currently // support partial schedules +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif printf("Success!\n"); return 0; diff --git a/test/error/auto_schedule_no_reorder.cpp b/test/error/auto_schedule_no_reorder.cpp index 8f39114ee9ea..d9fb344473e4 100644 --- a/test/error/auto_schedule_no_reorder.cpp +++ b/test/error/auto_schedule_no_reorder.cpp @@ -25,7 +25,11 @@ int main(int argc, char **argv) { // This should throw an error since auto-scheduler does not currently // support partial schedules +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API p.auto_schedule(target); +#else + p.apply_autoscheduler(target, {"Mullapudi2016"}); +#endif printf("Success!\n"); return 0; diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt index e6f82c285641..3097fc8b6955 100644 --- a/test/generator/CMakeLists.txt +++ b/test/generator/CMakeLists.txt @@ -109,10 +109,17 @@ endif () # alias_aottest.cpp # alias_generator.cpp -halide_define_aot_test(alias EXTRA_LIBS alias_with_offset_42) -add_halide_library(alias_with_offset_42 - FROM alias.generator - GENERATOR alias_with_offset_42) +set(ALIAS_LIBS alias_with_offset_42 alias_Adams2019 alias_Li2018 alias_Mullapudi2016) +halide_define_aot_test(alias EXTRA_LIBS ${ALIAS_LIBS}) +foreach (LIB IN LISTS ALIAS_LIBS) + # We don't really need all the plugins at once here -- + # It's just easier to specify them all (and adds a test that loading + # multiple plugins works) + add_halide_library(${LIB} + FROM alias.generator + GENERATOR ${LIB} + PLUGINS Halide::Adams2019 Halide::Li2018 Halide::Mullapudi2016) +endforeach () # argvcall_aottest.cpp # argvcall_generator.cpp diff --git a/test/generator/alias_aottest.cpp b/test/generator/alias_aottest.cpp index 80c2f61a9602..41c1a9f0ae80 100644 --- a/test/generator/alias_aottest.cpp +++ b/test/generator/alias_aottest.cpp @@ -6,6 +6,13 @@ #include "alias.h" #include "alias_with_offset_42.h" +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API +// nothing +#else +#include "alias_Adams2019.h" +#include "alias_Li2018.h" +#include "alias_Mullapudi2016.h" +#endif using namespace Halide::Runtime; @@ -18,16 +25,45 @@ int main(int argc, char **argv) { input(x) = x; }); + output.fill(0); alias(input, output); + output.copy_to_host(); input.for_each_element([=](int x) { assert(output(x) == input(x)); }); + output.fill(0); alias_with_offset_42(input, output); + output.copy_to_host(); input.for_each_element([=](int x) { assert(output(x) == input(x) + 42); }); +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API + // nothing +#else + output.fill(0); + alias_Adams2019(input, output); + output.copy_to_host(); + input.for_each_element([=](int x) { + assert(output(x) == input(x) + 2019); + }); + + output.fill(0); + alias_Li2018(input, output); + output.copy_to_host(); + input.for_each_element([=](int x) { + assert(output(x) == input(x) + 2018); + }); + + output.fill(0); + output.copy_to_host(); + alias_Mullapudi2016(input, output); + input.for_each_element([=](int x) { + assert(output(x) == input(x) + 2016); + }); +#endif + printf("Success!\n"); return 0; } diff --git a/test/generator/alias_generator.cpp b/test/generator/alias_generator.cpp index 84d3e803709f..5661588229d6 100644 --- a/test/generator/alias_generator.cpp +++ b/test/generator/alias_generator.cpp @@ -11,6 +11,15 @@ class Alias : public Halide::Generator { void generate() { Var x; output(x) = input(x) + offset; + + // set estimates for the autoschedulers + input.set_estimates({{0, 32}}); + output.set_estimates({{0, 32}}); + + if (!using_autoscheduler()) { + // Don't really need a default schedule for something this simple, but sure, why not + output.vectorize(x, natural_vector_size()).compute_root(); + } } }; @@ -18,3 +27,12 @@ class Alias : public Halide::Generator { HALIDE_REGISTER_GENERATOR(Alias, alias) HALIDE_REGISTER_GENERATOR_ALIAS(alias_with_offset_42, alias, {{"offset", "42"}}) +#ifdef HALIDE_ALLOW_LEGACY_AUTOSCHEDULER_API +// nothing +#else +// Since autoscheduler-to-use is now an ordinary GeneratorParam, we can specify it in Aliases for convenience. +// (Set unique offsets just to verify these are all separate calls.) +HALIDE_REGISTER_GENERATOR_ALIAS(alias_Adams2019, alias, {{"autoscheduler", "Adams2019"}, {"offset", "2019"}}) +HALIDE_REGISTER_GENERATOR_ALIAS(alias_Li2018, alias, {{"autoscheduler", "Li2018"}, {"offset", "2018"}}) +HALIDE_REGISTER_GENERATOR_ALIAS(alias_Mullapudi2016, alias, {{"autoscheduler", "Mullapudi2016"}, {"offset", "2016"}}) +#endif diff --git a/test/generator/example_generator.cpp b/test/generator/example_generator.cpp index 41ab28e8da2d..9997b6ccfcad 100644 --- a/test/generator/example_generator.cpp +++ b/test/generator/example_generator.cpp @@ -81,7 +81,7 @@ class Example : public Halide::Generator { runtime_factor.set_estimate(1); output.set_estimates({{0, 32}, {0, 32}, {0, 3}}); - if (!auto_schedule) { + if (!using_autoscheduler()) { output .bound(c, 0, channels) .reorder(c, x, y) diff --git a/test/generator/stubtest_generator.cpp b/test/generator/stubtest_generator.cpp index bbb68aaadc17..8f5b41640e41 100644 --- a/test/generator/stubtest_generator.cpp +++ b/test/generator/stubtest_generator.cpp @@ -92,7 +92,7 @@ class StubTest : public Halide::Generator { } void schedule() { - if (!auto_schedule) { + if (!using_autoscheduler()) { intermediate.compute_at(intermediate_level); intermediate.specialize(vectorize).vectorize(x, natural_vector_size()); } diff --git a/test/generator/stubtest_jittest.cpp b/test/generator/stubtest_jittest.cpp index 2973501941f4..1c0aa3f8fc14 100644 --- a/test/generator/stubtest_jittest.cpp +++ b/test/generator/stubtest_jittest.cpp @@ -142,7 +142,7 @@ int main(int argc, char **argv) { // from the specific inputs we provide, but for the JIT (and AOT) cases, there are // no such inputs available, so we must be explicit. (Note that these are the same // values specified in our Make/CMake files.) - const std::map gp = { + const GeneratorParamsMap gp = { {"untyped_buffer_input.type", "uint8"}, {"untyped_buffer_input.dim", "3"}, {"simple_input.type", "float32"}, @@ -217,7 +217,7 @@ int main(int argc, char **argv) { // from the specific inputs we provide, but for the JIT (and AOT) cases, there are // no such inputs available, so we must be explicit. (Note that these are the same // values specified in our Make/CMake files.) - const std::map gp = { + const GeneratorParamsMap gp = { {"untyped_buffer_input.type", "uint8"}, {"untyped_buffer_input.dim", "3"}, {"simple_input.type", "float32"}, diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt index 7d86ae5d6ae3..2db06c7d0dfa 100644 --- a/tutorial/CMakeLists.txt +++ b/tutorial/CMakeLists.txt @@ -194,11 +194,14 @@ if (TARGET Halide::Mullapudi2016) add_halide_library(auto_schedule_false FROM lesson_21_auto_scheduler_generate TARGETS cmake - GENERATOR auto_schedule_gen PARAMS auto_schedule=false) + GENERATOR auto_schedule_gen) add_halide_library(auto_schedule_true FROM lesson_21_auto_scheduler_generate TARGETS cmake AUTOSCHEDULER Halide::Mullapudi2016 - GENERATOR auto_schedule_gen PARAMS machine_params=32,16777216,40) + GENERATOR auto_schedule_gen + PARAMS autoscheduler.parallelism=32 + autoscheduler.last_level_cache_size=16777216 + autoscheduler.balance=40) add_executable(lesson_21_auto_scheduler_run lesson_21_auto_scheduler_run.cpp) target_link_libraries(lesson_21_auto_scheduler_run PRIVATE diff --git a/tutorial/lesson_21_auto_scheduler_generate.cpp b/tutorial/lesson_21_auto_scheduler_generate.cpp index 44a1bcac6aea..4258599e8d58 100644 --- a/tutorial/lesson_21_auto_scheduler_generate.cpp +++ b/tutorial/lesson_21_auto_scheduler_generate.cpp @@ -2,7 +2,7 @@ // So far we have written Halide schedules by hand, but it is also possible to // ask Halide to suggest a reasonable schedule. We call this auto-scheduling. -// This lesson demonstrates how to use the auto-scheduler to generate a +// This lesson demonstrates how to use the autoscheduler to generate a // copy-pasteable CPU schedule that can be subsequently improved upon. // On linux or os x, you can compile and run it like so: @@ -11,7 +11,7 @@ // export LD_LIBRARY_PATH= # For linux // export DYLD_LIBRARY_PATH= # For OS X // ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_false -e static_library,h,schedule target=host auto_schedule=false -// ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_true -e static_library,h,schedule -p -S Mullapudi2016 target=host auto_schedule=true machine_params=32,16777216,40 +// ./lesson_21_generate -o . -g auto_schedule_gen -f auto_schedule_true -e static_library,h,schedule -p -S Mullapudi2016 target=host autoscheduler=Mullapudi2016 autoscheduler.parallelism=32 autoscheduler.last_level_cache_size=16777216 autoscheduler.balance=40 // g++ lesson_21_auto_scheduler_run.cpp -std=c++17 -I -I auto_schedule_false.a auto_schedule_true.a -ldl -lpthread -o lesson_21_run // ./lesson_21_run @@ -69,8 +69,8 @@ class AutoScheduled : public Halide::Generator { } void schedule() { - if (auto_schedule) { - // The auto-scheduler requires estimates on all the input/output + if (using_autoscheduler()) { + // The autoscheduler requires estimates on all the input/output // sizes and parameter values in order to compare different // alternatives and decide on a good schedule. @@ -95,31 +95,33 @@ class AutoScheduled : public Halide::Generator { // schedule will be. // To auto-schedule the pipeline, we don't have to do anything else: - // every Generator implicitly has a GeneratorParam named "auto_schedule"; - // if this is set to true, Halide will call auto_schedule() on all of - // our pipeline's outputs automatically. - - // Every Generator also implicitly has a GeneratorParams named "machine_params", - // which allows you to specify characteristics of the machine architecture - // for the auto-scheduler; it's generally specified in your Makefile. + // every Generator implicitly has a GeneratorParam named "auto_scheduler.name"; + // if this is set to the name of the Autoscheduler we want to use, Halide will + // apply it to all of our pipeline's outputs automatically. + + // Every Generator also implicitly has additional, optional GeneratorParams that are + // dependent on the specific Autoscheduler select, which allows you to specify + // characteristics of the machine architecture + // for the autoscheduler; it's generally specified in your Makefile. // If none is specified, the default machine parameters for a generic CPU - // architecture will be used by the auto-scheduler. + // architecture will be used by the autoscheduler. - // Let's see some arbitrary but plausible values for the machine parameters. + // Let's see some arbitrary but plausible values for the machine parameters + // for the Mullapudi2016 Autoscheduler: // - // const int kParallelism = 32; - // const int kLastLevelCacheSize = 16 * 1024 * 1024; - // const int kBalance = 40; - // MachineParams machine_params(kParallelism, kLastLevelCacheSize, kBalance); + // autoscheduler=Mullapudi2016 + // autoscheduler.parallelism=32 + // autoscheduler.last_level_cache_size=16777216 + // autoscheduler.balance=40 // - // The arguments to MachineParams are the maximum level of parallelism - // available, the size of the last-level cache (in KB), and the ratio + // These are the maximum level of parallelism + // available, the size of the last-level cache (in bytes), and the ratio // between the cost of a miss at the last level cache and the cost // of arithmetic on the target architecture, in that order. - // Note that when using the auto-scheduler, no schedule should have - // been applied to the pipeline; otherwise, the auto-scheduler will - // throw an error. The current auto-scheduler cannot handle a + // Note that when using the autoscheduler, no schedule should have + // been applied to the pipeline; otherwise, the autoscheduler will + // throw an error. The current autoscheduler cannot handle a // partially-scheduled pipeline. // If HL_DEBUG_CODEGEN is set to 3 or greater, the schedule will be dumped @@ -131,12 +133,12 @@ class AutoScheduled : public Halide::Generator { // Halide C++ source, which is readily copy-pasteable back into // this very same source file with few modifications. Programmers // can use this as a starting schedule and iteratively improve the - // schedule. Note that the current auto-scheduler is only able to + // schedule. Note that the current autoscheduler is only able to // generate CPU schedules and only does tiling, simple vectorization // and parallelization. It doesn't deal with line buffering, storage // reordering, or factoring reductions. - // At the time of writing, the auto-scheduler will produce the + // At the time of writing, the autoscheduler will produce the // following schedule for the estimates and machine parameters // declared above when run on this pipeline: // @@ -211,7 +213,7 @@ class AutoScheduled : public Halide::Generator { } else { // This is where you would declare the schedule you have written by - // hand or paste the schedule generated by the auto-scheduler. + // hand or paste the schedule generated by the autoscheduler. // We will use a naive schedule here to compare the performance of // the autoschedule with a basic schedule. gray.compute_root(); From 2d907c45a7043361164bba532221d62b8e0fe0d9 Mon Sep 17 00:00:00 2001 From: Derek Gerstmann Date: Fri, 15 Jul 2022 15:15:18 -0700 Subject: [PATCH 2/2] [vulkan phase0] Add adts for containers and memory allocation to runtime (#6829) * Cherry pick runtime internals as standalone commit (preparation work for Vulkan runtime) * Clang format/tidy fixes * Fix runtime test linkage and include paths to not include libHalide * Update test/runtime/CMakeLists.txt Fix typo mismatch for HALIDE_VERSION_PATCH Co-authored-by: Alex Reinking * Add compiler id guard to build options for runtime tests * Avoid building runtime tests on MSVC since Halide runtime headers are not MS compatible Remove CLANG warning flag for runtime test * Change runtime test compile definitions to be PRIVATE. Remove PUBLIC_EXPORTS from runtime test definition. * Add comment about GNU warnings for 'no-builtin-declaration-mismatch' * Change to debug(user_context) for debug messages where context is valid. Wrap verbose debugging with DEBUG_RUNTIME ifdef. Syle pass based on review comments. * Add note explaining why we disable the internal runtime tests on MSVC. * Cleanup cmake logic for disabling runtime internal tests for MSVC and add a status message. * Don't use strncpy for prepend since some implementations may insert a null char regardless of the length used * Workaround varying platform str implementations and handle termination directly. * Clang Tidy/Format pass Co-authored-by: Derek Gerstmann Co-authored-by: Alex Reinking --- Makefile | 15 +- src/runtime/internal/block_allocator.h | 478 ++++++++++++++++++++++++ src/runtime/internal/block_storage.h | 425 +++++++++++++++++++++ src/runtime/internal/linked_list.h | 333 +++++++++++++++++ src/runtime/internal/memory_arena.h | 310 +++++++++++++++ src/runtime/internal/memory_resources.h | 280 ++++++++++++++ src/runtime/internal/pointer_table.h | 366 ++++++++++++++++++ src/runtime/internal/region_allocator.h | 462 +++++++++++++++++++++++ src/runtime/internal/string_storage.h | 216 +++++++++++ src/runtime/internal/string_table.h | 217 +++++++++++ src/runtime/runtime_internal.h | 5 + test/CMakeLists.txt | 21 ++ test/runtime/CMakeLists.txt | 32 ++ test/runtime/block_allocator.cpp | 140 +++++++ test/runtime/block_storage.cpp | 148 ++++++++ test/runtime/common.h | 29 ++ test/runtime/linked_list.cpp | 91 +++++ test/runtime/memory_arena.cpp | 88 +++++ test/runtime/string_storage.cpp | 63 ++++ test/runtime/string_table.cpp | 44 +++ 20 files changed, 3762 insertions(+), 1 deletion(-) create mode 100644 src/runtime/internal/block_allocator.h create mode 100644 src/runtime/internal/block_storage.h create mode 100644 src/runtime/internal/linked_list.h create mode 100644 src/runtime/internal/memory_arena.h create mode 100644 src/runtime/internal/memory_resources.h create mode 100644 src/runtime/internal/pointer_table.h create mode 100644 src/runtime/internal/region_allocator.h create mode 100644 src/runtime/internal/string_storage.h create mode 100644 src/runtime/internal/string_table.h create mode 100644 test/runtime/CMakeLists.txt create mode 100644 test/runtime/block_allocator.cpp create mode 100644 test/runtime/block_storage.cpp create mode 100644 test/runtime/common.h create mode 100644 test/runtime/linked_list.cpp create mode 100644 test/runtime/memory_arena.cpp create mode 100644 test/runtime/string_storage.cpp create mode 100644 test/runtime/string_table.cpp diff --git a/Makefile b/Makefile index 97d481012909..640b59fa9a68 100644 --- a/Makefile +++ b/Makefile @@ -1144,6 +1144,7 @@ CORRECTNESS_TESTS = $(shell ls $(ROOT_DIR)/test/correctness/*.cpp) $(shell ls $( PERFORMANCE_TESTS = $(shell ls $(ROOT_DIR)/test/performance/*.cpp) ERROR_TESTS = $(shell ls $(ROOT_DIR)/test/error/*.cpp) WARNING_TESTS = $(shell ls $(ROOT_DIR)/test/warning/*.cpp) +RUNTIME_TESTS = $(shell ls $(ROOT_DIR)/test/runtime/*.cpp) GENERATOR_EXTERNAL_TESTS := $(shell ls $(ROOT_DIR)/test/generator/*test.cpp) GENERATOR_EXTERNAL_TEST_GENERATOR := $(shell ls $(ROOT_DIR)/test/generator/*_generator.cpp) TUTORIALS = $(filter-out %_generate.cpp, $(shell ls $(ROOT_DIR)/tutorial/*.cpp)) @@ -1153,6 +1154,7 @@ test_correctness: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=quiet_c test_performance: $(PERFORMANCE_TESTS:$(ROOT_DIR)/test/performance/%.cpp=performance_%) test_error: $(ERROR_TESTS:$(ROOT_DIR)/test/error/%.cpp=error_%) test_warning: $(WARNING_TESTS:$(ROOT_DIR)/test/warning/%.cpp=warning_%) +test_runtime: $(RUNTIME_TESTS:$(ROOT_DIR)/test/runtime/%.cpp=runtime_%) test_tutorial: $(TUTORIALS:$(ROOT_DIR)/tutorial/%.cpp=tutorial_%) test_valgrind: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=valgrind_%) test_avx512: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=avx512_%) @@ -1239,7 +1241,7 @@ test_generator: $(GENERATOR_AOT_TESTS) $(GENERATOR_AOTCPP_TESTS) $(GENERATOR_JIT $(FILTERS_DIR)/rungen_test $(FILTERS_DIR)/registration_test -ALL_TESTS = test_internal test_correctness test_error test_tutorial test_warning test_generator +ALL_TESTS = test_internal test_correctness test_error test_tutorial test_warning test_runtime test_generator # These targets perform timings of each test. For most tests this includes Halide JIT compile times, and run times. # For generator tests they time the compile time only. The times are recorded in CSV files. @@ -1260,6 +1262,7 @@ build_tests: $(CORRECTNESS_TESTS:$(ROOT_DIR)/test/correctness/%.cpp=$(BIN_DIR)/c $(PERFORMANCE_TESTS:$(ROOT_DIR)/test/performance/%.cpp=$(BIN_DIR)/performance_%) \ $(ERROR_TESTS:$(ROOT_DIR)/test/error/%.cpp=$(BIN_DIR)/error_%) \ $(WARNING_TESTS:$(ROOT_DIR)/test/warning/%.cpp=$(BIN_DIR)/warning_%) \ + $(RUNTIME_TESTS:$(ROOT_DIR)/test/runtime/%.cpp=$(BIN_DIR)/runtime_%) \ $(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_aottest.cpp=$(BIN_DIR)/$(TARGET)/generator_aot_%) \ $(GENERATOR_EXTERNAL_TESTS:$(ROOT_DIR)/test/generator/%_jittest.cpp=$(BIN_DIR)/generator_jit_%) \ $(AUTO_SCHEDULE_TESTS:$(ROOT_DIR)/test/auto_schedule/%.cpp=$(BIN_DIR)/auto_schedule_%) @@ -1332,6 +1335,11 @@ $(BIN_DIR)/error_%: $(ROOT_DIR)/test/error/%.cpp $(BIN_DIR)/libHalide.$(SHARED_E $(BIN_DIR)/warning_%: $(ROOT_DIR)/test/warning/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ +# Runtime tests that test internals +RUNTIME_TESTS_CXXFLAGS = -fno-rtti -fno-exceptions -fno-threadsafe-statics -Wno-builtin-declaration-mismatch -DCOMPILING_HALIDE_RUNTIME -DCOMPILING_HALIDE_RUNTIME_TESTS +$(BIN_DIR)/runtime_%: $(ROOT_DIR)/test/runtime/%.cpp $(ROOT_DIR)/test/runtime/common.h + $(CXX) $(TEST_CXX_FLAGS) $(RUNTIME_TESTS_CXXFLAGS) -I$(ROOT_DIR)/test/runtime -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $< $(COMMON_LD_FLAGS) -o $@ + # Auto schedule tests that link against libHalide $(BIN_DIR)/auto_schedule_%: $(ROOT_DIR)/test/auto_schedule/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ @@ -1929,6 +1937,11 @@ warning_%: $(BIN_DIR)/warning_% cd $(TMP_DIR) ; $(CURDIR)/$< 2>&1 | egrep --q "^Warning" @-echo +runtime_%: $(BIN_DIR)/runtime_% + @-mkdir -p $(TMP_DIR) + cd $(TMP_DIR) ; $(CURDIR)/$< + @-echo + generator_jit_%: $(BIN_DIR)/generator_jit_% @-mkdir -p $(TMP_DIR) cd $(TMP_DIR) ; $(CURDIR)/$< diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h new file mode 100644 index 000000000000..f7f0247e441f --- /dev/null +++ b/src/runtime/internal/block_allocator.h @@ -0,0 +1,478 @@ +#ifndef HALIDE_RUNTIME_BLOCK_ALLOCATOR_H +#define HALIDE_RUNTIME_BLOCK_ALLOCATOR_H + +#include "linked_list.h" +#include "memory_resources.h" +#include "region_allocator.h" + +namespace Halide { +namespace Runtime { +namespace Internal { + +// -- + +/** Allocator class interface for managing large contiguous blocks + * of memory, which are then sub-allocated into smaller regions of + * memory. This class only manages the address creation for the + * regions -- allocation callback functions are used to request the + * memory from the necessary system or API calls. This class is + * intended to be used inside of a higher level memory management + * class that provides thread safety, policy management and API + * integration for a specific runtime API (eg Vulkan, OpenCL, etc) + */ +class BlockAllocator { +public: + // disable copy constructors and assignment + BlockAllocator(const BlockAllocator &) = delete; + BlockAllocator &operator=(const BlockAllocator &) = delete; + + // disable non-factory based construction + BlockAllocator() = delete; + ~BlockAllocator() = delete; + + // Allocators for the different types of memory we need to allocate + struct MemoryAllocators { + SystemMemoryAllocatorFns system; + MemoryBlockAllocatorFns block; + MemoryRegionAllocatorFns region; + }; + + // Runtime configuration parameters to adjust the behaviour of the block allocator + struct Config { + size_t initial_capacity = 0; + size_t minimum_block_size = 0; + size_t maximum_block_size = 0; + size_t maximum_block_count = 0; + }; + + // Factory methods for creation / destruction + static BlockAllocator *create(void *user_context, const Config &config, const MemoryAllocators &allocators); + static void destroy(void *user_context, BlockAllocator *block_allocator); + + // Public interface methods + MemoryRegion *reserve(void *user_context, const MemoryRequest &request); + void reclaim(void *user_context, MemoryRegion *region); + bool collect(void *user_context); //< returns true if any blocks were removed + void release(void *user_context); + void destroy(void *user_context); + + // Access methods + const MemoryAllocators ¤t_allocators() const; + const Config ¤t_config() const; + const Config &default_config() const; + size_t block_count() const; + +private: + // Linked-list for storing the block resources + typedef LinkedList::EntryType BlockEntry; + + // Initializes a new instance + void initialize(void *user_context, const Config &config, const MemoryAllocators &allocators); + + // Reserves a region of memory using the given allocator for the given block resource, returns nullptr on failure + MemoryRegion *reserve_memory_region(void *user_context, RegionAllocator *allocator, const MemoryRequest &request); + + // Creates a new region allocator for the given block resource + RegionAllocator *create_region_allocator(void *user_context, BlockResource *block); + + // Destroys the given region allocator and all associated memory regions + void destroy_region_allocator(void *user_context, RegionAllocator *region_allocator); + + // Reserves a block of memory for the requested size and returns the corresponding block entry, or nullptr on failure + BlockEntry *reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + + // Locates the "best-fit" block entry for the requested size, or nullptr if none was found + BlockEntry *find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + + // Creates a new block entry and int the list + BlockEntry *create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + + // Releases the block entry from being used, and makes it available for further allocations + void release_block_entry(void *user_context, BlockEntry *block_entry); + + // Destroys the block entry and removes it from the list + void destroy_block_entry(void *user_context, BlockEntry *block_entry); + + // Invokes the allocation callback to allocate memory for the block region + void alloc_memory_block(void *user_context, BlockResource *block); + + // Invokes the deallocation callback to free memory for the memory block + void free_memory_block(void *user_context, BlockResource *block); + + // Returns a constrained size for the requested size based on config parameters + size_t constrain_requested_size(size_t size) const; + + // Returns true if the given block is compatible with the given properties + bool is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const; + + Config config; + LinkedList block_list; + MemoryAllocators allocators; +}; + +BlockAllocator *BlockAllocator::create(void *user_context, const Config &cfg, const MemoryAllocators &allocators) { + halide_abort_if_false(user_context, allocators.system.allocate != nullptr); + BlockAllocator *result = reinterpret_cast( + allocators.system.allocate(user_context, sizeof(BlockAllocator))); + + if (result == nullptr) { + error(user_context) << "BlockAllocator: Failed to create instance! Out of memory!\n"; + return nullptr; + } + + result->initialize(user_context, cfg, allocators); + return result; +} + +void BlockAllocator::destroy(void *user_context, BlockAllocator *instance) { + halide_abort_if_false(user_context, instance != nullptr); + const MemoryAllocators &allocators = instance->allocators; + instance->destroy(user_context); + halide_abort_if_false(user_context, allocators.system.deallocate != nullptr); + allocators.system.deallocate(user_context, instance); +} + +void BlockAllocator::initialize(void *user_context, const Config &cfg, const MemoryAllocators &ma) { + config = cfg; + allocators = ma; + block_list.initialize(user_context, + sizeof(BlockResource), + config.initial_capacity, + allocators.system); +} + +MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &request) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Reserve (" + << "user_context=" << (void *)(user_context) << " " + << "offset=" << (uint32_t)request.offset << " " + << "size=" << (uint32_t)request.size << " " + << "dedicated=" << (request.dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...\n"; +#endif + BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated); + if (block_entry == nullptr) { + debug(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size (" + << (int32_t)(request.size) << " bytes)!\n"; + return nullptr; + } + + BlockResource *block = static_cast(block_entry->value); + halide_abort_if_false(user_context, block != nullptr); + halide_abort_if_false(user_context, block->allocator != nullptr); + + MemoryRegion *result = reserve_memory_region(user_context, block->allocator, request); + if (result == nullptr) { + + // Unable to reserve region in an existing block ... create a new block and try again. + size_t actual_size = constrain_requested_size(request.size); + block_entry = create_block_entry(user_context, request.properties, actual_size, request.dedicated); + if (block_entry == nullptr) { + debug(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size (" + << (int32_t)(actual_size) << " bytes)!\n"; + return nullptr; + } + + block = static_cast(block_entry->value); + if (block->allocator == nullptr) { + block->allocator = create_region_allocator(user_context, block); + } + + result = reserve_memory_region(user_context, block->allocator, request); + } + return result; +} + +void BlockAllocator::reclaim(void *user_context, MemoryRegion *memory_region) { + halide_abort_if_false(user_context, memory_region != nullptr); + RegionAllocator *allocator = RegionAllocator::find_allocator(user_context, memory_region); + if (allocator == nullptr) { return; } + allocator->reclaim(user_context, memory_region); +} + +bool BlockAllocator::collect(void *user_context) { + bool result = false; + BlockEntry *block_entry = block_list.back(); + while (block_entry != nullptr) { + BlockEntry *prev_entry = block_entry->prev_ptr; + + const BlockResource *block = static_cast(block_entry->value); + if (block->allocator == nullptr) { + block_entry = prev_entry; + continue; + } + + block->allocator->collect(user_context); + if (block->reserved == 0) { + destroy_block_entry(user_context, block_entry); + result = true; + } + + block_entry = prev_entry; + } + return result; +} + +void BlockAllocator::release(void *user_context) { + BlockEntry *block_entry = block_list.back(); + while (block_entry != nullptr) { + BlockEntry *prev_entry = block_entry->prev_ptr; + release_block_entry(user_context, block_entry); + block_entry = prev_entry; + } +} + +void BlockAllocator::destroy(void *user_context) { + BlockEntry *block_entry = block_list.back(); + while (block_entry != nullptr) { + BlockEntry *prev_entry = block_entry->prev_ptr; + destroy_block_entry(user_context, block_entry); + block_entry = prev_entry; + } +} + +MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAllocator *allocator, const MemoryRequest &request) { + MemoryRegion *result = allocator->reserve(user_context, request); + if (result == nullptr) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Failed to allocate region of size (" + << (int32_t)(request.size) << " bytes)!\n"; +#endif + // allocator has enough free space, but not enough contiguous space + // -- collect and try to reallocate + if (allocator->collect(user_context)) { + result = allocator->reserve(user_context, request); + } + } + return result; +} + +BlockAllocator::BlockEntry * +BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { + BlockEntry *block_entry = nullptr; + for (block_entry = block_list.front(); block_entry != nullptr; block_entry = block_entry->next_ptr) { + + const BlockResource *block = static_cast(block_entry->value); + if (!is_compatible_block(block, properties)) { + continue; + } + + // skip blocks that can't be dedicated to a single allocation + if (dedicated && (block->reserved > 0)) { + continue; + } + + // skip dedicated blocks that are already allocated + if (block->memory.dedicated && (block->reserved > 0)) { + continue; + } + + size_t available = (block->memory.size - block->reserved); + if (available >= size) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: find_block_entry (FOUND) (" + << "user_context=" << (void *)(user_context) << " " + << "block_entry=" << (void *)(block_entry) << " " + << "size=" << (uint32_t)size << " " + << "dedicated=" << (dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(properties.usage) << " " + << "caching=" << halide_memory_caching_name(properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n"; +#endif + break; + } + } + + return block_entry; +} + +BlockAllocator::BlockEntry * +BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { + BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated); + if (block_entry == nullptr) { + size_t actual_size = constrain_requested_size(size); + block_entry = create_block_entry(user_context, properties, actual_size, dedicated); + } + + if (block_entry) { + BlockResource *block = static_cast(block_entry->value); + if (block->allocator == nullptr) { + block->allocator = create_region_allocator(user_context, block); + } + } + return block_entry; +} + +RegionAllocator * +BlockAllocator::create_region_allocator(void *user_context, BlockResource *block) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Creating region allocator (" + << "user_context=" << (void *)(user_context) << " " + << "block_resource=" << (void *)(block) << ")...\n"; +#endif + halide_abort_if_false(user_context, block != nullptr); + RegionAllocator *region_allocator = RegionAllocator::create( + user_context, block, {allocators.system, allocators.region}); + + if (region_allocator == nullptr) { + error(user_context) << "BlockAllocator: Failed to create new region allocator!\n"; + return nullptr; + } + + return region_allocator; +} + +void BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator *region_allocator) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Destroying region allocator (" + << "user_context=" << (void *)(user_context) << " " + << "region_allocator=" << (void *)(region_allocator) << ")...\n"; +#endif + if (region_allocator == nullptr) { return; } + region_allocator->destroy(user_context); + RegionAllocator::destroy(user_context, region_allocator); +} + +BlockAllocator::BlockEntry * +BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { + if (config.maximum_block_count && (block_count() >= config.maximum_block_count)) { + debug(user_context) << "BlockAllocator: No free blocks found! Maximum block count reached (" + << (int32_t)(config.maximum_block_count) << ")!\n"; + return nullptr; + } + + BlockEntry *block_entry = block_list.append(user_context); + if (block_entry == nullptr) { + debug(user_context) << "BlockAllocator: Failed to allocate new block entry!\n"; + return nullptr; + } + +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Creating block entry (" + << "block_entry=" << (void *)(block_entry) << " " + << "block=" << (void *)(block_entry->value) << " " + << "allocator=" << (void *)(allocators.block.allocate) << ")...\n"; +#endif + + BlockResource *block = static_cast(block_entry->value); + block->memory.size = size; + block->memory.properties = properties; + block->memory.dedicated = dedicated; + block->reserved = 0; + block->allocator = create_region_allocator(user_context, block); + alloc_memory_block(user_context, block); + return block_entry; +} + +void BlockAllocator::release_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Releasing block entry (" + << "block_entry=" << (void *)(block_entry) << " " + << "block=" << (void *)(block_entry->value) << ")...\n"; +#endif + BlockResource *block = static_cast(block_entry->value); + if (block->allocator) { + block->allocator->release(user_context); + } +} + +void BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::BlockEntry *block_entry) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Destroying block entry (" + << "block_entry=" << (void *)(block_entry) << " " + << "block=" << (void *)(block_entry->value) << " " + << "deallocator=" << (void *)(allocators.block.deallocate) << ")...\n"; +#endif + BlockResource *block = static_cast(block_entry->value); + if (block->allocator) { + destroy_region_allocator(user_context, block->allocator); + block->allocator = nullptr; + } + free_memory_block(user_context, block); + block_list.remove(user_context, block_entry); +} + +void BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n"; +#endif + halide_abort_if_false(user_context, allocators.block.allocate != nullptr); + MemoryBlock *memory_block = &(block->memory); + allocators.block.allocate(user_context, memory_block); + block->reserved = 0; +} + +void BlockAllocator::free_memory_block(void *user_context, BlockResource *block) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n"; +#endif + halide_abort_if_false(user_context, allocators.block.deallocate != nullptr); + MemoryBlock *memory_block = &(block->memory); + allocators.block.deallocate(user_context, memory_block); + block->reserved = 0; + block->memory.size = 0; +} + +size_t BlockAllocator::constrain_requested_size(size_t size) const { + size_t actual_size = size; + if (config.minimum_block_size) { + actual_size = ((actual_size < config.minimum_block_size) ? + config.minimum_block_size : + actual_size); + } + if (config.maximum_block_size) { + actual_size = ((actual_size > config.maximum_block_size) ? + config.maximum_block_size : + actual_size); + } + return actual_size; +} + +bool BlockAllocator::is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const { + if (properties.caching != MemoryCaching::DefaultCaching) { + if (properties.caching != block->memory.properties.caching) { + return false; + } + } + + if (properties.visibility != MemoryVisibility::DefaultVisibility) { + if (properties.visibility != block->memory.properties.visibility) { + return false; + } + } + + if (properties.usage != MemoryUsage::DefaultUsage) { + if (properties.usage != block->memory.properties.usage) { + return false; + } + } + + return true; +} + +const BlockAllocator::MemoryAllocators &BlockAllocator::current_allocators() const { + return allocators; +} + +const BlockAllocator::Config &BlockAllocator::current_config() const { + return config; +} + +const BlockAllocator::Config &BlockAllocator::default_config() const { + static Config result; + return result; +} + +size_t BlockAllocator::block_count() const { + return block_list.size(); +} + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +#endif // HALIDE_RUNTIME_BLOCK_ALLOCATOR_H diff --git a/src/runtime/internal/block_storage.h b/src/runtime/internal/block_storage.h new file mode 100644 index 000000000000..648f10a84846 --- /dev/null +++ b/src/runtime/internal/block_storage.h @@ -0,0 +1,425 @@ +#ifndef HALIDE_RUNTIME_BLOCK_STORAGE_H +#define HALIDE_RUNTIME_BLOCK_STORAGE_H + +#include "memory_resources.h" + +namespace Halide { +namespace Runtime { +namespace Internal { + +// Dynamically resizable array for block storage (eg plain old data) +// -- No usage of constructors/destructors for value type +// -- Assumes all elements stored are uniformly the same fixed size +// -- Allocations are done in blocks of a fixed size +// -- Implementation uses memcpy/memmove for copying +// -- Customizable allocator ... default uses NativeSystemAllocator +class BlockStorage { +public: + static constexpr size_t default_capacity = 32; // smallish + + // Configurable parameters + struct Config { + uint32_t entry_size = 1; // bytes per entry + uint32_t block_size = 32; // bytes per each allocation block + uint32_t minimum_capacity = default_capacity; + }; + + BlockStorage(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &sma = default_allocator()); + BlockStorage(const BlockStorage &other); + ~BlockStorage(); + + void initialize(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &sma = default_allocator()); + + BlockStorage &operator=(const BlockStorage &other); + bool operator==(const BlockStorage &other) const; + bool operator!=(const BlockStorage &other) const; + + void reserve(void *user_context, size_t capacity, bool free_existing = false); + void resize(void *user_context, size_t entry_count, bool realloc = true); + + void assign(void *user_context, size_t index, const void *entry_ptr); + void insert(void *user_context, size_t index, const void *entry_ptr); + void prepend(void *user_context, const void *entry_ptr); + void append(void *user_context, const void *entry_ptr); + void remove(void *user_context, size_t index); + + void fill(void *user_context, const void *array, size_t array_size); + void insert(void *user_context, size_t index, const void *array, size_t array_size); + void replace(void *user_context, size_t index, const void *array, size_t array_size); + void prepend(void *user_context, const void *array, size_t array_size); + void append(void *user_context, const void *array, size_t array_size); + void remove(void *user_context, size_t index, size_t entry_count); + + void pop_front(void *user_context); + void pop_back(void *user_context); + void shrink_to_fit(void *user_context); + void clear(void *user_context); + void destroy(void *user_context); + + bool empty() const; + size_t stride() const; + size_t size() const; + + void *operator[](size_t index); ///< logical entry index (returns ptr = data() + (index * stride()) + const void *operator[](size_t index) const; + + void *data(); + void *front(); + void *back(); + + const void *data() const; + const void *front() const; + const void *back() const; + + const Config ¤t_config() const; + static const Config &default_config(); + + const SystemMemoryAllocatorFns ¤t_allocator() const; + static const SystemMemoryAllocatorFns &default_allocator(); + +private: + void allocate(void *user_context, size_t capacity); + + void *ptr = nullptr; + size_t count = 0; + size_t capacity = 0; + Config config; + SystemMemoryAllocatorFns allocator; +}; + +BlockStorage::BlockStorage(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &sma) + : config(cfg), allocator(sma) { + halide_abort_if_false(user_context, config.entry_size != 0); + halide_abort_if_false(user_context, allocator.allocate != nullptr); + halide_abort_if_false(user_context, allocator.deallocate != nullptr); + if (config.minimum_capacity) { + reserve(user_context, config.minimum_capacity); + } +} + +BlockStorage::BlockStorage(const BlockStorage &other) + : BlockStorage(nullptr, other.config, other.allocator) { + if (other.count) { + resize(nullptr, other.count); + memcpy(this->ptr, other.ptr, count * config.entry_size); + } +} + +BlockStorage::~BlockStorage() { + destroy(nullptr); +} + +void BlockStorage::destroy(void *user_context) { + halide_abort_if_false(user_context, allocator.deallocate != nullptr); + if (ptr != nullptr) { + allocator.deallocate(user_context, ptr); + } + capacity = count = 0; + ptr = nullptr; +} + +void BlockStorage::initialize(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &sma) { + allocator = sma; + config = cfg; + capacity = count = 0; + ptr = nullptr; + if (config.minimum_capacity) { + reserve(user_context, config.minimum_capacity); + } +} + +BlockStorage &BlockStorage::operator=(const BlockStorage &other) { + if (&other != this) { + config = other.config; + resize(nullptr, other.count); + if (count != 0 && other.ptr != nullptr) { + memcpy(ptr, other.ptr, count * config.entry_size); + } + } + return *this; +} + +bool BlockStorage::operator==(const BlockStorage &other) const { + if (config.entry_size != other.config.entry_size) { return false; } + if (count != other.count) { return false; } + return memcmp(this->ptr, other.ptr, this->size() * config.entry_size) == 0; +} + +bool BlockStorage::operator!=(const BlockStorage &other) const { + return !(*this == other); +} + +void BlockStorage::fill(void *user_context, const void *array, size_t array_size) { + if (array_size != 0) { + resize(user_context, array_size); + memcpy(this->ptr, array, array_size * config.entry_size); + count = array_size; + } +} + +void BlockStorage::assign(void *user_context, size_t index, const void *entry_ptr) { + replace(user_context, index, entry_ptr, 1); +} + +void BlockStorage::prepend(void *user_context, const void *entry_ptr) { + insert(user_context, 0, entry_ptr, 1); +} + +void BlockStorage::append(void *user_context, const void *entry_ptr) { + append(user_context, entry_ptr, 1); +} + +void BlockStorage::pop_front(void *user_context) { + halide_debug_assert(user_context, count > 0); + remove(user_context, 0); +} + +void BlockStorage::pop_back(void *user_context) { + halide_debug_assert(user_context, count > 0); + resize(user_context, size() - 1); +} + +void BlockStorage::clear(void *user_context) { + resize(user_context, 0); +} + +void BlockStorage::reserve(void *user_context, size_t new_capacity, bool free_existing) { + new_capacity = max(new_capacity, count); + + if ((new_capacity < capacity) && !free_existing) { + new_capacity = capacity; + } + + allocate(user_context, new_capacity); +} + +void BlockStorage::resize(void *user_context, size_t entry_count, bool realloc) { + size_t current_size = capacity; + size_t requested_size = entry_count; + size_t minimum_size = config.minimum_capacity; + size_t actual_size = current_size; + count = requested_size; + + // increase capacity upto 1.5x existing (or at least min_capacity) + if (requested_size > current_size) { + actual_size = max(requested_size, max(current_size * 3 / 2, minimum_size)); + } else if (!realloc) { + return; + } + +#if DEBUG + debug(user_context) << "BlockStorage: Resize (" + << "requested_size=" << (int32_t)requested_size << " " + << "current_size=" << (int32_t)current_size << " " + << "minimum_size=" << (int32_t)minimum_size << " " + << "actual_size=" << (int32_t)actual_size << " " + << "entry_size=" << (int32_t)config.entry_size << " " + << "realloc=" << (realloc ? "true" : "false") << ")...\n"; +#endif + + allocate(user_context, actual_size); +} + +void BlockStorage::shrink_to_fit(void *user_context) { + if (capacity > count) { + void *new_ptr = nullptr; + if (count > 0) { + size_t actual_bytes = count * config.entry_size; + new_ptr = allocator.allocate(user_context, actual_bytes); + memcpy(new_ptr, ptr, actual_bytes); + } + allocator.deallocate(user_context, ptr); + capacity = count; + ptr = new_ptr; + } +} + +void BlockStorage::insert(void *user_context, size_t index, const void *entry_ptr) { + insert(user_context, index, entry_ptr, 1); +} + +void BlockStorage::remove(void *user_context, size_t index) { + remove(user_context, index, 1); +} + +void BlockStorage::remove(void *user_context, size_t index, size_t entry_count) { + halide_debug_assert(user_context, index < count); + const size_t last_index = size(); + if (index < (last_index - entry_count)) { + size_t dst_offset = index * config.entry_size; + size_t src_offset = (index + entry_count) * config.entry_size; + size_t bytes = (last_index - index - entry_count) * config.entry_size; + +#if DEBUG + debug(0) << "BlockStorage: Remove (" + << "index=" << (int32_t)index << " " + << "entry_count=" << (int32_t)entry_count << " " + << "entry_size=" << (int32_t)config.entry_size << " " + << "last_index=" << (int32_t)last_index << " " + << "src_offset=" << (int32_t)src_offset << " " + << "dst_offset=" << (int32_t)dst_offset << " " + << "bytes=" << (int32_t)bytes << ")...\n"; +#endif + void *dst_ptr = offset_address(ptr, dst_offset); + void *src_ptr = offset_address(ptr, src_offset); + memmove(dst_ptr, src_ptr, bytes); + } + resize(user_context, last_index - entry_count); +} + +void BlockStorage::replace(void *user_context, size_t index, const void *array, size_t array_size) { + halide_debug_assert(user_context, index < count); + size_t offset = index * config.entry_size; + size_t remaining = count - index; + +#if DEBUG + debug(0) << "BlockStorage: Replace (" + << "index=" << (int32_t)index << " " + << "array_size=" << (int32_t)array_size << " " + << "entry_size=" << (int32_t)config.entry_size << " " + << "offset=" << (int32_t)offset << " " + << "remaining=" << (int32_t)remaining << " " + << "capacity=" << (int32_t)capacity << ")...\n"; +#endif + + halide_debug_assert(user_context, remaining > 0); + size_t copy_count = min(remaining, array_size); + void *dst_ptr = offset_address(ptr, offset); + memcpy(dst_ptr, array, copy_count * config.entry_size); + count = max(count, index + copy_count); +} + +void BlockStorage::insert(void *user_context, size_t index, const void *array, size_t array_size) { + halide_debug_assert(user_context, index <= count); + const size_t last_index = size(); + resize(user_context, last_index + array_size); + if (index < last_index) { + size_t src_offset = index * config.entry_size; + size_t dst_offset = (index + array_size) * config.entry_size; + size_t bytes = (last_index - index) * config.entry_size; + void *src_ptr = offset_address(ptr, src_offset); + void *dst_ptr = offset_address(ptr, dst_offset); + memmove(dst_ptr, src_ptr, bytes); + } + replace(user_context, index, array, array_size); +} + +void BlockStorage::prepend(void *user_context, const void *array, size_t array_size) { + insert(user_context, 0, array, array_size); +} + +void BlockStorage::append(void *user_context, const void *array, size_t array_size) { + const size_t last_index = size(); + insert(user_context, last_index, array, array_size); +} + +bool BlockStorage::empty() const { + return count == 0; +} + +size_t BlockStorage::size() const { + return count; +} + +size_t BlockStorage::stride() const { + return config.entry_size; +} + +void *BlockStorage::operator[](size_t index) { + halide_debug_assert(nullptr, index < capacity); + return offset_address(ptr, index * config.entry_size); +} + +const void *BlockStorage::operator[](size_t index) const { + halide_debug_assert(nullptr, index < capacity); + return offset_address(ptr, index * config.entry_size); +} + +void *BlockStorage::data() { + return ptr; +} + +void *BlockStorage::front() { + halide_debug_assert(nullptr, count > 0); + return ptr; +} + +void *BlockStorage::back() { + halide_debug_assert(nullptr, count > 0); + size_t index = count - 1; + return offset_address(ptr, index * config.entry_size); +} + +const void *BlockStorage::data() const { + return ptr; +} + +const void *BlockStorage::front() const { + halide_debug_assert(nullptr, count > 0); + return ptr; +} + +const void *BlockStorage::back() const { + halide_debug_assert(nullptr, count > 0); + size_t index = count - 1; + return offset_address(ptr, index * config.entry_size); +} + +void BlockStorage::allocate(void *user_context, size_t new_capacity) { + if (new_capacity != capacity) { + halide_abort_if_false(user_context, allocator.allocate != nullptr); + size_t requested_bytes = new_capacity * config.entry_size; + size_t block_size = max(config.block_size, config.entry_size); + size_t block_count = (requested_bytes / block_size); + block_count += (requested_bytes % block_size) ? 1 : 0; + size_t alloc_size = block_count * block_size; +#if DEBUG + debug(0) << "BlockStorage: Allocating (" + << "requested_bytes=" << (int32_t)requested_bytes << " " + << "block_size=" << (int32_t)block_size << " " + << "block_count=" << (int32_t)block_count << " " + << "alloc_size=" << (int32_t)alloc_size << ") ...\n"; +#endif + void *new_ptr = alloc_size ? allocator.allocate(user_context, alloc_size) : nullptr; + if (count != 0 && ptr != nullptr && new_ptr != nullptr) { + memcpy(new_ptr, ptr, count * config.entry_size); + } + if (ptr != nullptr) { + halide_abort_if_false(user_context, allocator.deallocate != nullptr); + allocator.deallocate(user_context, ptr); + } + capacity = new_capacity; + ptr = new_ptr; + } +} + +const SystemMemoryAllocatorFns & +BlockStorage::current_allocator() const { + return this->allocator; +} + +const BlockStorage::Config & +BlockStorage::default_config() { + static Config default_cfg; + return default_cfg; +} + +const BlockStorage::Config & +BlockStorage::current_config() const { + return this->config; +} + +const SystemMemoryAllocatorFns & +BlockStorage::default_allocator() { + static SystemMemoryAllocatorFns native_allocator = { + native_system_malloc, native_system_free}; + return native_allocator; +} + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +#endif // HALIDE_RUNTIME_BLOCK_STORAGE_H diff --git a/src/runtime/internal/linked_list.h b/src/runtime/internal/linked_list.h new file mode 100644 index 000000000000..dea22c13285e --- /dev/null +++ b/src/runtime/internal/linked_list.h @@ -0,0 +1,333 @@ +#ifndef HALIDE_RUNTIME_LINKED_LIST_H +#define HALIDE_RUNTIME_LINKED_LIST_H + +#include "memory_arena.h" + +namespace Halide { +namespace Runtime { +namespace Internal { + +// Doubly linked list container +// -- Implemented using MemoryArena for allocation +class LinkedList { +public: + // Disable copy support + LinkedList(const LinkedList &) = delete; + LinkedList &operator=(const LinkedList &) = delete; + + // Default initial capacity + static constexpr uint32_t default_capacity = uint32_t(32); // smallish + + // List entry + struct EntryType { + void *value = nullptr; + EntryType *prev_ptr = nullptr; + EntryType *next_ptr = nullptr; + }; + + LinkedList(void *user_context, uint32_t entry_size, uint32_t capacity = default_capacity, + const SystemMemoryAllocatorFns &allocator = default_allocator()); + ~LinkedList(); + + void initialize(void *user_context, uint32_t entry_size, uint32_t capacity = default_capacity, + const SystemMemoryAllocatorFns &allocator = default_allocator()); + + EntryType *front(); + EntryType *back(); + + const EntryType *front() const; + const EntryType *back() const; + + EntryType *prepend(void *user_context); + EntryType *prepend(void *user_context, const void *value); + + EntryType *append(void *user_context); + EntryType *append(void *user_context, const void *value); + + void pop_front(void *user_context); + void pop_back(void *user_context); + + EntryType *insert_before(void *user_context, EntryType *entry_ptr); + EntryType *insert_before(void *user_context, EntryType *entry_ptr, const void *value); + + EntryType *insert_after(void *user_context, EntryType *entry_ptr); + EntryType *insert_after(void *user_context, EntryType *entry_ptr, const void *value); + + void remove(void *user_context, EntryType *entry_ptr); + void clear(void *user_context); + void destroy(void *user_context); + + size_t size() const; + bool empty() const; + + const SystemMemoryAllocatorFns ¤t_allocator() const; + static const SystemMemoryAllocatorFns &default_allocator(); + +private: + EntryType *reserve(void *user_context); + void reclaim(void *user_context, EntryType *entry_ptr); + + MemoryArena *link_arena = nullptr; + MemoryArena *data_arena = nullptr; + EntryType *front_ptr = nullptr; + EntryType *back_ptr = nullptr; + size_t entry_count = 0; +}; + +LinkedList::LinkedList(void *user_context, uint32_t entry_size, uint32_t capacity, + const SystemMemoryAllocatorFns &sma) { + uint32_t arena_capacity = max(capacity, MemoryArena::default_capacity); + link_arena = MemoryArena::create(user_context, {sizeof(EntryType), arena_capacity, 0}, sma); + data_arena = MemoryArena::create(user_context, {entry_size, arena_capacity, 0}, sma); + front_ptr = nullptr; + back_ptr = nullptr; + entry_count = 0; +} + +LinkedList::~LinkedList() { + destroy(nullptr); +} + +void LinkedList::initialize(void *user_context, uint32_t entry_size, uint32_t capacity, + const SystemMemoryAllocatorFns &sma) { + uint32_t arena_capacity = max(capacity, MemoryArena::default_capacity); + link_arena = MemoryArena::create(user_context, {sizeof(EntryType), arena_capacity, 0}, sma); + data_arena = MemoryArena::create(user_context, {entry_size, arena_capacity, 0}, sma); + front_ptr = nullptr; + back_ptr = nullptr; + entry_count = 0; +} + +void LinkedList::destroy(void *user_context) { + clear(nullptr); + if (link_arena) { MemoryArena::destroy(nullptr, link_arena); } + if (data_arena) { MemoryArena::destroy(nullptr, data_arena); } + link_arena = nullptr; + data_arena = nullptr; + front_ptr = nullptr; + back_ptr = nullptr; + entry_count = 0; +} + +typename LinkedList::EntryType *LinkedList::front() { + return front_ptr; +} + +typename LinkedList::EntryType *LinkedList::back() { + return back_ptr; +} + +const typename LinkedList::EntryType *LinkedList::front() const { + return front_ptr; +} + +const typename LinkedList::EntryType *LinkedList::back() const { + return back_ptr; +} + +typename LinkedList::EntryType * +LinkedList::prepend(void *user_context) { + EntryType *entry_ptr = reserve(user_context); + if (empty()) { + front_ptr = entry_ptr; + back_ptr = entry_ptr; + entry_count = 1; + } else { + entry_ptr->next_ptr = front_ptr; + front_ptr->prev_ptr = entry_ptr; + front_ptr = entry_ptr; + ++entry_count; + } + return entry_ptr; +} + +typename LinkedList::EntryType * +LinkedList::append(void *user_context) { + EntryType *entry_ptr = reserve(user_context); + if (empty()) { + front_ptr = entry_ptr; + back_ptr = entry_ptr; + entry_count = 1; + } else { + entry_ptr->prev_ptr = back_ptr; + back_ptr->next_ptr = entry_ptr; + back_ptr = entry_ptr; + ++entry_count; + } + return entry_ptr; +} + +typename LinkedList::EntryType * +LinkedList::prepend(void *user_context, const void *value) { + EntryType *entry_ptr = prepend(user_context); + memcpy(entry_ptr->value, value, data_arena->current_config().entry_size); + return entry_ptr; +} + +typename LinkedList::EntryType * +LinkedList::append(void *user_context, const void *value) { + EntryType *entry_ptr = append(user_context); + memcpy(entry_ptr->value, value, data_arena->current_config().entry_size); + return entry_ptr; +} + +void LinkedList::pop_front(void *user_context) { + halide_abort_if_false(user_context, (entry_count > 0)); + EntryType *remove_ptr = front_ptr; + EntryType *next_ptr = remove_ptr->next_ptr; + if (next_ptr != nullptr) { + next_ptr->prev_ptr = nullptr; + } + front_ptr = next_ptr; + reclaim(user_context, remove_ptr); + --entry_count; +} + +void LinkedList::pop_back(void *user_context) { + halide_abort_if_false(user_context, (entry_count > 0)); + EntryType *remove_ptr = back_ptr; + EntryType *prev_ptr = remove_ptr->prev_ptr; + if (prev_ptr != nullptr) { + prev_ptr->next_ptr = nullptr; + } + back_ptr = prev_ptr; + reclaim(user_context, remove_ptr); + --entry_count; +} + +void LinkedList::clear(void *user_context) { + if (empty() == false) { + EntryType *remove_ptr = back_ptr; + while (remove_ptr != nullptr) { + EntryType *prev_ptr = remove_ptr->prev_ptr; + reclaim(user_context, remove_ptr); + remove_ptr = prev_ptr; + } + front_ptr = nullptr; + back_ptr = nullptr; + entry_count = 0; + } +} + +void LinkedList::remove(void *user_context, EntryType *entry_ptr) { + halide_abort_if_false(user_context, (entry_ptr != nullptr)); + halide_abort_if_false(user_context, (entry_count > 0)); + + if (entry_ptr->prev_ptr != nullptr) { + entry_ptr->prev_ptr->next_ptr = entry_ptr->next_ptr; + } else { + halide_abort_if_false(user_context, (front_ptr == entry_ptr)); + front_ptr = entry_ptr->next_ptr; + } + + if (entry_ptr->next_ptr != nullptr) { + entry_ptr->next_ptr->prev_ptr = entry_ptr->prev_ptr; + } else { + halide_abort_if_false(user_context, (back_ptr == entry_ptr)); + back_ptr = entry_ptr->prev_ptr; + } + + reclaim(user_context, entry_ptr); + --entry_count; +} + +typename LinkedList::EntryType * +LinkedList::insert_before(void *user_context, EntryType *entry_ptr) { + if (entry_ptr != nullptr) { + EntryType *prev_ptr = entry_ptr->prev_ptr; + EntryType *new_ptr = reserve(user_context); + new_ptr->prev_ptr = prev_ptr; + new_ptr->next_ptr = entry_ptr; + entry_ptr->prev_ptr = new_ptr; + if (prev_ptr != nullptr) { + prev_ptr->next_ptr = new_ptr; + } else { + halide_abort_if_false(user_context, (front_ptr == entry_ptr)); + front_ptr = new_ptr; + } + ++entry_count; + return new_ptr; + } else { + return append(user_context); + } +} + +typename LinkedList::EntryType * +LinkedList::insert_after(void *user_context, EntryType *entry_ptr) { + if (entry_ptr != nullptr) { + EntryType *next_ptr = entry_ptr->next_ptr; + EntryType *new_ptr = reserve(user_context); + new_ptr->next_ptr = next_ptr; + new_ptr->prev_ptr = entry_ptr; + entry_ptr->next_ptr = new_ptr; + if (next_ptr != nullptr) { + next_ptr->prev_ptr = new_ptr; + } else { + halide_abort_if_false(user_context, (back_ptr == entry_ptr)); + back_ptr = new_ptr; + } + ++entry_count; + return new_ptr; + } else { + return prepend(user_context); + } +} + +typename LinkedList::EntryType * +LinkedList::insert_before(void *user_context, EntryType *entry_ptr, const void *value) { + EntryType *new_ptr = insert_before(user_context, entry_ptr); + memcpy(new_ptr->value, value, data_arena->current_config().entry_size); + return new_ptr; +} + +typename LinkedList::EntryType * +LinkedList::insert_after(void *user_context, EntryType *entry_ptr, const void *value) { + EntryType *new_ptr = insert_after(user_context, entry_ptr); + memcpy(new_ptr->value, value, data_arena->current_config().entry_size); + return new_ptr; +} + +size_t LinkedList::size() const { + return entry_count; +} + +bool LinkedList::empty() const { + return entry_count == 0; +} + +const SystemMemoryAllocatorFns & +LinkedList::current_allocator() const { + return link_arena->current_allocator(); +} + +const SystemMemoryAllocatorFns & +LinkedList::default_allocator() { + return MemoryArena::default_allocator(); +} + +typename LinkedList::EntryType * +LinkedList::reserve(void *user_context) { + EntryType *entry_ptr = static_cast( + link_arena->reserve(user_context, true)); + entry_ptr->value = data_arena->reserve(user_context, true); + entry_ptr->next_ptr = nullptr; + entry_ptr->prev_ptr = nullptr; + return entry_ptr; +} + +void LinkedList::reclaim(void *user_context, EntryType *entry_ptr) { + void *value_ptr = entry_ptr->value; + entry_ptr->value = nullptr; + entry_ptr->next_ptr = nullptr; + entry_ptr->prev_ptr = nullptr; + data_arena->reclaim(user_context, value_ptr); + link_arena->reclaim(user_context, entry_ptr); +} + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +#endif // HALIDE_RUNTIME_LINKED_LIST_H diff --git a/src/runtime/internal/memory_arena.h b/src/runtime/internal/memory_arena.h new file mode 100644 index 000000000000..27c3d871dccf --- /dev/null +++ b/src/runtime/internal/memory_arena.h @@ -0,0 +1,310 @@ +#ifndef HALIDE_RUNTIME_MEMORY_ARENA_H +#define HALIDE_RUNTIME_MEMORY_ARENA_H + +#include "block_storage.h" + +namespace Halide { +namespace Runtime { +namespace Internal { + +// -- +// Memory Arena class for region based allocations and caching of same-type data +// -- Implementation uses block_storage, and internally manages lists of allocated entries +// -- Customizable allocator (defaults to BlockStorage::default_allocator()) +// -- Not thread safe ... locking must be done by client +// +class MemoryArena { +public: + // Disable copy constructors and assignment + MemoryArena(const MemoryArena &) = delete; + MemoryArena &operator=(const MemoryArena &) = delete; + + // Default initial capacity + static constexpr uint32_t default_capacity = uint32_t(32); // smallish + + // Configurable parameters + struct Config { + uint32_t entry_size = 1; + uint32_t minimum_block_capacity = default_capacity; + uint32_t maximum_block_count = 0; + }; + + MemoryArena(void *user_context, const Config &config = default_config(), + const SystemMemoryAllocatorFns &allocator = default_allocator()); + + ~MemoryArena(); + + // Factory methods for creation / destruction + static MemoryArena *create(void *user_context, const Config &config, const SystemMemoryAllocatorFns &allocator = default_allocator()); + static void destroy(void *user_context, MemoryArena *arena); + + // Initialize a newly created instance + void initialize(void *user_context, const Config &config, + const SystemMemoryAllocatorFns &allocator = default_allocator()); + + // Public interface methods + void *reserve(void *user_context, bool initialize = false); + void reclaim(void *user_context, void *ptr); + bool collect(void *user_context); //< returns true if any blocks were removed + void destroy(void *user_context); + + // Access methods + const Config ¤t_config() const; + static const Config &default_config(); + + const SystemMemoryAllocatorFns ¤t_allocator() const; + static const SystemMemoryAllocatorFns &default_allocator(); + +private: + // Sentinal invalid entry value + static const uint32_t invalid_entry = uint32_t(-1); + + // Each block contains: + // - an array of entries + // - an array of indices (for the free list) + // - an array of status flags (indicating usage) + // - free index points to next available entry for the block (or invalid_entry if block is full) + struct Block { + void *entries = nullptr; + uint32_t *indices = nullptr; + AllocationStatus *status = nullptr; + uint32_t capacity = 0; + uint32_t free_index = 0; + }; + + Block *create_block(void *user_context); + bool collect_block(void *user_context, Block *block); //< returns true if any blocks were removed + void destroy_block(void *user_context, Block *block); + Block *lookup_block(void *user_context, uint32_t index); + + void *create_entry(void *user_context, Block *block, uint32_t index); + void destroy_entry(void *user_context, Block *block, uint32_t index); + void *lookup_entry(void *user_context, Block *block, uint32_t index); + + Config config; + BlockStorage blocks; +}; + +MemoryArena::MemoryArena(void *user_context, + const Config &cfg, + const SystemMemoryAllocatorFns &alloc) + : config(cfg), + blocks(user_context, {sizeof(MemoryArena::Block), 32, 32}, alloc) { + halide_debug_assert(user_context, config.minimum_block_capacity > 1); +} + +MemoryArena::~MemoryArena() { + destroy(nullptr); +} + +MemoryArena *MemoryArena::create(void *user_context, const Config &cfg, const SystemMemoryAllocatorFns &system_allocator) { + halide_abort_if_false(user_context, system_allocator.allocate != nullptr); + MemoryArena *result = reinterpret_cast( + system_allocator.allocate(user_context, sizeof(MemoryArena))); + + if (result == nullptr) { + halide_error(user_context, "MemoryArena: Failed to create instance! Out of memory!\n"); + return nullptr; + } + + result->initialize(user_context, cfg, system_allocator); + return result; +} + +void MemoryArena::destroy(void *user_context, MemoryArena *instance) { + halide_abort_if_false(user_context, instance != nullptr); + const SystemMemoryAllocatorFns &system_allocator = instance->blocks.current_allocator(); + instance->destroy(user_context); + halide_abort_if_false(user_context, system_allocator.deallocate != nullptr); + system_allocator.deallocate(user_context, instance); +} + +void MemoryArena::initialize(void *user_context, + const Config &cfg, + const SystemMemoryAllocatorFns &system_allocator) { + config = cfg; + blocks.initialize(user_context, {sizeof(MemoryArena::Block), 32, 32}, system_allocator); + halide_debug_assert(user_context, config.minimum_block_capacity > 1); +} + +void MemoryArena::destroy(void *user_context) { + for (size_t i = blocks.size(); i--;) { + Block *block = lookup_block(user_context, i); + halide_abort_if_false(user_context, block != nullptr); + destroy_block(user_context, block); + } + blocks.destroy(user_context); +} + +bool MemoryArena::collect(void *user_context) { + bool result = false; + for (size_t i = blocks.size(); i--;) { + Block *block = lookup_block(user_context, i); + halide_abort_if_false(user_context, block != nullptr); + if (collect_block(user_context, block)) { + blocks.remove(user_context, i); + result = true; + } + } + return result; +} + +void *MemoryArena::reserve(void *user_context, bool initialize) { + // Scan blocks for a free entry + for (size_t i = blocks.size(); i--;) { + Block *block = lookup_block(user_context, i); + halide_abort_if_false(user_context, block != nullptr); + if (block->free_index != invalid_entry) { + return create_entry(user_context, block, block->free_index); + } + } + + if (config.maximum_block_count && (blocks.size() >= config.maximum_block_count)) { + halide_error(user_context, "MemoryArena: Failed to reserve new entry! Maxmimum blocks reached!\n"); + return nullptr; + } + + // All blocks full ... create a new one + uint32_t index = 0; + Block *block = create_block(user_context); + void *entry_ptr = create_entry(user_context, block, index); + + // Optionally clear the allocation if requested + if (initialize) { + memset(entry_ptr, 0, config.entry_size); + } + return entry_ptr; +} + +void MemoryArena::reclaim(void *user_context, void *entry_ptr) { + for (size_t i = blocks.size(); i--;) { + Block *block = lookup_block(user_context, i); + halide_abort_if_false(user_context, block != nullptr); + + // is entry_ptr in the address range of this block. + uint8_t *offset_ptr = static_cast(entry_ptr); + uint8_t *base_ptr = static_cast(block->entries); + uint8_t *end_ptr = static_cast(offset_address(block->entries, block->capacity * config.entry_size)); + if ((entry_ptr >= base_ptr) && (entry_ptr < end_ptr)) { + const uint32_t offset = static_cast(offset_ptr - base_ptr); + const uint32_t index = offset / config.entry_size; + destroy_entry(user_context, block, index); + return; + } + } + halide_error(user_context, "MemoryArena: Pointer address doesn't belong to this memory pool!\n"); +} + +typename MemoryArena::Block *MemoryArena::create_block(void *user_context) { + // resize capacity starting with initial up to 1.5 last capacity + uint32_t new_capacity = config.minimum_block_capacity; + if (!blocks.empty()) { + const Block *last_block = static_cast(blocks.back()); + new_capacity = (last_block->capacity * 3 / 2); + } + + halide_abort_if_false(user_context, current_allocator().allocate != nullptr); + void *new_entries = current_allocator().allocate(user_context, config.entry_size * new_capacity); + memset(new_entries, 0, config.entry_size * new_capacity); + + uint32_t *new_indices = (uint32_t *)current_allocator().allocate(user_context, sizeof(uint32_t) * new_capacity); + AllocationStatus *new_status = (AllocationStatus *)current_allocator().allocate(user_context, sizeof(AllocationStatus) * new_capacity); + + for (uint32_t i = 0; i < new_capacity - 1; ++i) { + new_indices[i] = i + 1; // singly-linked list of all free entries in the block + new_status[i] = AllocationStatus::Available; // usage status + } + + new_indices[new_capacity - 1] = invalid_entry; + new_status[new_capacity - 1] = AllocationStatus::InvalidStatus; + + const Block new_block = {new_entries, new_indices, new_status, new_capacity, 0}; + blocks.append(user_context, &new_block); + return static_cast(blocks.back()); +} + +void MemoryArena::destroy_block(void *user_context, Block *block) { + halide_abort_if_false(user_context, block != nullptr); + if (block->entries != nullptr) { + halide_abort_if_false(user_context, current_allocator().deallocate != nullptr); + current_allocator().deallocate(user_context, block->entries); + current_allocator().deallocate(user_context, block->indices); + current_allocator().deallocate(user_context, block->status); + block->entries = nullptr; + block->indices = nullptr; + block->status = nullptr; + } +} + +bool MemoryArena::collect_block(void *user_context, Block *block) { + halide_abort_if_false(user_context, block != nullptr); + if (block->entries != nullptr) { + bool can_collect = true; + for (size_t i = block->capacity; i--;) { + if (block->status[i] == AllocationStatus::InUse) { + can_collect = false; + break; + } + } + if (can_collect) { + destroy_block(user_context, block); + return true; + } + } + return false; +} + +MemoryArena::Block *MemoryArena::lookup_block(void *user_context, uint32_t index) { + return static_cast(blocks[index]); +} + +void *MemoryArena::lookup_entry(void *user_context, Block *block, uint32_t index) { + halide_abort_if_false(user_context, block != nullptr); + halide_abort_if_false(user_context, block->entries != nullptr); + return offset_address(block->entries, index * config.entry_size); +} + +void *MemoryArena::create_entry(void *user_context, Block *block, uint32_t index) { + void *entry_ptr = lookup_entry(user_context, block, index); + block->free_index = block->indices[index]; + block->status[index] = AllocationStatus::InUse; +#if DEBUG_RUNTIME + memset(entry_ptr, 0, config.entry_size); +#endif + return entry_ptr; +} + +void MemoryArena::destroy_entry(void *user_context, Block *block, uint32_t index) { + block->status[index] = AllocationStatus::Available; + block->indices[index] = block->free_index; + block->free_index = index; +} + +const typename MemoryArena::Config & +MemoryArena::current_config() const { + return config; +} + +const typename MemoryArena::Config & +MemoryArena::default_config() { + static Config result; + return result; +} + +const SystemMemoryAllocatorFns & +MemoryArena::current_allocator() const { + return blocks.current_allocator(); +} + +const SystemMemoryAllocatorFns & +MemoryArena::default_allocator() { + return BlockStorage::default_allocator(); +} + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +#endif // HALIDE_RUNTIME_MEMORY_ARENA_H diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h new file mode 100644 index 000000000000..513892922530 --- /dev/null +++ b/src/runtime/internal/memory_resources.h @@ -0,0 +1,280 @@ +#ifndef HALIDE_RUNTIME_MEMORY_RESOURCES_H +#define HALIDE_RUNTIME_MEMORY_RESOURCES_H + +namespace Halide { +namespace Runtime { +namespace Internal { + +// -- + +// Hint for allocation usage indicating whether or not the resource +// is in use, available, or dedicated (and can't be split or shared) +enum class AllocationStatus { + InvalidStatus, + InUse, + Available, + Dedicated +}; + +// Hint for allocation requests indicating intended usage +// required between host and device address space mappings +enum class MemoryVisibility { + InvalidVisibility, //< invalid enum value + HostOnly, //< host local + DeviceOnly, //< device local + DeviceToHost, //< transfer from device to host + HostToDevice, //< transfer from host to device + DefaultVisibility, //< default visibility (use any valid visibility -- unable to determine prior to usage) +}; + +// Hint for allocation requests indicating intended update +// frequency for modifying the contents of the allocation +enum class MemoryUsage { + InvalidUsage, //< invalid enum value + StaticStorage, //< intended for static storage, whereby the contents will be set once and remain unchanged + DynamicStorage, //< intended for dyanmic storage, whereby the contents will be set frequently and change constantly + UniformStorage, //< intended for fast & small fixed read-only uniform storage (intended for passing shader parameters), whereby the contents will be set once and remain unchanged + TransferSrc, //< intended for staging storage updates, whereby the contents will be used as the source of a transfer + TransferDst, //< intended for staging storage updates, whereby the contents will be used as the destination of a transfer + TransferSrcDst, //< intended for staging storage updates, whereby the contents will be used either as a source or destination of a transfer + DefaultUsage //< default usage (use any valid usage -- unable to determine prior to usage) +}; + +// Hint for allocation requests indicating ideal caching support (if available) +enum class MemoryCaching { + InvalidCaching, //< invalid enum value + Cached, //< cached + Uncached, //< uncached + CachedCoherent, //< cached and coherent + UncachedCoherent, //< uncached but still coherent + DefaultCaching //< default caching (use any valid caching behaviour -- unable to determine prior to usage) +}; + +struct MemoryProperties { + MemoryVisibility visibility = MemoryVisibility::InvalidVisibility; + MemoryUsage usage = MemoryUsage::InvalidUsage; + MemoryCaching caching = MemoryCaching::InvalidCaching; +}; + +// Client-facing struct for exchanging memory block allocation requests +struct MemoryBlock { + void *handle = nullptr; //< client data storing native handle (managed by alloc_block_region/free_block_region) + size_t size = 0; //< allocated size (in bytes) + bool dedicated = false; //< flag indicating whether allocation is one dedicated resource (or split/shared into other resources) + MemoryProperties properties; //< properties for the allocated block +}; + +// Client-facing struct for exchanging memory region allocation requests +struct MemoryRegion { + void *handle = nullptr; //< client data storing native handle (managed by alloc_block_region/free_block_region) + size_t offset = 0; //< offset from base address in block (in bytes) + size_t size = 0; //< allocated size (in bytes) + bool dedicated = false; //< flag indicating whether allocation is one dedicated resource (or split/shared into other resources) + MemoryProperties properties; //< properties for the allocated region +}; + +// Client-facing struct for issuing memory allocation requests +struct MemoryRequest { + size_t offset = 0; //< offset from base address in block (in bytes) + size_t size = 0; //< allocated size (in bytes) + size_t alignment = 0; //< alignment constraint for address + bool dedicated = false; //< flag indicating whether allocation is one dedicated resource (or split/shared into other resources) + MemoryProperties properties; //< properties for the allocated region +}; + +class RegionAllocator; +struct BlockRegion; + +// Internal struct for block resource state +// -- Note: first field must MemoryBlock +struct BlockResource { + MemoryBlock memory; //< memory info for the allocated block + RegionAllocator *allocator = nullptr; //< designated allocator for the block + BlockRegion *regions = nullptr; //< head of linked list of memory regions + size_t reserved = 0; //< number of bytes already reserved to regions +}; + +// Internal struct for block region state +// -- Note: first field must MemoryRegion +struct BlockRegion { + MemoryRegion memory; //< memory info for the allocated region + AllocationStatus status = AllocationStatus::InvalidStatus; //< allocation status indicator + BlockRegion *next_ptr = nullptr; //< pointer to next block region in linked list + BlockRegion *prev_ptr = nullptr; //< pointer to prev block region in linked list + BlockResource *block_ptr = nullptr; //< pointer to parent block resource +}; + +// Returns an aligned byte offset to adjust the given offset based on alignment constraints +// -- Alignment must be power of two! +ALWAYS_INLINE size_t aligned_offset(size_t offset, size_t alignment) { + return (offset + (alignment - 1)) & ~(alignment - 1); +} + +// Returns a padded size to accomodate an adjusted offset due to alignment constraints +// -- Alignment must be power of two! +ALWAYS_INLINE size_t aligned_size(size_t offset, size_t size, size_t alignment) { + size_t actual_offset = aligned_offset(offset, alignment); + size_t padding = actual_offset - offset; + size_t actual_size = padding + size; + return actual_size; +} + +// Clamps the given value to be within the [min_value, max_value] range +ALWAYS_INLINE size_t clamped_size(size_t value, size_t min_value, size_t max_value) { + size_t result = (value < min_value) ? min_value : value; + return (result > max_value) ? max_value : result; +} + +// Offset the untyped pointer by the given number of bytes +ALWAYS_INLINE const void *offset_address(const void *address, size_t byte_offset) { + const uintptr_t base = reinterpret_cast(address); + return reinterpret_cast(base + byte_offset); +} + +// Offset the untyped pointer by the given number of bytes +ALWAYS_INLINE void *offset_address(void *address, size_t byte_offset) { + const uintptr_t base = reinterpret_cast(address); + return reinterpret_cast(base + byte_offset); +} + +// -- + +typedef void *(*AllocateSystemFn)(void *, size_t); +typedef void (*DeallocateSystemFn)(void *, void *); + +ALWAYS_INLINE void *native_system_malloc(void *user_context, size_t bytes) { + return malloc(bytes); +} + +ALWAYS_INLINE void native_system_free(void *user_context, void *ptr) { + free(ptr); +} + +struct SystemMemoryAllocatorFns { + AllocateSystemFn allocate = nullptr; + DeallocateSystemFn deallocate = nullptr; +}; + +struct HalideSystemAllocatorFns { + AllocateSystemFn allocate = halide_malloc; + DeallocateSystemFn deallocate = halide_free; +}; + +typedef void (*AllocateBlockFn)(void *, MemoryBlock *); +typedef void (*DeallocateBlockFn)(void *, MemoryBlock *); + +struct MemoryBlockAllocatorFns { + AllocateBlockFn allocate = nullptr; + DeallocateBlockFn deallocate = nullptr; +}; + +typedef void (*AllocateRegionFn)(void *, MemoryRegion *); +typedef void (*DeallocateRegionFn)(void *, MemoryRegion *); + +struct MemoryRegionAllocatorFns { + AllocateRegionFn allocate = nullptr; + DeallocateRegionFn deallocate = nullptr; +}; + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +// -- + +extern "C" { + +WEAK const char *halide_memory_visibility_name(MemoryVisibility value) { + switch (value) { + case MemoryVisibility::InvalidVisibility: { + return "InvalidVisibility"; + } + case MemoryVisibility::DefaultVisibility: { + return "DefaultVisibility"; + } + case MemoryVisibility::HostOnly: { + return "HostOnly"; + } + case MemoryVisibility::DeviceOnly: { + return "DeviceOnly"; + } + case MemoryVisibility::HostToDevice: { + return "HostToDevice"; + } + case MemoryVisibility::DeviceToHost: { + return "DeviceToHost"; + } + default: { + return ""; + } + }; + return ""; +} + +WEAK const char *halide_memory_usage_name(MemoryUsage value) { + switch (value) { + case MemoryUsage::InvalidUsage: { + return "InvalidUsage"; + } + case MemoryUsage::DefaultUsage: { + return "DefaultUsage"; + } + case MemoryUsage::StaticStorage: { + return "StaticStorage"; + } + case MemoryUsage::DynamicStorage: { + return "DynamicStorage"; + } + case MemoryUsage::UniformStorage: { + return "UniformStorage"; + } + case MemoryUsage::TransferSrc: { + return "TransferSrc"; + } + case MemoryUsage::TransferDst: { + return "TransferDst"; + } + case MemoryUsage::TransferSrcDst: { + return "TransferSrcDst"; + } + default: { + return ""; + } + }; + return ""; +} + +WEAK const char *halide_memory_caching_name(MemoryCaching value) { + switch (value) { + case MemoryCaching::InvalidCaching: { + return "InvalidCaching"; + } + case MemoryCaching::DefaultCaching: { + return "DefaultCaching"; + } + case MemoryCaching::Cached: { + return "Cached"; + } + case MemoryCaching::Uncached: { + return "Uncached"; + } + case MemoryCaching::CachedCoherent: { + return "CachedCoherent"; + } + case MemoryCaching::UncachedCoherent: { + return "UncachedCoherent"; + } + default: { + return ""; + } + }; + return ""; +} + +} // extern "C" + +// -- + +#endif // HALIDE_RUNTIME_MEMORY_RESOURCES_H diff --git a/src/runtime/internal/pointer_table.h b/src/runtime/internal/pointer_table.h new file mode 100644 index 000000000000..b5ff3bfd6f7c --- /dev/null +++ b/src/runtime/internal/pointer_table.h @@ -0,0 +1,366 @@ +#ifndef HALIDE_RUNTIME_POINTER_TABLE_H +#define HALIDE_RUNTIME_POINTER_TABLE_H + +#include "memory_resources.h" + +namespace Halide { +namespace Runtime { +namespace Internal { + +// Dynamically resizable array for storing untyped pointers +// -- Implementation uses memcpy/memmove for copying +// -- Customizable allocator ... default uses NativeSystemAllocator +class PointerTable { +public: + static constexpr size_t default_capacity = 32; // smallish + + PointerTable(void *user_context, size_t initial_capacity = 0, const SystemMemoryAllocatorFns &sma = default_allocator()); + PointerTable(const PointerTable &other); + ~PointerTable(); + + void initialize(void *user_context, size_t initial_capacity = 0, const SystemMemoryAllocatorFns &sma = default_allocator()); + + PointerTable &operator=(const PointerTable &other); + bool operator==(const PointerTable &other) const; + bool operator!=(const PointerTable &other) const; + + void reserve(void *user_context, size_t capacity, bool free_existing = false); + void resize(void *user_context, size_t entry_count, bool realloc = true); + + void assign(void *user_context, size_t index, const void *entry_ptr); + void insert(void *user_context, size_t index, const void *entry_ptr); + void prepend(void *user_context, const void *entry_ptr); + void append(void *user_context, const void *entry_ptr); + void remove(void *user_context, size_t index); + + void fill(void *user_context, const void **array, size_t array_size); + void insert(void *user_context, size_t index, const void **array, size_t array_size); + void replace(void *user_context, size_t index, const void **array, size_t array_size); + void prepend(void *user_context, const void **array, size_t array_size); + void append(void *user_context, const void **array, size_t array_size); + void remove(void *user_context, size_t index, size_t entry_count); + + void pop_front(void *user_context); + void pop_back(void *user_context); + void shrink_to_fit(void *user_context); + void clear(void *user_context); + void destroy(void *user_context); + + bool empty() const; + size_t size() const; + + void *operator[](size_t index); + void *operator[](size_t index) const; + + void **data(); + const void **data() const; + + void *front(); + void *back(); + + const SystemMemoryAllocatorFns ¤t_allocator() const; + static const SystemMemoryAllocatorFns &default_allocator(); + +private: + void allocate(void *user_context, size_t capacity); + + void **ptr = nullptr; + size_t count = 0; + size_t capacity = 0; + SystemMemoryAllocatorFns allocator; +}; + +PointerTable::PointerTable(void *user_context, size_t initial_capacity, const SystemMemoryAllocatorFns &sma) + : allocator(sma) { + halide_abort_if_false(user_context, allocator.allocate != nullptr); + halide_abort_if_false(user_context, allocator.deallocate != nullptr); + if (initial_capacity) { reserve(user_context, initial_capacity); } +} + +PointerTable::PointerTable(const PointerTable &other) + : PointerTable(nullptr, 0, other.allocator) { + if (other.capacity) { + ptr = static_cast(allocator.allocate(nullptr, other.capacity * sizeof(void *))); + capacity = other.capacity; + } + if (ptr && other.count != 0) { + count = other.count; + memcpy(this->ptr, other.ptr, count * sizeof(void *)); + } +} + +PointerTable::~PointerTable() { + destroy(nullptr); +} + +void PointerTable::destroy(void *user_context) { + halide_abort_if_false(user_context, allocator.deallocate != nullptr); + if (ptr != nullptr) { + allocator.deallocate(user_context, ptr); + } + capacity = count = 0; + ptr = nullptr; +} + +void PointerTable::initialize(void *user_context, size_t initial_capacity, const SystemMemoryAllocatorFns &sma) { + allocator = sma; + capacity = count = 0; + ptr = nullptr; + if (initial_capacity) { + reserve(user_context, initial_capacity); + } +} + +PointerTable &PointerTable::operator=(const PointerTable &other) { + if (&other != this) { + resize(nullptr, other.count); + if (count != 0 && other.ptr != nullptr) { + memcpy(ptr, other.ptr, count * sizeof(void *)); + } + } + return *this; +} + +bool PointerTable::operator==(const PointerTable &other) const { + if (count != other.count) { return false; } + return memcmp(this->ptr, other.ptr, this->size() * sizeof(void *)) == 0; +} + +bool PointerTable::operator!=(const PointerTable &other) const { + return !(*this == other); +} + +void PointerTable::fill(void *user_context, const void **array, size_t array_size) { + if (array_size != 0) { + resize(user_context, array_size); + memcpy(this->ptr, array, array_size * sizeof(void *)); + count = array_size; + } +} + +void PointerTable::assign(void *user_context, size_t index, const void *entry_ptr) { + halide_debug_assert(user_context, index < count); + ptr[index] = const_cast(entry_ptr); +} + +void PointerTable::prepend(void *user_context, const void *entry_ptr) { + insert(user_context, 0, &entry_ptr, 1); +} + +void PointerTable::append(void *user_context, const void *entry_ptr) { + append(user_context, &entry_ptr, 1); +} + +void PointerTable::pop_front(void *user_context) { + halide_debug_assert(user_context, count > 0); + remove(user_context, 0); +} + +void PointerTable::pop_back(void *user_context) { + halide_debug_assert(user_context, count > 0); + resize(user_context, size() - 1); +} + +void PointerTable::clear(void *user_context) { + resize(user_context, 0); +} + +void PointerTable::reserve(void *user_context, size_t new_capacity, bool free_existing) { + new_capacity = max(new_capacity, count); + if ((new_capacity < capacity) && !free_existing) { + new_capacity = capacity; + } + allocate(user_context, new_capacity); +} + +void PointerTable::resize(void *user_context, size_t entry_count, bool realloc) { + size_t current_size = capacity; + size_t requested_size = entry_count; + size_t minimum_size = default_capacity; + size_t actual_size = current_size; + count = requested_size; + +#ifdef DEBUG_RUNTIME + debug(user_context) << "PointerTable: Resize (" + << "requested_size=" << (int32_t)requested_size << " " + << "current_size=" << (int32_t)current_size << " " + << "minimum_size=" << (int32_t)minimum_size << " " + << "sizeof(void*)=" << (int32_t)sizeof(void *) << " " + << "realloc=" << (realloc ? "true" : "false") << ")...\n"; +#endif + + // increase capacity upto 1.5x existing (or at least min_capacity) + if (requested_size > current_size) { + actual_size = max(requested_size, max(current_size * 3 / 2, minimum_size)); + } else if (!realloc) { + return; + } + + allocate(user_context, actual_size); +} + +void PointerTable::shrink_to_fit(void *user_context) { + if (capacity > count) { + void *new_ptr = nullptr; + if (count > 0) { + size_t bytes = count * sizeof(void *); + new_ptr = allocator.allocate(user_context, bytes); + memcpy(new_ptr, ptr, bytes); + } + allocator.deallocate(user_context, ptr); + capacity = count; + ptr = static_cast(new_ptr); + } +} + +void PointerTable::insert(void *user_context, size_t index, const void *entry_ptr) { + const void *addr = reinterpret_cast(entry_ptr); + insert(user_context, index, &addr, 1); +} + +void PointerTable::remove(void *user_context, size_t index) { + remove(user_context, index, 1); +} + +void PointerTable::remove(void *user_context, size_t index, size_t entry_count) { + halide_debug_assert(user_context, index < count); + const size_t last_index = size(); + if (index < (last_index - entry_count)) { + size_t dst_offset = index * sizeof(void *); + size_t src_offset = (index + entry_count) * sizeof(void *); + size_t bytes = (last_index - index - entry_count) * sizeof(void *); + +#ifdef DEBUG_RUNTIME + debug(user_context) << "PointerTable: Remove (" + << "index=" << (int32_t)index << " " + << "entry_count=" << (int32_t)entry_count << " " + << "last_index=" << (int32_t)last_index << " " + << "src_offset=" << (int32_t)src_offset << " " + << "dst_offset=" << (int32_t)dst_offset << " " + << "bytes=" << (int32_t)bytes << ")...\n"; +#endif + memmove(ptr + dst_offset, ptr + src_offset, bytes); + } + resize(user_context, last_index - entry_count); +} + +void PointerTable::replace(void *user_context, size_t index, const void **array, size_t array_size) { + halide_debug_assert(user_context, index < count); + size_t remaining = count - index; + size_t copy_count = min(remaining, array_size); + +#ifdef DEBUG_RUNTIME + + debug(user_context) << "PointerTable: Replace (" + << "index=" << (int32_t)index << " " + << "array_size=" << (int32_t)array_size << " " + << "remaining=" << (int32_t)remaining << " " + << "copy_count=" << (int32_t)copy_count << " " + << "capacity=" << (int32_t)capacity << ")...\n"; +#endif + + halide_debug_assert(user_context, remaining > 0); + memcpy(ptr + index, array, copy_count * sizeof(void *)); + count = max(count, index + copy_count); +} + +void PointerTable::insert(void *user_context, size_t index, const void **array, size_t array_size) { + halide_debug_assert(user_context, index <= count); + const size_t last_index = size(); + resize(user_context, last_index + array_size); + if (index < last_index) { + size_t src_offset = index * sizeof(void *); + size_t dst_offset = (index + array_size) * sizeof(void *); + size_t bytes = (last_index - index) * sizeof(void *); + memmove(ptr + dst_offset, ptr + src_offset, bytes); + } + replace(user_context, index, array, array_size); +} + +void PointerTable::prepend(void *user_context, const void **array, size_t array_size) { + insert(user_context, 0, array, array_size); +} + +void PointerTable::append(void *user_context, const void **array, size_t array_size) { + const size_t last_index = size(); + insert(user_context, last_index, array, array_size); +} + +bool PointerTable::empty() const { + return count == 0; +} + +size_t PointerTable::size() const { + return count; +} + +void *PointerTable::operator[](size_t index) { + halide_debug_assert(nullptr, index < capacity); + return ptr[index]; +} + +void *PointerTable::operator[](size_t index) const { + halide_debug_assert(nullptr, index < capacity); + return ptr[index]; +} + +void **PointerTable::data() { + return ptr; +} + +void *PointerTable::front() { + halide_debug_assert(nullptr, count > 0); + return ptr[0]; +} + +void *PointerTable::back() { + halide_debug_assert(nullptr, count > 0); + size_t index = count - 1; + return ptr[index]; +} + +const void **PointerTable::data() const { + return const_cast(ptr); +} + +void PointerTable::allocate(void *user_context, size_t new_capacity) { + if (new_capacity != capacity) { + halide_abort_if_false(user_context, allocator.allocate != nullptr); + size_t bytes = new_capacity * sizeof(void *); + +#ifdef DEBUG_RUNTIME + debug(user_context) << "PointerTable: Allocating (bytes=" << (int32_t)bytes << " allocator=" << (void *)allocator.allocate << ")...\n"; +#endif + + void *new_ptr = bytes ? allocator.allocate(user_context, bytes) : nullptr; + if (count != 0 && ptr != nullptr && new_ptr != nullptr) { + memcpy(new_ptr, ptr, count * sizeof(void *)); + } + if (ptr != nullptr) { + halide_abort_if_false(user_context, allocator.deallocate != nullptr); + allocator.deallocate(user_context, ptr); + } + capacity = new_capacity; + ptr = static_cast(new_ptr); + } +} + +const SystemMemoryAllocatorFns & +PointerTable::current_allocator() const { + return this->allocator; +} + +const SystemMemoryAllocatorFns & +PointerTable::default_allocator() { + static SystemMemoryAllocatorFns native_allocator = { + native_system_malloc, native_system_free}; + return native_allocator; +} + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +#endif // HALIDE_RUNTIME_POINTER_TABLE_H diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h new file mode 100644 index 000000000000..8c7f8602abe7 --- /dev/null +++ b/src/runtime/internal/region_allocator.h @@ -0,0 +1,462 @@ +#ifndef HALIDE_RUNTIME_REGION_ALLOCATOR_H +#define HALIDE_RUNTIME_REGION_ALLOCATOR_H + +#include "memory_arena.h" +#include "memory_resources.h" + +namespace Halide { +namespace Runtime { +namespace Internal { + +// -- + +/** Allocator class interface for sub-allocating a contiguous + * memory block into smaller regions of memory. This class only + * manages the address creation for the regions -- allocation + * callback functions are used to request the memory from the + * necessary system or API calls. This class is intended to be + * used inside of a higher level memory management class that + * provides thread safety, policy management and API + * integration for a specific runtime API (eg Vulkan, OpenCL, etc) + */ +class RegionAllocator { +public: + // disable copy constructors and assignment + RegionAllocator(const RegionAllocator &) = delete; + RegionAllocator &operator=(const RegionAllocator &) = delete; + + // disable non-factory based construction + RegionAllocator() = delete; + ~RegionAllocator() = delete; + + // Allocators for the different types of memory we need to allocate + struct MemoryAllocators { + SystemMemoryAllocatorFns system; + MemoryRegionAllocatorFns region; + }; + + // Factory methods for creation / destruction + static RegionAllocator *create(void *user_context, BlockResource *block, const MemoryAllocators &ma); + static void destroy(void *user_context, RegionAllocator *region_allocator); + + // Returns the allocator class instance for the given allocation (or nullptr) + static RegionAllocator *find_allocator(void *user_context, MemoryRegion *memory_region); + + // Public interface methods + MemoryRegion *reserve(void *user_context, const MemoryRequest &request); + void reclaim(void *user_context, MemoryRegion *memory_region); + bool collect(void *user_context); //< returns true if any blocks were removed + void release(void *user_context); + void destroy(void *user_context); + + // Returns the currently managed block resource + BlockResource *block_resource() const; + +private: + // Initializes a new instance + void initialize(void *user_context, BlockResource *block, const MemoryAllocators &ma); + + // Search through allocated block regions (Best-Fit) + BlockRegion *find_block_region(void *user_context, const MemoryRequest &request); + + // Returns true if neighbouring block regions to the given region can be coalesced into one + bool can_coalesce(BlockRegion *region); + + // Merges available neighbouring block regions into the given region + BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region); + + // Returns true if the given region can be split to accomadate the given size + bool can_split(BlockRegion *region, size_t size); + + // Splits the given block region into a smaller region to accomadate the given size, followed by empty space for the remaining + BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment); + + // Creates a new block region and adds it to the region list + BlockRegion *create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated); + + // Creates a new block region and adds it to the region list + void destroy_block_region(void *user_context, BlockRegion *region); + + // Invokes the allocation callback to allocate memory for the block region + void alloc_block_region(void *user_context, BlockRegion *region); + + // Releases a block region and leaves it in the list for further allocations + void release_block_region(void *user_context, BlockRegion *region); + + // Invokes the deallocation callback to free memory for the block region + void free_block_region(void *user_context, BlockRegion *region); + + // Returns true if the given block region is compatible with the given properties + bool is_compatible_block_region(const BlockRegion *region, const MemoryProperties &properties) const; + + BlockResource *block = nullptr; + MemoryArena *arena = nullptr; + MemoryAllocators allocators; +}; + +RegionAllocator *RegionAllocator::create(void *user_context, BlockResource *block_resource, const MemoryAllocators &allocators) { + halide_abort_if_false(user_context, allocators.system.allocate != nullptr); + RegionAllocator *result = reinterpret_cast( + allocators.system.allocate(user_context, sizeof(RegionAllocator))); + + if (result == nullptr) { + halide_error(user_context, "RegionAllocator: Failed to create instance! Out of memory!\n"); + return nullptr; + } + + result->initialize(user_context, block_resource, allocators); + return result; +} + +void RegionAllocator::destroy(void *user_context, RegionAllocator *instance) { + halide_abort_if_false(user_context, instance != nullptr); + const MemoryAllocators &allocators = instance->allocators; + instance->destroy(user_context); + halide_abort_if_false(user_context, allocators.system.deallocate != nullptr); + allocators.system.deallocate(user_context, instance); +} + +void RegionAllocator::initialize(void *user_context, BlockResource *mb, const MemoryAllocators &ma) { + block = mb; + allocators = ma; + arena = MemoryArena::create(user_context, {sizeof(BlockRegion), MemoryArena::default_capacity, 0}, allocators.system); + halide_abort_if_false(user_context, arena != nullptr); + block->allocator = this; + block->regions = create_block_region( + user_context, + block->memory.properties, + 0, block->memory.size, + block->memory.dedicated); +} + +MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) { + halide_abort_if_false(user_context, request.size > 0); + size_t remaining = block->memory.size - block->reserved; + if (remaining < request.size) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Unable to reserve more memory from block " + << "-- requested size (" << (int32_t)(request.size) << " bytes) " + << "greater than available (" << (int32_t)(remaining) << " bytes)!\n"; +#endif + return nullptr; + } + + BlockRegion *block_region = find_block_region(user_context, request); + if (block_region == nullptr) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Failed to locate region for requested size (" + << (int32_t)(request.size) << " bytes)!\n"; +#endif + return nullptr; + } + + if (can_split(block_region, request.size)) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") " + << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n"; +#endif + split_block_region(user_context, block_region, request.size, request.alignment); + } + + alloc_block_region(user_context, block_region); + return reinterpret_cast(block_region); +} + +void RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) { + BlockRegion *block_region = reinterpret_cast(memory_region); + halide_abort_if_false(user_context, block_region != nullptr); + halide_abort_if_false(user_context, block_region->block_ptr == block); + free_block_region(user_context, block_region); + if (can_coalesce(block_region)) { + block_region = coalesce_block_regions(user_context, block_region); + } +} + +RegionAllocator *RegionAllocator::find_allocator(void *user_context, MemoryRegion *memory_region) { + BlockRegion *block_region = reinterpret_cast(memory_region); + halide_abort_if_false(user_context, block_region != nullptr); + halide_abort_if_false(user_context, block_region->block_ptr != nullptr); + return block_region->block_ptr->allocator; +} + +BlockRegion *RegionAllocator::find_block_region(void *user_context, const MemoryRequest &request) { + BlockRegion *result = nullptr; + for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) { + + if (block_region->status != AllocationStatus::Available) { + continue; + } + + // skip incompatible block regions for this request + if (!is_compatible_block_region(block_region, request.properties)) { + continue; + } + + // is the requested size larger than the current region? + if (request.size > block_region->memory.size) { + continue; + } + + size_t actual_size = aligned_size(block_region->memory.offset, request.size, request.alignment); + + // is the adjusted size larger than the current region? + if (actual_size > block_region->memory.size) { + continue; + } + + // will the adjusted size fit within the remaining unallocated space? + if ((actual_size + block->reserved) < block->memory.size) { + result = block_region; // best-fit! + break; + } + } + return result; +} + +bool RegionAllocator::can_coalesce(BlockRegion *block_region) { + if (block_region == nullptr) { return false; } + if (block_region->prev_ptr && (block_region->prev_ptr->status == AllocationStatus::Available)) { + return true; + } + if (block_region->next_ptr && (block_region->next_ptr->status == AllocationStatus::Available)) { + return true; + } + return false; +} + +BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRegion *block_region) { + if (block_region->prev_ptr && (block_region->prev_ptr->status == AllocationStatus::Available)) { + BlockRegion *prev_region = block_region->prev_ptr; + +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Coalescing " + << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) " + << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!"; +#endif + + prev_region->next_ptr = block_region->next_ptr; + if (block_region->next_ptr) { + block_region->next_ptr->prev_ptr = prev_region; + } + prev_region->memory.size += block_region->memory.size; + destroy_block_region(user_context, block_region); + block_region = prev_region; + } + + if (block_region->next_ptr && (block_region->next_ptr->status == AllocationStatus::Available)) { + BlockRegion *next_region = block_region->next_ptr; + +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Coalescing " + << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) " + << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n"; +#endif + + if (next_region->next_ptr) { + next_region->next_ptr->prev_ptr = block_region; + } + block_region->next_ptr = next_region->next_ptr; + block_region->memory.size += next_region->memory.size; + destroy_block_region(user_context, next_region); + } + + return block_region; +} + +bool RegionAllocator::can_split(BlockRegion *block_region, size_t size) { + return (block_region && (block_region->memory.size > size)); +} + +BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) { + size_t adjusted_size = aligned_size(block_region->memory.offset, size, alignment); + size_t adjusted_offset = aligned_offset(block_region->memory.offset, alignment); + + size_t empty_offset = adjusted_offset + size; + size_t empty_size = block_region->memory.size - adjusted_size; + +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Splitting " + << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) " + << "to create empty region (offset=" << (int32_t)empty_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n"; +#endif + + BlockRegion *next_region = block_region->next_ptr; + BlockRegion *empty_region = create_block_region(user_context, + block_region->memory.properties, + empty_offset, empty_size, + block_region->memory.dedicated); + halide_abort_if_false(user_context, empty_region != nullptr); + + empty_region->next_ptr = next_region; + if (next_region) { + next_region->prev_ptr = empty_region; + } + block_region->next_ptr = empty_region; + block_region->memory.size = size; + return empty_region; +} + +BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Creating block region (" + << "user_context=" << (void *)(user_context) << " " + << "offset=" << (uint32_t)offset << " " + << "size=" << (uint32_t)size << " " + << "dedicated=" << (dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(properties.usage) << " " + << "caching=" << halide_memory_caching_name(properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n"; +#endif + + BlockRegion *block_region = static_cast(arena->reserve(user_context, true)); + + if (block_region == nullptr) { + error(user_context) << "RegionAllocator: Failed to allocate new block region!\n"; + return nullptr; + } + +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Added block region (" + << "user_context=" << (void *)(user_context) << " " + << "block_region=" << (void *)(block_region) << ") ...\n"; +#endif + + block_region->memory.offset = offset; + block_region->memory.size = size; + block_region->memory.properties = properties; + block_region->memory.dedicated = dedicated; + block_region->status = AllocationStatus::Available; + block_region->block_ptr = block; + return block_region; +} + +void RegionAllocator::release_block_region(void *user_context, BlockRegion *block_region) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Releasing block region (" + << "user_context=" << (void *)(user_context) << " " + << "block_region=" << (void *)(block_region) << ") ...\n"; +#endif + free_block_region(user_context, block_region); +} + +void RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block_region) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Destroying block region (" + << "user_context=" << (void *)(user_context) << " " + << "block_region=" << (void *)(block_region) << ") ...\n"; +#endif + + free_block_region(user_context, block_region); + arena->reclaim(user_context, block_region); +} + +void RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_region) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Allocating region (size=" << (int32_t)(block_region->memory.size) << ", offset=" << (int32_t)block_region->memory.offset << ")!\n"; +#endif + halide_abort_if_false(user_context, allocators.region.allocate != nullptr); + halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available); + MemoryRegion *memory_region = &(block_region->memory); + allocators.region.allocate(user_context, memory_region); + block_region->status = block_region->memory.dedicated ? AllocationStatus::Dedicated : AllocationStatus::InUse; + block->reserved += block_region->memory.size; +} + +void RegionAllocator::free_block_region(void *user_context, BlockRegion *block_region) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Freeing block region (" + << "user_context=" << (void *)(user_context) << " " + << "block_region=" << (void *)(block_region) << ") ...\n"; +#endif + if ((block_region->status == AllocationStatus::InUse) || + (block_region->status == AllocationStatus::Dedicated)) { + debug(user_context) << "RegionAllocator: Deallocating region (size=" << (int32_t)(block_region->memory.size) << ", offset=" << (int32_t)block_region->memory.offset << ")!\n"; + halide_abort_if_false(user_context, allocators.region.deallocate != nullptr); + MemoryRegion *memory_region = &(block_region->memory); + allocators.region.deallocate(user_context, memory_region); + block->reserved -= block_region->memory.size; + } + block_region->status = AllocationStatus::Available; +} + +void RegionAllocator::release(void *user_context) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Releasing all regions (" + << "user_context=" << (void *)(user_context) << ") ...\n"; +#endif + for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) { + release_block_region(user_context, block_region); + } +} + +bool RegionAllocator::collect(void *user_context) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Collecting free block regions (" + << "user_context=" << (void *)(user_context) << ") ...\n"; +#endif + bool result = false; + for (BlockRegion *block_region = block->regions; block_region != nullptr; block_region = block_region->next_ptr) { + if (block_region->status == AllocationStatus::Available) { + if (can_coalesce(block_region)) { + block_region = coalesce_block_regions(user_context, block_region); + result = true; + } + } + } + return result; +} + +void RegionAllocator::destroy(void *user_context) { +#ifdef DEBUG_RUNTIME + debug(user_context) << "RegionAllocator: Destroying all block regions (" + << "user_context=" << (void *)(user_context) << ") ...\n"; +#endif + for (BlockRegion *block_region = block->regions; block_region != nullptr;) { + + if (block_region->next_ptr == nullptr) { + destroy_block_region(user_context, block_region); + block_region = nullptr; + } else { + BlockRegion *prev_region = block_region; + block_region = block_region->next_ptr; + destroy_block_region(user_context, prev_region); + } + } + block->regions = nullptr; + block->reserved = 0; + arena->destroy(user_context); +} + +bool RegionAllocator::is_compatible_block_region(const BlockRegion *block_region, const MemoryProperties &properties) const { + if (properties.caching != MemoryCaching::DefaultCaching) { + if (properties.caching != block_region->memory.properties.caching) { + return false; + } + } + + if (properties.visibility != MemoryVisibility::DefaultVisibility) { + if (properties.visibility != block_region->memory.properties.visibility) { + return false; + } + } + + if (properties.usage != MemoryUsage::DefaultUsage) { + if (properties.usage != block_region->memory.properties.usage) { + return false; + } + } + + return true; +} + +BlockResource *RegionAllocator::block_resource() const { + return block; +} + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +#endif // HALIDE_RUNTIME_REGION_ALLOCATOR_H diff --git a/src/runtime/internal/string_storage.h b/src/runtime/internal/string_storage.h new file mode 100644 index 000000000000..6b4daa95ac0a --- /dev/null +++ b/src/runtime/internal/string_storage.h @@ -0,0 +1,216 @@ +#ifndef HALIDE_RUNTIME_STRING_STORAGE_H +#define HALIDE_RUNTIME_STRING_STORAGE_H + +#include "block_storage.h" + +namespace Halide { +namespace Runtime { +namespace Internal { + +// Static utility functions for dealing with string data +struct StringUtils { + static bool is_empty(const char *str) { + if (str == nullptr) { return true; } + if (str[0] == '\0') { return true; } + return false; + } + + // count the number of delimited string tokens + static size_t count_tokens(const char *str, const char *delim) { + if (StringUtils::is_empty(str)) { return 0; } + if (StringUtils::is_empty(delim)) { return 1; } // no delim ... string is one token + + size_t count = 0; + const char *ptr = str; + size_t delim_length = strlen(delim); + while (!StringUtils::is_empty(ptr)) { + const char *next_delim = strstr(ptr, delim); + ptr = (next_delim != nullptr) ? (next_delim + delim_length) : nullptr; + ++count; + } + return count; + } + + static size_t count_length(const char *str) { + const char *ptr = str; + while (!StringUtils::is_empty(ptr)) { + ++ptr; + } + return size_t(ptr - str); + } +}; + +// -- +// Storage class for handling c-string data (based on block storage) +// -- Intended for building and maintaining string data w/8-bit chars +// +class StringStorage { +public: + StringStorage(void *user_context = nullptr, uint32_t capacity = 0, const SystemMemoryAllocatorFns &sma = default_allocator()); + StringStorage(const StringStorage &other) = default; + ~StringStorage(); + + void initialize(void *user_context, uint32_t capacity = 0, const SystemMemoryAllocatorFns &sma = default_allocator()); + void destroy(void *user_context); + + StringStorage &operator=(const StringStorage &other); + bool operator==(const StringStorage &other) const; + bool operator!=(const StringStorage &other) const; + + bool contains(const char *str) const; + bool contains(const StringStorage &other) const; + + void reserve(void *user_context, size_t length); + void assign(void *user_context, char ch); + void assign(void *user_context, const char *str, size_t length = 0); // if length is zero, strlen is used + void append(void *user_context, char ch); + void append(void *user_context, const char *str, size_t length = 0); // if length is zero, strlen is used + void prepend(void *user_context, char ch); + void prepend(void *user_context, const char *str, size_t length = 0); // if length is zero, strlen is used + void clear(void *user_context); + void terminate(void *user_context, size_t length); + + size_t length() const; + const char *data() const; + + const SystemMemoryAllocatorFns ¤t_allocator() const; + static const SystemMemoryAllocatorFns &default_allocator(); + +private: + BlockStorage contents; +}; + +StringStorage::StringStorage(void *user_context, uint32_t capacity, const SystemMemoryAllocatorFns &sma) + : contents(user_context, {sizeof(char), 32, 32}, sma) { + if (capacity) { contents.reserve(user_context, capacity); } +} + +StringStorage::~StringStorage() { + destroy(nullptr); +} + +StringStorage &StringStorage::operator=(const StringStorage &other) { + if (&other != this) { + assign(nullptr, other.data(), other.length()); + } + return *this; +} + +bool StringStorage::contains(const char *str) const { + const char *this_str = static_cast(contents.data()); + return strstr(this_str, str) != nullptr; +} + +bool StringStorage::contains(const StringStorage &other) const { + const char *this_str = static_cast(contents.data()); + const char *other_str = static_cast(other.contents.data()); + return strstr(this_str, other_str) != nullptr; +} + +bool StringStorage::operator==(const StringStorage &other) const { + if (contents.size() != other.contents.size()) { return false; } + const char *this_str = static_cast(contents.data()); + const char *other_str = static_cast(other.contents.data()); + return strncmp(this_str, other_str, contents.size()) == 0; +} + +bool StringStorage::operator!=(const StringStorage &other) const { + return !(*this == other); +} + +void StringStorage::reserve(void *user_context, size_t length) { + contents.reserve(user_context, length + 1); // leave room for termination + contents.resize(user_context, length, false); + terminate(user_context, length); +} + +void StringStorage::assign(void *user_context, char ch) { + contents.resize(user_context, 1); + char *ptr = static_cast(contents[0]); + (*ptr) = ch; +} + +void StringStorage::assign(void *user_context, const char *str, size_t length) { + if (StringUtils::is_empty(str)) { return; } + if (length == 0) { length = strlen(str); } + char *this_str = static_cast(contents.data()); + reserve(user_context, length); + memcpy(this_str, str, length); + terminate(user_context, length); +} + +void StringStorage::append(void *user_context, const char *str, size_t length) { + if (StringUtils::is_empty(str)) { return; } + if (length == 0) { length = strlen(str); } + const size_t old_size = contents.size(); + size_t new_length = old_size + length; + char *this_str = static_cast(contents[old_size]); + reserve(user_context, length); + memcpy(this_str, str, length); + terminate(user_context, new_length); +} + +void StringStorage::append(void *user_context, char ch) { + contents.append(user_context, &ch); +} + +void StringStorage::prepend(void *user_context, const char *str, size_t length) { + if (StringUtils::is_empty(str)) { return; } + if (length == 0) { length = strlen(str); } + const size_t old_size = contents.size(); + size_t new_length = old_size + length; + char *this_str = static_cast(contents.data()); + reserve(user_context, new_length); + memcpy(this_str + length, this_str, old_size); + memcpy(this_str, str, length); + terminate(user_context, new_length); +} + +void StringStorage::prepend(void *user_context, char ch) { + contents.prepend(user_context, &ch); +} + +void StringStorage::terminate(void *user_context, size_t length) { + char *end_ptr = static_cast(contents[length]); + (*end_ptr) = '\0'; +} + +void StringStorage::clear(void *user_context) { + contents.clear(user_context); + if (contents.data()) { terminate(user_context, 0); } +} + +void StringStorage::initialize(void *user_context, uint32_t capacity, const SystemMemoryAllocatorFns &sma) { + contents.initialize(user_context, {sizeof(char), 32, 32}, sma); + if (capacity) { contents.reserve(user_context, capacity); } +} + +void StringStorage::destroy(void *user_context) { + contents.destroy(user_context); +} + +size_t StringStorage::length() const { + return StringUtils::count_length(data()); +} + +const char *StringStorage::data() const { + return static_cast(contents.data()); +} + +const SystemMemoryAllocatorFns & +StringStorage::current_allocator() const { + return contents.current_allocator(); +} + +const SystemMemoryAllocatorFns & +StringStorage::default_allocator() { + return BlockStorage::default_allocator(); +} + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +#endif // HALIDE_RUNTIME_STRING_STORAGE_H diff --git a/src/runtime/internal/string_table.h b/src/runtime/internal/string_table.h new file mode 100644 index 000000000000..07e09f5f97b2 --- /dev/null +++ b/src/runtime/internal/string_table.h @@ -0,0 +1,217 @@ +#ifndef HALIDE_RUNTIME_STRING_TABLE_H +#define HALIDE_RUNTIME_STRING_TABLE_H + +#include "linked_list.h" +#include "pointer_table.h" +#include "string_storage.h" + +namespace Halide { +namespace Runtime { +namespace Internal { + +// Storage class for an array of strings (based on block storage) +// -- Intended for building and maintaining tables of strings +class StringTable { +public: + // Disable copy constructors + StringTable(const StringTable &) = delete; + StringTable &operator=(const StringTable &) = delete; + + StringTable(const SystemMemoryAllocatorFns &allocator = StringStorage::default_allocator()); + StringTable(void *user_context, size_t capacity, const SystemMemoryAllocatorFns &allocator = StringStorage::default_allocator()); + StringTable(void *user_context, const char **array, size_t count, const SystemMemoryAllocatorFns &allocator = StringStorage::default_allocator()); + ~StringTable(); + + void resize(void *user_context, size_t capacity); + void destroy(void *user_context); + void clear(void *user_context); + + // fills the contents of the table (copies strings from given array) + void fill(void *user_context, const char **array, size_t coun); + + // assign the entry at given index the given string + void assign(void *user_context, size_t index, const char *str, size_t length = 0); // if length is zero, strlen is used + + // appends the given string to the end of the table + void append(void *user_context, const char *str, size_t length = 0); // if length is zero, strlen is used + + // prepend the given string to the end of the table + void prepend(void *user_context, const char *str, size_t length = 0); // if length is zero, strlen is used + + // parses the given c-string based on given delimiter, stores each substring in the resulting table + size_t parse(void *user_context, const char *str, const char *delim); + + // index-based access operator + const char *operator[](size_t index) const; + + // returns the raw string table pointer + const char **data() const; + + // scans the table for existance of the given string within any entry (linear scan w/string compare!) + bool contains(const char *str) const; + + size_t size() const { + return contents.size(); + } + +private: + LinkedList contents; //< owns string data + PointerTable pointers; //< stores pointers +}; + +// -- + +StringTable::StringTable(const SystemMemoryAllocatorFns &sma) + : contents(nullptr, sizeof(StringStorage), 0, sma), + pointers(nullptr, 0, sma) { + // EMPTY! +} + +StringTable::StringTable(void *user_context, size_t capacity, const SystemMemoryAllocatorFns &sma) + : contents(user_context, sizeof(StringStorage), capacity, sma), + pointers(user_context, capacity, sma) { + if (capacity) { resize(user_context, capacity); } +} + +StringTable::StringTable(void *user_context, const char **array, size_t count, const SystemMemoryAllocatorFns &sma) + : contents(user_context, sizeof(StringStorage), count, sma), + pointers(user_context, count, sma) { + fill(user_context, array, count); +} + +StringTable::~StringTable() { + destroy(nullptr); +} + +void StringTable::resize(void *user_context, size_t capacity) { + for (size_t n = contents.size(); n < capacity; ++n) { + LinkedList::EntryType *entry_ptr = contents.append(user_context); + StringStorage *storage_ptr = static_cast(entry_ptr->value); + storage_ptr->initialize(user_context, 0, contents.current_allocator()); + } + pointers.resize(user_context, capacity); +} + +void StringTable::clear(void *user_context) { + for (size_t n = 0; n < contents.size(); ++n) { + LinkedList::EntryType *entry_ptr = contents.front(); + StringStorage *storage_ptr = static_cast(entry_ptr->value); + storage_ptr->clear(user_context); + contents.pop_front(user_context); + } + contents.clear(user_context); + pointers.clear(user_context); +} + +void StringTable::destroy(void *user_context) { + for (size_t n = 0; n < contents.size(); ++n) { + LinkedList::EntryType *entry_ptr = contents.front(); + StringStorage *storage_ptr = static_cast(entry_ptr->value); + storage_ptr->destroy(user_context); + contents.pop_front(user_context); + } + contents.destroy(user_context); + pointers.destroy(user_context); +} + +const char *StringTable::operator[](size_t index) const { + return static_cast(pointers[index]); +} + +void StringTable::fill(void *user_context, const char **array, size_t count) { + resize(user_context, count); + LinkedList::EntryType *entry_ptr = contents.front(); + for (size_t n = 0; n < count && n < contents.size() && entry_ptr != nullptr; ++n) { + StringStorage *storage_ptr = static_cast(entry_ptr->value); + storage_ptr->assign(user_context, array[n]); + pointers.assign(user_context, n, storage_ptr->data()); + entry_ptr = entry_ptr->next_ptr; + } +} + +void StringTable::assign(void *user_context, size_t index, const char *str, size_t length) { + if (length == 0) { length = strlen(str); } + LinkedList::EntryType *entry_ptr = contents.front(); + for (size_t n = 0; n < contents.size() && entry_ptr != nullptr; ++n) { + if (n == index) { + StringStorage *storage_ptr = static_cast(entry_ptr->value); + storage_ptr->assign(user_context, str, length); + pointers.assign(user_context, n, storage_ptr->data()); + break; + } + entry_ptr = entry_ptr->next_ptr; + } +} + +void StringTable::append(void *user_context, const char *str, size_t length) { + LinkedList::EntryType *entry_ptr = contents.append(user_context); + StringStorage *storage_ptr = static_cast(entry_ptr->value); + storage_ptr->initialize(user_context, 0, contents.current_allocator()); + storage_ptr->assign(user_context, str, length); + pointers.append(user_context, storage_ptr->data()); +} + +void StringTable::prepend(void *user_context, const char *str, size_t length) { + LinkedList::EntryType *entry_ptr = contents.prepend(user_context); + StringStorage *storage_ptr = static_cast(entry_ptr->value); + storage_ptr->initialize(user_context, 0, contents.current_allocator()); + storage_ptr->assign(user_context, str, length); + pointers.prepend(user_context, storage_ptr->data()); +} + +size_t StringTable::parse(void *user_context, const char *str, const char *delim) { + if (StringUtils::is_empty(str)) { return 0; } + + size_t delim_length = strlen(delim); + size_t total_length = strlen(str); + size_t entry_count = StringUtils::count_tokens(str, delim); + if (entry_count < 1) { return 0; } + + resize(user_context, entry_count); + + // save each entry into the table + size_t index = 0; + const char *ptr = str; + LinkedList::EntryType *entry_ptr = contents.front(); + while (!StringUtils::is_empty(ptr) && (index < entry_count)) { + size_t ptr_offset = ptr - str; + const char *next_delim = strstr(ptr, delim); + size_t token_length = (next_delim == nullptr) ? (total_length - ptr_offset) : (next_delim - ptr); + if (token_length > 0 && entry_ptr != nullptr) { + StringStorage *storage_ptr = static_cast(entry_ptr->value); + storage_ptr->assign(user_context, ptr, token_length); + pointers.assign(user_context, index, storage_ptr->data()); + entry_ptr = entry_ptr->next_ptr; + ++index; + } + ptr = (next_delim != nullptr) ? (next_delim + delim_length) : nullptr; + } + return entry_count; +} + +bool StringTable::contains(const char *str) const { + if (StringUtils::is_empty(str)) { return false; } + + const LinkedList::EntryType *entry_ptr = contents.front(); + for (size_t n = 0; n < contents.size() && entry_ptr != nullptr; ++n) { + StringStorage *storage_ptr = static_cast(entry_ptr->value); + if (storage_ptr->contains(str)) { + return true; + } + entry_ptr = entry_ptr->next_ptr; + } + + return false; +} + +const char **StringTable::data() const { + return reinterpret_cast(pointers.data()); +} + +// -- + +} // namespace Internal +} // namespace Runtime +} // namespace Halide + +#endif // HALIDE_RUNTIME_STRING_STORAGE_H diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h index e551d080613b..2801f9bfedc5 100644 --- a/src/runtime/runtime_internal.h +++ b/src/runtime/runtime_internal.h @@ -1,9 +1,13 @@ #ifndef HALIDE_RUNTIME_INTERNAL_H #define HALIDE_RUNTIME_INTERNAL_H +#ifdef COMPILING_HALIDE_RUNTIME_TESTS +// Only allowed if building Halide runtime tests ... since they use system compiler which may be GCC or MSVS +#else #if __STDC_HOSTED__ #error "Halide runtime files must be compiled with clang in freestanding mode." #endif +#endif #ifdef __UINT8_TYPE__ typedef __INT64_TYPE__ int64_t; @@ -92,6 +96,7 @@ int strncmp(const char *s, const char *t, size_t n); size_t strlen(const char *s); const char *strchr(const char *s, int c); void *memcpy(void *s1, const void *s2, size_t n); +void *memmove(void *dest, const void *src, size_t n); int memcmp(const void *s1, const void *s2, size_t n); void *memset(void *s, int val, size_t n); // Use fopen+fileno+fclose instead of open+close - the value of the diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4130f8fddaf3..ca1e3f46acf8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -39,4 +39,25 @@ if (WITH_TEST_GENERATOR) add_subdirectory(generator) endif () +# FIXME: Disable the runtime tests for MSVC until we have a MS compatible header. +# +# The runtime tests include src/runtime/runtime_internal.h which was written +# to only support clang (GCC's front end is close enough it works fine as well). +# We originally setup the tests to compile with clang (in the same way as the actual +# runtime bitcode files), but that wasn't very clean and didn't integrate well with +# the other tests, so we switched to just using the native system compiler. +# Sadly MSVC isn't compatible with the current runtime_internal.h which would need +# some platform specific ifdefs for attributes and types that are causing compile +# errors. +# +cmake_dependent_option(WITH_TEST_RUNTIME "Build runtime tests" ON + "NOT MSVC" OFF) + +if (WITH_TEST_RUNTIME) + message(STATUS "Building internal runtime tests enabled") + add_subdirectory(runtime) +else () + message(STATUS "Building internal runtime tests disabled") +endif () + # FIXME: failing_with_issue is dead code :) diff --git a/test/runtime/CMakeLists.txt b/test/runtime/CMakeLists.txt new file mode 100644 index 000000000000..54c219ffa392 --- /dev/null +++ b/test/runtime/CMakeLists.txt @@ -0,0 +1,32 @@ +function(halide_define_runtime_internal_test NAME) + add_executable(runtime_internal_${NAME} ${NAME}.cpp) + target_link_libraries(runtime_internal_${NAME} PRIVATE Halide::Test) + target_include_directories(runtime_internal_${NAME} PRIVATE "${Halide_SOURCE_DIR}/src") + target_include_directories(runtime_internal_${NAME} PRIVATE "${Halide_SOURCE_DIR}/src/runtime") + target_link_libraries(runtime_internal_${NAME} PRIVATE Halide::Runtime) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # Halide runtime lib has declarations for memcmp etc that conflict with GNU stdlib + target_compile_options(runtime_internal_${NAME} PRIVATE -Wno-builtin-declaration-mismatch ) + endif() + target_compile_definitions( + runtime_internal_${NAME} + PRIVATE + HALIDE_VERSION=${Halide_VERSION} + HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR} + HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR} + HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH} + COMPILING_HALIDE_RUNTIME + COMPILING_HALIDE_RUNTIME_TESTS + ) + add_halide_test(runtime_internal_${NAME} GROUPS runtime_internal) +endfunction() + +# NOTE: These tests directly include runtime_internal.h which isn't compatible with MSVC +if(NOT MSVC) + halide_define_runtime_internal_test(block_allocator) + halide_define_runtime_internal_test(block_storage) + halide_define_runtime_internal_test(linked_list) + halide_define_runtime_internal_test(memory_arena) + halide_define_runtime_internal_test(string_storage) + halide_define_runtime_internal_test(string_table) +endif() \ No newline at end of file diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp new file mode 100644 index 000000000000..69479901fa95 --- /dev/null +++ b/test/runtime/block_allocator.cpp @@ -0,0 +1,140 @@ +#include "common.h" + +#include "internal/block_allocator.h" +#include "internal/pointer_table.h" + +using namespace Halide::Runtime::Internal; + +namespace { + +size_t allocated_block_memory = 0; +size_t allocated_region_memory = 0; + +void allocate_block(void *user_context, MemoryBlock *block) { + block->handle = native_system_malloc(user_context, block->size); + allocated_block_memory += block->size; + + debug(user_context) << "Test : allocate_block (" + << "block=" << (void *)(block) << " " + << "block_size=" << int32_t(block->size) << " " + << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " + << ") !\n"; +} + +void deallocate_block(void *user_context, MemoryBlock *block) { + native_system_free(user_context, block->handle); + allocated_block_memory -= block->size; + + debug(user_context) << "Test : deallocate_block (" + << "block=" << (void *)(block) << " " + << "block_size=" << int32_t(block->size) << " " + << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " + << ") !\n"; +} + +void allocate_region(void *user_context, MemoryRegion *region) { + region->handle = (void *)1; + allocated_region_memory += region->size; + + debug(user_context) << "Test : allocate_region (" + << "region=" << (void *)(region) << " " + << "region_size=" << int32_t(region->size) << " " + << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " + << ") !\n"; +} + +void deallocate_region(void *user_context, MemoryRegion *region) { + region->handle = (void *)0; + allocated_region_memory -= region->size; + + debug(user_context) << "Test : deallocate_region (" + << "region=" << (void *)(region) << " " + << "region_size=" << int32_t(region->size) << " " + << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " + << ") !\n"; +} + +} // end namespace + +int main(int argc, char **argv) { + void *user_context = (void *)1; + + SystemMemoryAllocatorFns system_allocator = {native_system_malloc, native_system_free}; + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region}; + + // test class interface + { + BlockAllocator::Config config = {0}; + config.minimum_block_size = 1024; + + BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; + BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); + + MemoryRequest request = {0}; + request.size = sizeof(int); + request.alignment = sizeof(int); + request.properties.visibility = MemoryVisibility::DefaultVisibility; + request.properties.caching = MemoryCaching::DefaultCaching; + request.properties.usage = MemoryUsage::DefaultUsage; + + MemoryRegion *r1 = instance->reserve(user_context, request); + halide_abort_if_false(user_context, r1 != nullptr); + halide_abort_if_false(user_context, allocated_block_memory == config.minimum_block_size); + halide_abort_if_false(user_context, allocated_region_memory == request.size); + + MemoryRegion *r2 = instance->reserve(user_context, request); + halide_abort_if_false(user_context, r2 != nullptr); + halide_abort_if_false(user_context, allocated_block_memory == config.minimum_block_size); + halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size)); + + instance->reclaim(user_context, r1); + halide_abort_if_false(user_context, allocated_region_memory == (1 * request.size)); + + instance->destroy(user_context); + halide_abort_if_false(user_context, allocated_block_memory == 0); + halide_abort_if_false(user_context, allocated_region_memory == 0); + + BlockAllocator::destroy(user_context, instance); + } + + // stress test + { + BlockAllocator::Config config = {0}; + config.minimum_block_size = 1024; + + BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; + BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); + + MemoryRequest request = {0}; + request.size = sizeof(int); + request.alignment = sizeof(int); + request.properties.visibility = MemoryVisibility::DefaultVisibility; + request.properties.caching = MemoryCaching::DefaultCaching; + request.properties.usage = MemoryUsage::DefaultUsage; + + static size_t test_allocations = 1000; + PointerTable pointers(user_context, test_allocations, system_allocator); + for (size_t n = 0; n < test_allocations; ++n) { + size_t count = n % 32; + count = count > 1 ? count : 1; + request.size = count * sizeof(int); + MemoryRegion *region = instance->reserve(user_context, request); + pointers.append(user_context, region); + } + + for (size_t n = 0; n < pointers.size(); ++n) { + MemoryRegion *region = static_cast(pointers[n]); + instance->reclaim(user_context, region); + } + halide_abort_if_false(user_context, allocated_region_memory == 0); + + instance->destroy(user_context); + halide_abort_if_false(user_context, allocated_block_memory == 0); + + BlockAllocator::destroy(user_context, instance); + } + + print(user_context) << "Success!\n"; + return 0; +} diff --git a/test/runtime/block_storage.cpp b/test/runtime/block_storage.cpp new file mode 100644 index 000000000000..ad7499f84378 --- /dev/null +++ b/test/runtime/block_storage.cpp @@ -0,0 +1,148 @@ +#include "common.h" + +#include "internal/block_storage.h" + +using namespace Halide::Runtime::Internal; + +struct TestStruct { + int8_t i8; + uint16_t ui16; + float f32; +}; + +template +T read_as(const BlockStorage &bs, size_t index) { + const T *ptr = static_cast(bs[index]); + return *ptr; +} + +int main(int argc, char **argv) { + void *user_context = (void *)1; + + // test class interface + { + BlockStorage::Config config = BlockStorage::default_config(); + config.entry_size = sizeof(int); + + BlockStorage bs(user_context, config); + bs.reserve(user_context, 256); + halide_abort_if_false(user_context, bs.size() == 0); + + int a1[4] = {12, 34, 56, 78}; + bs.append(user_context, &a1[0]); + halide_abort_if_false(user_context, bs.size() == 1); + halide_abort_if_false(user_context, read_as(bs, 0) == a1[0]); + + bs.append(user_context, &a1[1]); + halide_abort_if_false(user_context, bs.size() == 2); + halide_abort_if_false(user_context, read_as(bs, 1) == a1[1]); + + bs.insert(user_context, 1, &a1[2]); + halide_abort_if_false(user_context, bs.size() == 3); + halide_abort_if_false(user_context, read_as(bs, 0) == a1[0]); + halide_abort_if_false(user_context, read_as(bs, 1) == a1[2]); // inserted here + halide_abort_if_false(user_context, read_as(bs, 2) == a1[1]); + + bs.prepend(user_context, &a1[3]); + halide_abort_if_false(user_context, bs.size() == 4); + halide_abort_if_false(user_context, read_as(bs, 0) == a1[3]); + + int a2[] = {98, 76, 54, 32, 10}; + size_t a2_size = 5; + bs.fill(user_context, a2, a2_size); + halide_abort_if_false(user_context, bs.size() == a2_size); + halide_abort_if_false(user_context, read_as(bs, 0) == a2[0]); + halide_abort_if_false(user_context, read_as(bs, 1) == a2[1]); + halide_abort_if_false(user_context, read_as(bs, 2) == a2[2]); + halide_abort_if_false(user_context, read_as(bs, 3) == a2[3]); + halide_abort_if_false(user_context, read_as(bs, 4) == a2[4]); + + int a3[] = {77, 66, 55}; + size_t a3_size = 3; + bs.insert(user_context, 2, a3, a3_size); + halide_abort_if_false(user_context, bs.size() == (a2_size + a3_size)); + halide_abort_if_false(user_context, read_as(bs, 0) == a2[0]); + halide_abort_if_false(user_context, read_as(bs, 1) == a2[1]); + halide_abort_if_false(user_context, read_as(bs, 2) == a3[0]); // a3 inserted here + halide_abort_if_false(user_context, read_as(bs, 3) == a3[1]); + halide_abort_if_false(user_context, read_as(bs, 4) == a3[2]); + halide_abort_if_false(user_context, read_as(bs, 5) == a2[2]); // a2 resumes here + halide_abort_if_false(user_context, read_as(bs, 6) == a2[3]); + halide_abort_if_false(user_context, read_as(bs, 7) == a2[4]); + + bs.pop_front(user_context); + bs.pop_front(user_context); + + bs.pop_back(user_context); + bs.pop_back(user_context); + + halide_abort_if_false(user_context, bs.size() == (a2_size + a3_size - 4)); + halide_abort_if_false(user_context, read_as(bs, 0) == a3[0]); + halide_abort_if_false(user_context, read_as(bs, 1) == a3[1]); + halide_abort_if_false(user_context, read_as(bs, 2) == a3[2]); + halide_abort_if_false(user_context, read_as(bs, 3) == a2[2]); + + bs.clear(user_context); + halide_abort_if_false(user_context, bs.size() == 0); + } + + // test copy and equality + { + BlockStorage::Config config = BlockStorage::default_config(); + config.entry_size = sizeof(int); + + int a1[] = {98, 76, 54, 32, 10}; + size_t a1_size = 5; + + int a2[] = {77, 66, 55}; + size_t a2_size = 3; + + BlockStorage bs1(user_context, config); + bs1.fill(user_context, a1, a1_size); + + BlockStorage bs2(user_context, config); + bs2.fill(user_context, a2, a2_size); + + BlockStorage bs3(bs1); + + halide_abort_if_false(user_context, bs1.size() == (a1_size)); + halide_abort_if_false(user_context, bs2.size() == (a2_size)); + halide_abort_if_false(user_context, bs3.size() == bs1.size()); + + halide_abort_if_false(user_context, bs1 != bs2); + halide_abort_if_false(user_context, bs1 == bs3); + + bs2 = bs1; + halide_abort_if_false(user_context, bs1 == bs2); + } + + // test struct storage + { + BlockStorage::Config config = BlockStorage::default_config(); + config.entry_size = sizeof(TestStruct); + + BlockStorage bs(user_context, config); + halide_abort_if_false(user_context, bs.size() == 0); + + TestStruct s1 = {8, 16, 32.0f}; + bs.append(user_context, &s1); + halide_abort_if_false(user_context, bs.size() == 1); + + const TestStruct e1 = read_as(bs, 0); + halide_abort_if_false(user_context, e1.i8 == s1.i8); + halide_abort_if_false(user_context, e1.ui16 == s1.ui16); + halide_abort_if_false(user_context, e1.f32 == s1.f32); + + TestStruct s2 = {1, 2, 3.0f}; + bs.prepend(user_context, &s2); + halide_abort_if_false(user_context, bs.size() == 2); + + const TestStruct e2 = read_as(bs, 0); + halide_abort_if_false(user_context, e2.i8 == s2.i8); + halide_abort_if_false(user_context, e2.ui16 == s2.ui16); + halide_abort_if_false(user_context, e2.f32 == s2.f32); + } + + print(user_context) << "Success!\n"; + return 0; +} diff --git a/test/runtime/common.h b/test/runtime/common.h new file mode 100644 index 000000000000..523e3b7e6797 --- /dev/null +++ b/test/runtime/common.h @@ -0,0 +1,29 @@ +#include +#include + +#include "HalideRuntime.h" +#include "msan_stubs.cpp" +#include "runtime_internal.h" +#include "to_string.cpp" + +extern "C" { + +extern int printf(const char *format, ...); + +void halide_print(void *user_context, const char *str) { + printf("%s", str); +} + +void halide_error(void *user_context, const char *msg) { + halide_print(user_context, msg); +} + +void halide_profiler_report(void *user_context) { +} + +void halide_profiler_reset() { +} + +} // extern "C" + +#include "printer.h" diff --git a/test/runtime/linked_list.cpp b/test/runtime/linked_list.cpp new file mode 100644 index 000000000000..4e2ab51da685 --- /dev/null +++ b/test/runtime/linked_list.cpp @@ -0,0 +1,91 @@ +#include "common.h" + +#include "internal/linked_list.h" + +using namespace Halide::Runtime::Internal; + +struct TestStruct { + int8_t i8; + uint16_t ui16; + float f32; +}; + +template +T read_as(const LinkedList::EntryType *entry_ptr) { + const T *ptr = static_cast(entry_ptr->value); + return *ptr; +} + +int main(int argc, char **argv) { + void *user_context = (void *)1; + + // test class interface + { + LinkedList list(user_context, sizeof(int), 64); + halide_abort_if_false(user_context, list.size() == 0); + + const int i0 = 12; + list.append(user_context, &i0); // contents: 12 + halide_abort_if_false(user_context, list.size() == 1); + halide_abort_if_false(user_context, (list.front() != nullptr)); + halide_abort_if_false(user_context, (list.back() != nullptr)); + halide_abort_if_false(user_context, read_as(list.front()) == i0); + halide_abort_if_false(user_context, read_as(list.back()) == i0); + + const int i1 = 34; + list.append(user_context, &i1); // contents: 12, 34 + halide_abort_if_false(user_context, list.size() == 2); + halide_abort_if_false(user_context, read_as(list.back()) == i1); + + const int i2 = 56; + list.insert_before(user_context, list.back(), &i2); // contents: 12, 56, 34 + halide_abort_if_false(user_context, list.size() == 3); + halide_abort_if_false(user_context, read_as(list.back()) == i1); + + const int i3 = 78; + list.prepend(user_context, &i3); // contents: 78, 12, 56, 34 + halide_abort_if_false(user_context, list.size() == 4); + halide_abort_if_false(user_context, read_as(list.front()) == i3); + halide_abort_if_false(user_context, read_as(list.back()) == i1); + + list.pop_front(user_context); // contents: 12, 56, 34 + halide_abort_if_false(user_context, list.size() == 3); + halide_abort_if_false(user_context, read_as(list.front()) == i0); + halide_abort_if_false(user_context, read_as(list.back()) == i1); + + list.pop_back(user_context); // contents: 12, 56 + halide_abort_if_false(user_context, list.size() == 2); + halide_abort_if_false(user_context, read_as(list.front()) == i0); + halide_abort_if_false(user_context, read_as(list.back()) == i2); + + list.clear(user_context); + halide_abort_if_false(user_context, list.size() == 0); + } + + // test struct storage + { + LinkedList list(user_context, sizeof(TestStruct)); + halide_abort_if_false(user_context, list.size() == 0); + + TestStruct s1 = {8, 16, 32.0f}; + list.append(user_context, &s1); + halide_abort_if_false(user_context, list.size() == 1); + + const TestStruct e1 = read_as(list.front()); + halide_abort_if_false(user_context, e1.i8 == s1.i8); + halide_abort_if_false(user_context, e1.ui16 == s1.ui16); + halide_abort_if_false(user_context, e1.f32 == s1.f32); + + TestStruct s2 = {1, 2, 3.0f}; + list.prepend(user_context, &s2); + halide_abort_if_false(user_context, list.size() == 2); + + TestStruct e2 = read_as(list.front()); + halide_abort_if_false(user_context, e2.i8 == s2.i8); + halide_abort_if_false(user_context, e2.ui16 == s2.ui16); + halide_abort_if_false(user_context, e2.f32 == s2.f32); + } + + print(user_context) << "Success!\n"; + return 0; +} diff --git a/test/runtime/memory_arena.cpp b/test/runtime/memory_arena.cpp new file mode 100644 index 000000000000..cce3c7bf1c02 --- /dev/null +++ b/test/runtime/memory_arena.cpp @@ -0,0 +1,88 @@ +#include "common.h" + +#include "internal/memory_arena.h" + +using namespace Halide::Runtime::Internal; + +namespace { + +size_t counter = 0; + +void *allocate_system(void *user_context, size_t bytes) { + ++counter; + return native_system_malloc(user_context, bytes); +} + +void deallocate_system(void *user_context, void *ptr) { + native_system_free(user_context, ptr); + --counter; +} + +} // namespace + +struct TestStruct { + int8_t i8; + uint16_t ui16; + float f32; +}; + +int main(int argc, char **argv) { + void *user_context = (void *)1; + + // test class interface + { + SystemMemoryAllocatorFns test_allocator = {allocate_system, deallocate_system}; + + MemoryArena::Config config = {sizeof(int), 32, 0}; + MemoryArena arena(user_context, config, test_allocator); + void *p1 = arena.reserve(user_context); + halide_abort_if_false(user_context, counter > 1); + halide_abort_if_false(user_context, p1 != nullptr); + + void *p2 = arena.reserve(user_context, true); + halide_abort_if_false(user_context, counter > 2); + halide_abort_if_false(user_context, p2 != nullptr); + halide_abort_if_false(user_context, (*static_cast(p2)) == 0); + + arena.reclaim(user_context, p1); + arena.destroy(user_context); + + halide_abort_if_false(user_context, counter == 0); + } + + // test struct allocations + { + SystemMemoryAllocatorFns test_allocator = {allocate_system, deallocate_system}; + MemoryArena::Config config = {sizeof(TestStruct), 32, 0}; + MemoryArena arena(user_context, config, test_allocator); + void *s1 = arena.reserve(user_context, true); + halide_abort_if_false(user_context, s1 != nullptr); + halide_abort_if_false(user_context, counter > 1); + halide_abort_if_false(user_context, ((TestStruct *)s1)->i8 == int8_t(0)); + halide_abort_if_false(user_context, ((TestStruct *)s1)->ui16 == uint16_t(0)); + halide_abort_if_false(user_context, ((TestStruct *)s1)->f32 == float(0)); + + arena.destroy(user_context); + + size_t count = 4 * 1024; + void *pointers[count]; + for (size_t n = 0; n < count; ++n) { + pointers[n] = arena.reserve(user_context, true); + } + + for (size_t n = 0; n < count; ++n) { + void *s1 = pointers[n]; + halide_abort_if_false(user_context, s1 != nullptr); + halide_abort_if_false(user_context, ((TestStruct *)s1)->i8 == int8_t(0)); + halide_abort_if_false(user_context, ((TestStruct *)s1)->ui16 == uint16_t(0)); + halide_abort_if_false(user_context, ((TestStruct *)s1)->f32 == float(0)); + } + + arena.destroy(user_context); + + halide_abort_if_false(user_context, counter == 0); + } + + print(user_context) << "Success!\n"; + return 0; +} diff --git a/test/runtime/string_storage.cpp b/test/runtime/string_storage.cpp new file mode 100644 index 000000000000..b7428d4440a3 --- /dev/null +++ b/test/runtime/string_storage.cpp @@ -0,0 +1,63 @@ +#include "common.h" + +#include "internal/string_storage.h" + +using namespace Halide::Runtime::Internal; + +int main(int argc, char **argv) { + void *user_context = (void *)1; + + // test class interface + { + StringStorage ss; + halide_abort_if_false(user_context, ss.length() == 0); + + const char *ts1 = "Testing!"; + const size_t ts1_length = strlen(ts1); + ss.assign(user_context, ts1); + halide_abort_if_false(user_context, ss.length() == ts1_length); + halide_abort_if_false(user_context, ss.contains(ts1)); + + const char *ts2 = "More "; + const size_t ts2_length = strlen(ts2); + ss.prepend(user_context, ts2); + halide_abort_if_false(user_context, ss.length() == (ts1_length + ts2_length)); + halide_abort_if_false(user_context, ss.contains(ts2)); + halide_abort_if_false(user_context, ss.contains(ts1)); + + ss.append(user_context, '!'); + halide_abort_if_false(user_context, ss.length() == (ts1_length + ts2_length + 1)); + + ss.clear(user_context); + halide_abort_if_false(user_context, ss.length() == 0); + } + + // test copy and equality + { + const char *ts1 = "Test One!"; + const size_t ts1_length = strlen(ts1); + + const char *ts2 = "Test Two!"; + const size_t ts2_length = strlen(ts2); + + StringStorage ss1; + ss1.assign(user_context, ts1, ts1_length); + + StringStorage ss2; + ss2.assign(user_context, ts2, ts2_length); + + StringStorage ss3(ss1); + + halide_abort_if_false(user_context, ss1.length() == (ts1_length)); + halide_abort_if_false(user_context, ss2.length() == (ts2_length)); + halide_abort_if_false(user_context, ss3.length() == ss1.length()); + + halide_abort_if_false(user_context, ss1 != ss2); + halide_abort_if_false(user_context, ss1 == ss3); + + ss2 = ss1; + halide_abort_if_false(user_context, ss1 == ss2); + } + print(user_context) << "Success!\n"; + return 0; +} diff --git a/test/runtime/string_table.cpp b/test/runtime/string_table.cpp new file mode 100644 index 000000000000..82d0525d02f3 --- /dev/null +++ b/test/runtime/string_table.cpp @@ -0,0 +1,44 @@ +#include "common.h" + +#include "internal/string_table.h" + +using namespace Halide::Runtime::Internal; + +int main(int argc, char **argv) { + void *user_context = (void *)1; + + // test class interface + { + size_t data_size = 4; + const char *data[] = { + "one", "two", "three", "four"}; + + StringTable st1; + halide_abort_if_false(user_context, st1.size() == 0); + + st1.fill(user_context, data, data_size); + halide_abort_if_false(user_context, st1.size() == data_size); + halide_abort_if_false(user_context, strncmp(st1[0], data[0], strlen(data[0])) == 0); + halide_abort_if_false(user_context, strncmp(st1[1], data[1], strlen(data[1])) == 0); + halide_abort_if_false(user_context, strncmp(st1[2], data[2], strlen(data[2])) == 0); + halide_abort_if_false(user_context, strncmp(st1[3], data[3], strlen(data[3])) == 0); + halide_abort_if_false(user_context, st1.contains(data[0])); + halide_abort_if_false(user_context, st1.contains(data[1])); + halide_abort_if_false(user_context, st1.contains(data[2])); + halide_abort_if_false(user_context, st1.contains(data[3])); + + st1.clear(user_context); + halide_abort_if_false(user_context, st1.size() == 0); + + size_t entry_count = st1.parse(user_context, "one:two:three:four", ":"); + halide_abort_if_false(user_context, entry_count == data_size); + halide_abort_if_false(user_context, st1.size() == data_size); + halide_abort_if_false(user_context, st1.contains(data[0])); + halide_abort_if_false(user_context, st1.contains(data[1])); + halide_abort_if_false(user_context, st1.contains(data[2])); + halide_abort_if_false(user_context, st1.contains(data[3])); + } + + print(user_context) << "Success!\n"; + return 0; +}