[tutorials] Add tutorial on JIT compile/execute performance (halide#7838

) * Add tutorial on JIT compile/execute performance * Addressing comments from review. Fix punctuation and comment nits. Add timing estimates as comments. Add std::function example. Enable advanced scheduling directives. * Addressing comments from review. Added cases that match real usage patterns: 1. Defining and compiling the whole pipeline every time you want to run it (i.e. in the benchmarking loop) 2. Defining the pipeline outside the benchmarking loop, and realizing it repeatedly. 3. (optional) Same as 2), but calling compile_jit() outside the loop, saying what it does, and saying why the time isn't actually different to case 2 (benchmark() runs multiple times and takes a min, and realize only compiiles on the first run) 4. Compiling to a callable outside the benchmarking loop and showing that it has lower overhead than case 3 (if indeed it does. If not we may need to change the example so that it does, e.g. by adding a real input buffer.) * Addressing comments from review for style nits, and typos in comments. --------- Co-authored-by: Derek Gerstmann <dgerstmann@adobe.com> Co-authored-by: Steven Johnson <srj@google.com>
ardier · Mar 3, 2024 · c177e15 · c177e15
1 parent ead72ad
commit c177e15
Show file tree

Hide file tree

Showing 2 changed files with 262 additions and 0 deletions.
diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt
@@ -209,3 +209,6 @@ if (TARGET Halide::Mullapudi2016)
  add_test(NAME tutorial_lesson_21_auto_scheduler_run COMMAND lesson_21_auto_scheduler_run)
  set_tests_properties(tutorial_lesson_21_auto_scheduler_run PROPERTIES LABELS "tutorial;multithreaded")
 endif ()
+
+# Lesson 22
+add_tutorial(lesson_22_jit_performance.cpp)
diff --git a/tutorial/lesson_22_jit_performance.cpp b/tutorial/lesson_22_jit_performance.cpp
@@ -0,0 +1,259 @@
+// Halide tutorial lesson 22: JIT compilation performance
+
+// This lesson demonstrates the various performance implications of the
+// various Halide methods of doing "Just-In-Time" compilation.
+
+// On linux, you can compile and run it like so:
+// g++ lesson_22*.cpp -g -I <path/to/Halide.h> -I <path/to/tools/halide_benchmark.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_22 -std=c++17
+// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_20
+
+// On os x:
+// g++ lesson_22*.cpp -g -I <path/to/Halide.h> -I <path/to/tools/halide_benchmark.h> -L <path/to/libHalide.so> -lHalide -o lesson_22 -std=c++17
+// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_22
+
+// If you have the entire Halide source tree, you can also build it by
+// running:
+// make tutorial_lesson_22_jit_performance
+// in a shell at the top of the halide source tree.
+
+#include "Halide.h"
+#include "halide_benchmark.h"
+#include <stdio.h>
+
+using namespace Halide;
+using namespace Halide::Tools; // for benchmark()
+
+// Let's define a helper function to construct a simple pipeline that we'll use for our performance tests.
+Pipeline make_pipeline() {
+ // We'll start with a simple transpose operation...
+ Func input("input"), output("output");
+ Var x("x"), y("y");
+
+ // Fill the input with a linear combination of the coordinate values...
+ input(x, y) = cast<uint16_t>(x + y);
+ input.compute_root();
+
+ // Transpose the rows and cols 
+ output(x, y) = input(y, x);
+
+ // Schedule it ... there's a number of possibilities here to do an efficient block-wise transpose.
+ Var xi("xi"), yi("yi");
+
+ // Let's focus on 8x8 subtiles, and then vectorize across X, and unroll across Y.
+ output.tile(x, y, xi, yi, 8, 8).vectorize(xi).unroll(yi);
+
+ // For more advanced scheduling: 
+ //
+ // We can improve this even more by using the .in() directive (see Tutorial 19), 
+ // which allows us to interpose new Funcs in between input and output.
+ // 
+ // Here we can inject a block_transpose function to allow us to do 8 vectorized loads from the input.
+ Func block_transpose("block_transpose"), block("block");
+ block_transpose = input.in(output).compute_at(output, x).vectorize(x).unroll(y);
+ //
+ // And now Let's reorder and vectorize in X across the block.
+ block = block_transpose.in(output).reorder_storage(y, x).compute_at(output, x).vectorize(x).unroll(y);
+
+ // Return the constructed pipeline
+ return Pipeline(output);
+}
+
+int main(int argc, char **argv) {
+ // Since we'll be using the same sample and iteration counts for our benchmarking,
+ // let's define them here in the outermost scope.
+ constexpr int samples = 100;
+ constexpr int iterations = 1;
+
+ // Now, let's measure the performance of constructing and executing a simple pipeline from scratch...
+ {
+ size_t count = 0;
+ double t = benchmark(samples, iterations, [&]() {
+
+ // First, create an output buffer to hold the results.
+ Buffer<uint16_t> result(1024, 1024);
+
+ // Now, construct our pipeline from scratch.
+ Pipeline pipeline = make_pipeline();
+
+ // And then call realize to execute the pipeline.
+ pipeline.realize(result);
+ ++count;
+ });
+
+ // On a MacBook Pro M1, we should get around ~1800 times/sec.
+ std::cout << "Compile & Execute Pipeline (from scratch): " << int(count / t) << " times/sec\n";
+ }
+
+ // This time, let's create the pipeline outside the timing loop and re-use it for each execution...
+ {
+ // Create our pipeline, and re-use it in the loop below
+ Pipeline pipeline = make_pipeline();
+
+ size_t count = 0;
+ double t = benchmark(samples, iterations, [&]() {
+
+ // Create our output buffer
+ Buffer<uint16_t> result(1024, 1024);
+
+ // Now, call realize
+ pipeline.realize(result);
+ ++count;
+ });
+
+ // On a MacBook Pro M1, we should get around ~175000 times/sec (almost 95-100x times faster!).
+ std::cout << "Compile & Execute Pipeline (re-use pipeline): " << int(count / t) << " times/sec\n";
+ }
+
+ // Let's do the same thing as before, but explicitly JIT compile before we realize...
+ {
+ Pipeline pipeline = make_pipeline();
+
+ // Let's JIT compile for our target before we realize, and see what happens...
+ const Target target = get_jit_target_from_environment();
+ pipeline.compile_jit(target);
+
+ size_t count = 0;
+ double t = benchmark(samples, iterations, [&]() {
+ Buffer<uint16_t> result(1024, 1024);
+ pipeline.realize(result);
+ ++count;
+ });
+
+ // On a MacBook Pro M1, this should be about the same as the previous run (about ~175000 times/sec)
+ //
+ // This may seem somewhat surprising, since compiling before realizing doesn't seem to make 
+ // much of a difference to the previous case. However, the first call to realize() will implicitly
+ // JIT-compile and cache the generated code associated with the Pipeline object, which is basically 
+ // what we've done here. Each subsequent call to realize uses the cached version of the native code, 
+ // so there's no additional overhead, and the cost is amortized as we re-use the pipeline.
+ std::cout << "Execute Pipeline (compile before realize): " << int(count / t) << " times/sec\n";
+
+ // Another subtlety is the creation of the result buffer ... the declaration implicitly
+ // allocates memory which will add overhead to each loop iteration. This time, let's try 
+ // using the realize({1024, 1024}) call which will use the buffer managed by the pipeline 
+ // object for the outputs...
+ count = 0;
+ t = benchmark(samples, iterations, [&]() {
+ Buffer<uint16_t> result = pipeline.realize({1024, 1024});
+ ++count;
+ });
+
+ // On a MacBook Pro M1, this should be about the same as the previous run (about ~175000 times/sec).
+ std::cout << "Execute Pipeline (same but with realize({})): " << int(count / t) << " times/sec\n";
+
+ // Or ... we could move the declaration of the result buffer outside the timing loop, and
+ // re-use the allocation (with the caveat that we will be stomping over its contents on each 
+ // execution).
+ Buffer<uint16_t> result(1024, 1024);
+
+ count = 0;
+ t = benchmark(samples, iterations, [&]() {
+ pipeline.realize(result);
+ ++count;
+ });
+
+ // On a MacBook Pro M1, this should be much more efficient ... ~200000 times/sec (or 10-12% faster).
+ std::cout << "Execute Pipeline (re-use buffer with realize): " << int(count / t) << " times/sec\n";
+ }
+
+ // Alternatively, we could compile to a Callable object...
+ {
+ Pipeline pipeline = make_pipeline();
+ const Target target = get_jit_target_from_environment();
+
+ // Here, we can ask the pipeline for its argument list (these are either Params,
+ // ImageParams, or Buffers) so that we can construct a Callable object with the same 
+ // calling convention.
+ auto arguments = pipeline.infer_arguments();
+
+ // The Callable object acts as a convenient way of invoking the compiled code like
+ // a function call, using an argv-like syntax for the argument list. It also caches 
+ // the JIT compiled code, so there's no code generation overhead when invoking the
+ // callable object and executing the pipeline.
+ Callable callable = pipeline.compile_to_callable(arguments, target);
+
+ // Again, we'll pre-allocate and re-use the result buffer.
+ Buffer<uint16_t> result(1024, 1024);
+
+ size_t count = 0;
+ double t = benchmark(samples, iterations, [&]() {
+ callable(result);
+ ++count;
+ });
+
+ // This should be about the same as the previous run (about ~200000 times/sec).
+ std::cout << "Execute Pipeline (compile to callable): " << int(count / t) << " times/sec\n";
+
+ // Perhaps even more convient, we can create a std::function object from the callable,
+ // which allows cleaner type checking for the parameters, and slightly less overhead
+ // for invoking the function. The list used for the template parameters needs to match
+ // the list for the parameters of the pipeline. Here, we have a single result buffer,
+ // so we specify Buffer<uint16_t> in our call to .make_std_function<>. If we had other 
+ // scalar parameters, input buffers or output buffers, we'd pass them in the template 
+ // parameter list too.
+ auto function = callable.make_std_function<Buffer<uint16_t>>();
+
+ count = 0;
+ t = benchmark(samples, iterations, [&]() {
+ function(result);
+ ++count;
+ });
+
+ // On a MacBook Pro M1, this should be slightly more efficient than the callable (~1% faster).
+ std::cout << "Execute Pipeline (compile to std::function): " << int(count / t) << " times/sec\n";
+ }
+
+ // Let's see how much time is spent on just compiling...
+ {
+ Pipeline pipeline = make_pipeline();
+
+ // Only the first call to compile_jit() is expensive ... after the code is generated,
+ // it gets stored in a cache for later re-use, so repeatedly calling compile_jit has
+ // very little overhead after its been cached.
+
+ size_t count = 0;
+ double t = benchmark(samples, iterations, [&]() {
+ pipeline.compile_jit();
+ ++count;
+ });
+
+ // Only the first call does any work and the rest are essentially free.
+ // On a MacBook Pro M1, we should expect ~2 billion times/sec.
+ std::cout << "Compile JIT (using cache): " << int(count / t) << " times/sec\n";
+
+ // You can invalidate the cache manually, which will destroy all the compiled state.
+ count = 0;
+ t = benchmark(samples, iterations, [&]() {
+ pipeline.invalidate_cache();
+ pipeline.compile_jit();
+ ++count;
+ });
+
+ // This is an intentionally expensive loop, and very slow!
+ // On a MacBook Pro M1, we should see only ~2000 times/sec.
+ std::cout << "Compile JIT (from scratch): " << int(count / t) << " times/sec\n";
+ }
+
+ // Alternatively we could compile to a Module...
+ {
+ Pipeline pipeline = make_pipeline();
+ auto args = pipeline.infer_arguments();
+
+ // Compiling to a module generates a self-contained Module containing an internal-representation
+ // of the lowered code suitable for further compilation. So, it's not directly
+ // runnable, but it can be used to link/combine Modules and generate object files,
+ // static libs, bitcode, etc.
+
+ size_t count = 0;
+ double t = benchmark(samples, iterations, [&]() {
+ Module m = pipeline.compile_to_module(args, "transpose");
+ ++count;
+ });
+
+ // On a MacBook Pro M1, this should be around ~10000 times/sec
+ std::cout << "Compile to Module: " << int(count / t) << " times/sec\n";
+ }
+
+ printf("DONE!\n");
+ return 0;
+}