diff --git a/docs/Project.toml b/docs/Project.toml index c0ccd2bbcb..3de01d2f95 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,8 +1,11 @@ [deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterVitepress = "4710194d-e776-4893-9690-8d956a29c365" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +PrettyChairmarks = "aafa11c5-44f9-44a1-b829-427e6ce1ffc2" Reactant = "3c362404-f566-11ee-1572-e11a4b42c853" ReactantCore = "a3311ec8-5e00-46d5-b541-4f83e724a433" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts index 2853e7abd5..792461abb5 100644 --- a/docs/src/.vitepress/config.mts +++ b/docs/src/.vitepress/config.mts @@ -101,6 +101,7 @@ export default defineConfig({ text: "Persistent Compilation Cache", link: "/tutorials/persistent_compile_cache", }, + { text: "Raising", link: "/tutorials/raising" } ], }, { @@ -185,6 +186,7 @@ export default defineConfig({ text: "Persistent Compilation Cache", link: "/tutorials/persistent_compile_cache", }, + { text: "Raising", link: "/tutorials/raising" } ], } ], diff --git a/docs/src/tutorials/raising.md b/docs/src/tutorials/raising.md new file mode 100644 index 0000000000..ab7ab48486 --- /dev/null +++ b/docs/src/tutorials/raising.md @@ -0,0 +1,170 @@ +# Raising + +## Raising GPU Kernels + +Kernel raising refer to Reactant's ability to transform a program written in a GPU kernel +style. That is, kernel functions which are evaluated in a grid of blocks and threads where +operations are done at the scalar level. The transformation raises the program to a tensor +style function (in the StableHLO dialect) where operations are broadcasted. + +This transformation enables several features: + +- Running the raised compute kernel on hardware where the original kernel was not designed + to run on (_i.e._ running a CUDA kernel on a TPU). +- Enabling further optimizations, since the raised kernel is now indiscernible from the rest + of the program, it can be optimized with it. For example, two sequential kernel launches + operating on the result of each others can be fused if they are both raised. Resulting in + a single kernel launch, in the final optimized StableHLO program. +- Lastly, automatic-differentiation in Reactant is currently not supported for GPU kernels. + Raising kernels enables Enzyme to differentiate the raised kernel. For this to function, + one must use the `raise_first` compilation option to make sure the kernel are raised + before Enzyme performs automatic-differentiation on the program. + +!!! note + + Not all classes of kernels are currently raisable to StableHLO. If your kernel + encounters an error while being raised, please open an issue on + [the Reactant.jl repository](https://github.com/EnzymeAD/Reactant.jl/issues/new?labels=raising). + +### Example + +```@example raising_kernelabstractions_to_stablehlo +using Reactant +using KernelAbstractions +using CUDA # needs to be loaded for raising even if CUDA is not functional on your system +``` + +!!! tip + + We could have also directly implemented the kernel using CUDA.jl instead of KernelAbstractions.jl. + +We will implement a simple kernel to compute the square of a vector. + +```@example raising_kernelabstractions_to_stablehlo +@kernel function square_kernel!(y, @Const(x)) + i = @index(Global) + @inbounds y[i] = x[i] * x[i] +end + +function square(x) + y = similar(x) + backend = KernelAbstractions.get_backend(x) + kernel! = square_kernel!(backend) + kernel!(y, x; ndrange=length(x)) + return y +end +``` + +```@example raising_kernelabstractions_to_stablehlo +x = Reactant.to_rarray(collect(1:1:64) ./ 64) +nothing # hide +``` + +Let's see what the HLO IR looks like for this function. Note that raising is automatically +enabled for backends like TPU, where the original kernel was not designed to run on. To +enable raising on other backends, pass the `raise=true` option. + +```@example raising_kernelabstractions_to_stablehlo +@code_hlo raise=true square(x) +``` + +## Raising Scalar Loops to Tensor IR + +We will implement a simple N body simulation code in Reactant. Instead of using +broadcasting or high-level abstractions, we will use loops and scalar operations +to implement this. + +```@example raising_stablehlo +using Reactant, PrettyChairmarks + +Reactant.allowscalar(true) # generally not recommended to turn on globally +``` + +We will implement a naive function to compute the attractive force between each +pair of particles in a system. + +```@example raising_stablehlo +function compute_attractive_force( + positions::AbstractMatrix, masses::AbstractVector, G::Number +) + N = size(positions, 2) + F = similar(positions, N, N) + + @trace for i in 1:N + @trace for j in 1:N + dx = positions[1, i] - positions[1, j] + dy = positions[2, i] - positions[2, j] + dz = positions[3, i] - positions[3, j] + + invr² = ifelse(i == j, dx, inv(dx^2 + dy^2 + dz^2)) + + Fx = G * masses[i] * masses[j] * invr² * dx + Fy = G * masses[i] * masses[j] * invr² * dy + Fz = G * masses[i] * masses[j] * invr² * dz + F[i, j] = Fx + Fy + Fz + end + end + + return F +end +``` + +```@example raising_stablehlo +positions = randn(Float32, 3, 1024) +masses = rand(Float32, 1024) .* 10 + +positions_ra = Reactant.to_rarray(positions) +masses_ra = Reactant.to_rarray(masses) +nothing # hide +``` + +Let's see what the HLO IR looks like for this function (without enabling the loop +raising). + +```@example raising_stablehlo +@code_hlo compile_options = CompileOptions(; + disable_auto_batching_passes=true +) compute_attractive_force(positions_ra, masses_ra, 2.0f0) +``` + +This IR has a nested loop, but that won't work nicely for GPUs/TPUs. Even for CPUs, XLA +often doens't do a great job with loops. By default, we will attempt to raise loops to a +tensor IR. + +```@example raising_stablehlo +hlo = @code_hlo compute_attractive_force(positions_ra, masses_ra, 2.0f0) +@assert !contains(repr(hlo), "stablehlo.while") #hide +hlo +``` + +This IR won't have any loops, instead it will be written in a tensor IR! Let ensure that +the values are identical. + +```@example raising_stablehlo +y_jl = compute_attractive_force(positions, masses, 2.0f0) +y_ra = @jit compute_attractive_force(positions_ra, masses_ra, 2.0f0) +maximum(abs, Array(y_ra) .- y_jl) +``` + +Let's time the execution of the two versions. + +```@example raising_stablehlo +fn1 = @compile sync=true compile_options=CompileOptions(; + disable_auto_batching_passes=true +) compute_attractive_force(positions_ra, masses_ra, 2.0f0) +fn2 = @compile sync=true compute_attractive_force(positions_ra, masses_ra, 2.0f0) +``` + +Runtime for non-raised function: + +```@example raising_stablehlo +fn1(positions_ra, masses_ra, 2.0f0) #hide +@bs fn1(positions_ra, masses_ra, 2.0f0) +``` + +Runtime for raised function: + +```@example raising_stablehlo +fn2(positions_ra, masses_ra, 2.0f0) #hide +@bs fn2(positions_ra, masses_ra, 2.0f0) +```