From 04a8b97c58499c5b1ec74eab142367eb8dbd4e3c Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Fri, 7 Feb 2025 20:50:38 -0500 Subject: [PATCH 1/3] we can always vendor more things --- ext/ReactantCUDAExt.jl | 147 ++++++++++++++++++++++++++++++++++++++++- src/Compiler.jl | 6 +- 2 files changed, 150 insertions(+), 3 deletions(-) diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl index d7e4f80d95..7e62136a54 100644 --- a/ext/ReactantCUDAExt.jl +++ b/ext/ReactantCUDAExt.jl @@ -459,6 +459,145 @@ function vendored_optimize_module!( end end +function vendored_buildEarlyOptimizerPipeline(mpm, @nospecialize(job), opt_level; instcombine=false) + LLVM.add!(mpm, LLVM.NewPMCGSCCPassManager()) do cgpm + # TODO invokeCGSCCCallbacks + LLVM.add!(cgpm, LLVM.NewPMFunctionPassManager()) do fpm + LLVM.add!(fpm, LLVM.Interop.AllocOptPass()) + LLVM.add!(fpm, LLVM.Float2IntPass()) + LLVM.add!(fpm, LLVM.LowerConstantIntrinsicsPass()) + end + end + LLVM.add!(mpm, GPULowerCPUFeaturesPass()) + if opt_level >= 1 + LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm + if opt_level >= 2 + LLVM.add!(fpm, LLVM.SROAPass()) + if instcombine + LLVM.add!(fpm, LLVM.InstCombinePass()) + else + LLVM.add!(fpm, LLVM.InstSimplifyPass()) + end + LLVM.add!(fpm, LLVM.JumpThreadingPass()) + LLVM.add!(fpm, LLVM.CorrelatedValuePropagationPass()) + LLVM.add!(fpm, LLVM.ReassociatePass()) + LLVM.add!(fpm, LLVM.EarlyCSEPass()) + LLVM.add!(fpm, LLVM.Interop.AllocOptPass()) + else + if instcombine + LLVM.add!(fpm, LLVM.InstCombinePass()) + else + LLVM.add!(fpm, LLVM.InstSimplifyPass()) + end + LLVM.add!(fpm, LLVM.EarlyCSEPass()) + end + end + # TODO invokePeepholeCallbacks + end +end + +function vendored_buildIntrinsicLoweringPipeline(mpm, @nospecialize(job), opt_level; instcombine::Bool=false) + GPUCompiler.add!(mpm, LLVM.Interop.RemoveNIPass()) + + # lower GC intrinsics + if !GPUCompiler.uses_julia_runtime(job) + LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm + LLVM.add!(fpm, GPULowerGCFramePass()) + end + end + + # lower kernel state intrinsics + # NOTE: we can only do so here, as GC lowering can introduce calls to the runtime, + # and thus additional uses of the kernel state intrinsics. + if job.config.kernel + # TODO: now that all kernel state-related passes are being run here, merge some? + LLVM.add!(mpm, AddKernelStatePass()) + LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm + LLVM.add!(fpm, LowerKernelStatePass()) + end + LLVM.add!(mpm, CleanupKernelStatePass()) + end + + if !GPUCompiler.uses_julia_runtime(job) + # remove dead uses of ptls + LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm + LLVM.add!(fpm, LLVM.ADCEPass()) + end + LLVM.add!(mpm, GPULowerPTLSPass()) + end + + LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm + # lower exception handling + if GPUCompiler.uses_julia_runtime(job) + LLVM.add!(fpm, LLVM.Interop.LowerExcHandlersPass()) + end + LLVM.add!(fpm, GPUCompiler.GCInvariantVerifierPass()) + LLVM.add!(fpm, LLVM.Interop.LateLowerGCPass()) + if GPUCompiler.uses_julia_runtime(job) && VERSION >= v"1.11.0-DEV.208" + LLVM.add!(fpm, LLVM.Interop.FinalLowerGCPass()) + end + end + if GPUCompiler.uses_julia_runtime(job) && VERSION < v"1.11.0-DEV.208" + LLVM.add!(mpm, LLVM.Interop.FinalLowerGCPass()) + end + + if opt_level >= 2 + LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm + LLVM.add!(fpm, LLVM.GVNPass()) + LLVM.add!(fpm, LLVM.SCCPPass()) + LLVM.add!(fpm, LLVM.DCEPass()) + end + end + + # lower PTLS intrinsics + if GPUCompiler.uses_julia_runtime(job) + LLVM.add!(mpm, LLVM.Interop.LowerPTLSPass()) + end + + if opt_level >= 1 + LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm + if instcombine + LLVM.add!(fpm, LLVM.InstCombinePass()) + else + LLVM.add!(fpm, LLVM.InstSimplifyPass()) + end + LLVM.add!(fpm, LLVM.SimplifyCFGPass(; GPUCompiler.AggressiveSimplifyCFGOptions...)) + end + end + + # remove Julia address spaces + LLVM.add!(mpm, LLVM.Interop.RemoveJuliaAddrspacesPass()) + + # Julia's operand bundles confuse the inliner, so repeat here now they are gone. + # FIXME: we should fix the inliner so that inlined code gets optimized early-on + LLVM.add!(mpm, LLVM.AlwaysInlinerPass()) +end + +function vendored_buildNewPMPipeline!(mpm, @nospecialize(job), opt_level) + # Doesn't call instcombine + GPUCompiler.buildEarlySimplificationPipeline(mpm, job, opt_level) + LLVM.add!(mpm, LLVM.AlwaysInlinerPass()) + vendored_buildEarlyOptimizerPipeline(mpm, job, opt_level) + LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm + # Doesn't call instcombine + GPUCompiler.buildLoopOptimizerPipeline(fpm, job, opt_level) + # Doesn't call instcombine + GPUCompiler.buildScalarOptimizerPipeline(fpm, job, opt_level) + if GPUCompiler.uses_julia_runtime(job) && opt_level >= 2 + # XXX: we disable vectorization, as this generally isn't useful for GPU targets + # and actually causes issues with some back-end compilers (like Metal). + # TODO: Make this not dependent on `uses_julia_runtime` (likely CPU), but it's own control + # Doesn't call instcombine + GPUCompiler.buildVectorPipeline(fpm, job, opt_level) + end + # if isdebug(:optim) + # add!(fpm, WarnMissedTransformationsPass()) + # end + end + vendored_buildIntrinsicLoweringPipeline(mpm, job, opt_level) + GPUCompiler.buildCleanupPipeline(mpm, job, opt_level) +end + # compile to executable machine code function compile(job) # lower to PTX @@ -495,11 +634,17 @@ function compile(job) LLVM.register!(pb, CleanupKernelStatePass()) LLVM.add!(pb, LLVM.NewPMModulePassManager()) do mpm - GPUCompiler.buildNewPMPipeline!(mpm, job, opt_level) + vendored_buildNewPMPipeline!(mpm, job, opt_level) end LLVM.run!(pb, mod, tm) end + if Reactant.Compiler.DUMP_LLVMIR[] + println("cuda.jl pre vendor IR\n", string(mod)) + end vendored_optimize_module!(job, mod) + if Reactant.Compiler.DUMP_LLVMIR[] + println("cuda.jl post vendor IR\n", string(mod)) + end LLVM.run!(CUDA.GPUCompiler.DeadArgumentEliminationPass(), mod, tm) for fname in ("gpu_report_exception", "gpu_signal_exception") diff --git a/src/Compiler.jl b/src/Compiler.jl index ae89256d45..b4ac6c3fdc 100644 --- a/src/Compiler.jl +++ b/src/Compiler.jl @@ -416,10 +416,12 @@ function optimization_passes(; no_nan::Bool=false, sroa::Bool=false, inline::Boo if sroa push!(passes, "propagate-constant-bounds") if DUMP_LLVMIR[] - push!(passes, "sroa-wrappers{dump_prellvm=true dump_postllvm=true}") + push!(passes, "sroa-wrappers{dump_prellvm=true dump_postllvm=true instcombine=false instsimplify=true}") else - push!(passes, "sroa-wrappers") + push!(passes, "sroa-wrappers{instcombine=false instsimplify=true}") end + push!(passes, "canonicalize") + push!(passes, "sroa-wrappers{instcombine=false instsimplify=true}") push!(passes, "libdevice-funcs-raise") push!(passes, "canonicalize") push!(passes, "remove-duplicate-func-def") From 4e39a42e5464314203ef620c5e2aa51375c8d60b Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Fri, 7 Feb 2025 23:32:36 -0500 Subject: [PATCH 2/3] fix --- src/Compiler.jl | 74 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/src/Compiler.jl b/src/Compiler.jl index b4ac6c3fdc..d6d0d602ce 100644 --- a/src/Compiler.jl +++ b/src/Compiler.jl @@ -558,6 +558,9 @@ end const DEBUG_KERNEL = Ref{Bool}(false) const DUMP_LLVMIR = Ref{Bool}(false) + +const Raise = Ref{Bool}(false) + function compile_mlir!( mod, f, @@ -607,16 +610,33 @@ function compile_mlir!( end if backend == "cpu" - kern = "lower-kernel{backend=cpu},canonicalize,lower-jit{openmp=true backend=cpu},symbol-dce" + kern = "lower-kernel{backend=cpu},canonicalize" + jit = "lower-jit{openmp=true backend=cpu},symbol-dce" elseif DEBUG_KERNEL[] curesulthandler = dlsym( Reactant_jll.libReactantExtra_handle, "ReactantHandleCuResult" ) @assert curesulthandler !== nothing curesulthandler = Base.reinterpret(UInt, curesulthandler) - kern = "lower-kernel,canonicalize,lower-jit{debug=true cuResultHandlerPtr=$curesulthandler cuOptLevel=$(cuOptLevel[]) cubinFormat=$(cubinFormat[]) indexBitWidth=$(cuindexBitWidth[]) cubinChip=$(cubinChip[]) cubinFeatures=$(cubinFeatures()) run_init=true toolkitPath=$toolkit},symbol-dce" + kern = if Raise[] + "lower-kernel{backend=cpu},canonicalize" + else + "lower-kernel,canonicalize" + end + jit = "lower-jit{debug=true cuResultHandlerPtr=$curesulthandler cuOptLevel=$(cuOptLevel[]) cubinFormat=$(cubinFormat[]) indexBitWidth=$(cuindexBitWidth[]) cubinChip=$(cubinChip[]) cubinFeatures=$(cubinFeatures()) run_init=true toolkitPath=$toolkit},symbol-dce" else - kern = "lower-kernel,canonicalize,lower-jit{cuOptLevel=$(cuOptLevel[]) indexBitWidth=$(cuindexBitWidth[]) cubinFormat=$(cubinFormat[]) cubinChip=$(cubinChip[]) cubinFeatures=$(cubinFeatures()) run_init=true toolkitPath=$toolkit},symbol-dce" + kern = if Raise[] + "lower-kernel{backend=cpu},canonicalize" + else + "lower-kernel,canonicalize" + end + jit = "lower-jit{cuOptLevel=$(cuOptLevel[]) indexBitWidth=$(cuindexBitWidth[]) cubinFormat=$(cubinFormat[]) cubinChip=$(cubinChip[]) cubinFeatures=$(cubinFeatures()) run_init=true toolkitPath=$toolkit},symbol-dce" + end + + raise = if Raise[] + "convert-llvm-to-cf,canonicalize,enzyme-lift-cf-to-scf,llvm-to-affine-access,canonicalize" + else + "canonicalize" end opt_passes = optimization_passes(; no_nan, sroa=true) @@ -636,6 +656,8 @@ function compile_mlir!( "enzyme-simplify-math", opt_passes2, kern, + raise, + jit ], ',', ), @@ -657,6 +679,43 @@ function compile_mlir!( ',', ), ) + elseif optimize === :before_jit + run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes2], ",")) + run_pass_pipeline!( + mod, "$enzyme_pass,arith-raise{stablehlo=true}"; enable_verifier=false + ) + run_pass_pipeline!( + mod, + join( + [ + "canonicalize", + "remove-unnecessary-enzyme-ops", + "enzyme-simplify-math", + opt_passes2, + kern, + raise, + ], + ',', + ), + ) + elseif optimize === :before_raise + run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes2], ",")) + run_pass_pipeline!( + mod, "$enzyme_pass,arith-raise{stablehlo=true}"; enable_verifier=false + ) + run_pass_pipeline!( + mod, + join( + [ + "canonicalize", + "remove-unnecessary-enzyme-ops", + "enzyme-simplify-math", + opt_passes2, + kern + ], + ',', + ), + ) elseif optimize === :no_enzyme run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes2], ",")) run_pass_pipeline!(mod, "arith-raise{stablehlo=true}"; enable_verifier=false) @@ -698,6 +757,8 @@ function compile_mlir!( "enzyme-simplify-math", opt_passes2, kern, + raise, + jit ], ',', ), @@ -708,7 +769,12 @@ function compile_mlir!( mod, "$enzyme_pass,arith-raise{stablehlo=true}"; enable_verifier=false ) run_pass_pipeline!( - mod, "canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math," * kern + mod, join([ + "canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math", + kern, + raise, + jit + ], ',') ) elseif optimize === :canonicalize run_pass_pipeline!( From 13d39b0f4a413a160044f9d4a64e663048ade75d Mon Sep 17 00:00:00 2001 From: "William S. Moses" Date: Fri, 7 Feb 2025 23:37:50 -0500 Subject: [PATCH 3/3] fix --- ext/ReactantCUDAExt.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl index 7e62136a54..4ce7fcfe81 100644 --- a/ext/ReactantCUDAExt.jl +++ b/ext/ReactantCUDAExt.jl @@ -297,7 +297,7 @@ function ka_with_reactant(ndrange, workgroupsize, obj, args...) # figure out the optimal workgroupsize automatically if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing - if !Reactant.Compiler.PartitionKA[] + if !Reactant.Compiler.PartitionKA[] || Reactant.Compiler.Raise[] threads = prod(ndrange) else config = CUDA.launch_configuration(kernel.fun; max_threads=prod(ndrange))