From 04a8b97c58499c5b1ec74eab142367eb8dbd4e3c Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Fri, 7 Feb 2025 20:50:38 -0500
Subject: [PATCH 1/3] we can always vendor more things

---
 ext/ReactantCUDAExt.jl | 147 ++++++++++++++++++++++++++++++++++++++++-
 src/Compiler.jl        |   6 +-
 2 files changed, 150 insertions(+), 3 deletions(-)

diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
index d7e4f80d95..7e62136a54 100644
--- a/ext/ReactantCUDAExt.jl
+++ b/ext/ReactantCUDAExt.jl
@@ -459,6 +459,145 @@ function vendored_optimize_module!(
     end
 end
 
+function vendored_buildEarlyOptimizerPipeline(mpm, @nospecialize(job), opt_level; instcombine=false)
+    LLVM.add!(mpm, LLVM.NewPMCGSCCPassManager()) do cgpm
+        # TODO invokeCGSCCCallbacks
+        LLVM.add!(cgpm, LLVM.NewPMFunctionPassManager()) do fpm
+            LLVM.add!(fpm, LLVM.Interop.AllocOptPass())
+            LLVM.add!(fpm, LLVM.Float2IntPass())
+            LLVM.add!(fpm, LLVM.LowerConstantIntrinsicsPass())
+        end
+    end
+    LLVM.add!(mpm, GPULowerCPUFeaturesPass())
+    if opt_level >= 1
+        LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm
+            if opt_level >= 2
+                LLVM.add!(fpm, LLVM.SROAPass())
+                if instcombine
+                    LLVM.add!(fpm, LLVM.InstCombinePass())
+                else
+                    LLVM.add!(fpm, LLVM.InstSimplifyPass())
+                end
+                LLVM.add!(fpm, LLVM.JumpThreadingPass())
+                LLVM.add!(fpm, LLVM.CorrelatedValuePropagationPass())
+                LLVM.add!(fpm, LLVM.ReassociatePass())
+                LLVM.add!(fpm, LLVM.EarlyCSEPass())
+                LLVM.add!(fpm, LLVM.Interop.AllocOptPass())
+            else
+                if instcombine
+                    LLVM.add!(fpm, LLVM.InstCombinePass())
+                else
+                    LLVM.add!(fpm, LLVM.InstSimplifyPass())
+                end
+                LLVM.add!(fpm, LLVM.EarlyCSEPass())
+            end
+        end
+        # TODO invokePeepholeCallbacks
+    end
+end
+
+function vendored_buildIntrinsicLoweringPipeline(mpm, @nospecialize(job), opt_level; instcombine::Bool=false)
+    GPUCompiler.add!(mpm, LLVM.Interop.RemoveNIPass())
+
+    # lower GC intrinsics
+    if !GPUCompiler.uses_julia_runtime(job)
+        LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm
+            LLVM.add!(fpm, GPULowerGCFramePass())
+        end
+    end
+
+    # lower kernel state intrinsics
+    # NOTE: we can only do so here, as GC lowering can introduce calls to the runtime,
+    #       and thus additional uses of the kernel state intrinsics.
+    if job.config.kernel
+        # TODO: now that all kernel state-related passes are being run here, merge some?
+        LLVM.add!(mpm, AddKernelStatePass())
+        LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm
+            LLVM.add!(fpm, LowerKernelStatePass())
+        end
+        LLVM.add!(mpm, CleanupKernelStatePass())
+    end
+
+    if !GPUCompiler.uses_julia_runtime(job)
+        # remove dead uses of ptls
+        LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm
+            LLVM.add!(fpm, LLVM.ADCEPass())
+        end
+        LLVM.add!(mpm, GPULowerPTLSPass())
+    end
+
+    LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm
+        # lower exception handling
+        if GPUCompiler.uses_julia_runtime(job)
+            LLVM.add!(fpm, LLVM.Interop.LowerExcHandlersPass())
+        end
+        LLVM.add!(fpm, GPUCompiler.GCInvariantVerifierPass())
+        LLVM.add!(fpm, LLVM.Interop.LateLowerGCPass())
+        if GPUCompiler.uses_julia_runtime(job) && VERSION >= v"1.11.0-DEV.208"
+            LLVM.add!(fpm, LLVM.Interop.FinalLowerGCPass())
+        end
+    end
+    if GPUCompiler.uses_julia_runtime(job) && VERSION < v"1.11.0-DEV.208"
+        LLVM.add!(mpm, LLVM.Interop.FinalLowerGCPass())
+    end
+
+    if opt_level >= 2
+        LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm
+            LLVM.add!(fpm, LLVM.GVNPass())
+            LLVM.add!(fpm, LLVM.SCCPPass())
+            LLVM.add!(fpm, LLVM.DCEPass())
+        end
+    end
+
+    # lower PTLS intrinsics
+    if GPUCompiler.uses_julia_runtime(job)
+        LLVM.add!(mpm, LLVM.Interop.LowerPTLSPass())
+    end
+
+    if opt_level >= 1
+        LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm
+            if instcombine
+                LLVM.add!(fpm, LLVM.InstCombinePass())
+            else
+                LLVM.add!(fpm, LLVM.InstSimplifyPass())
+            end
+            LLVM.add!(fpm, LLVM.SimplifyCFGPass(; GPUCompiler.AggressiveSimplifyCFGOptions...))
+        end
+    end
+
+    # remove Julia address spaces
+    LLVM.add!(mpm, LLVM.Interop.RemoveJuliaAddrspacesPass())
+
+    # Julia's operand bundles confuse the inliner, so repeat here now they are gone.
+    # FIXME: we should fix the inliner so that inlined code gets optimized early-on
+    LLVM.add!(mpm, LLVM.AlwaysInlinerPass())
+end
+
+function vendored_buildNewPMPipeline!(mpm, @nospecialize(job), opt_level)
+    # Doesn't call instcombine
+    GPUCompiler.buildEarlySimplificationPipeline(mpm, job, opt_level)
+    LLVM.add!(mpm, LLVM.AlwaysInlinerPass())
+    vendored_buildEarlyOptimizerPipeline(mpm, job, opt_level)
+    LLVM.add!(mpm, LLVM.NewPMFunctionPassManager()) do fpm
+        # Doesn't call instcombine
+        GPUCompiler.buildLoopOptimizerPipeline(fpm, job, opt_level)
+        # Doesn't call instcombine
+        GPUCompiler.buildScalarOptimizerPipeline(fpm, job, opt_level)
+        if GPUCompiler.uses_julia_runtime(job) && opt_level >= 2
+            # XXX: we disable vectorization, as this generally isn't useful for GPU targets
+            #      and actually causes issues with some back-end compilers (like Metal).
+            # TODO: Make this not dependent on `uses_julia_runtime` (likely CPU), but it's own control
+            # Doesn't call instcombine
+            GPUCompiler.buildVectorPipeline(fpm, job, opt_level)
+        end
+        # if isdebug(:optim)
+        #     add!(fpm, WarnMissedTransformationsPass())
+        # end
+    end
+    vendored_buildIntrinsicLoweringPipeline(mpm, job, opt_level)
+    GPUCompiler.buildCleanupPipeline(mpm, job, opt_level)
+end
+
 # compile to executable machine code
 function compile(job)
     # lower to PTX
@@ -495,11 +634,17 @@ function compile(job)
             LLVM.register!(pb, CleanupKernelStatePass())
 
             LLVM.add!(pb, LLVM.NewPMModulePassManager()) do mpm
-                GPUCompiler.buildNewPMPipeline!(mpm, job, opt_level)
+                vendored_buildNewPMPipeline!(mpm, job, opt_level)
             end
             LLVM.run!(pb, mod, tm)
         end
+        if Reactant.Compiler.DUMP_LLVMIR[]
+            println("cuda.jl pre vendor IR\n", string(mod))
+        end
         vendored_optimize_module!(job, mod)
+        if Reactant.Compiler.DUMP_LLVMIR[]
+            println("cuda.jl post vendor IR\n", string(mod))
+        end
         LLVM.run!(CUDA.GPUCompiler.DeadArgumentEliminationPass(), mod, tm)
 
         for fname in ("gpu_report_exception", "gpu_signal_exception")
diff --git a/src/Compiler.jl b/src/Compiler.jl
index ae89256d45..b4ac6c3fdc 100644
--- a/src/Compiler.jl
+++ b/src/Compiler.jl
@@ -416,10 +416,12 @@ function optimization_passes(; no_nan::Bool=false, sroa::Bool=false, inline::Boo
     if sroa
         push!(passes, "propagate-constant-bounds")
         if DUMP_LLVMIR[]
-            push!(passes, "sroa-wrappers{dump_prellvm=true dump_postllvm=true}")
+            push!(passes, "sroa-wrappers{dump_prellvm=true dump_postllvm=true instcombine=false instsimplify=true}")
         else
-            push!(passes, "sroa-wrappers")
+            push!(passes, "sroa-wrappers{instcombine=false instsimplify=true}")
         end
+        push!(passes, "canonicalize")
+        push!(passes, "sroa-wrappers{instcombine=false instsimplify=true}")
         push!(passes, "libdevice-funcs-raise")
         push!(passes, "canonicalize")
         push!(passes, "remove-duplicate-func-def")

From 4e39a42e5464314203ef620c5e2aa51375c8d60b Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Fri, 7 Feb 2025 23:32:36 -0500
Subject: [PATCH 2/3] fix

---
 src/Compiler.jl | 74 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 4 deletions(-)

diff --git a/src/Compiler.jl b/src/Compiler.jl
index b4ac6c3fdc..d6d0d602ce 100644
--- a/src/Compiler.jl
+++ b/src/Compiler.jl
@@ -558,6 +558,9 @@ end
 const DEBUG_KERNEL = Ref{Bool}(false)
 const DUMP_LLVMIR = Ref{Bool}(false)
 
+
+const Raise = Ref{Bool}(false)
+
 function compile_mlir!(
     mod,
     f,
@@ -607,16 +610,33 @@ function compile_mlir!(
     end
 
     if backend == "cpu"
-        kern = "lower-kernel{backend=cpu},canonicalize,lower-jit{openmp=true backend=cpu},symbol-dce"
+        kern = "lower-kernel{backend=cpu},canonicalize"
+        jit = "lower-jit{openmp=true backend=cpu},symbol-dce"
     elseif DEBUG_KERNEL[]
         curesulthandler = dlsym(
             Reactant_jll.libReactantExtra_handle, "ReactantHandleCuResult"
         )
         @assert curesulthandler !== nothing
         curesulthandler = Base.reinterpret(UInt, curesulthandler)
-        kern = "lower-kernel,canonicalize,lower-jit{debug=true cuResultHandlerPtr=$curesulthandler cuOptLevel=$(cuOptLevel[]) cubinFormat=$(cubinFormat[]) indexBitWidth=$(cuindexBitWidth[])  cubinChip=$(cubinChip[]) cubinFeatures=$(cubinFeatures()) run_init=true toolkitPath=$toolkit},symbol-dce"
+        kern = if Raise[]
+            "lower-kernel{backend=cpu},canonicalize"
+        else
+            "lower-kernel,canonicalize"
+        end
+        jit = "lower-jit{debug=true cuResultHandlerPtr=$curesulthandler cuOptLevel=$(cuOptLevel[]) cubinFormat=$(cubinFormat[]) indexBitWidth=$(cuindexBitWidth[])  cubinChip=$(cubinChip[]) cubinFeatures=$(cubinFeatures()) run_init=true toolkitPath=$toolkit},symbol-dce"
     else
-        kern = "lower-kernel,canonicalize,lower-jit{cuOptLevel=$(cuOptLevel[]) indexBitWidth=$(cuindexBitWidth[]) cubinFormat=$(cubinFormat[]) cubinChip=$(cubinChip[]) cubinFeatures=$(cubinFeatures()) run_init=true toolkitPath=$toolkit},symbol-dce"
+        kern = if Raise[]
+            "lower-kernel{backend=cpu},canonicalize"
+        else
+            "lower-kernel,canonicalize"
+        end
+        jit = "lower-jit{cuOptLevel=$(cuOptLevel[]) indexBitWidth=$(cuindexBitWidth[]) cubinFormat=$(cubinFormat[]) cubinChip=$(cubinChip[]) cubinFeatures=$(cubinFeatures()) run_init=true toolkitPath=$toolkit},symbol-dce"
+    end
+
+    raise = if Raise[]
+        "convert-llvm-to-cf,canonicalize,enzyme-lift-cf-to-scf,llvm-to-affine-access,canonicalize"
+    else
+        "canonicalize"
     end
 
     opt_passes = optimization_passes(; no_nan, sroa=true)
@@ -636,6 +656,8 @@ function compile_mlir!(
                     "enzyme-simplify-math",
                     opt_passes2,
                     kern,
+                    raise,
+                    jit
                 ],
                 ',',
             ),
@@ -657,6 +679,43 @@ function compile_mlir!(
                 ',',
             ),
         )
+    elseif optimize === :before_jit
+        run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes2], ","))
+        run_pass_pipeline!(
+            mod, "$enzyme_pass,arith-raise{stablehlo=true}"; enable_verifier=false
+        )
+        run_pass_pipeline!(
+            mod,
+            join(
+                [
+                    "canonicalize",
+                    "remove-unnecessary-enzyme-ops",
+                    "enzyme-simplify-math",
+                    opt_passes2,
+                    kern,
+                    raise,
+                ],
+                ',',
+            ),
+        )
+    elseif optimize === :before_raise
+        run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes2], ","))
+        run_pass_pipeline!(
+            mod, "$enzyme_pass,arith-raise{stablehlo=true}"; enable_verifier=false
+        )
+        run_pass_pipeline!(
+            mod,
+            join(
+                [
+                    "canonicalize",
+                    "remove-unnecessary-enzyme-ops",
+                    "enzyme-simplify-math",
+                    opt_passes2,
+                    kern
+                ],
+                ',',
+            ),
+        )
     elseif optimize === :no_enzyme
         run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes2], ","))
         run_pass_pipeline!(mod, "arith-raise{stablehlo=true}"; enable_verifier=false)
@@ -698,6 +757,8 @@ function compile_mlir!(
                     "enzyme-simplify-math",
                     opt_passes2,
                     kern,
+                    raise,
+                    jit
                 ],
                 ',',
             ),
@@ -708,7 +769,12 @@ function compile_mlir!(
             mod, "$enzyme_pass,arith-raise{stablehlo=true}"; enable_verifier=false
         )
         run_pass_pipeline!(
-            mod, "canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math," * kern
+            mod, join([
+                "canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math",
+                kern,
+                raise,
+                jit
+               ], ',')
         )
     elseif optimize === :canonicalize
         run_pass_pipeline!(

From 13d39b0f4a413a160044f9d4a64e663048ade75d Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Fri, 7 Feb 2025 23:37:50 -0500
Subject: [PATCH 3/3] fix

---
 ext/ReactantCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
index 7e62136a54..4ce7fcfe81 100644
--- a/ext/ReactantCUDAExt.jl
+++ b/ext/ReactantCUDAExt.jl
@@ -297,7 +297,7 @@ function ka_with_reactant(ndrange, workgroupsize, obj, args...)
 
     # figure out the optimal workgroupsize automatically
     if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
-        if !Reactant.Compiler.PartitionKA[]
+        if !Reactant.Compiler.PartitionKA[] || Reactant.Compiler.Raise[]
             threads = prod(ndrange)
         else
             config = CUDA.launch_configuration(kernel.fun; max_threads=prod(ndrange))