Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

error for scatter with Metal arrays #534

Open
CarloLucibello opened this issue Sep 19, 2023 · 7 comments
Open

error for scatter with Metal arrays #534

CarloLucibello opened this issue Sep 19, 2023 · 7 comments

Comments

@CarloLucibello
Copy link
Member

CarloLucibello commented Sep 19, 2023

gather works fine on Apple Silicon, but with scatter I get an error.
The scatter kernel works fine with cuda and amdgpu arrays.

cc @maleadt @pxl-th

julia> using Metal, NNlib, Flux

julia> Metal.versioninfo()
macOS 14.0.0, Darwin 23.0.0

Toolchain:
- Julia: 1.9.3
- LLVM: 14.0.6

Julia packages:
- Metal.jl: 0.5.1
- Metal_LLVM_Tools_jll: 0.5.1+0

1 device:
- Apple M1 Pro (384.000 KiB allocated)

julia> device = Flux.get_device("Metal")

julia> NNlib.gather([1 2 3; 4 5 6] |> device, [1,3,1,3,1] |> device)
2×5 MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
 1  3  1  3  1
 4  6  4  6  4

julia> NNlib.scatter(+, [1 2 3 4; 5 6 7 8] |> device, [2,1,1,5] |> device)
ERROR: Compilation to native code failed; see below for details.
If you think this is a bug, please file an issue and attach /var/folders/z_/n_d2vxmx4jj95q7hzmwngnyc0000gn/T/jl_A3uHqJIPoH.metallib.
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:35
  [2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:78
  [3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
  [4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:132
  [5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:103
  [6] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
  [7] macro expansion
    @ ./lock.jl:267 [inlined]
  [8] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
  [9] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
 [10] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77 [inlined]
 [11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(NNlib.gpu__scatter!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
    @ Metal.MetalKernels ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:105
 [12] Kernel
    @ ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:101 [inlined]
 [13] scatter!
    @ ~/.julia/packages/NNlib/lOntC/src/scatter.jl:104 [inlined]
 [14] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}; init::Nothing, dstsize::Nothing)
    @ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:177
 [15] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate})
    @ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:168
 [16] top-level scope
    @ REPL[32]:1
 [17] top-level scope
    @ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57

caused by: NSError: Compiler encountered an internal error (AGXMetalG13X, code 3)
Stacktrace:
  [1] MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
    @ Metal.MTL ~/.julia/packages/Metal/lnkVP/lib/mtl/compute_pipeline.jl:60
  [2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:70
  [3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/compilation.jl:65
  [4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:132
  [5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Cp7sE/src/execution.jl:103
  [6] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:162 [inlined]
  [7] macro expansion
    @ ./lock.jl:267 [inlined]
  [8] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:157
  [9] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(+), MtlDeviceMatrix{Int64, 1}, MtlDeviceMatrix{Int64, 1}, MtlDeviceVector{Int64, 1}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Int64}})
    @ Metal ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:155
 [10] macro expansion
    @ ~/.julia/packages/Metal/lnkVP/src/compiler/execution.jl:77 [inlined]
 [11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(NNlib.gpu__scatter!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
    @ Metal.MetalKernels ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:105
 [12] Kernel
    @ ~/.julia/packages/Metal/lnkVP/src/MetalKernels.jl:101 [inlined]
 [13] scatter!
    @ ~/.julia/packages/NNlib/lOntC/src/scatter.jl:104 [inlined]
 [14] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}; init::Nothing, dstsize::Nothing)
    @ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:177
 [15] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}, idx::MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate})
    @ NNlib ~/.julia/packages/NNlib/lOntC/src/scatter.jl:168
 [16] top-level scope
    @ REPL[32]:1
 [17] top-level scope
    @ ~/.julia/packages/Metal/lnkVP/src/initialization.jl:57
@pxl-th
Copy link
Member

pxl-th commented Sep 19, 2023

This is likely because of atomic operations, which rely on Atomix:

@inbounds Atomix.modify!(Atomix.IndexableRef(dst, idx), op, src[i])

IIUC, Metal does not yet support Atomix.

@maleadt
Copy link
Contributor

maleadt commented Sep 19, 2023

IIUC, Metal does not yet support Atomix.

That is correct; I think we have the necessary intrinsics, but nobody has implemented the Atomix.jl interface yet.

The compiler shouldn't crash like that, though. Unless of course Atomix.jl is falling back to LLVM atomic, which aren't supported by the back-end.

@CarloLucibello
Copy link
Member Author

I tried this with Atomix 1.0 and on top of JuliaGPU/KernelAbstractions.jl#545
but I still see a failure

julia> using Metal, NNlib, Flux

julia> Metal.versioninfo()
macOS 15.0.1, Darwin 24.0.0

Toolchain:
- Julia: 1.11.1
- LLVM: 16.0.6

Julia packages: 
- Metal.jl: 1.4.2
- GPUArrays: 10.3.1
- GPUCompiler: 0.27.8
- KernelAbstractions: 0.9.29
- ObjectiveC: 3.1.0
- LLVM: 9.1.3
- LLVMDowngrader_jll: 0.3.0+2

1 device:
- Apple M1 Pro (384.000 KiB allocated)

julia> device = Flux.get_device()
(::MetalDevice) (generic function with 4 methods)

julia> NNlib.gather([1 2 3; 4 5 6] |> device, [1,3,1,3,1] |> device)
2×5 MtlMatrix{Int64, Metal.PrivateStorage}:
 1  3  1  3  1
 4  6  4  6  4

julia> NNlib.scatter(+, [1 2 3 4; 5 6 7 8] |> device, [2,1,1,5] |> device)
ERROR: Compilation to native code failed; see below for details.
If you think this is a bug, please file an issue and attach /var/folders/z_/n_d2vxmx4jj95q7hzmwngnyc0000gn/T/jl_xsBWFuVaFy.metallib
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:35
  [2] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/compilation.jl:195 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/ObjectiveC/C7BVt/src/os.jl:264 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/compilation.jl:178 [inlined]
  [5] (::Metal.var"#171#172"{Bool, GPUCompiler.CompilerJob{}, @NamedTuple{}})()
    @ Metal ~/.julia/packages/ObjectiveC/C7BVt/src/foundation.jl:637
  [6] macro expansion
    @ ~/.julia/packages/ObjectiveC/C7BVt/src/foundation.jl:565 [inlined]
  [7] macro expansion
    @ ./lock.jl:273 [inlined]
  [8] ObjectiveC.Foundation.NSAutoreleasePool(f::Metal.var"#171#172"{Bool, GPUCompiler.CompilerJob{}, @NamedTuple{}})
    @ ObjectiveC.Foundation ~/.julia/packages/ObjectiveC/C7BVt/src/foundation.jl:557
  [9] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String}; return_function::Bool)
    @ Metal ~/.julia/packages/ObjectiveC/C7BVt/src/foundation.jl:636
 [10] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/execution.jl:262
 [11] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/execution.jl:151
 [12] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:189 [inlined]
 [13] macro expansion
    @ ./lock.jl:273 [inlined]
 [14] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{…}}; name::Nothing, kwargs::@Kwargs{})
    @ Metal ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:184
 [15] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{…}})
    @ Metal ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:182
 [16] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:85 [inlined]
 [17] (::KernelAbstractions.Kernel{…})(::Function, ::Vararg{…}; ndrange::Int64, workgroupsize::Nothing)
    @ Metal.MetalKernels ~/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:110
 [18] Kernel
    @ ~/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:106 [inlined]
 [19] scatter!
    @ ~/.julia/dev/NNlib/src/scatter.jl:104 [inlined]
 [20] scatter(op::typeof(+), src::MtlMatrix{…}, idx::MtlVector{…}; init::Nothing, dstsize::Nothing)
    @ NNlib ~/.julia/dev/NNlib/src/scatter.jl:183
 [21] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.PrivateStorage}, idx::MtlVector{Int64, Metal.PrivateStorage})
    @ NNlib ~/.julia/dev/NNlib/src/scatter.jl:174
 [22] top-level scope
    @ REPL[22]:1

caused by: NSError: Failed to materializeAll. (AGXMetalG13X, code 3)
Stacktrace:
  [1] Metal.MTL.MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
    @ Metal.MTL ~/.julia/packages/Metal/JtmpJ/lib/mtl/compute_pipeline.jl:60
  [2] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/compilation.jl:183 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/ObjectiveC/C7BVt/src/os.jl:264 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/compilation.jl:178 [inlined]
  [5] (::Metal.var"#171#172"{Bool, GPUCompiler.CompilerJob{}, @NamedTuple{}})()
    @ Metal ~/.julia/packages/ObjectiveC/C7BVt/src/foundation.jl:637
  [6] macro expansion
    @ ~/.julia/packages/ObjectiveC/C7BVt/src/foundation.jl:565 [inlined]
  [7] macro expansion
    @ ./lock.jl:273 [inlined]
  [8] ObjectiveC.Foundation.NSAutoreleasePool(f::Metal.var"#171#172"{Bool, GPUCompiler.CompilerJob{}, @NamedTuple{}})
    @ ObjectiveC.Foundation ~/.julia/packages/ObjectiveC/C7BVt/src/foundation.jl:557
  [9] link(job::GPUCompiler.CompilerJob, compiled::@NamedTuple{image::Vector{UInt8}, entry::String}; return_function::Bool)
    @ Metal ~/.julia/packages/ObjectiveC/C7BVt/src/foundation.jl:636
 [10] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/execution.jl:262
 [11] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/execution.jl:151
 [12] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:189 [inlined]
 [13] macro expansion
    @ ./lock.jl:273 [inlined]
 [14] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{…}}; name::Nothing, kwargs::@Kwargs{})
    @ Metal ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:184
 [15] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{…}})
    @ Metal ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:182
 [16] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:85 [inlined]
 [17] (::KernelAbstractions.Kernel{…})(::Function, ::Vararg{…}; ndrange::Int64, workgroupsize::Nothing)
    @ Metal.MetalKernels ~/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:110
 [18] Kernel
    @ ~/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:106 [inlined]
 [19] scatter!
    @ ~/.julia/dev/NNlib/src/scatter.jl:104 [inlined]
 [20] scatter(op::typeof(+), src::MtlMatrix{…}, idx::MtlVector{…}; init::Nothing, dstsize::Nothing)
    @ NNlib ~/.julia/dev/NNlib/src/scatter.jl:183
 [21] scatter(op::typeof(+), src::MtlMatrix{Int64, Metal.PrivateStorage}, idx::MtlVector{Int64, Metal.PrivateStorage})
    @ NNlib ~/.julia/dev/NNlib/src/scatter.jl:174
 [22] top-level scope
    @ REPL[22]:1
Some type information was truncated. Use `show(err)` to see complete types.

@maleadt
Copy link
Contributor

maleadt commented Nov 26, 2024

Can you verify in the LLVM IR that this is emitting AIR atomics and not native LLVM atomics? x-ref JuliaConcurrent/Atomix.jl#39 (comment)

@christiangnrd
Copy link
Contributor

christiangnrd commented Nov 26, 2024

@CarloLucibello Does it work if your src array is of type Int32? It's probably related to JuliaGPU/Metal.jl#477

@CarloLucibello
Copy link
Member Author

With Int32 srx and idx maybe we go a bit further:

julia> import Metal, NNlib, Flux

julia> dev = Flux.get_device();

julia> src, idx = Int32[1 2 3 4; 5 6 7 8], Int32[2,1,1,5];

julia> srcd, idxd = dev(x), dev(idx)

julia> y = NNlib.scatter(+, src, idx); # CPU

julia> yd = dev(zero(y));

julia> NNlib.scatter!(+, yd, srcd, idxd).           # GPU
ERROR: InvalidIRError: compiling MethodInstance for NNlib.gpu__scatter!(::KernelAbstractions.CompilerMetadata{…}, ::typeof(+), ::MtlDeviceMatrix{…}, ::MtlDeviceMatrix{…}, ::MtlDeviceVector{…}, ::CartesianIndices{…}, ::Int64) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to gpu_malloc)
Stacktrace:
 [1] malloc
   @ ~/.julia/packages/GPUCompiler/2CW9L/src/runtime.jl:85
 [2] gc_pool_alloc
   @ ~/.julia/packages/GPUCompiler/2CW9L/src/runtime.jl:116
 [3] modify!
   @ ~/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:38
 [4] modify!
   @ ~/.julia/packages/Atomix/g4H61/src/generic.jl:120
 [5] macro expansion
   @ ~/.julia/packages/NNlib/mRRJu/src/scatter.jl:123
 [6] gpu__scatter!
   @ ~/.julia/dev/KernelAbstractions/src/macros.jl:97
 [7] gpu__scatter!
   @ ./none:0
Reason: unsupported call to an unknown function (call to gpu_malloc)
Stacktrace:
  [1] malloc
    @ ~/.julia/packages/GPUCompiler/2CW9L/src/runtime.jl:85
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/2CW9L/src/runtime.jl:180
  [3] macro expansion
    @ ./none:0
  [4] box
    @ ./none:0
  [5] box_int64
    @ ~/.julia/packages/GPUCompiler/2CW9L/src/runtime.jl:209
  [6] modify!
    @ ~/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:38
  [7] modify!
    @ ~/.julia/packages/Atomix/g4H61/src/generic.jl:120
  [8] macro expansion
    @ ~/.julia/packages/NNlib/mRRJu/src/scatter.jl:123
  [9] gpu__scatter!
    @ ~/.julia/dev/KernelAbstractions/src/macros.jl:97
 [10] gpu__scatter!
    @ ./none:0
Reason: unsupported dynamic function invocation (call to atomic_fetch_add_explicit)
Stacktrace:
 [1] modify!
   @ ~/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:38
 [2] modify!
   @ ~/.julia/packages/Atomix/g4H61/src/generic.jl:120
 [3] macro expansion
   @ ~/.julia/packages/NNlib/mRRJu/src/scatter.jl:123
 [4] gpu__scatter!
   @ ~/.julia/dev/KernelAbstractions/src/macros.jl:97
 [5] gpu__scatter!
   @ ./none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/validation.jl:147
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/2CW9L/src/driver.jl:382 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/NRdsv/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/2CW9L/src/driver.jl:381 [inlined]
  [5] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/utils.jl:108
  [6] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/driver.jl:100
  [7] codegen
    @ ~/.julia/packages/GPUCompiler/2CW9L/src/driver.jl:82 [inlined]
  [8] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/driver.jl:79
  [9] compile
    @ ~/.julia/packages/GPUCompiler/2CW9L/src/driver.jl:74 [inlined]
 [10] (::Metal.var"#154#162"{GPUCompiler.CompilerJob{}})(ctx::LLVM.Context)
    @ Metal ~/.julia/packages/Metal/JtmpJ/src/compiler/compilation.jl:108
 [11] JuliaContext(f::Metal.var"#154#162"{GPUCompiler.CompilerJob{}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/driver.jl:34
 [12] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/driver.jl:25
 [13] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/compilation.jl:107 [inlined]
 [14] macro expansion
    @ ~/.julia/packages/ObjectiveC/C7BVt/src/os.jl:264 [inlined]
 [15] compile(job::GPUCompiler.CompilerJob)
    @ Metal ~/.julia/packages/Metal/JtmpJ/src/compiler/compilation.jl:105
 [16] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/execution.jl:237
 [17] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/2CW9L/src/execution.jl:151
 [18] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:189 [inlined]
 [19] macro expansion
    @ ./lock.jl:273 [inlined]
 [20] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{…}}; name::Nothing, kwargs::@Kwargs{})
    @ Metal ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:184
 [21] mtlfunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{…}})
    @ Metal ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:182
 [22] macro expansion
    @ ~/.julia/packages/Metal/JtmpJ/src/compiler/execution.jl:85 [inlined]
 [23] (::KernelAbstractions.Kernel{…})(::Function, ::Vararg{…}; ndrange::Int64, workgroupsize::Nothing)
    @ Metal.MetalKernels ~/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:110
 [24] Kernel
    @ ~/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:106 [inlined]
 [25] scatter!(op::typeof(+), dst::MtlMatrix{…}, src::MtlMatrix{…}, idx::MtlVector{…})
    @ NNlib ~/.julia/packages/NNlib/mRRJu/src/scatter.jl:104
 [26] top-level scope
    @ REPL[47]:1
 [27] top-level scope
    @ ~/.julia/packages/Metal/JtmpJ/src/initialization.jl:72
Some type information was truncated. Use `show(err)` to see complete types.

@CarloLucibello
Copy link
Member Author

@maleadt here is the @device_code_llvm output for the Int32 case:

details ``` julia> Metal.@device_code_llvm NNlib.scatter!(+, yd, srcd, idxd) ; GPUCompiler.CompilerJob{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}(MethodInstance for NNlib.gpu__scatter!(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::typeof(+), ::MtlDeviceMatrix{Int32, 1}, ::MtlDeviceMatrix{Float32, 1}, ::MtlDeviceVector{Int32, 1}, ::CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, ::Int64), CompilerConfig for GPUCompiler.MetalCompilerTarget, 0x00000000000068bc) ; @ none within `gpu__scatter!` define void @_Z13gpu__scatter_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl1E5TupleI5OneToI5Int64EEE7NDRangeILl1ES0_S0_S2_ILl1ES3_IS4_IS5_EEES2_ILl1ES3_IS4_IS5_EEEEE1_14MtlDeviceArrayI5Int32Ll2ELl1EES8_I7Float32Ll2ELl1EES8_IS9_Ll1ELl1EES2_ILl1ES3_IS4_IS5_EEES5_({ [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } addrspace(1)* %0, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %2, { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %3, [1 x [1 x [1 x i64]]] addrspace(1)* %4, i64 addrspace(1)* %5, i32 %thread_position_in_threadgroup, i32 %threadgroup_position_in_grid, i32 %thread_position_in_grid) local_unnamed_addr { conversion: %6 = getelementptr inbounds { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] }, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } addrspace(1)* %0, i64 0, i32 0, i64 0, i64 0, i64 0 %.unpack.unpack.unpack.unpack = load i64, i64 addrspace(1)* %6, align 8 %7 = getelementptr inbounds { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] }, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } addrspace(1)* %0, i64 0, i32 1, i64 1, i64 0, i64 0, i64 0 %.unpack18.unpack23.unpack.unpack.unpack = load i64, i64 addrspace(1)* %7, align 8 %.elt = getelementptr inbounds { i8 addrspace(1)*, [2 x i64] }, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1, i64 0, i32 0 %.unpack = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.elt, align 8 %.unpack32.elt = getelementptr inbounds { i8 addrspace(1)*, [2 x i64] }, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1, i64 0, i32 1, i64 0 %.unpack32.unpack = load i64, i64 addrspace(1)* %.unpack32.elt, align 8 %.unpack32.elt33 = getelementptr inbounds { i8 addrspace(1)*, [2 x i64] }, { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %1, i64 0, i32 1, i64 1 %.unpack32.unpack34 = load i64, i64 addrspace(1)* %.unpack32.elt33, align 8 %8 = bitcast { i8 addrspace(1)*, [2 x i64] } addrspace(1)* %2 to float addrspace(1)* addrspace(1)* %.unpack3743 = load float addrspace(1)*, float addrspace(1)* addrspace(1)* %8, align 8 %9 = bitcast { i8 addrspace(1)*, [1 x i64] } addrspace(1)* %3 to i32 addrspace(1)* addrspace(1)* %.unpack4549 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %9, align 8 %10 = load i64, i64 addrspace(1)* %5, align 8 ; @ none within `gpu__scatter!` @ /Users/carlo/.julia/dev/KernelAbstractions/src/macros.jl:96 ; ┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:161 within `#__validindex` ; │┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/intrinsics/arguments.jl:49 within `#threadgroup_position_in_grid_1d` ; ││┌ @ int.jl:87 within `+` %11 = add i32 %threadgroup_position_in_grid, 1 ; │└└ ; │┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/intrinsics/arguments.jl:49 within `#thread_position_in_threadgroup_1d` ; ││┌ @ int.jl:87 within `+` %12 = add i32 %thread_position_in_threadgroup, 1 ; │└└ ; │┌ @ /Users/carlo/.julia/dev/KernelAbstractions/src/nditeration.jl:84 within `expand` ; ││┌ @ abstractarray.jl:1312 within `getindex` ; │││┌ @ indices.jl:365 within `to_indices` @ indices.jl:368 ; ││││┌ @ indices.jl:292 within `to_index` @ indices.jl:307 ; │││││┌ @ number.jl:7 within `convert` ; ││││││┌ @ boot.jl:892 within `Int64` ; │││││││┌ @ boot.jl:816 within `toInt64` %13 = zext i32 %11 to i64 %14 = zext i32 %12 to i64 ; ││└└└└└└ ; ││ @ /Users/carlo/.julia/dev/KernelAbstractions/src/nditeration.jl:84 within `expand` @ /Users/carlo/.julia/dev/KernelAbstractions/src/nditeration.jl:74 ; ││┌ @ ntuple.jl:48 within `ntuple` ; │││┌ @ /Users/carlo/.julia/dev/KernelAbstractions/src/nditeration.jl:78 within `#1` ; ││││┌ @ int.jl:86 within `-` %15 = add nsw i64 %13, -1 ; ││││└ ; ││││┌ @ int.jl:88 within `*` %16 = mul i64 %.unpack18.unpack23.unpack.unpack.unpack, %15 ; ││││└ ; ││││┌ @ int.jl:87 within `+` %17 = add i64 %16, %14 ; │└└└└ ; │ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:163 within `#__validindex` ; │┌ @ multidimensional.jl:477 within `in` ; ││┌ @ tuple.jl:382 within `map` ; │││┌ @ range.jl:1426 within `in` ; ││││┌ @ int.jl:514 within `<=` %18 = icmp slt i64 %17, 1 %19 = icmp sgt i64 %17, %.unpack.unpack.unpack.unpack ; └└└└└ %.not2 = or i1 %18, %19 br i1 %.not2, label %L288, label %L107

L107: ; preds = %conversion
; @ none within gpu__scatter! @ /Users/carlo/.julia/dev/KernelAbstractions/src/macros.jl:97
; ┌ @ /Users/carlo/.julia/packages/NNlib/mRRJu/src/scatter.jl:121 within macro expansion
; │┌ @ div.jl:181 within divrem @ div.jl:203
; ││┌ @ int.jl:295 within div
%.not = icmp eq i64 %10, 0
br i1 %.not, label %fail, label %pass

L211: ; preds = %pass
; │└└
; │ @ /Users/carlo/.julia/packages/NNlib/mRRJu/src/scatter.jl:123 within macro expansion
; │┌ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:34
; ││┌ @ number.jl:7 within convert
; │││┌ @ float.jl:991 within Int32
; ││││┌ @ floatfuncs.jl:45 within isinteger
; │││││┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/intrinsics/math.jl:274 within #trunc
%20 = call float @air.trunc.f32(float %46)
; │││││└
; │││││┌ @ float.jl:492 within -
%21 = fsub float %46, %20
; │││││└
; │││││┌ @ float.jl:654 within == @ float.jl:616
%22 = fcmp une float %21, 0.000000e+00
; ││││└└
br i1 %22, label %L219, label %L217

L217: ; preds = %L211
; ││└└
; ││ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:35
; ││┌ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/references.jl:102 within pointer
; │││┌ @ abstractarray.jl:1312 within getindex
; ││││┌ @ abstractarray.jl:1340 within _getindex
; │││││┌ @ abstractarray.jl:699 within checkbounds @ abstractarray.jl:681
; ││││││┌ @ abstractarray.jl:725 within checkbounds_indices
; │││││││┌ @ abstractarray.jl:754 within checkindex
; ││││││││┌ @ int.jl:513 within <
%23 = icmp uge i64 %40, %.unpack32.unpack
; │││││││└└
; │││││││ @ abstractarray.jl:725 within checkbounds_indices @ abstractarray.jl:725
; │││││││┌ @ abstractarray.jl:754 within checkindex
; ││││││││┌ @ int.jl:86 within -
%24 = add nsw i64 %43, -1
; ││││││││└
; ││││││││┌ @ int.jl:513 within <
%25 = icmp uge i64 %24, %.unpack32.unpack34
; ││││││└└└
; ││││││ @ abstractarray.jl:699 within checkbounds
%.not4 = or i1 %23, %25
br i1 %.not4, label %L247, label %L250

L219: ; preds = %pass, %L211
; ││└└└└
; ││ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:34
; ││┌ @ number.jl:7 within convert
; │││┌ @ float.jl:994 within Int32
call fastcc void @ijl_box_float32(float %46)
call fastcc void @gpu_report_exception()
call fastcc void @gpu_signal_exception()
call void @llvm.trap()
unreachable

L247: ; preds = %L217
; ││└└
; ││ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:35
; ││┌ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/references.jl:102 within pointer
; │││┌ @ abstractarray.jl:1312 within getindex
; ││││┌ @ abstractarray.jl:1340 within _getindex
; │││││┌ @ abstractarray.jl:699 within checkbounds
; ││││││┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/quirks.jl:4 within #throw_boundserror
call fastcc void @gpu_report_exception()
call fastcc void @gpu_signal_exception()
call void @llvm.trap()
unreachable

L250: ; preds = %L217
; ││└└└└└
; ││ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:34
; ││┌ @ number.jl:7 within convert
; │││┌ @ float.jl:992 within Int32
; ││││┌ @ float.jl:416 within unsafe_trunc
%26 = fptosi float %46 to i32
%27 = freeze i32 %26
; ││└└└
; ││ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:35
; ││┌ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/references.jl:102 within pointer
; │││┌ @ abstractarray.jl:1312 within getindex
; ││││┌ @ abstractarray.jl:1341 within _getindex
; │││││┌ @ abstractarray.jl:1347 within _to_linear_index
; ││││││┌ @ abstractarray.jl:3048 within _sub2ind @ abstractarray.jl:3064
; │││││││┌ @ abstractarray.jl:3080 within _sub2ind_recurse @ abstractarray.jl:3080
; ││││││││┌ @ abstractarray.jl:3087 within offsetin
; │││││││││┌ @ int.jl:86 within -
%28 = add nsw i64 %43, 4611686018427387903
; ││││││││└└
; ││││││││┌ @ int.jl:88 within *
%29 = mul i64 %.unpack32.unpack, %28
; │││└└└└└└
; │││ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/references.jl:103 within pointer @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/array.jl:64
; │││┌ @ abstractarray.jl:1236 within _memory_offset
; ││││┌ @ int.jl:86 within -
%30 = add i64 %29, %40
; ││││└
; ││││┌ @ int.jl:88 within *
%31 = shl i64 %30, 2
; │││└└
; │││┌ @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/pointer.jl:147 within +
; ││││┌ @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/pointer.jl:114 within add_ptr
; │││││┌ @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/pointer.jl:114 within macro expansion @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/base.jl:39
%32 = getelementptr i8, i8 addrspace(1)* %.unpack, i64 %31
; ││└└└└
; ││ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:38
; ││┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/intrinsics/atomics.jl:84 within atomic_fetch_add_explicit
; │││┌ @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/pointer.jl:344 within macro expansion
; ││││┌ @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/pointer.jl:182 within _typed_llvmcall
; │││││┌ @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/pointer.jl:182 within macro expansion @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/base.jl:39
%33 = bitcast i8 addrspace(1)* %32 to i32 addrspace(1)*
%34 = call i32 @air.atomic.global.add.s.i32(i32 addrspace(1)* %33, i32 %27, i32 0, i32 2, i1 true)
; ││└└└└
; ││ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:55
br label %L288

L288: ; preds = %L250, %conversion
; └└
; @ none within gpu__scatter! @ /Users/carlo/.julia/dev/KernelAbstractions/src/macros.jl:99
ret void

fail: ; preds = %L107
; @ none within gpu__scatter! @ /Users/carlo/.julia/dev/KernelAbstractions/src/macros.jl:97
; ┌ @ /Users/carlo/.julia/packages/NNlib/mRRJu/src/scatter.jl:121 within macro expansion
; │┌ @ div.jl:181 within divrem @ div.jl:203
; ││┌ @ int.jl:295 within div
call fastcc void @gpu_report_exception()
call fastcc void @gpu_signal_exception()
call void @llvm.trap()
unreachable

pass: ; preds = %L107
; │└└
; │ @ /Users/carlo/.julia/packages/NNlib/mRRJu/src/scatter.jl:120 within macro expansion
; │┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/MetalKernels.jl:143 within #__index_Global_Linear
; ││┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/intrinsics/arguments.jl:49 within #thread_position_in_grid_1d
; │││┌ @ int.jl:87 within +
%35 = add i32 %thread_position_in_grid, 1
; │└└└
; │ @ /Users/carlo/.julia/packages/NNlib/mRRJu/src/scatter.jl:121 within macro expansion
; │┌ @ int.jl:1011 within -
; ││┌ @ int.jl:551 within rem
; │││┌ @ number.jl:7 within convert
; ││││┌ @ boot.jl:892 within Int64
; │││││┌ @ boot.jl:816 within toInt64
%36 = zext i32 %35 to i64
; ││└└└└
; ││ @ int.jl:1013 within - @ int.jl:86
%37 = add nsw i64 %36, -1
; │└
; │┌ @ div.jl:181 within divrem @ div.jl:203
; ││┌ @ int.jl:295 within div
%38 = sdiv i64 %37, %10
; ││└
; ││ @ div.jl:181 within divrem @ div.jl:204
; ││┌ @ int.jl:88 within *
%39 = mul i64 %38, %10
; ││└
; ││┌ @ int.jl:86 within -
%40 = sub i64 %37, %39
; │└└
; │ @ /Users/carlo/.julia/packages/NNlib/mRRJu/src/scatter.jl:122 within macro expansion
; │┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/array.jl:103 within getindex
; ││┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/array.jl:82 within arrayref
; │││┌ @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/pointer.jl:85 within unsafe_load
; ││││┌ @ none within pointerref
; │││││┌ @ none within macro expansion @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/base.jl:39
%41 = getelementptr inbounds i32, i32 addrspace(1)* %.unpack4549, i64 %38
%42 = load i32, i32 addrspace(1)* %41, align 4
; │└└└└└
; │┌ @ /Users/carlo/.julia/packages/NNlib/mRRJu/src/scatter.jl:133 within _convert_i64
; ││┌ @ boot.jl:892 within Int64
; │││┌ @ boot.jl:811 within toInt64
%43 = sext i32 %42 to i64
; │└└└
; │ @ /Users/carlo/.julia/packages/NNlib/mRRJu/src/scatter.jl:123 within macro expansion
; │┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/array.jl:103 within getindex
; ││┌ @ /Users/carlo/.julia/packages/Metal/JtmpJ/src/device/array.jl:82 within arrayref
; │││┌ @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/pointer.jl:85 within unsafe_load
; ││││┌ @ none within pointerref
; │││││┌ @ none within macro expansion @ /Users/carlo/.julia/packages/LLVM/wMjUU/src/interop/base.jl:39
%44 = sext i32 %thread_position_in_grid to i64
%45 = getelementptr inbounds float, float addrspace(1)* %.unpack3743, i64 %44
%46 = load float, float addrspace(1)* %45, align 4
; │└└└└└
; │┌ @ /Users/carlo/.julia/packages/Atomix/g4H61/src/generic.jl:120 within modify! @ /Users/carlo/.julia/packages/Atomix/g4H61/ext/AtomixMetalExt.jl:34
; ││┌ @ number.jl:7 within convert
; │││┌ @ float.jl:991 within Int32
; ││││┌ @ float.jl:619 within <=
%47 = fcmp ult float %46, 0xC1E0000000000000
%48 = fcmp uge float %46, 0x41E0000000000000
; ││││└
%or.cond = or i1 %47, %48
br i1 %or.cond, label %L219, label %L211
; └└└└
}

</details>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants