Closed
Description
The following simple kernel
function kernel1!(f0::AbstractMatrix{TF}, f1::AbstractArray{TF,3}, f2::AbstractArray{TF,3},
ρ::AbstractMatrix{TF}, u::AbstractMatrix{TF}, v::AbstractMatrix{TF}, ω::TF)
@inbounds begin
n::Int64 = size(ρ,1)
@loop for y in (1:n; (blockIdx().y-1)*blockDim().y+threadIdx().y)
if y<2 || y>n-1 continue end
@loop for x in (1:n; (blockIdx().x-1)*blockDim().x+threadIdx().x)
if x<2 || x>n-1 continue end
fs::TF = f0[x,y]
fm = MVector{8,TF}(undef)
ρ0::TF, u0::TF, v0::TF = fs, zero(TF), zero(TF)
for q ∈ 1:8
fq::TF = f1[q,x-dx[q],y-dy[q]]
ρ0::TF += fq
u0::TF += fq * ex[q]
v0::TF += fq * ey[q]
fm[q] = fq
end
ρ0_inv::TF = one(TF) / ρ0
u0::TF *= ρ0_inv
v0::TF *= ρ0_inv
ρ[x,y] = ρ0
u[x,y] = u0
v[x,y] = v0
uv::TF = u0*u0 + v0*v0
f0[x,y] = fs - ω * (fs - ws*ρ0*(one(TF) - TF(1.5)*uv))
for q ∈ 1:8
eu::TF = ex[q]*u0 + ey[q]*v0
eq::TF = w[q] * ρ0 * (one(TF) + TF(3.0)*eu + TF(4.5)*eu*eu - TF(1.5)*uv)
f2[q,x,y] = fm[q] - ω*(fm[q] - eq)
end
end
end
end
@synchronize
nothing
end
produces a series of duplicate warnings and partly duplicate errors that in abbreviated form read
julia> @time kernel1!(f0g, f1g, f2g, ρg, ug, vg, 0.9f0)
┌ Warning: Decoding arguments to jl_apply_generic failed, please file a bug with a reproducer.
│ inst = %5 = call nonnull %jl_value_t addrspace(10)* @jl_apply_generic(%jl_value_t addrspace(10)* nonnull %4, %jl_value_t addrspace(10)** null, i32 0), !dbg !87
│ bb =
│
│ top:
│ %1 = alloca [2 x %jl_value_t addrspace(10)*], align 8
│ call void @llvm.dbg.declare(metadata %jl_value_t addrspace(10)* null, metadata !85, metadata !DIExpression(DW_OP_deref)), !dbg !87
│ call void @llvm.dbg.value(metadata !4, metadata !86, metadata !DIExpression()), !dbg !87
│ %.sub = getelementptr inbounds [2 x %jl_value_t addrspace(10)*], [2 x %jl_value_t addrspace(10)*]* %1, i64 0, i64 0
│ %2 = call fastcc %jl_value_t addrspace(10)* @jl_box_uint16(i16 zeroext %0), !dbg !87
│ store %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140196326644416 to %jl_value_t*) to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** %.sub, align 8, !dbg !87
│ %3 = getelementptr inbounds [2 x %jl_value_t addrspace(10)*], [2 x %jl_value_t addrspace(10)*]* %1, i64 0, i64 1, !dbg !87
│ store %jl_value_t addrspace(10)* %2, %jl_value_t addrspace(10)** %3, align 8, !dbg !87
│ %4 = call nonnull %jl_value_t addrspace(10)* @jl_f_apply_type(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* null to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** nonnull %.sub, i32 2), !dbg !87
│ %5 = call nonnull %jl_value_t addrspace(10)* @jl_apply_generic(%jl_value_t addrspace(10)* nonnull %4, %jl_value_t addrspace(10)** null, i32 0), !dbg !87
│ ret void
│
└ @ CUDAnative ~/.julia/packages/CUDAnative/gJDZI/src/compiler/validation.jl:222
ERROR: InvalidIRError: compiling kernel1!(Cassette.Context{nametype(Ctx),Nothing,Nothing,getfield(GPUifyLoops, Symbol("##PassType#399")),Nothing,Cassette.DisableHooks}, typeof(kernel1!), CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,3,CUDAnative.AS.Global}, CuDeviceArray{Float32,3,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, Float32) resulted in invalid LLVM IR
Reason: unsupported call to the Julia runtime (call to jl_f_apply_type)
Stacktrace:
[1] Val at essentials.jl:694 (repeats 2 times)
[2] getindex at /home/dextorious/.julia/packages/CUDAnative/gJDZI/src/device/array.jl:78
[3] _getindex at abstractarray.jl:1004
[4] getindex at abstractarray.jl:981
[5] kernel1! at /home/dextorious/Documents/code/julia/simplelb.jl:66
[6] overdub at /home/dextorious/.julia/packages/Cassette/N8DMt/src/overdub.jl:0
The full stacktrace is given in https://gist.github.com/dextorious/67337310913e04e708db04095d82e311
The condensed MWE to reproduce this: https://gist.github.com/dextorious/7fe847f2996376125e5b59ebfeba3737
This is on Julia 1.3.0-DEV.514 (2019-07-05), commit 8792eb2c76, using the latest released versions of the relevant Julia packages on 64-bit Manjaro Linux with CUDA 10.1. Notably, running the same code on Julia 1.1.1 the kernel compiles and then crashes at runtime by running out of memory from repeatedly heap allocating the MVector
in the kernel. Not sure how to proceed with debugging this further @vchuravy.