Kernel using MVector fails to compile or crashes at runtime due to heap allocation

The following simple kernel 
```
function kernel1!(f0::AbstractMatrix{TF}, f1::AbstractArray{TF,3}, f2::AbstractArray{TF,3},
                  ρ::AbstractMatrix{TF}, u::AbstractMatrix{TF}, v::AbstractMatrix{TF}, ω::TF)
    @inbounds begin
    n::Int64 = size(ρ,1)
    @loop for y in (1:n; (blockIdx().y-1)*blockDim().y+threadIdx().y)
        if y<2 || y>n-1 continue end
        @loop for x in (1:n; (blockIdx().x-1)*blockDim().x+threadIdx().x)
            if x<2 || x>n-1 continue end
            fs::TF = f0[x,y]
            fm = MVector{8,TF}(undef)
            ρ0::TF, u0::TF, v0::TF = fs, zero(TF), zero(TF)
            for q ∈ 1:8
                fq::TF = f1[q,x-dx[q],y-dy[q]]
                ρ0::TF += fq
                u0::TF += fq * ex[q]
                v0::TF += fq * ey[q]
                fm[q] = fq
            end
            ρ0_inv::TF = one(TF) / ρ0
            u0::TF *= ρ0_inv
            v0::TF *= ρ0_inv
            ρ[x,y] = ρ0
            u[x,y] = u0
            v[x,y] = v0
            uv::TF = u0*u0 + v0*v0
            f0[x,y] = fs - ω * (fs - ws*ρ0*(one(TF) - TF(1.5)*uv))
            for q ∈ 1:8
                eu::TF = ex[q]*u0 + ey[q]*v0
                eq::TF = w[q] * ρ0 * (one(TF) + TF(3.0)*eu + TF(4.5)*eu*eu - TF(1.5)*uv)
                f2[q,x,y] = fm[q] - ω*(fm[q] - eq)
            end
        end
    end
    end
    @synchronize
    nothing
end
```
produces a series of duplicate warnings and partly duplicate errors that in abbreviated form read
```
julia> @time kernel1!(f0g, f1g, f2g, ρg, ug, vg, 0.9f0)
┌ Warning: Decoding arguments to jl_apply_generic failed, please file a bug with a reproducer.
│   inst =   %5 = call nonnull %jl_value_t addrspace(10)* @jl_apply_generic(%jl_value_t addrspace(10)* nonnull %4, %jl_value_t addrspace(10)** null, i32 0), !dbg !87
│   bb =
│    
│    top:
│      %1 = alloca [2 x %jl_value_t addrspace(10)*], align 8
│      call void @llvm.dbg.declare(metadata %jl_value_t addrspace(10)* null, metadata !85, metadata !DIExpression(DW_OP_deref)), !dbg !87
│      call void @llvm.dbg.value(metadata !4, metadata !86, metadata !DIExpression()), !dbg !87
│      %.sub = getelementptr inbounds [2 x %jl_value_t addrspace(10)*], [2 x %jl_value_t addrspace(10)*]* %1, i64 0, i64 0
│      %2 = call fastcc %jl_value_t addrspace(10)* @jl_box_uint16(i16 zeroext %0), !dbg !87
│      store %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140196326644416 to %jl_value_t*) to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** %.sub, align 8, !dbg !87
│      %3 = getelementptr inbounds [2 x %jl_value_t addrspace(10)*], [2 x %jl_value_t addrspace(10)*]* %1, i64 0, i64 1, !dbg !87
│      store %jl_value_t addrspace(10)* %2, %jl_value_t addrspace(10)** %3, align 8, !dbg !87
│      %4 = call nonnull %jl_value_t addrspace(10)* @jl_f_apply_type(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* null to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** nonnull %.sub, i32 2), !dbg !87
│      %5 = call nonnull %jl_value_t addrspace(10)* @jl_apply_generic(%jl_value_t addrspace(10)* nonnull %4, %jl_value_t addrspace(10)** null, i32 0), !dbg !87
│      ret void
│    
└ @ CUDAnative ~/.julia/packages/CUDAnative/gJDZI/src/compiler/validation.jl:222

ERROR: InvalidIRError: compiling kernel1!(Cassette.Context{nametype(Ctx),Nothing,Nothing,getfield(GPUifyLoops, Symbol("##PassType#399")),Nothing,Cassette.DisableHooks}, typeof(kernel1!), CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,3,CUDAnative.AS.Global}, CuDeviceArray{Float32,3,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, Float32) resulted in invalid LLVM IR                                                              
Reason: unsupported call to the Julia runtime (call to jl_f_apply_type)
Stacktrace:
 [1] Val at essentials.jl:694 (repeats 2 times)
 [2] getindex at /home/dextorious/.julia/packages/CUDAnative/gJDZI/src/device/array.jl:78
 [3] _getindex at abstractarray.jl:1004
 [4] getindex at abstractarray.jl:981
 [5] kernel1! at /home/dextorious/Documents/code/julia/simplelb.jl:66
 [6] overdub at /home/dextorious/.julia/packages/Cassette/N8DMt/src/overdub.jl:0
```
The full stacktrace is given in https://gist.github.com/dextorious/67337310913e04e708db04095d82e311
The condensed MWE to reproduce this: https://gist.github.com/dextorious/7fe847f2996376125e5b59ebfeba3737

This is on Julia 1.3.0-DEV.514 (2019-07-05), commit 8792eb2c76, using the latest released versions of the relevant Julia packages on 64-bit Manjaro Linux with CUDA 10.1. Notably, running the same code on Julia 1.1.1 the kernel compiles and then crashes at runtime by running out of memory from repeatedly heap allocating the `MVector` in the kernel. Not sure how to proceed with debugging this further @vchuravy.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Kernel using MVector fails to compile or crashes at runtime due to heap allocation #45

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Kernel using MVector fails to compile or crashes at runtime due to heap allocation #45

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions