Skip to content

Kernel using MVector fails to compile or crashes at runtime due to heap allocation #45

Closed
@dextorious

Description

@dextorious

The following simple kernel

function kernel1!(f0::AbstractMatrix{TF}, f1::AbstractArray{TF,3}, f2::AbstractArray{TF,3},
                  ρ::AbstractMatrix{TF}, u::AbstractMatrix{TF}, v::AbstractMatrix{TF}, ω::TF)
    @inbounds begin
    n::Int64 = size(ρ,1)
    @loop for y in (1:n; (blockIdx().y-1)*blockDim().y+threadIdx().y)
        if y<2 || y>n-1 continue end
        @loop for x in (1:n; (blockIdx().x-1)*blockDim().x+threadIdx().x)
            if x<2 || x>n-1 continue end
            fs::TF = f0[x,y]
            fm = MVector{8,TF}(undef)
            ρ0::TF, u0::TF, v0::TF = fs, zero(TF), zero(TF)
            for q ∈ 1:8
                fq::TF = f1[q,x-dx[q],y-dy[q]]
                ρ0::TF += fq
                u0::TF += fq * ex[q]
                v0::TF += fq * ey[q]
                fm[q] = fq
            end
            ρ0_inv::TF = one(TF) / ρ0
            u0::TF *= ρ0_inv
            v0::TF *= ρ0_inv
            ρ[x,y] = ρ0
            u[x,y] = u0
            v[x,y] = v0
            uv::TF = u0*u0 + v0*v0
            f0[x,y] = fs - ω * (fs - ws*ρ0*(one(TF) - TF(1.5)*uv))
            for q ∈ 1:8
                eu::TF = ex[q]*u0 + ey[q]*v0
                eq::TF = w[q] * ρ0 * (one(TF) + TF(3.0)*eu + TF(4.5)*eu*eu - TF(1.5)*uv)
                f2[q,x,y] = fm[q] - ω*(fm[q] - eq)
            end
        end
    end
    end
    @synchronize
    nothing
end

produces a series of duplicate warnings and partly duplicate errors that in abbreviated form read

julia> @time kernel1!(f0g, f1g, f2g, ρg, ug, vg, 0.9f0)
┌ Warning: Decoding arguments to jl_apply_generic failed, please file a bug with a reproducer.
│   inst =   %5 = call nonnull %jl_value_t addrspace(10)* @jl_apply_generic(%jl_value_t addrspace(10)* nonnull %4, %jl_value_t addrspace(10)** null, i32 0), !dbg !87
│   bb =
│    
│    top:
│      %1 = alloca [2 x %jl_value_t addrspace(10)*], align 8
│      call void @llvm.dbg.declare(metadata %jl_value_t addrspace(10)* null, metadata !85, metadata !DIExpression(DW_OP_deref)), !dbg !87
│      call void @llvm.dbg.value(metadata !4, metadata !86, metadata !DIExpression()), !dbg !87
│      %.sub = getelementptr inbounds [2 x %jl_value_t addrspace(10)*], [2 x %jl_value_t addrspace(10)*]* %1, i64 0, i64 0
│      %2 = call fastcc %jl_value_t addrspace(10)* @jl_box_uint16(i16 zeroext %0), !dbg !87
│      store %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140196326644416 to %jl_value_t*) to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** %.sub, align 8, !dbg !87
│      %3 = getelementptr inbounds [2 x %jl_value_t addrspace(10)*], [2 x %jl_value_t addrspace(10)*]* %1, i64 0, i64 1, !dbg !87
│      store %jl_value_t addrspace(10)* %2, %jl_value_t addrspace(10)** %3, align 8, !dbg !87
│      %4 = call nonnull %jl_value_t addrspace(10)* @jl_f_apply_type(%jl_value_t addrspace(10)* addrspacecast (%jl_value_t* null to %jl_value_t addrspace(10)*), %jl_value_t addrspace(10)** nonnull %.sub, i32 2), !dbg !87
│      %5 = call nonnull %jl_value_t addrspace(10)* @jl_apply_generic(%jl_value_t addrspace(10)* nonnull %4, %jl_value_t addrspace(10)** null, i32 0), !dbg !87
│      ret void
│    
└ @ CUDAnative ~/.julia/packages/CUDAnative/gJDZI/src/compiler/validation.jl:222

ERROR: InvalidIRError: compiling kernel1!(Cassette.Context{nametype(Ctx),Nothing,Nothing,getfield(GPUifyLoops, Symbol("##PassType#399")),Nothing,Cassette.DisableHooks}, typeof(kernel1!), CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,3,CUDAnative.AS.Global}, CuDeviceArray{Float32,3,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, Float32) resulted in invalid LLVM IR                                                              
Reason: unsupported call to the Julia runtime (call to jl_f_apply_type)
Stacktrace:
 [1] Val at essentials.jl:694 (repeats 2 times)
 [2] getindex at /home/dextorious/.julia/packages/CUDAnative/gJDZI/src/device/array.jl:78
 [3] _getindex at abstractarray.jl:1004
 [4] getindex at abstractarray.jl:981
 [5] kernel1! at /home/dextorious/Documents/code/julia/simplelb.jl:66
 [6] overdub at /home/dextorious/.julia/packages/Cassette/N8DMt/src/overdub.jl:0

The full stacktrace is given in https://gist.github.com/dextorious/67337310913e04e708db04095d82e311
The condensed MWE to reproduce this: https://gist.github.com/dextorious/7fe847f2996376125e5b59ebfeba3737

This is on Julia 1.3.0-DEV.514 (2019-07-05), commit 8792eb2c76, using the latest released versions of the relevant Julia packages on 64-bit Manjaro Linux with CUDA 10.1. Notably, running the same code on Julia 1.1.1 the kernel compiles and then crashes at runtime by running out of memory from repeatedly heap allocating the MVector in the kernel. Not sure how to proceed with debugging this further @vchuravy.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingcuda kernelsStuff about writing CUDA kernels.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions