Don't eagerly lower intrinsics. #349

maleadt · 2019-02-18T13:04:45Z

Fix #340
Depends on JuliaLang/julia#31101

maleadt · 2019-03-01T13:14:28Z

Looking into #340 (comment), there's another quirk... For some weird reason, after promoting the entry point to a kernel and rewriting IR a little, passes start to run out of order (first changing alloc_obj to ptx_alloc_obj with our LowerGCFrame pass, and only then Julia's alloc-opt pass...)

julia> using LLVM
julia> LLVM.clopts("-print-before=AllocOpt")
julia> LLVM.clopts("-print-after=AllocOpt")

julia> CUDAnative.code_llvm(knl!, Tuple{})

*** IR Dump Before Promote heap allocation to stack ***
define void @julia_knl__92() !dbg !8 {
top:
  %0 = call %jl_value_t*** @julia.ptls_states()
  %1 = bitcast %jl_value_t*** %0 to i8*, !dbg !10
  %2 = call noalias nonnull %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %1, i64 16, %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140194257904848 to %jl_value_t*) to %jl_value_t addrspace(10)*)) #0, !dbg !10
  %3 = addrspacecast %jl_value_t addrspace(10)* %2 to %jl_value_t addrspace(11)*
  br label %L15, !dbg !17

L15:                                              ; preds = %top, %L15
  %value_phi = phi i64 [ 1, %top ], [ %9, %L15 ]
  %value_phi.off = add nsw i64 %value_phi, -1, !dbg !18
  %4 = call token (...) @llvm.julia.gc_preserve_begin(%jl_value_t addrspace(10)* nonnull %2), !dbg !29
  %5 = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %3) #6, !dbg !33
  %6 = bitcast %jl_value_t* %5 to float*, !dbg !37
  %7 = getelementptr inbounds float, float* %6, i64 %value_phi.off, !dbg !37
  store float 0.000000e+00, float* %7, align 1, !dbg !37, !tbaa !39
  call void @llvm.julia.gc_preserve_end(token %4), !dbg !42
  %8 = icmp eq i64 %value_phi, 4, !dbg !43
  %9 = add nuw nsw i64 %value_phi, 1, !dbg !49
  br i1 %8, label %L32, label %L15, !dbg !28

L32:                                              ; preds = %L15
  ret void, !dbg !53
}

*** IR Dump After Promote heap allocation to stack ***
define void @julia_knl__92() !dbg !8 {
top:
  %0 = alloca i128, align 16
  %1 = bitcast i128* %0 to i8*
  %2 = bitcast i8* %1 to %jl_value_t*
  %3 = call %jl_value_t*** @julia.ptls_states()
  %4 = bitcast %jl_value_t*** %3 to i8*, !dbg !10
  call void @llvm.lifetime.start.p0i8(i64 16, i8* %1)
  br label %L15, !dbg !17

L15:                                              ; preds = %top, %L15
  %value_phi = phi i64 [ 1, %top ], [ %8, %L15 ]
  %value_phi.off = add nsw i64 %value_phi, -1, !dbg !18
  %5 = bitcast %jl_value_t* %2 to float*, !dbg !29
  %6 = getelementptr inbounds float, float* %5, i64 %value_phi.off, !dbg !29
  store float 0.000000e+00, float* %6, align 1, !dbg !29, !tbaa !36
  %7 = icmp eq i64 %value_phi, 4, !dbg !39
  %8 = add nuw nsw i64 %value_phi, 1, !dbg !45
  br i1 %7, label %L32, label %L15, !dbg !28

L32:                                              ; preds = %L15
  ret void, !dbg !49
}

julia> CUDAnative.code_llvm(knl!, Tuple{}; kernel=true)

*** IR Dump Before Promote heap allocation to stack ***
define void @ptxcall_knl__93() {
entry:
  %0 = call %jl_value_t*** @julia.ptls_states()
  %1 = call %jl_value_t addrspace(10)* @ptx_gc_pool_alloc(i64 16), !dbg !21
  %2 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)*
  br label %L11.i, !dbg !29

L11.i:                                            ; preds = %entry, %L11.i
  %value_phi.i = phi i64 [ 1, %entry ], [ %8, %L11.i ]
  %value_phi.i.off = add nsw i64 %value_phi.i, -1, !dbg !30
  %3 = call token (...) @llvm.julia.gc_preserve_begin(%jl_value_t addrspace(10)* nonnull %1), !dbg !41
  %4 = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %2) #6, !dbg !45
  %5 = bitcast %jl_value_t* %4 to float*, !dbg !49
  %6 = getelementptr inbounds float, float* %5, i64 %value_phi.i.off, !dbg !49
  store float 0.000000e+00, float* %6, align 1, !dbg !49, !tbaa !51
  call void @llvm.julia.gc_preserve_end(token %3), !dbg !52
  %7 = icmp eq i64 %value_phi.i, 4, !dbg !53
  %8 = add nuw nsw i64 %value_phi.i, 1, !dbg !59
  br i1 %7, label %julia_knl__93.exit, label %L11.i, !dbg !40

julia_knl__93.exit:                               ; preds = %L11.i
  ret void
}

*** IR Dump After Promote heap allocation to stack ***
define void @ptxcall_knl__93() {
entry:
  %0 = call %jl_value_t*** @julia.ptls_states()
  %1 = call %jl_value_t addrspace(10)* @ptx_gc_pool_alloc(i64 16), !dbg !21
  %2 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)*
  br label %L11.i, !dbg !29

L11.i:                                            ; preds = %entry, %L11.i
  %value_phi.i = phi i64 [ 1, %entry ], [ %8, %L11.i ]
  %value_phi.i.off = add nsw i64 %value_phi.i, -1, !dbg !30
  %3 = call token (...) @llvm.julia.gc_preserve_begin(%jl_value_t addrspace(10)* nonnull %1), !dbg !41
  %4 = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %2) #6, !dbg !45
  %5 = bitcast %jl_value_t* %4 to float*, !dbg !49
  %6 = getelementptr inbounds float, float* %5, i64 %value_phi.i.off, !dbg !49
  store float 0.000000e+00, float* %6, align 1, !dbg !49, !tbaa !51
  call void @llvm.julia.gc_preserve_end(token %3), !dbg !52
  %7 = icmp eq i64 %value_phi.i, 4, !dbg !53
  %8 = add nuw nsw i64 %value_phi.i, 1, !dbg !59
  br i1 %7, label %julia_knl__93.exit, label %L11.i, !dbg !40

julia_knl__93.exit:                               ; preds = %L11.i
  ret void
}

Note the ptx_gc_pool_alloc in the second listing before alloc-opt has processed this function, even though we queue passes like this:

CUDAnative.jl/src/compiler/optim.jl

Lines 19 to 26 in e6b9b37

    
           ccall(:jl_add_optimization_passes, Cvoid, 
        
                 (LLVM.API.LLVMPassManagerRef, Cint, Cint), 
        
                 LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0) 
        
           # custom intrinsic lowering 
        
           add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) 
        
           aggressive_dce!(pm) # remove dead uses of ptls 
        
           add!(pm, ModulePass("LowerPTLS", lower_ptls!))

maleadt · 2019-03-01T13:15:05Z

I've split the optimization into multiple pass manager runs -- that probably adds some overhead, but seems like the only true way to force pass ordering (@vchuravy?).

vchuravy · 2019-03-01T18:13:30Z

There is a Barrier pass that will stop LLVM from doing reruns. I don't totally understand what happens, but I think the pass pipeline is not linear...

maleadt · 2019-03-01T18:47:17Z

Yeah, I saw that but the comments in the pass were confusing and it's not wrapped by the C API.

vchuravy · 2019-03-01T18:58:17Z

I hope we can switch to the new pass manager soon...

vchuravy · 2019-03-01T19:01:41Z

This patch introduces the Barrier pass http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20121015/153541.html and is the clearest on its purpose.

lcw · 2019-03-05T01:07:42Z

Any hope this change will get merged to master soon?

vchuravy · 2019-03-05T01:36:15Z

bors try

I think the only thing missing here is a test. let's see if that works everywhere (I suspect not quite yet)

bors · 2019-03-05T01:45:57Z

try

Build failed

ci/gitlab/trying

vchuravy · 2019-03-05T02:13:34Z

I will leave this to @maleadt to merge.

bors try

bors · 2019-03-05T02:24:35Z

try

Build succeeded

ci/gitlab/trying

maleadt · 2019-03-05T20:58:42Z

I had hoped to use a different pass order mechanism, but this should be fine for now.

lcw · 2019-03-05T21:03:22Z

@vchuravy and @maleadt, thanks for all your work pushing this through!

Don't eagerly lower intrinsics.

40d2628

maleadt force-pushed the tb/skip_lower_intrinsics branch from 13afc7c to 40d2628 Compare February 18, 2019 16:00

Fix version number.

e6b9b37

maleadt added bug codegen labels Feb 25, 2019

lcw mentioned this pull request Feb 28, 2019

Creating an MArray performs a dynamic allocation #340

Closed

maleadt changed the title ~~Don't eagerly lower intrinsics.~~ WIP: Don't eagerly lower intrinsics. Mar 1, 2019

maleadt added 2 commits March 1, 2019 14:14

Force pass ordering using multiple pass managers.

5182992

Get rid of internalized functions.

b1dc8af

lcw mentioned this pull request Mar 5, 2019

add prefetch vchuravy/GPUifyLoops.jl#5

Open

bors bot added a commit that referenced this pull request Mar 5, 2019

Try #349:

ae98b2c

add test for allocation ellision

5ebe8bd

vchuravy force-pushed the tb/skip_lower_intrinsics branch from 7551edb to 5ebe8bd Compare March 5, 2019 02:13

vchuravy changed the title ~~WIP: Don't eagerly lower intrinsics.~~ Don't eagerly lower intrinsics. Mar 5, 2019

bors bot added a commit that referenced this pull request Mar 5, 2019

Try #349:

462cb2c

maleadt merged commit e28c5f0 into master Mar 5, 2019

bors bot deleted the tb/skip_lower_intrinsics branch March 5, 2019 20:59

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Don't eagerly lower intrinsics. #349

Don't eagerly lower intrinsics. #349

maleadt commented Feb 18, 2019 •

edited

Loading

maleadt commented Mar 1, 2019

maleadt commented Mar 1, 2019

vchuravy commented Mar 1, 2019

maleadt commented Mar 1, 2019 •

edited

Loading

vchuravy commented Mar 1, 2019

vchuravy commented Mar 1, 2019

lcw commented Mar 5, 2019

vchuravy commented Mar 5, 2019

bors bot commented Mar 5, 2019

vchuravy commented Mar 5, 2019

bors bot commented Mar 5, 2019

maleadt commented Mar 5, 2019

lcw commented Mar 5, 2019

Don't eagerly lower intrinsics. #349

Don't eagerly lower intrinsics. #349

Conversation

maleadt commented Feb 18, 2019 • edited Loading

maleadt commented Mar 1, 2019

maleadt commented Mar 1, 2019

vchuravy commented Mar 1, 2019

maleadt commented Mar 1, 2019 • edited Loading

vchuravy commented Mar 1, 2019

vchuravy commented Mar 1, 2019

lcw commented Mar 5, 2019

vchuravy commented Mar 5, 2019

bors bot commented Mar 5, 2019

try

Build failed

vchuravy commented Mar 5, 2019

bors bot commented Mar 5, 2019

try

Build succeeded

maleadt commented Mar 5, 2019

lcw commented Mar 5, 2019

maleadt commented Feb 18, 2019 •

edited

Loading

maleadt commented Mar 1, 2019 •

edited

Loading