-
-
Notifications
You must be signed in to change notification settings - Fork 55
Conversation
13afc7c
to
40d2628
Compare
Looking into #340 (comment), there's another quirk... For some weird reason, after promoting the entry point to a kernel and rewriting IR a little, passes start to run out of order (first changing alloc_obj to ptx_alloc_obj with our LowerGCFrame pass, and only then Julia's alloc-opt pass...)
julia> CUDAnative.code_llvm(knl!, Tuple{})
*** IR Dump Before Promote heap allocation to stack ***
define void @julia_knl__92() !dbg !8 {
top:
%0 = call %jl_value_t*** @julia.ptls_states()
%1 = bitcast %jl_value_t*** %0 to i8*, !dbg !10
%2 = call noalias nonnull %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %1, i64 16, %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140194257904848 to %jl_value_t*) to %jl_value_t addrspace(10)*)) #0, !dbg !10
%3 = addrspacecast %jl_value_t addrspace(10)* %2 to %jl_value_t addrspace(11)*
br label %L15, !dbg !17
L15: ; preds = %top, %L15
%value_phi = phi i64 [ 1, %top ], [ %9, %L15 ]
%value_phi.off = add nsw i64 %value_phi, -1, !dbg !18
%4 = call token (...) @llvm.julia.gc_preserve_begin(%jl_value_t addrspace(10)* nonnull %2), !dbg !29
%5 = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %3) #6, !dbg !33
%6 = bitcast %jl_value_t* %5 to float*, !dbg !37
%7 = getelementptr inbounds float, float* %6, i64 %value_phi.off, !dbg !37
store float 0.000000e+00, float* %7, align 1, !dbg !37, !tbaa !39
call void @llvm.julia.gc_preserve_end(token %4), !dbg !42
%8 = icmp eq i64 %value_phi, 4, !dbg !43
%9 = add nuw nsw i64 %value_phi, 1, !dbg !49
br i1 %8, label %L32, label %L15, !dbg !28
L32: ; preds = %L15
ret void, !dbg !53
}
*** IR Dump After Promote heap allocation to stack ***
define void @julia_knl__92() !dbg !8 {
top:
%0 = alloca i128, align 16
%1 = bitcast i128* %0 to i8*
%2 = bitcast i8* %1 to %jl_value_t*
%3 = call %jl_value_t*** @julia.ptls_states()
%4 = bitcast %jl_value_t*** %3 to i8*, !dbg !10
call void @llvm.lifetime.start.p0i8(i64 16, i8* %1)
br label %L15, !dbg !17
L15: ; preds = %top, %L15
%value_phi = phi i64 [ 1, %top ], [ %8, %L15 ]
%value_phi.off = add nsw i64 %value_phi, -1, !dbg !18
%5 = bitcast %jl_value_t* %2 to float*, !dbg !29
%6 = getelementptr inbounds float, float* %5, i64 %value_phi.off, !dbg !29
store float 0.000000e+00, float* %6, align 1, !dbg !29, !tbaa !36
%7 = icmp eq i64 %value_phi, 4, !dbg !39
%8 = add nuw nsw i64 %value_phi, 1, !dbg !45
br i1 %7, label %L32, label %L15, !dbg !28
L32: ; preds = %L15
ret void, !dbg !49
} julia> CUDAnative.code_llvm(knl!, Tuple{}; kernel=true)
*** IR Dump Before Promote heap allocation to stack ***
define void @ptxcall_knl__93() {
entry:
%0 = call %jl_value_t*** @julia.ptls_states()
%1 = call %jl_value_t addrspace(10)* @ptx_gc_pool_alloc(i64 16), !dbg !21
%2 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)*
br label %L11.i, !dbg !29
L11.i: ; preds = %entry, %L11.i
%value_phi.i = phi i64 [ 1, %entry ], [ %8, %L11.i ]
%value_phi.i.off = add nsw i64 %value_phi.i, -1, !dbg !30
%3 = call token (...) @llvm.julia.gc_preserve_begin(%jl_value_t addrspace(10)* nonnull %1), !dbg !41
%4 = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %2) #6, !dbg !45
%5 = bitcast %jl_value_t* %4 to float*, !dbg !49
%6 = getelementptr inbounds float, float* %5, i64 %value_phi.i.off, !dbg !49
store float 0.000000e+00, float* %6, align 1, !dbg !49, !tbaa !51
call void @llvm.julia.gc_preserve_end(token %3), !dbg !52
%7 = icmp eq i64 %value_phi.i, 4, !dbg !53
%8 = add nuw nsw i64 %value_phi.i, 1, !dbg !59
br i1 %7, label %julia_knl__93.exit, label %L11.i, !dbg !40
julia_knl__93.exit: ; preds = %L11.i
ret void
}
*** IR Dump After Promote heap allocation to stack ***
define void @ptxcall_knl__93() {
entry:
%0 = call %jl_value_t*** @julia.ptls_states()
%1 = call %jl_value_t addrspace(10)* @ptx_gc_pool_alloc(i64 16), !dbg !21
%2 = addrspacecast %jl_value_t addrspace(10)* %1 to %jl_value_t addrspace(11)*
br label %L11.i, !dbg !29
L11.i: ; preds = %entry, %L11.i
%value_phi.i = phi i64 [ 1, %entry ], [ %8, %L11.i ]
%value_phi.i.off = add nsw i64 %value_phi.i, -1, !dbg !30
%3 = call token (...) @llvm.julia.gc_preserve_begin(%jl_value_t addrspace(10)* nonnull %1), !dbg !41
%4 = call %jl_value_t* @julia.pointer_from_objref(%jl_value_t addrspace(11)* %2) #6, !dbg !45
%5 = bitcast %jl_value_t* %4 to float*, !dbg !49
%6 = getelementptr inbounds float, float* %5, i64 %value_phi.i.off, !dbg !49
store float 0.000000e+00, float* %6, align 1, !dbg !49, !tbaa !51
call void @llvm.julia.gc_preserve_end(token %3), !dbg !52
%7 = icmp eq i64 %value_phi.i, 4, !dbg !53
%8 = add nuw nsw i64 %value_phi.i, 1, !dbg !59
br i1 %7, label %julia_knl__93.exit, label %L11.i, !dbg !40
julia_knl__93.exit: ; preds = %L11.i
ret void
} Note the CUDAnative.jl/src/compiler/optim.jl Lines 19 to 26 in e6b9b37
|
I've split the optimization into multiple pass manager runs -- that probably adds some overhead, but seems like the only true way to force pass ordering (@vchuravy?). |
There is a Barrier pass that will stop LLVM from doing reruns. I don't totally understand what happens, but I think the pass pipeline is not linear... |
Yeah, I saw that but the comments in the pass were confusing and it's not wrapped by the C API. |
I hope we can switch to the new pass manager soon... |
This patch introduces the Barrier pass http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20121015/153541.html and is the clearest on its purpose. |
Any hope this change will get merged to master soon? |
bors try I think the only thing missing here is a test. let's see if that works everywhere (I suspect not quite yet) |
tryBuild failed |
7551edb
to
5ebe8bd
Compare
I will leave this to @maleadt to merge. bors try |
tryBuild succeeded |
I had hoped to use a different pass order mechanism, but this should be fine for now. |
Fix #340
Depends on JuliaLang/julia#31101