Codegen unboxes precise values too eagerly #51410

topolarity · 2023-09-20T18:03:17Z

printme(@nospecialize(x)) = println(x)
function foo(a::Any, b::Any, cond::Bool)
     x = cond ? a : b
     if x isa Int64
         return @noinline printme(x)
     end
end

This function has better codegen when despecializing the arguments:

julia> code_llvm(foo, Tuple{Any, Any})
; WARNING: This code may not match what actually runs.
;  @ REPL[13]:1 within `foo`
define void @julia_foo_509({}* noundef nonnull readonly %0, {}* noundef nonnull readonly %1) #0 {
top:
;  @ REPL[13]:2 within `foo`
  %2 = call i8 @j_rand_511() #0
  %3 = and i8 %2, 1
  %.not = icmp eq i8 %3, 0
  %value_phi = select i1 %.not, {}* %1, {}* %0
;  @ REPL[13]:3 within `foo`
  %4 = bitcast {}* %value_phi to i64*
  %5 = getelementptr inbounds i64, i64* %4, i64 -1
  %6 = load atomic i64, i64* %5 unordered, align 8
  %7 = and i64 %6, -16
  %8 = inttoptr i64 %7 to {}*
  %.not1 = icmp eq {}* %8, inttoptr (i64 140457879333328 to {}*)
  br i1 %.not1, label %L8, label %common.ret

common.ret:                                       ; preds = %L8, %top
;  @ REPL[13]:4 within `foo`
  ret void

L8:                                               ; preds = %top
  call void @j_bar_512({}* nonnull readonly %value_phi) #0
  br label %common.ret
}

versus when they are fully specialized:

julia> code_llvm(foo, Tuple{Int64, Int64})
;  @ REPL[13]:1 within `foo`
define void @julia_foo_505(i64 signext %0, i64 signext %1) #0 {
top:
  %gcframe2 = alloca [3 x {}*], align 16
  %gcframe2.sub = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe2, i64 0, i64 0
  %2 = bitcast [3 x {}*]* %gcframe2 to i8*
  call void @llvm.memset.p0i8.i32(i8* noundef nonnull align 16 dereferenceable(24) %2, i8 0, i32 24, i1 false)
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #4
  %ppgcstack_i8 = getelementptr i8, i8* %thread_ptr, i64 -8
  %ppgcstack = bitcast i8* %ppgcstack_i8 to {}****
  %pgcstack = load {}***, {}**** %ppgcstack, align 8
;  @ REPL[13]:2 within `foo`
  %3 = bitcast [3 x {}*]* %gcframe2 to i64*
  store i64 4, i64* %3, align 16
  %4 = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe2, i64 0, i64 1
  %5 = bitcast {}** %4 to {}***
  %6 = load {}**, {}*** %pgcstack, align 8
  store {}** %6, {}*** %5, align 8
  %7 = bitcast {}*** %pgcstack to {}***
  store {}** %gcframe2.sub, {}*** %7, align 8
  %8 = call i8 @j_rand_507() #0
  %9 = and i8 %8, 1
  %.not = icmp eq i8 %9, 0
;  @ REPL[13] within `foo`
  %. = select i1 %.not, i64 %1, i64 %0
;  @ REPL[13]:4 within `foo`
  %10 = call nonnull {}* @ijl_box_int64(i64 signext %.)
  %11 = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe2, i64 0, i64 2
  store {}* %10, {}** %11, align 16
  call void @j_bar_508({}* nonnull readonly %10) #0
  %12 = load {}*, {}** %4, align 8
  %13 = bitcast {}*** %pgcstack to {}**
  store {}* %12, {}** %13, align 8
  ret void
}

Notice in particular the introduction of jl_box_int64 in the specialized function.

The key difference in the Julia Typed IR is:

%5 = φ (#2 => a, #3 => b)::Any
...
%8 = π (%5, Int64)
%9 = invoke Main.bar(%8::Any)::Nothing

versus

%5 = φ (#2 => a, #3 => b)::Int64
%6 = invoke Main.bar(%5::Any)::Nothing

The problem is that codegen too eagerly unboxed the precisely-typed PhiNode value and then was forced to re-box it.

This is the cause of some allocation regressions introduced in #50943 (comment).

The text was updated successfully, but these errors were encountered:

topolarity added performance Must go faster compiler:codegen Generation of LLVM IR and native code labels Sep 20, 2023

topolarity assigned gbaraldi Sep 20, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Codegen unboxes precise values too eagerly #51410

Codegen unboxes precise values too eagerly #51410

topolarity commented Sep 20, 2023

Codegen unboxes precise values too eagerly #51410

Codegen unboxes precise values too eagerly #51410

Comments

topolarity commented Sep 20, 2023