isbits Union Arrays can't simd #23336

quinnj · 2017-08-18T21:09:10Z

julia> function f(v::Vector{Int8}, n)
         s = Int8(0)
         @inbounds for i = 1:n
           s += v[i]
         end
         return s
       end
f (generic function with 2 methods)

julia> function f(v::Vector{Union{Void, Int8}}, n)
         s = Int8(0)
         @inbounds for i = 1:n
           t = Base.arrayref(v, i)
           s += t isa Void ? Int8(0) : t
         end
         return s
       end
f (generic function with 2 methods)

julia> using BenchmarkTools

julia> n = 100
100

julia> b = @benchmarkable f($(Vector{Int8}(n)), $n)
Benchmark(evals=1, seconds=5.0, samples=10000)

julia> tune!(b)
Benchmark(evals=999, seconds=5.0, samples=10000)

julia> run(b)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     5.608 ns (0.00% GC)
  median time:      6.762 ns (0.00% GC)
  mean time:        7.384 ns (0.00% GC)
  maximum time:     72.746 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> V = Vector{Union{Void, Int8}}(n);

julia> b2 = @benchmarkable f($(V), $n)
Benchmark(evals=1, seconds=5.0, samples=10000)

julia> tune!(b2)
Benchmark(evals=944, seconds=5.0, samples=10000)

julia> run(b2)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     96.484 ns (0.00% GC)
  median time:      108.358 ns (0.00% GC)
  mean time:        115.729 ns (0.00% GC)
  maximum time:     346.938 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     944

julia> @code_llvm f(Vector{Int8}(10), 10)

; Function f
; Location: REPL[77]
define i8 @julia_f_62457(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 {
L14:
; Location: REPL[77]:3
  %2 = icmp slt i64 %1, 1
  br i1 %2, label %L38, label %if1.lr.ph

if1.lr.ph:                                        ; preds = %L14
; Location: REPL[77]:4
  %3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
  %4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)*
  %5 = load i8*, i8* addrspace(11)* %4, align 8
; Location: REPL[77]:3
  %min.iters.check = icmp ult i64 %1, 32
  br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked

min.iters.checked:                                ; preds = %if1.lr.ph
  %n.vec = and i64 %1, -32
  %cmp.zero = icmp eq i64 %n.vec, 0
  %ind.end = or i64 %n.vec, 1
  br i1 %cmp.zero, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %min.iters.checked
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <32 x i8> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
  %6 = phi i64 [ 1, %vector.ph ], [ %7, %vector.body ]
  %7 = add i64 %6, 32
; Location: REPL[77]:4
  %8 = add i64 %6, -1
  %9 = getelementptr i8, i8* %5, i64 %8
  %10 = bitcast i8* %9 to <32 x i8>*
  %wide.load = load <32 x i8>, <32 x i8>* %10, align 1
  %11 = add <32 x i8> %wide.load, %vec.phi
; Location: REPL[77]:3
  %index.next = add i64 %index, 32
  %12 = icmp eq i64 %index.next, %n.vec
  br i1 %12, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
; Location: REPL[77]:4
  %rdx.shuf = shufflevector <32 x i8> %11, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx = add <32 x i8> %11, %rdx.shuf
  %rdx.shuf7 = shufflevector <32 x i8> %bin.rdx, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx8 = add <32 x i8> %bin.rdx, %rdx.shuf7
  %rdx.shuf9 = shufflevector <32 x i8> %bin.rdx8, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx10 = add <32 x i8> %bin.rdx8, %rdx.shuf9
  %rdx.shuf11 = shufflevector <32 x i8> %bin.rdx10, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx12 = add <32 x i8> %bin.rdx10, %rdx.shuf11
  %rdx.shuf13 = shufflevector <32 x i8> %bin.rdx12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx14 = add <32 x i8> %bin.rdx12, %rdx.shuf13
  %13 = extractelement <32 x i8> %bin.rdx14, i32 0
  %cmp.n = icmp eq i64 %n.vec, %1
; Location: REPL[77]:3
  br i1 %cmp.n, label %L38.loopexit, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %min.iters.checked, %if1.lr.ph
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %if1.lr.ph ], [ 1, %min.iters.checked ]
  %bc.merge.rdx = phi i8 [ %13, %middle.block ], [ 0, %if1.lr.ph ], [ 0, %min.iters.checked ]
  br label %if1

if1:                                              ; preds = %scalar.ph, %if1
  %s.06 = phi i8 [ %bc.merge.rdx, %scalar.ph ], [ %18, %if1 ]
  %"#temp#.05" = phi i64 [ %bc.resume.val, %scalar.ph ], [ %14, %if1 ]
  %14 = add i64 %"#temp#.05", 1
; Location: REPL[77]:4
  %15 = add i64 %"#temp#.05", -1
  %16 = getelementptr i8, i8* %5, i64 %15
  %17 = load i8, i8* %16, align 1
  %18 = add i8 %17, %s.06
; Location: REPL[77]:3
  %19 = icmp eq i64 %"#temp#.05", %1
  br i1 %19, label %L38.loopexit, label %if1

L38.loopexit:                                     ; preds = %middle.block, %if1
  %.lcssa = phi i8 [ %18, %if1 ], [ %13, %middle.block ]
; Location: REPL[77]:6
  br label %L38

L38:                                              ; preds = %L38.loopexit, %L14
  %s.0.lcssa = phi i8 [ 0, %L14 ], [ %.lcssa, %L38.loopexit ]
  ret i8 %s.0.lcssa
}

julia> @code_llvm f(Vector{Union{Void, Int8}}(10), 10)

; Function f
; Location: REPL[78]
define i8 @julia_f_62451(%jl_value_t addrspace(10)* dereferenceable(40), i64) #0 {
L14:
; Location: REPL[78]:3
  %2 = icmp slt i64 %1, 1
  br i1 %2, label %L66, label %if4.lr.ph

if4.lr.ph:                                        ; preds = %L14
; Location: REPL[78]:4
  %3 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
  %4 = bitcast %jl_value_t addrspace(11)* %3 to i8* addrspace(11)*
  %5 = load i8*, i8* addrspace(11)* %4, align 8
  %6 = bitcast %jl_value_t addrspace(11)* %3 to %jl_array_t addrspace(11)*
  %7 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %6, i64 0, i32 1
  %8 = load i64, i64 addrspace(11)* %7, align 8
; Location: REPL[78]:3
  br label %if4

if4:                                              ; preds = %if4.lr.ph, %post_union_move
  %t.sroa.0.015 = phi i8 [ undef, %if4.lr.ph ], [ %t.sroa.0.1, %post_union_move ]
  %s.014 = phi i8 [ 0, %if4.lr.ph ], [ %17, %post_union_move ]
  %"#temp#.013" = phi i64 [ 1, %if4.lr.ph ], [ %16, %post_union_move ]
; Location: REPL[78]:4
  %9 = add i64 %"#temp#.013", -1
  %10 = add i64 %8, %9
  %11 = getelementptr i8, i8* %5, i64 %10
  %12 = load i8, i8* %11, align 1
  %13 = add nuw i8 %12, 1
  %trunc = trunc i8 %13 to i7
  switch i7 %trunc, label %union_move_skip [
    i7 1, label %post_union_move
    i7 2, label %union_move5
  ]

L66.loopexit:                                     ; preds = %post_union_move
; Location: REPL[78]:7
  br label %L66

L66:                                              ; preds = %L66.loopexit, %L14
  %s.0.lcssa = phi i8 [ 0, %L14 ], [ %17, %L66.loopexit ]
  ret i8 %s.0.lcssa

union_move_skip:                                  ; preds = %if4
; Location: REPL[78]:4
  call void @llvm.trap()
  unreachable

post_union_move:                                  ; preds = %if4, %union_move5
  %t.sroa.0.1 = phi i8 [ %20, %union_move5 ], [ %t.sroa.0.015, %if4 ]
; Location: REPL[78]:5
  %14 = and i8 %13, 127
  %15 = icmp eq i8 %14, 1
  %.t.sroa.0.1 = select i1 %15, i8 0, i8 %t.sroa.0.1
; Location: REPL[78]:3
  %16 = add i64 %"#temp#.013", 1
; Location: REPL[78]:5
  %17 = add i8 %.t.sroa.0.1, %s.014
; Location: REPL[78]:3
  %18 = icmp eq i64 %"#temp#.013", %1
  br i1 %18, label %L66.loopexit, label %if4

union_move5:                                      ; preds = %if4
; Location: REPL[78]:4
  %19 = getelementptr i8, i8* %5, i64 %9
  %20 = load i8, i8* %19, align 1
  br label %post_union_move
}

xiaodaigh · 2018-01-15T10:11:54Z

Based on my benchmark R is faster at counting uniques in a Vector{Bool, Missing} equivalent. Hopefully, Julia can be faster soon. Although for 2 billion records, 10 seconds or so isn't that bad.

code here

vtjnash · 2018-07-28T13:32:18Z

I didn't realize we had an issue for this. It's fixed.

quinnj mentioned this issue Aug 18, 2017

Performance of Union{T1, T2} #22097

Closed

kshyatt added the arrays [a, r, r, a, y, s] label Aug 22, 2017

StefanKarpinski assigned vtjnash Sep 19, 2017

nalimilan mentioned this issue Nov 24, 2017

Replace Nullable{T} with Union{Some{T}, Void} #23642

Merged

nalimilan added the missing data Base.missing and related functionality label Mar 24, 2018

mbauman added the compiler:simd instruction-level vectorization label Apr 24, 2018

vtjnash closed this as completed Jul 28, 2018

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

isbits Union Arrays can't simd #23336

isbits Union Arrays can't simd #23336

quinnj commented Aug 18, 2017

xiaodaigh commented Jan 15, 2018

vtjnash commented Jul 28, 2018

isbits Union Arrays can't simd #23336

isbits Union Arrays can't simd #23336

Comments

quinnj commented Aug 18, 2017

xiaodaigh commented Jan 15, 2018

vtjnash commented Jul 28, 2018