[WIP] Stack allocate some genericmemory #52382

pchintalapudi · 2023-12-03T19:24:37Z

Just trying this out.

Examples:

Example 1

julia> function my_cumsum(arr)
           length(arr) < 1 && return zero(eltype(arr))
           temp = zeros(eltype(arr), length(arr))
           @inbounds temp[1] = arr[1]
           for i in 2:length(temp)
               @inbounds temp[i] = temp[i-1] + arr[i]
           end
           @inbounds temp[length(arr)]
       end
my_cumsum (generic function with 1 method)

julia> @code_llvm my_cumsum([0, 1, 2, 3])

Before:

; Function Signature: my_cumsum(Array{Int64, 1})
;  @ REPL[1]:1 within `my_cumsum`
define i64 @julia_my_cumsum_724({}* noundef nonnull align 8 dereferenceable(24) %"arr::Array") #0 {
top:
;  @ REPL[1]:2 within `my_cumsum`
; ┌ @ essentials.jl:11 within `length`
   %0 = bitcast {}* %"arr::Array" to i8*
   %1 = getelementptr inbounds i8, i8* %0, i64 16
   %memcpy_refined_src = bitcast i8* %1 to i64*
   %2 = load i64, i64* %memcpy_refined_src, align 8
; └
; ┌ @ int.jl:83 within `<`
   %3 = icmp sgt i64 %2, 0
; └
  br i1 %3, label %L82, label %common.ret

common.ret:                                       ; preds = %L192, %top
  %common.ret.op = phi i64 [ %87, %L192 ], [ 0, %top ]
;  @ REPL[1] within `my_cumsum`
  ret i64 %common.ret.op

L82:                                              ; preds = %top
;  @ REPL[1]:3 within `my_cumsum`
; ┌ @ array.jl:571 within `zeros` @ array.jl:575
; │┌ @ boot.jl:586 within `Array` @ boot.jl:573
; ││┌ @ boot.jl:513 within `GenericMemory`
     %"Memory{Int64}[]" = call {}* @jl_alloc_genericmemory({}* @"+Core.GenericMemory#730.jit", i64 %2)
; ││└
; ││ @ boot.jl:586 within `Array` @ boot.jl:574
    %4 = bitcast {}* %"Memory{Int64}[]" to { i64, {}** }*
    %.data_ptr = getelementptr inbounds { i64, {}** }, { i64, {}** }* %4, i64 0, i32 1
    %5 = load {}**, {}*** %.data_ptr, align 8
; │└
; │ @ array.jl:571 within `zeros` @ array.jl:576
; │┌ @ array.jl:330 within `fill!`
    %6 = bitcast {}** %5 to i8*
    %7 = shl nuw i64 %2, 3
; ││ @ array.jl:329 within `fill!`
; ││┌ @ array.jl:969 within `setindex!`
     call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %6, i8 0, i64 %7, i1 false)
; └└└

...

After:

; Function Signature: my_cumsum(Array{Int64, 1})
;  @ REPL[1]:1 within `my_cumsum`
define i64 @julia_my_cumsum_724({}* noundef nonnull align 8 dereferenceable(24) %"arr::Array") #0 {
top:
  %"Memory{Int64}[].stack_data206" = alloca [4096 x i8], align 16
;  @ REPL[1]:2 within `my_cumsum`
; ┌ @ essentials.jl:11 within `length`
   %0 = bitcast {}* %"arr::Array" to i8*
   %1 = getelementptr inbounds i8, i8* %0, i64 16
   %memcpy_refined_src = bitcast i8* %1 to i64*
   %2 = load i64, i64* %memcpy_refined_src, align 8
; └
; ┌ @ int.jl:83 within `<`
   %3 = icmp sgt i64 %2, 0
; └
  br i1 %3, label %L82, label %common.ret

common.ret:                                       ; preds = %L192, %top
  %common.ret.op = phi i64 [ %88, %L192 ], [ 0, %top ]
;  @ REPL[1] within `my_cumsum`
  ret i64 %common.ret.op

L82:                                              ; preds = %top
  %"Memory{Int64}[].stack_data206.sub" = getelementptr inbounds [4096 x i8], [4096 x i8]* %"Memory{Int64}[].stack_data206", i64 0, i64 0
;  @ REPL[1]:3 within `my_cumsum`
; ┌ @ array.jl:571 within `zeros` @ array.jl:575
; │┌ @ boot.jl:586 within `Array` @ boot.jl:573
; ││┌ @ boot.jl:513 within `GenericMemory`
     %4 = icmp ugt i64 %2, 4096
     br i1 %4, label %stack_alloc_fallback, label %allocated_array

stack_alloc_fallback:                             ; preds = %L82
     %5 = call {}* @jl_alloc_genericmemory({}* @"+Core.GenericMemory#730.jit", i64 %2)
     %6 = bitcast {}* %5 to i8**
     %"Memory{Int64}[].fallback_data_ptr" = getelementptr i8*, i8** %6, i64 1
     %"Memory{Int64}[].fallback_data" = load i8*, i8** %"Memory{Int64}[].fallback_data_ptr", align 8
     br label %allocated_array

allocated_array:                                  ; preds = %stack_alloc_fallback, %L82
     %"Memory{Int64}[].data" = phi i8* [ %"Memory{Int64}[].stack_data206.sub", %L82 ], [ %"Memory{Int64}[].fallback_data", %stack_alloc_fallback ]
; ││└
; ││ @ boot.jl:586 within `Array` @ boot.jl:574
    %7 = bitcast i8* %"Memory{Int64}[].data" to {}**
; │└
; │ @ array.jl:571 within `zeros` @ array.jl:576
; │┌ @ array.jl:330 within `fill!`
    %8 = shl nuw i64 %2, 3
; ││ @ array.jl:329 within `fill!`
; ││┌ @ array.jl:969 within `setindex!`
     call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %"Memory{Int64}[].data", i8 0, i64 %8, i1 false)
; └└└

...

pchintalapudi · 2023-12-04T03:50:00Z

Example 2 (after some pipeline changes that may go behind O3)

julia> function xinc(x)
                  return [x, x+1, x+2]
              end;

julia> function loopinc()
                  y = 0
                  for i = 1:10^7
                      ret = xinc(i)
                      y += ret[2]
                  end
                  return y
              end;

julia> @code_llvm loopinc()

Before:

; Function Signature: loopinc()
;  @ REPL[2]:1 within `loopinc`
define i64 @julia_loopinc_855() #0 {
top:
  br label %L44

L44:                                              ; preds = %L44, %top
  %value_phi = phi i64 [ 1, %top ], [ %0, %L44 ]
;  @ REPL[2]:4 within `loopinc`
; ┌ @ REPL[1]:2 within `xinc`
; │┌ @ int.jl:87 within `+`
    %0 = add nuw nsw i64 %value_phi, 1
    %1 = add nuw nsw i64 %value_phi, 2
; │└
; │┌ @ array.jl:161 within `vect`
; ││┌ @ boot.jl:573 within `Array`
; │││┌ @ boot.jl:513 within `GenericMemory`
      %"Memory{Int64}[]" = call {}* @jl_alloc_genericmemory({}* @"+Core.GenericMemory#859.jit", i64 3)
; │││└
; │││ @ boot.jl:574 within `Array`
     %2 = bitcast {}* %"Memory{Int64}[]" to { i64, {}** }*
     %.data_ptr = getelementptr inbounds { i64, {}** }, { i64, {}** }* %2, i64 0, i32 1
     %3 = load {}**, {}*** %.data_ptr, align 8
; ││└
; ││ @ array.jl:163 within `vect`
; ││┌ @ array.jl:983 within `__safe_setindex!`
     %4 = bitcast {}** %3 to i64*
     store i64 %value_phi, i64* %4, align 8
     %"new::Tuple.sroa.2.0..sroa_idx4144" = getelementptr inbounds {}*, {}** %3, i64 1
     %5 = bitcast {}** %"new::Tuple.sroa.2.0..sroa_idx4144" to i64*
     store i64 %0, i64* %5, align 8
     %"new::Tuple.sroa.3.0..sroa_idx4245" = getelementptr inbounds {}*, {}** %3, i64 2
     %6 = bitcast {}** %"new::Tuple.sroa.3.0..sroa_idx4245" to i64*
     store i64 %1, i64* %6, align 8
; └└└
;  @ REPL[2]:6 within `loopinc`
; ┌ @ range.jl:902 within `iterate`
; │┌ @ promotion.jl:620 within `==`
    %.not = icmp eq i64 %value_phi, 10000000
; └└
  br i1 %.not, label %L60, label %L44

L60:                                              ; preds = %L44
;  @ REPL[2]:7 within `loopinc`
  ret i64 50000015000000
}

After:

; Function Signature: loopinc()
;  @ REPL[2]:1 within `loopinc`
define i64 @julia_loopinc_692() #0 {
top:
;  @ REPL[2]:7 within `loopinc`
  ret i64 50000015000000
}

vtjnash

Exciting work!

gbaraldi · 2023-12-05T18:29:33Z

src/llvm-alloc-opt.cpp

+    IRBuilder<> builder(&*F.getEntryBlock().getFirstInsertionPt());
+    StringRef origName = orig->getName();
+    auto T_size = pass.DL->getIntPtrType(builder.getContext());
+    auto data = builder.CreateAlloca(Type::getInt8Ty(builder.getContext()), ConstantInt::get(T_size, maxStackAlloc));


Can we use a dynamically sized alloca here? Instead of maxStackAlloc? Not sure if it makes a difference

So there's a few reasons I don't want a dynamic alloca:

SROA won't look at any alloca that's outside of the entry block. To be fair it's unlikely that SROA can get rid of the array anyways, but it's still an opportunity I would like to give.

We can't just stick the dynamically sized alloca in the entry block, since we don't know the chain of computations leading to the length of the array and whether or not that can be entirely hoisted.

On a related note, alloca isn't freed until the end of the function. If you don't put your allocas in the entry block, you are very liable to put one in a loop, which will almost certainly cause a stack overflow in a nigh undebuggable manner.

The above related note is actually why gcc and clang will not inline functions that have dynamically sized allocas, or perhaps even any explicit allocas at all (I forget the details).

Is LLVM able to reduce the size of the alloca if it sees we don't use the whole thing? Not sure if that's an optimization that may happen.
My concern is paying the cost of the big alloca for something that may be only 3 or 4 elements long. Though I guess if it's dynamic then there isn't much we can do.

The fallback size is something that can probably be up for debate (I chose 4096 to make it consistent with sized allocations, and I wanted most sized allocations to be stack allocated), but yeah there's not much else we can do about it. Incidentally, this is basically turning every genericmemory into a SmallVector, so maybe we can take some size hints off of that? I would at least like an 8x8 8-byte element array to be stack allocated though (so my personal minimum is 512 bytes for unsized stack allocations).

I think the size we are willing to do this to should vary depending on whether we know the size. when we know the size, doing this for moderately sized arrays makes sense, but fragmenting the stack has a cost so imo, we shouldn't be as willing to alloca 4k for a list that might have 2 elements.

pchintalapudi · 2023-12-06T05:30:47Z

@vtjnash I'm doing a few experiments with partial escape analysis, to stack allocate even when we haven't eliminated the bounds error, and I'm running into the issue of the data pointer being untrackable in address space 0 (so I can't replace it with a sunken allocation's data pointer in just the error blocks). What was the reason we made that pointer be in address space 0, and is there any way for me to track the pointer?

vtjnash · 2023-12-06T14:22:19Z

What are you trying to track about it? Unless there is a load from it (which will cast it to addrspace 13 first), the only valid uses of it are typically as a sort of token

pchintalapudi · 2023-12-07T01:46:57Z

So my example is here: https://godbolt.org/z/ffnxhe5TG

Essentially the important part is the genericmemory allocation has its data pointer loaded in %10 as an i8* untracked pointer, which we then store to an alloca'd i64, which we then capture and store into a sunken array allocation in L88 as %32 (implicitly escaping the data pointer). This is fine for basic escape analysis/SROA, since if the data pointer escapes the genericmemory also escapes and we don't need to worry about it.

The problem right now is that I'm doing partial escape analysis, so my transformation essentially needs to turn any stack pointers in the error block into heap pointers. I can't do that without full knowledge of what is and isn't a pointer that's going to be on the stack.

That being said, for now I think I can get around the problem by rerunning escape analysis on every load of the data pointer and bailing out if it ever becomes untrackable outside of an error block, but it's kind of expensive and conservative to do that.

pchintalapudi · 2023-12-10T16:36:46Z

So with the latest changes I've added partial escape analysis, to be able to stack allocate arrays even when bounds errors exist. That plus the addition of an extra instcombine and loopfullunroll pass (vs the standard pipeline) have optimized the examples I've tried this on to ~optimal as far as I can tell, so I don't think I have much to do on the bitsarray allocation front. I also personally don't plan to extend this to pointer arrays (no reason why it can't be, just I don't know enough about late-gc-lowering/garbage collection to do it myself).

In terms of results, I see ~10 arrays were affected by escape analysis during building the system image (and another ~10 during Pkg precompilation), which isn't super great. I would like to see if it has an impact on more numerical code, if anyone wants to try that out.

oscardssmith · 2023-12-10T16:57:13Z

@chriselrod you have some examples to test here, right?

chriselrod · 2023-12-10T17:22:54Z

@chriselrod you have some examples to test here, right?

Yeah, would be good to compare up the task local storage version here https://spmd.org/posts/multithreadedallocations/

chriselrod · 2023-12-11T01:16:53Z

I just tested this and saw no difference from Julia's master branch, except that while building Julia I saw the occasional printing of

allocation was sized

or

allocation was unsized

Mind pinging me when I should expect to see a difference?

Master:

julia> GC.gc(); @time do_multithreaded_work!(gccexpm!, Bs, As, testrange);
  0.860287 seconds (8.41 k allocations: 4.532 MiB)

julia> GC.gc(); @time do_multithreaded_work!(gccexpm!, Bs, As, testrange);
  0.854966 seconds (8.41 k allocations: 4.532 MiB)

julia> GC.gc(); @time do_multithreaded_work!(clangexpm!, Bs, As, testrange);
  0.776054 seconds (8.41 k allocations: 4.532 MiB)

julia> GC.gc(); @time do_multithreaded_work!(clangexpm!, Bs, As, testrange);
  0.761762 seconds (8.41 k allocations: 4.532 MiB)

julia> GC.gc(); @time do_multithreaded_work!(expm_tls!, Bs, As, testrange);
  1.223061 seconds (21.29 M allocations: 728.865 MiB, 4.70% gc time)

julia> GC.gc(); @time do_multithreaded_work!(expm_tls!, Bs, As, testrange);
  1.255723 seconds (21.29 M allocations: 728.853 MiB, 4.51% gc time)

julia> GC.gc(); @time do_multithreaded_work!(expm_custommul!, Bs, As, testrange);
  4.376751 seconds (84.66 M allocations: 36.832 GiB, 63.57% gc time)

julia> GC.gc(); @time do_multithreaded_work!(expm_custommul!, Bs, As, testrange);
  4.446788 seconds (84.66 M allocations: 36.832 GiB, 64.02% gc time)

PR:

julia> GC.gc(); @time do_multithreaded_work!(gccexpm!, Bs, As, testrange);
  0.892432 seconds (8.41 k allocations: 4.532 MiB)

julia> GC.gc(); @time do_multithreaded_work!(gccexpm!, Bs, As, testrange);
  0.854607 seconds (8.41 k allocations: 4.532 MiB)

julia> GC.gc(); @time do_multithreaded_work!(clangexpm!, Bs, As, testrange);
  0.766006 seconds (8.41 k allocations: 4.532 MiB)

julia> GC.gc(); @time do_multithreaded_work!(clangexpm!, Bs, As, testrange);
  0.762248 seconds (8.41 k allocations: 4.532 MiB)

julia> GC.gc(); @time do_multithreaded_work!(expm_tls!, Bs, As, testrange);
  1.248361 seconds (21.29 M allocations: 728.869 MiB, 5.36% gc time)

julia> GC.gc(); @time do_multithreaded_work!(expm_tls!, Bs, As, testrange);
  1.250094 seconds (21.29 M allocations: 728.877 MiB, 5.42% gc time)

julia> GC.gc(); @time do_multithreaded_work!(expm_custommul!, Bs, As, testrange);
  4.388915 seconds (84.71 M allocations: 36.950 GiB, 63.50% gc time)

julia> GC.gc(); @time do_multithreaded_work!(expm_custommul!, Bs, As, testrange);
  4.421541 seconds (84.71 M allocations: 36.950 GiB, 63.47% gc time)

expm_custommul! is the one that should be able to see some improvement here, ideally approaching or even exceeding that of the tls (task-local-storage) version, which preallocates. As you can see, this PR actually had slightly higher heap allocations, and we had over 60% GC time.
A decent percentage of these are below the 512 byte threshold mentioned here:

julia> rmap(sizeof, Bs)
(((32, 72, 128, 200), (64, 144, 256, 400), (96, 216, 384, 600), (128, 288, 512, 800), (160, 360, 640, 1000), (192, 432, 768, 1200), (224, 504, 896, 1400), (256, 576, 1024, 1600), (288, 648, 1152, 1800)), ((32, 72, 128, 200), (128, 288, 512, 800), (192, 432, 768, 1200), (256, 576, 1024, 1600), (320, 720, 1280, 2000), (384, 864, 1536, 2400), (448, 1008, 1792, 2800), (512, 1152, 2048, 3200), (576, 1296, 2304, 3600)), ((32, 72, 128, 200), (192, 432, 768, 1200), (288, 648, 1152, 1800), (384, 864, 1536, 2400), (480, 1080, 1920, 3000), (576, 1296, 2304, 3600), (672, 1512, 2688, 4200), (768, 1728, 3072, 4800), (864, 1944, 3456, 5400)))

This was done using the SIMD.jl ForwardDiff PR: JuliaDiff/ForwardDiff.jl#570

Also, on this PR, I get

julia> using StaticArrays
allocation was unsized
[ Info: Precompiling StaticArrays [90137ffa-7385-5640-81b9-e52037218182]

[166847] signal 11 (1): Segmentation fault
in expression starting at none:0
Allocations: 9910646 (Pool: 9910136; Big: 510); GC: 9
ERROR: Failed to precompile StaticArrays [90137ffa-7385-5640-81b9-e52037218182] to "/home/chriselrod/.julia/compiled/v1.11/StaticArrays/jl_u4booz".
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:35
  [2] compilecache(pkg::Base.PkgId, path::String, internal_stderr::IO, internal_stdout::IO, keep_loaded_modules::Bool)
    @ Base ./loading.jl:2601
  [3] compilecache(pkg::Base.PkgId, path::String, internal_stderr::IO, internal_stdout::IO, keep_loaded_modules::Bool)
    @ Base ./loading.jl:2473 [inlined]
  [4] (::Base.var"#1034#1035"{Base.PkgId})()
    @ Base ./loading.jl:2075
  [5] mkpidlock(f::Base.var"#1034#1035"{Base.PkgId}, at::String, pid::Int32; kwopts::@Kwargs{stale_age::Int64, wait::Bool})
    @ FileWatching.Pidfile ~/Documents/languages/juliadev/usr/share/julia/stdlib/v1.11/FileWatching/src/pidfile.jl:95
  [6] #mkpidlock#6
    @ FileWatching.Pidfile ~/Documents/languages/juliadev/usr/share/julia/stdlib/v1.11/FileWatching/src/pidfile.jl:90 [inlined]
  [7] #trymkpidlock#11
    @ ~/Documents/languages/juliadev/usr/share/julia/stdlib/v1.11/FileWatching/src/pidfile.jl:116
  [8] #invokelatest#2
    @ Base ./essentials.jl:957 [inlined]
  [9] invokelatest
    @ Base ./essentials.jl:952 [inlined]
 [10] maybe_cachefile_lock(f::Base.var"#1034#1035"{Base.PkgId}, pkg::Base.PkgId, srcpath::String; stale_age::Int64)
    @ Base ./loading.jl:3181
 [11] maybe_cachefile_lock
    @ Base ./loading.jl:3178 [inlined]
 [12] _require(pkg::Base.PkgId, env::String)
    @ Base ./loading.jl:2071
 [13] __require_prelocked(uuidkey::Base.PkgId, env::String)
    @ Base ./loading.jl:1916
 [14] #invoke_in_world#3
    @ Base ./essentials.jl:989 [inlined]
 [15] invoke_in_world
    @ Base ./essentials.jl:986 [inlined]
 [16] _require_prelocked(uuidkey::Base.PkgId, env::String)
    @ Base ./loading.jl:1907
 [17] macro expansion
    @ Base ./loading.jl:1845 [inlined]
 [18] macro expansion
    @ Base ./lock.jl:267 [inlined]
 [19] __require(into::Module, mod::Symbol)
    @ Base ./loading.jl:1806
 [20] #invoke_in_world#3
    @ Base ./essentials.jl:989 [inlined]
 [21] invoke_in_world
    @ Base ./essentials.jl:986 [inlined]
 [22] require(into::Module, mod::Symbol)
    @ Base ./loading.jl:1799

Which is odd, because ForwardDiff depends on StaticArrays.

pchintalapudi · 2023-12-11T03:34:31Z

So those was sized/unsized messages are arrays being stack allocated. It's possible that the arrays that are being stack allocated are not the ones that contribute most to GC time (which is unfortunate). I can look into why those are escaping probably sometime next weekend.

Stack allocate some genericmemory

b2ba171

vchuravy marked this pull request as draft December 3, 2023 20:07

Some fixes, make the doctest optimize properly

21cefdb

vtjnash reviewed Dec 5, 2023

View reviewed changes

gbaraldi reviewed Dec 5, 2023

View reviewed changes

brenhinkeller added the performance Must go faster label Dec 7, 2023

pchintalapudi added 7 commits December 9, 2023 08:47

Restore pipeline

5d0464c

Fix getField to be more accurate

a595924

Remove fcas before first alloc-opt

eb7632c

Partial escape analysis for objects

776c4c4

Partial escape analysis for arrays

07c1031

Cleanups, fixes

3823f89

Fix warnings, remove confusing phi node

b5a86d8

oscardssmith mentioned this pull request Feb 10, 2024

optimizer: simple array SROA #43909

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[WIP] Stack allocate some genericmemory #52382

[WIP] Stack allocate some genericmemory #52382

pchintalapudi commented Dec 3, 2023 •

edited

Loading

pchintalapudi commented Dec 4, 2023

vtjnash left a comment

gbaraldi Dec 5, 2023

pchintalapudi Dec 6, 2023

gbaraldi Dec 6, 2023 •

edited

Loading

pchintalapudi Dec 6, 2023 •

edited

Loading

oscardssmith Dec 6, 2023

pchintalapudi commented Dec 6, 2023

vtjnash commented Dec 6, 2023

pchintalapudi commented Dec 7, 2023

pchintalapudi commented Dec 10, 2023

oscardssmith commented Dec 10, 2023

chriselrod commented Dec 10, 2023

chriselrod commented Dec 11, 2023

pchintalapudi commented Dec 11, 2023

[WIP] Stack allocate some genericmemory #52382

Are you sure you want to change the base?

[WIP] Stack allocate some genericmemory #52382

Conversation

pchintalapudi commented Dec 3, 2023 • edited Loading

pchintalapudi commented Dec 4, 2023

vtjnash left a comment

Choose a reason for hiding this comment

gbaraldi Dec 5, 2023

Choose a reason for hiding this comment

pchintalapudi Dec 6, 2023

Choose a reason for hiding this comment

gbaraldi Dec 6, 2023 • edited Loading

Choose a reason for hiding this comment

pchintalapudi Dec 6, 2023 • edited Loading

Choose a reason for hiding this comment

oscardssmith Dec 6, 2023

Choose a reason for hiding this comment

pchintalapudi commented Dec 6, 2023

vtjnash commented Dec 6, 2023

pchintalapudi commented Dec 7, 2023

pchintalapudi commented Dec 10, 2023

oscardssmith commented Dec 10, 2023

chriselrod commented Dec 10, 2023

chriselrod commented Dec 11, 2023

pchintalapudi commented Dec 11, 2023

pchintalapudi commented Dec 3, 2023 •

edited

Loading

gbaraldi Dec 6, 2023 •

edited

Loading

pchintalapudi Dec 6, 2023 •

edited

Loading