[Mono] Pass Vector128 in SIMD registers, when LLVM is enabled #68991

fanyang-mono · 2022-05-06T22:08:34Z

This PR enables passing Vector128 in SIMD registers between managed function calls for both Amd64 and Arm64, when LLVM is enabled.

Contributes to #60068

With this PR, the unnecessary move has been removed from both the caller and the callee. Here I am using caller function in the same example as #60068 (comment) to demonstrate the difference.

AMD64
Before

Generated unoptimized LLVM IR

*** Unoptimized LLVM IR for HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) ***
; ModuleID = 'jit-module-test'
source_filename = "jit-module-test"

; Function Attrs: noinline uwtable
define dso_local monocc void @"HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>)"(i64 %vret, i64 %arg_a, i64 %0, i64 %arg_b, i64 %1, <4 x i32>* byval(<4 x i32>) %arg_c) #0 {
BB0:
  %2 = alloca <4 x i32>, align 16
  %3 = alloca <4 x i32>, align 16
  %4 = bitcast <4 x i32>* %3 to i8*
  %5 = getelementptr i8, i8* %4, i32 0
  %6 = bitcast i8* %5 to i64*
  store i64 %arg_a, i64* %6, align 4
  %7 = getelementptr i8, i8* %4, i32 8
  %8 = bitcast i8* %7 to i64*
  store i64 %0, i64* %8, align 4
  %simd_vtype = load <4 x i32>, <4 x i32>* %3, align 16
  %9 = bitcast <4 x i32>* %2 to i8*
  %10 = getelementptr i8, i8* %9, i32 0
  %11 = bitcast i8* %10 to i64*
  store i64 %arg_b, i64* %11, align 4
  %12 = getelementptr i8, i8* %9, i32 8
  %13 = bitcast i8* %12 to i64*
  store i64 %1, i64* %13, align 4
  %simd_vtype1 = load <4 x i32>, <4 x i32>* %2, align 16
  %simd_vtype2 = load <4 x i32>, <4 x i32>* %arg_c, align 16
  br label %BB2

BB2:                                              ; preds = %BB0
  br label %BB3

BB3:                                              ; preds = %BB2
  %14 = icmp slt <4 x i32> %simd_vtype, %simd_vtype1
  %15 = select <4 x i1> %14, <4 x i32> %simd_vtype, <4 x i32> %simd_vtype1
  br label %BB4

BB4:                                              ; preds = %BB3
  %16 = add i64 %vret, 0
  %17 = inttoptr i64 %16 to <4 x i32>*
  store <4 x i32> %15, <4 x i32>* %17, align 1
  br label %BB1

BB1:                                              ; preds = %BB4
  ret void
}

Assembly

*** ASM for HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) ***
/var/folders/5w/jj_7d0cn7qjc2hq2xb1tz2wm0000gn/T/.0QhwIZ:
(__TEXT,__text) section
loWorld_Program_test__System_Runtime_Intrinsics_Vector128_1_int__System_Runtime_Intrinsics_Vector128_1_int__System_Runtime_Intrinsics_Vector128_1_int__:
0000000000000000	vmovq	%rdx, %xmm0
0000000000000005	vmovq	%rsi, %xmm1
000000000000000a	vpunpcklqdq	%xmm0, %xmm1, %xmm0
000000000000000e	vmovq	%r8, %xmm1
0000000000000013	vmovq	%rcx, %xmm2
0000000000000018	vpunpcklqdq	%xmm1, %xmm2, %xmm1
000000000000001c	vpminsd	%xmm1, %xmm0, %xmm0
0000000000000021	vmovdqu	%xmm0, (%rdi)
0000000000000025	retq

After

Generated unoptimized LLVM IR

*** Unoptimized LLVM IR for HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) ***
; ModuleID = 'jit-module-test'
source_filename = "jit-module-test"

; Function Attrs: noinline uwtable
define dso_local monocc void @"HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>)"(i64 %vret, <4 x i32> %arg_a, <4 x i32> %arg_b, <4 x i32>* byval(<4 x i32>) %arg_c) #0 {
BB0:
  %0 = alloca <4 x i32>, align 16
  %1 = alloca <4 x i32>, align 16
  store <4 x i32> %arg_a, <4 x i32>* %1, align 16
  %simd_vtype = load <4 x i32>, <4 x i32>* %1, align 16
  store <4 x i32> %arg_b, <4 x i32>* %0, align 16
  %simd_vtype1 = load <4 x i32>, <4 x i32>* %0, align 16
  %simd_vtype2 = load <4 x i32>, <4 x i32>* %arg_c, align 16
  br label %BB2

BB2:                                              ; preds = %BB0
  br label %BB3

BB3:                                              ; preds = %BB2
  %2 = icmp slt <4 x i32> %simd_vtype, %simd_vtype1
  %3 = select <4 x i1> %2, <4 x i32> %simd_vtype, <4 x i32> %simd_vtype1
  br label %BB4

BB4:                                              ; preds = %BB3
  %4 = add i64 %vret, 0
  %5 = inttoptr i64 %4 to <4 x i32>*
  store <4 x i32> %3, <4 x i32>* %5, align 1
  br label %BB1

BB1:                                              ; preds = %BB4
  ret void
}

Assembly

*** ASM for HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) ***
/var/folders/5w/jj_7d0cn7qjc2hq2xb1tz2wm0000gn/T/.MhJRvN:
(__TEXT,__text) section
loWorld_Program_test__System_Runtime_Intrinsics_Vector128_1_int__System_Runtime_Intrinsics_Vector128_1_int__System_Runtime_Intrinsics_Vector128_1_int__:
0000000000000000	vpminsd	%xmm1, %xmm0, %xmm0
0000000000000005	vmovdqu	%xmm0, (%rdi)
0000000000000009	retq

ARM64
Before

Generated unoptimized LLVM IR

*** Unoptimized LLVM IR for HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) ***
; ModuleID = 'jit-module-test'
source_filename = "jit-module-test"

; Function Attrs: noinline uwtable
define dso_local monocc i128 @"HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>)"([2 x i64] %arg_a, [2 x i64] %arg_b, [2 x i64] %arg_c) #0 {
BB0:
  %0 = alloca <4 x i32>, align 16
  %1 = alloca <4 x i32>, align 16
  %2 = alloca <4 x i32>, align 16
  %3 = bitcast <4 x i32>* %2 to [2 x i64]*
  store [2 x i64] %arg_a, [2 x i64]* %3, align 4
  %simd_vtype = load <4 x i32>, <4 x i32>* %2, align 16
  %4 = bitcast <4 x i32>* %1 to [2 x i64]*
  store [2 x i64] %arg_b, [2 x i64]* %4, align 4
  %simd_vtype1 = load <4 x i32>, <4 x i32>* %1, align 16
  %5 = bitcast <4 x i32>* %0 to [2 x i64]*
  store [2 x i64] %arg_c, [2 x i64]* %5, align 4
  %simd_vtype2 = load <4 x i32>, <4 x i32>* %0, align 16
  br label %BB2

BB2:                                              ; preds = %BB0
  br label %BB3

BB3:                                              ; preds = %BB2
  %6 = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %simd_vtype, <4 x i32> %simd_vtype1)
  br label %BB4

BB4:                                              ; preds = %BB3
  br label %BB1

BB1:                                              ; preds = %BB4
  %setret_simd_vtype_as_scalar = bitcast <4 x i32> %6 to i128
  ret i128 %setret_simd_vtype_as_scalar
}

Assembly

LLVM Method System.Runtime.Intrinsics.Vector128`1<int> HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) emitted at 0xffff812c2ef8 to 0xffff812c2f20 (code length 40)

*** ASM for HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) ***
Disassembly of section .text:
0000000000000000 <.text>:
   0:	9e670000 	fmov	d0, x0
   4:	9e670021 	fmov	d1, x1
   8:	9e670042 	fmov	d2, x2
   c:	9e670063 	fmov	d3, x3
  10:	6e180420 	mov	v0.d[1], v1.d[0]
  14:	6e180462 	mov	v2.d[1], v3.d[0]
  18:	4ea26c00 	smin	v0.4s, v0.4s, v2.4s
  1c:	4e183c01 	mov	x1, v0.d[1]
  20:	9e660000 	fmov	x0, d0
  24:	d65f03c0 	ret

After

Generated unoptimized LLVM IR

*** Unoptimized LLVM IR for HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) ***
; ModuleID = 'jit-module-test'
source_filename = "jit-module-test"

; Function Attrs: noinline uwtable
define dso_local monocc i128 @"HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>)"(<4 x i32> %arg_a, <4 x i32> %arg_b, <4 x i32> %arg_c) #0 {
BB0:
  %0 = alloca <4 x i32>, align 16
  %1 = alloca <4 x i32>, align 16
  %2 = alloca <4 x i32>, align 16
  store <4 x i32> %arg_a, <4 x i32>* %2, align 16
  %simd_vtype = load <4 x i32>, <4 x i32>* %2, align 16
  store <4 x i32> %arg_b, <4 x i32>* %1, align 16
  %simd_vtype1 = load <4 x i32>, <4 x i32>* %1, align 16
  store <4 x i32> %arg_c, <4 x i32>* %0, align 16
  %simd_vtype2 = load <4 x i32>, <4 x i32>* %0, align 16
  br label %BB2

BB2:                                              ; preds = %BB0
  br label %BB3

BB3:                                              ; preds = %BB2
  %3 = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %simd_vtype, <4 x i32> %simd_vtype1)
  br label %BB4

BB4:                                              ; preds = %BB3
  br label %BB1

BB1:                                              ; preds = %BB4
  %setret_simd_vtype_as_scalar = bitcast <4 x i32> %3 to i128
  ret i128 %setret_simd_vtype_as_scalar
}

Assembly

*** ASM for HelloWorld.Program:test (System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>,System.Runtime.Intrinsics.Vector128`1<int>) ***
Disassembly of section .text:
0000000000000000 <.text>:
   0:	4ea16c00 	smin	v0.4s, v0.4s, v1.4s
   4:	4e183c01 	mov	x1, v0.d[1]
   8:	9e660000 	fmov	x0, d0
   c:	d65f03c0 	ret

dotnet-issue-labeler · 2022-05-06T22:08:38Z

I couldn't figure out the best area label to add to this PR. If you have write-permissions please help me learn by adding exactly one area label.

tannergooding · 2022-05-09T21:14:34Z

This PR enables passing Vector128 in SIMD registers for both Amd64 and Arm64, when LLVM is enabled.

Always or only when it is required by the ABI? For example, Windows x64 calling convention would not by default pass these in register; only Windows __vectorcall would.

fanyang-mono · 2022-05-09T22:11:40Z

@tannergooding It will always using SIMD register.

… all.

tannergooding · 2022-05-16T14:58:15Z

It will always using SIMD register.

It might be good to ensure that the relevant interop tests all pass with this feature. There are a number of tests under https://github.com/dotnet/runtime/tree/main/src/tests/Interop/PInvoke/Generics but I believe they aren't being run for any platforms since RyuJIT currently blocks P/Invoke for these types.

fanyang-mono · 2022-05-25T14:40:46Z

/azp run runtime-extra-platforms

azure-pipelines · 2022-05-25T14:41:08Z

Azure Pipelines successfully started running 1 pipeline(s).

fanyang-mono · 2022-05-26T12:19:07Z

Test failures on CI are irrelevant to this PR.

Build windows x64 Release AllSubsets_Mono - rolling build showed the same failure
Build Browser wasm windows Release WasmBuildTests - same test failures could be found on other PR.

src/mono/mono/mini/mini-generic-sharing.c

src/mono/mono/mini/mini-llvm.c

src/mono/mono/mini/mini.h

src/mono/mono/mini/mini-amd64.c

src/mono/mono/mini/mini-arm64.c

fanyang-mono · 2022-05-26T18:01:54Z

It will always using SIMD register.

It might be good to ensure that the relevant interop tests all pass with this feature. There are a number of tests under https://github.com/dotnet/runtime/tree/main/src/tests/Interop/PInvoke/Generics but I believe they aren't being run for any platforms since RyuJIT currently blocks P/Invoke for these types.

This PR only handles the managed-to-managed calls. To enable those PInvoke tests, more work is needed to make sure the ABI/calling conventions are handled correctly, as @lambdageek mentioned earlier.

vargaz · 2022-05-26T18:08:23Z

/azp run runtime-extra-platforms

azure-pipelines · 2022-05-26T18:08:46Z

Azure Pipelines successfully started running 1 pipeline(s).

fanyang-mono · 2022-05-31T16:34:29Z

Test failures were irrelevant to this PR.

fanyang-mono added 5 commits April 30, 2022 09:06

Add a new LLVMArgStorage type

15badfe

Fix llvm signature

a069e0e

Fix emit_call

ffdcc2a

Enable passing SIMD register on arm64 and clean up debugging stuff

6e42d91

Clean up unnecessary format change

93da354

ghost assigned fanyang-mono May 6, 2022

fanyang-mono added the area-Codegen-LLVM-mono label May 6, 2022

fanyang-mono added 2 commits May 7, 2022 09:18

Merge branch 'main' into simd_register

bc396db

Update code after rebase

6d2c7d5

runfoapp bot mentioned this pull request May 9, 2022

EMSDK-.WasmNativeDefaultsTests work item failing #67887

Closed

fanyang-mono added 2 commits May 9, 2022 16:38

Only use SIMD register when element type is primitive type

3086bfa

Merge branch 'main' into simd_register

d56bd1f

fanyang-mono added 2 commits May 16, 2022 09:52

Disable SIMD intrinsics for generic sharing, since it doesn't work at…

5383b55

… all.

Check if element type is primitive on arm64

91d196b

fanyang-mono added 2 commits May 23, 2022 15:13

Merge branch 'main' into simd_register

fda3a73

Not using SIMD register when it is full aot

b9b1767

fanyang-mono marked this pull request as ready for review May 25, 2022 14:41

fanyang-mono requested review from vargaz, lambdageek and SamMonoRT as code owners May 25, 2022 14:41

fanyang-mono changed the title ~~Pass Vector128 in SIMD registers~~ [Mono] Pass Vector128 in SIMD registers, when LLVM is enabled May 25, 2022