- 
                Notifications
    
You must be signed in to change notification settings  - Fork 15.1k
 
[windows] Always pass fp128 arguments indirectly #128848
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
| 
          
 Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be notified. If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using  If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers. If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.  | 
    
| 
          
 @llvm/pr-subscribers-backend-x86 Author: Trevor Gross (tgross35) ChangesLLVM expects  Update LLVM's libcall calling convention to pass  With this change, the calling convention for  This corresponds to the frontend change in 2. Patch is 121.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128848.diff 3 Files Affected: 
 diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index cf164acba9ec0..0d087e057a2bd 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -639,8 +639,9 @@ def CC_X86_Win64_C : CallingConv<[
   // 512 bit vectors are passed by pointer
   CCIfType<[v64i8, v32i16, v16i32, v32f16, v32bf16, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
 
-  // Long doubles are passed by pointer
+  // Float types larger than 64-bits (long double and fp128) are passed by pointer
   CCIfType<[f80], CCPassIndirect<i64>>,
+  CCIfType<[f128], CCPassIndirect<i64>>,
 
   // If SSE was disabled, pass FP values smaller than 64-bits as integers in
   // GPRs or on the stack.
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 5263e0d4f6f39..a85b53ea62ac7 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -8,6 +8,12 @@
 ; RUN: llc < %s -O2 -mtriple=i686-linux-gnu -mattr=+sse2 \
 ; RUN:     -enable-legalize-types-checking \
 ; RUN:     | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -O2 -mtriple=x86_64-pc-windows-msvc \
+; RUN:     -enable-legalize-types-checking \
+; RUN:     | FileCheck %s --check-prefix=WIN
+; RUN: llc < %s -O2 -mtriple=i686-pc-windows-msvc \
+; RUN:     -enable-legalize-types-checking \
+; RUN:     | FileCheck %s --check-prefix=WIN-X86
 
 ; Check all soft floating point library function calls.
 
@@ -57,6 +63,55 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: add:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq __addtf3
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: add:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll ___addtf3
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %add = call fp128 @llvm.experimental.constrained.fadd.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %add
@@ -108,6 +163,55 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: sub:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq __subtf3
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: sub:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll ___subtf3
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %sub = call fp128 @llvm.experimental.constrained.fsub.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %sub
@@ -159,6 +263,55 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: mul:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq __multf3
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: mul:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll ___multf3
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %mul = call fp128 @llvm.experimental.constrained.fmul.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %mul
@@ -210,6 +363,55 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: div:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq __divtf3
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: div:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll ___divtf3
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %div = call fp128 @llvm.experimental.constrained.fdiv.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %div
@@ -258,6 +460,62 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: fma:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $88, %rsp
+; WIN-NEXT:    movaps (%r8), %xmm0
+; WIN-NEXT:    movaps (%rcx), %xmm1
+; WIN-NEXT:    movaps (%rdx), %xmm2
+; WIN-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
+; WIN-NEXT:    callq fmal
+; WIN-NEXT:    addq $88, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: fma:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 56(%ebp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _fmal
+; WIN-X86-NEXT:    addl $52, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %fma = call fp128 @llvm.experimental.constrained.fma.f128(fp128 %x, fp128 %y,  fp128 %z, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %fma
@@ -302,6 +560,55 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: frem:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $72, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps (%rdx), %xmm1
+; WIN-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN-NEXT:    callq fmodl
+; WIN-NEXT:    addq $72, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: frem:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _fmodl
+; WIN-X86-NEXT:    addl $36, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %div = call fp128 @llvm.experimental.constrained.frem.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %div
@@ -342,6 +649,48 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: ceil:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    callq ceill
+; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: ceil:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _ceill
+; WIN-X86-NEXT:    addl $20, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %ceil = call fp128 @llvm.experimental.constrained.ceil.f128(fp128 %x, metadata !"fpexcept.strict") #0
   ret fp128 %ceil
@@ -382,6 +731,48 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: acos:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    callq acosl
+; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: acos:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _acosl
+; WIN-X86-NEXT:    addl $20, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %acos = call fp128 @llvm.experimental.constrained.acos.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %acos
@@ -422,6 +813,48 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: cos:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    callq cosl
+; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: cos:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _cosl
+; WIN-X86-NEXT:    addl $20, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %cos = call fp128 @llvm.experimental.constrained.cos.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %cos
@@ -462,6 +895,48 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
+;
+; WIN-LABEL: cosh:
+; WIN:       # %bb.0: # %entry
+; WIN-NEXT:    subq $56, %rsp
+; WIN-NEXT:    movaps (%rcx), %xmm0
+; WIN-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN-NEXT:    callq coshl
+; WIN-NEXT:    addq $56, %rsp
+; WIN-NEXT:    retq
+;
+; WIN-X86-LABEL: cosh:
+; WIN-X86:       # %bb.0: # %entry
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esp, %eax
+; WIN-X86-NEXT:    pushl 24(%ebp)
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    calll _coshl
+; WIN-X86-NEXT:    addl $20, %esp
+; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN-X86-NEXT:    movl %edi, 8(%esi)
+; WIN-X86-NEXT:    movl %edx, 12(%esi)
+; WIN-X86-NEXT:    movl %eax, (%esi)
+; WIN-X86-NEXT:    movl %ecx, 4(%esi)
+; WIN-X86-NEXT:    movl %esi, %eax
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebp
+; WIN-X86-NEXT:    retl
 entry:
   %cosh = call fp128 @llvm.experim...
[truncated]
 | 
    
f6d8e8b    to
    eea0a28      
    Compare
  
    | 
           @rnk in #115052 (comment) you mentioned that changes in   | 
    
| 
           I went looking at   | 
    
f57b8a2    to
    71e42dc      
    Compare
  
    71e42dc    to
    b6c7942      
    Compare
  
    …, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2 try-job: x86_64-mingw-3
…, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2
| 
           Thanks for looking into that. I think this should be all set then, along with #115052  | 
    
…, r=<try> [do not merge] Windows f128 abi experiment Running tests with llvm/llvm-project#115052 and llvm/llvm-project#128848. r? `@ghost` try-job: dist-x86_64-msvc try-job: dist-x86_64-mingw try-job: x86_64-msvc-1 try-job: x86_64-msvc-2 try-job: x86_64-mingw-1 try-job: x86_64-mingw-2
| 
           Thanks for reviewing, I need somebody to land this for me. First commit for both PRs is NFC  | 
    
Clang currently passes and returns `__float128` in vector registers on
MinGW targets, which is LLVM's default ABI for `fp128`. However, the
Windows x86-64 calling convention [1] states the following:
    __m128 types, arrays, and strings are never passed by immediate
    value. Instead, a pointer is passed to memory allocated by the
    caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
    types, are passed as if they were integers of the same size. Structs
    or unions of other sizes are passed as a pointer to memory allocated
    by the caller. For these aggregate types passed as a pointer,
    including __m128, the caller-allocated temporary memory must be
    16-byte aligned.
Based on the above it sounds like `__float128` should be passed
indirectly. Thus, change `f128` passing to use the stack and make the
return in xmm0 explicit. This is the identical to `i128`, and passing is
the same as GCC.
Regarding return values, the documentation states:
    A scalar return value that can fit into 64 bits, including the __m64
    type, is returned through RAX. Non-scalar types including floats,
    doubles, and vector types such as __m128, __m128i, __m128d are
    returned in XMM0.
This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.
Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.
With [2] which should land around the same time, LLVM will also
implement this ABI so it is not technically necessary for Clang to make
a change here as well. This is sill done in order to be consistent with
other types, and to allow calling convention-aware optimizations at all
available optimization layers (@rnk mentioned possible reuse of stack
arguments). An added benefit is readibility of the LLVM IR since it more
accurately reflects what the lowered assembly does.
[1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
[2]: llvm#128848
    Clang currently passes and returns `__float128` in vector registers on
MinGW targets, which is LLVM's default ABI for `fp128`. However, the
Windows x86-64 calling convention [1] states the following:
    __m128 types, arrays, and strings are never passed by immediate
    value. Instead, a pointer is passed to memory allocated by the
    caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
    types, are passed as if they were integers of the same size. Structs
    or unions of other sizes are passed as a pointer to memory allocated
    by the caller. For these aggregate types passed as a pointer,
    including __m128, the caller-allocated temporary memory must be
    16-byte aligned.
Based on the above it sounds like `__float128` should be passed
indirectly. Thus, change `f128` passing to use the stack and make the
return in xmm0 explicit. This is the identical to `i128`, and passing is
the same as GCC.
Regarding return values, the documentation states:
    A scalar return value that can fit into 64 bits, including the __m64
    type, is returned through RAX. Non-scalar types including floats,
    doubles, and vector types such as __m128, __m128i, __m128d are
    returned in XMM0.
This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.
Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.
With [2] which should land around the same time, LLVM will also
implement this ABI so it is not technically necessary for Clang to make
a change here as well. This is sill done in order to be consistent with
other types, and to allow calling convention-aware optimizations at all
available optimization layers (rnk mentioned possible reuse of stack
arguments). An added benefit is readibility of the LLVM IR since it more
accurately reflects what the lowered assembly does.
[1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
[2]: llvm#128848
    Clang currently passes and returns `__float128` in vector registers on
MinGW targets, which is LLVM's default ABI for `fp128`. However, the
Windows x86-64 calling convention [1] states the following:
    __m128 types, arrays, and strings are never passed by immediate
    value. Instead, a pointer is passed to memory allocated by the
    caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
    types, are passed as if they were integers of the same size. Structs
    or unions of other sizes are passed as a pointer to memory allocated
    by the caller. For these aggregate types passed as a pointer,
    including __m128, the caller-allocated temporary memory must be
    16-byte aligned.
Based on the above it sounds like `__float128` should be passed
indirectly. Thus, change `f128` passing to use the stack and make the
return in xmm0 explicit. This is the identical to `i128`, and passing is
the same as GCC.
Regarding return values, the documentation states:
    A scalar return value that can fit into 64 bits, including the __m64
    type, is returned through RAX. Non-scalar types including floats,
    doubles, and vector types such as __m128, __m128i, __m128d are
    returned in XMM0.
This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.
Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.
With [2] which should land around the same time, LLVM will also
implement this ABI so it is not technically necessary for Clang to make
a change here as well. This is sill done in order to be consistent with
other types, and to allow calling convention-aware optimizations at all
available optimization layers (@rnk mentioned possible reuse of stack
arguments). An added benefit is readibility of the LLVM IR since it more
accurately reflects what the lowered assembly does.
[1]:
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
[2]: #128848
    …s (#115052)
Clang currently passes and returns `__float128` in vector registers on
MinGW targets, which is LLVM's default ABI for `fp128`. However, the
Windows x86-64 calling convention [1] states the following:
    __m128 types, arrays, and strings are never passed by immediate
    value. Instead, a pointer is passed to memory allocated by the
    caller. Structs and unions of size 8, 16, 32, or 64 bits, and __m64
    types, are passed as if they were integers of the same size. Structs
    or unions of other sizes are passed as a pointer to memory allocated
    by the caller. For these aggregate types passed as a pointer,
    including __m128, the caller-allocated temporary memory must be
    16-byte aligned.
Based on the above it sounds like `__float128` should be passed
indirectly. Thus, change `f128` passing to use the stack and make the
return in xmm0 explicit. This is the identical to `i128`, and passing is
the same as GCC.
Regarding return values, the documentation states:
    A scalar return value that can fit into 64 bits, including the __m64
    type, is returned through RAX. Non-scalar types including floats,
    doubles, and vector types such as __m128, __m128i, __m128d are
    returned in XMM0.
This makes it sound like it should be acceptable to return `__float128`
in xmm0; however, GCC returns `__float128` on the stack. That above ABI
statement as well as consistency with `i128` (which is returned in xmm0)
mean that it would likely be better for GCC to change its return ABI to
match Clang rather than the other way around, so that portion is left
as-is.
Clang's MSVC targets do not support `__float128` or `_Float128`, but
these changes would also apply there if it is eventually enabled.
With [2] which should land around the same time, LLVM will also
implement this ABI so it is not technically necessary for Clang to make
a change here as well. This is sill done in order to be consistent with
other types, and to allow calling convention-aware optimizations at all
available optimization layers (@rnk mentioned possible reuse of stack
arguments). An added benefit is readibility of the LLVM IR since it more
accurately reflects what the lowered assembly does.
[1]:
https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170
[2]: llvm/llvm-project#128848
    | 
           @phoebewang would you mind getting this one too? The first commit is NFC and should land separate  | 
    
| 
           I see some problems here: 
  | 
    
Add Windows invocations to existing fp128 libcall tests and create a new ABI test for x86.
b6c7942    to
    5187cfd      
    Compare
  
    LLVM currently expects `__float128` to be both passed and returned in xmm registers on Windows. However, this disagrees with the Windows x86-64 calling convention [1], which indicates values larger than 64 bits should be passed indirectly. Update LLVM's default Windows calling convention to pass `fp128` directly. Returning in xmm0 is unchanged since this seems like a reasonable extrapolation of the ABI. With this patch, the calling convention for `i128` and `f128` is the same. GCC passes `__float128` indirectly, which this also matches. However, it also returns indirectly, which is not done here. I intend to attempt a GCC change to also return in `xmm0` rather than making that change here, given the consistency with `i128`. This corresponds to the frontend change in [2], see more details there. [1]: https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention?view=msvc-170 [2]: llvm#115052
5187cfd    to
    e0a3738      
    Compare
  
    
          
 This is Windows-only. Were you referring to "Update LLVM's default calling convention" in the description? I meant this to refer to the default Windows CC used for libcalls, updated this to be more clear. The sysv ABI actually specifies __float128 as SSE+SSEUP and LLVM already does the right thing there. 
 That test only runs Linux, should I add the Windows targets? Some builtins calls are covered in  
 Done. Also rebased so this is on top of #115052.  | 
    
          
 No, I just read the patch as it affects both Linux and Windows. It looks good to me if it's Windows only.  | 
    
| 
          
 @tgross35 Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested by our build bots. If there is a problem with a build, you may receive a report in an email or a comment on this PR. Please check whether problems have been caused by your change specifically, as the builds can include changes from many authors. It is not uncommon for your change to be included in a build that fails due to someone else's changes, or infrastructure issues. How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. This is a normal part of LLVM development. You can fix your changes and open a new PR to merge them again. If you don't get any reports, no action is required from you. Your changes are working as expected, well done!  | 
    
LLVM currently expects
__float128to be both passed and returned inxmm registers on Windows. However, this disagrees with the Windows
x86-64 calling convention 1, which indicates values larger than 64
bits should be passed indirectly.
Update LLVM's default Windows calling convention to pass
fp128directly. Returning in xmm0 is unchanged since this seems like a
reasonable extrapolation of the ABI. With this patch, the calling
convention for
i128andf128is the same.GCC passes
__float128indirectly, which this also matches. However, italso returns indirectly, which is not done here. I intend to attempt a
GCC change to also return in
xmm0rather than making that change here,given the consistency with
i128.This corresponds to the frontend change in 2, see more details there.