Skip to content

Commit

Permalink
arm: Stop vadcq, vsbcq intrinsics from overwriting the FPSCR NZ flags
Browse files Browse the repository at this point in the history
Hi all,

We noticed that calls to the vadcq and vsbcq intrinsics, both of
which use __builtin_arm_set_fpscr_nzcvqc to set the Carry flag in
the FPSCR, would produce the following code:

```
< r2 is the *carry input >
vmrs	r3, FPSCR_nzcvqc
bic	r3, r3, #536870912
orr	r3, r3, r2, lsl gcc-mirror#29
vmsr	FPSCR_nzcvqc, r3
```

when the MVE ACLE instead gives a different instruction sequence of:
```
< Rt is the *carry input >
VMRS Rs,FPSCR_nzcvqc
BFI Rs,Rt,gcc-mirror#29,#1
VMSR FPSCR_nzcvqc,Rs
```

the bic + orr pair is slower and it's also wrong, because, if the
*carry input is greater than 1, then we risk overwriting the top two
bits of the FPSCR register (the N and Z flags).

This turned out to be a problem in the header file and the solution was
to simply add a `& 1x0u` to the `*carry` input: then the compiler knows
that we only care about the lowest bit and can optimise to a BFI.

Ok for trunk?

Thanks,
Stam Markianos-Wright

gcc/ChangeLog:

	* config/arm/arm_mve.h (__arm_vadcq_s32): Fix arithmetic.
	(__arm_vadcq_u32): Likewise.
	(__arm_vadcq_m_s32): Likewise.
	(__arm_vadcq_m_u32): Likewise.
	(__arm_vsbcq_s32): Likewise.
	(__arm_vsbcq_u32): Likewise.
	(__arm_vsbcq_m_s32): Likewise.
	(__arm_vsbcq_m_u32): Likewise.
	* config/arm/mve.md (get_fpscr_nzcvqc): Make unspec_volatile.

gcc/testsuite/ChangeLog:
	* gcc.target/arm/mve/mve_vadcq_vsbcq_fpscr_overwrite.c: New.
  • Loading branch information
Stammark committed May 18, 2023
1 parent a3010f7 commit 1ad81fc
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 9 deletions.
16 changes: 8 additions & 8 deletions gcc/config/arm/arm_mve.h
Original file line number Diff line number Diff line change
Expand Up @@ -16055,7 +16055,7 @@ __extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vadcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry)
{
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | (*__carry << 29));
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29));
int32x4_t __res = __builtin_mve_vadcq_sv4si (__a, __b);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
Expand All @@ -16065,7 +16065,7 @@ __extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vadcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry)
{
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | (*__carry << 29));
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29));
uint32x4_t __res = __builtin_mve_vadcq_uv4si (__a, __b);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
Expand All @@ -16075,7 +16075,7 @@ __extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vadcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p)
{
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | (*__carry << 29));
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29));
int32x4_t __res = __builtin_mve_vadcq_m_sv4si (__inactive, __a, __b, __p);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
Expand All @@ -16085,7 +16085,7 @@ __extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vadcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p)
{
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | (*__carry << 29));
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29));
uint32x4_t __res = __builtin_mve_vadcq_m_uv4si (__inactive, __a, __b, __p);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
Expand Down Expand Up @@ -16131,7 +16131,7 @@ __extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vsbcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry)
{
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | (*__carry << 29));
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29));
int32x4_t __res = __builtin_mve_vsbcq_sv4si (__a, __b);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
Expand All @@ -16141,7 +16141,7 @@ __extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vsbcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry)
{
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | (*__carry << 29));
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29));
uint32x4_t __res = __builtin_mve_vsbcq_uv4si (__a, __b);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
Expand All @@ -16151,7 +16151,7 @@ __extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vsbcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p)
{
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | (*__carry << 29));
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29));
int32x4_t __res = __builtin_mve_vsbcq_m_sv4si (__inactive, __a, __b, __p);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
Expand All @@ -16161,7 +16161,7 @@ __extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__arm_vsbcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p)
{
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | (*__carry << 29));
__builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29));
uint32x4_t __res = __builtin_mve_vsbcq_m_uv4si (__inactive, __a, __b, __p);
*__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u;
return __res;
Expand Down
2 changes: 1 addition & 1 deletion gcc/config/arm/mve.md
Original file line number Diff line number Diff line change
Expand Up @@ -9782,7 +9782,7 @@

(define_insn "get_fpscr_nzcvqc"
[(set (match_operand:SI 0 "register_operand" "=r")
(unspec:SI [(reg:SI VFPCC_REGNUM)] UNSPEC_GET_FPSCR_NZCVQC))]
(unspec_volatile:SI [(reg:SI VFPCC_REGNUM)] UNSPEC_GET_FPSCR_NZCVQC))]
"TARGET_HAVE_MVE"
"vmrs\\t%0, FPSCR_nzcvqc"
[(set_attr "type" "mve_move")])
Expand Down
67 changes: 67 additions & 0 deletions gcc/testsuite/gcc.target/arm/mve/mve_vadcq_vsbcq_fpscr_overwrite.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/* { dg-do run } */
/* { dg-require-effective-target arm_mve_hw } */
/* { dg-options "-O2" } */
/* { dg-add-options arm_v8_1m_mve } */

#include <arm_mve.h>

volatile int32x4_t c1;
volatile uint32x4_t c2;
int *carry;

int
main ()
{
int32x4_t a1 = vcreateq_s32 (0, 0);
int32x4_t b1 = vcreateq_s32 (0, 0);
int32x4_t inactive1 = vcreateq_s32 (0, 0);

uint32x4_t a2 = vcreateq_u32 (0, 0);
uint32x4_t b2 = vcreateq_u32 (0, 0);
uint32x4_t inactive2 = vcreateq_u32 (0, 0);

mve_pred16_t p = 0xFFFF;
(*carry) = 0xFFFFFFFF;

__builtin_arm_set_fpscr_nzcvqc (0);
c1 = vadcq (a1, b1, carry);
if (__builtin_arm_get_fpscr_nzcvqc () & !0x20000000)
__builtin_abort ();
(*carry) = 0xFFFFFFFF;
__builtin_arm_set_fpscr_nzcvqc (0);
c2 = vadcq (a2, b2, carry);
if (__builtin_arm_get_fpscr_nzcvqc () & !0x20000000)
__builtin_abort ();
(*carry) = 0xFFFFFFFF;
__builtin_arm_set_fpscr_nzcvqc (0);
c1 = vsbcq (a1, b1, carry);
if (__builtin_arm_get_fpscr_nzcvqc () & !0x20000000)
__builtin_abort ();
(*carry) = 0xFFFFFFFF;
__builtin_arm_set_fpscr_nzcvqc (0);
c2 = vsbcq (a2, b2, carry);
if (__builtin_arm_get_fpscr_nzcvqc () & !0x20000000)
__builtin_abort ();
(*carry) = 0xFFFFFFFF;
__builtin_arm_set_fpscr_nzcvqc (0);
c1 = vadcq_m (inactive1, a1, b1, carry, p);
if (__builtin_arm_get_fpscr_nzcvqc () & !0x20000000)
__builtin_abort ();
(*carry) = 0xFFFFFFFF;
__builtin_arm_set_fpscr_nzcvqc (0);
c2 = vadcq_m (inactive2, a2, b2, carry, p);
if (__builtin_arm_get_fpscr_nzcvqc () & !0x20000000)
__builtin_abort ();
(*carry) = 0xFFFFFFFF;
__builtin_arm_set_fpscr_nzcvqc (0);
c1 = vsbcq_m (inactive1, a1, b1, carry, p);
if (__builtin_arm_get_fpscr_nzcvqc () & !0x20000000)
__builtin_abort ();
(*carry) = 0xFFFFFFFF;
__builtin_arm_set_fpscr_nzcvqc (0);
c2 = vsbcq_m (inactive2, a2, b2, carry, p);
if (__builtin_arm_get_fpscr_nzcvqc () & !0x20000000)
__builtin_abort ();

return 0;
}

0 comments on commit 1ad81fc

Please sign in to comment.