Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[x86] default codegen should be more branchy #32360

Open
rotateright opened this issue May 11, 2017 · 18 comments
Open

[x86] default codegen should be more branchy #32360

rotateright opened this issue May 11, 2017 · 18 comments
Labels
backend:X86 bugzilla Issues migrated from bugzilla

Comments

@rotateright
Copy link
Contributor

Bugzilla Link 33013
Version trunk
OS All
CC @RKSimon,@michaeljclark,@TNorthover,@ZviRackover

Extended Description

The default x86 target is something like an Intel big core (ie, it's good at absorbing the cost of correctly predicted branches, good at predicting those branches, and good at speculating execution past those branches).

Therefore, we shouldn't favor cmov codegen for IR select as much as we currently do. Example:

int foo(float x) {
if (x < 42.0f)
return x;
return 12;
}

define i32 @​foo(float %x) {
%cmp = fcmp olt float %x, 4.200000e+01
%conv = fptosi float %x to i32
%ret = select i1 %cmp, i32 %conv, i32 12
ret i32 %ret
}

$ clang -O2 cmovfp.c -S -o -
movss LCPI0_0(%rip), %xmm1 ## xmm1 = mem[0],zero,zero,zero
ucomiss %xmm0, %xmm1
cvttss2si %xmm0, %ecx
movl $12, %eax
cmoval %ecx, %eax
retq

Note that gcc and icc will use compare and branch on this example.

@rotateright
Copy link
Contributor Author

An example from https://groups.google.com/forum/#!topic/llvm-dev/Tl9SD-edUJI where the select is not explicit in IR:

define i64 @​cvtf(float %f) {
%cvt = fptoui float %f to i64
ret i64 %cvt
}

movss	LCPI0_0(%rip), %xmm1    ## xmm1 = mem[0],zero,zero,zero
movaps	%xmm0, %xmm2
subss	%xmm1, %xmm2
cvttss2si	%xmm2, %rax
movabsq	$-9223372036854775808, %rcx ## imm = 0x8000000000000000
xorq	%rax, %rcx
cvttss2si	%xmm0, %rax
ucomiss	%xmm1, %xmm0
cmovaeq	%rcx, %rax
retq

@michaeljclark
Copy link
Mannequin

michaeljclark mannequin commented May 12, 2017

Another issue related to the use of predicate logic versus a branch is the unconditional execution of ISD::FSUB which causes FE_INEXACT for float to u64 and double to u64 conversion. See test code and workaround below.

See SelectionDAGLegalize::ExpandNode()

case ISD::FP_TO_UINT: {
SDValue True, False;
EVT VT = Node->getOperand(0).getValueType();
EVT NVT = Node->getValueType(0);
APFloat apf(DAG.EVTToAPFloatSemantics(VT),
APInt::getNullValue(VT.getSizeInBits()));
APInt x = APInt::getSignMask(NVT.getSizeInBits());
(void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
Tmp1 = DAG.getConstantFP(apf, dl, VT);
Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
Node->getOperand(0),
Tmp1, ISD::SETLT);
True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
// TODO: Should any fast-math-flags be set for the FSUB?
False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
DAG.getNode(ISD::FSUB, dl, VT,
Node->getOperand(0), Tmp1));
False = DAG.getNode(ISD::XOR, dl, NVT, False,
DAG.getConstant(x, dl, NVT));
Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
Results.push_back(Tmp1);
break;
}

Expected output:

$ clang fcvt-bug.cc
$ ./a.out
1 exact
1 inexact
1 exact
1 inexact
1 exact
1 inexact
1 exact
1 inexact

Erroneous output:

$ clang -DDISABLE_WORKAROUND fcvt-bug.cc
$ ./a.out
1 exact
1 inexact
1 inexact
1 inexact
1 exact
1 inexact
1 inexact
1 inexact

$ cat fcvt-bug.cc
#include
#include
#include
#include <fenv.h>

typedef float f32;
typedef double f64;
typedef signed int s32;
typedef unsigned int u32;
typedef signed long long s64;
typedef unsigned long long u64;

#if defined GNUC && defined x86_64 && !defined DISABLE_WORKAROUND

attribute ((noinline)) s64 fcvt_s_lu(f32 f)
{
s64 res;
static const union { f32 f; u32 u; } val = { .u = 0x5f000000 };
if (f > val.f) {
asm(
"subss %[sub], %[input]\n "
"movabsq $0x8000000000000000, %%rdx\n "
"cvttss2siq %[input], %[result]\n "
"xorq %%rdx, %[result]"
: [result] "=r"(res)
: [sub] "m"(val.f), [input] "x"(f)
: "rdx"
);
} else {
asm(
"cvttss2siq %[input], %[result]"
: [result] "=r"(res)
: [input] "x"(f)
);
}
return res;
}

attribute ((noinline)) s64 fcvt_d_lu(f64 f)
{
s64 res;
static const union f64_bits { f64 f; u64 u; } val = { .u = 0x43E0000000000000ULL };
if (f > val.f) {
asm(
"subsd %[sub], %[input]\n "
"movabsq $0x8000000000000000, %%rdx\n "
"cvttsd2siq %[input], %[result]\n "
"xorq %%rdx, %[result]"
: [result] "=r"(res)
: [sub] "m"(val.f), [input] "x"(f)
: "rdx"
);
} else {
asm(
"cvttsd2siq %[input], %[result]"
: [result] "=r"(res)
: [input] "x"(f)
);
}
return res;
}

#else

attribute ((noinline)) s64 fcvt_s_lu(f32 f) { return s64(u64(f)); }
attribute ((noinline)) s64 fcvt_d_lu(f64 f) { return s64(u64(f)); }

#endif

attribute ((noinline)) s32 fcvt_s_wu(f32 f) { return s32(u32(f)); }
attribute ((noinline)) s32 fcvt_d_wu(f64 f) { return s32(u32(f)); }

void test_fcvt_s_wu(f32 a)
{
feclearexcept(FE_ALL_EXCEPT);
printf("%d ", fcvt_s_wu(a));
printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}

void test_fcvt_s_lu(f32 a)
{
feclearexcept(FE_ALL_EXCEPT);
printf("%lld ", fcvt_s_lu(a));
printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}

void test_fcvt_d_wu(f64 a)
{
feclearexcept(FE_ALL_EXCEPT);
printf("%d ", fcvt_d_wu(a));
printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}

void test_fcvt_d_lu(f64 a)
{
feclearexcept(FE_ALL_EXCEPT);
printf("%lld ", fcvt_d_lu(a));
printf("%s\n", fetestexcept(FE_INEXACT) ? "inexact" : "exact");
}

int main()
{
fesetround(FE_TONEAREST);

    test_fcvt_s_wu(1.0f);
    test_fcvt_s_wu(1.1f);
    test_fcvt_s_lu(1.0f);
    test_fcvt_s_lu(1.1f);
    test_fcvt_d_wu(1.0f);
    test_fcvt_d_wu(1.1f);
    test_fcvt_d_lu(1.0f);
    test_fcvt_d_lu(1.1f);

}

@michaeljclark
Copy link
Mannequin

michaeljclark mannequin commented May 12, 2017

Given that the primary issue is the speed of branch vs predicate logic, here is a microbenchmark of the two approaches:

$ time fcvt-branch

real 0m0.208s
user 0m0.201s
sys 0m0.002s

$ time fcvt-cmov

real 0m0.241s
user 0m0.235s
sys 0m0.002s

int main() {
static const union { float f; unsigned u; } val = { .u = 0x5f000000 };
for (unsigned long i = 0; i < 100000000; i++) {
unsigned long res;
float f = 1.0f;
#ifndef USE_CMOV
asm volatile (
"ucomiss %[sub], %[input]\n "
"ja 1f\n "
"cvttss2siq %[input], %[result]\n "
"jmp 2f\n "
"1:\n "
"subss %[sub], %[input]\n "
"movabsq $0x8000000000000000, %%rdx\n "
"cvttss2siq %[input], %[result]\n "
"xorq %%rdx, %[result]\n "
"2: "
: [result] "=r"(res)
: [sub] "m"(val.f), [input] "x"(f)
: "rdx"
);
#else
asm volatile (
"movss %[sub], %%xmm1\n "
"movaps %[input], %%xmm2\n "
"subss %%xmm1, %%xmm2\n "
"cvttss2si %%xmm2, %[result]\n "
"movabsq $0x8000000000000000, %%rdx\n "
"xorq %[result], %%rdx\n "
"cvttss2si %[input], %[result]\n "
"ucomiss %%xmm1, %[input]\n "
"cmovaeq %%rdx, %[result]\n "
: [result] "=r"(res)
: [sub] "m"(val.f), [input] "x"(f)
: "rdx", "xmm1", "xmm2"
);
#endif
}
}

@rotateright
Copy link
Contributor Author

Given that the primary issue is the speed of branch vs predicate logic, here
is a microbenchmark of the two approaches:

As written, this is the absolute best case for the branching version, right? we should vary the input values to exercise the predictor, so we know when/if the cmov version is ever better.

If you can provide some info about your test machine, that would be good too. It's quite likely that the data and our conclusions are going to be different for say Intel Skylake vs. AMD Jaguar.

For now, I'm assuming that x86 needs to do something like this:
https://reviews.llvm.org/rL292154

This is for 2 reasons:

  1. Converting selects to branches early could have a negative impact on DAGCombiner/isel because that's limited to a basic block. We actually do convert some selects early in CodeGenPrepare::optimizeSelectInst(), but that's limited to the case where we have a known expensive op (division for example) and/or metadata (profile info) to guide that decision.

  2. As shown in the fptoui example, the select may not exist in IR. In that case, there's not much choice. We have to do this transform later unless we're willing to expand fptoui into the equivalent IR that contains a select and/or a compare+branch in CGP.

@llvmbot
Copy link
Collaborator

llvmbot commented May 12, 2017

There are certainly cases where cmov's data dependency is a bottleneck, instead of a control dependency that can be speculated and that avoids doing some work. e.g. http://stackoverflow.com/questions/28875325/gcc-optimization-flag-o3-makes-code-slower-then-o2 shows a case where a cmov data dependency (from gcc -O3) is slower than a conditional branch (from gcc -O2), for sorted input that leads to perfect branch prediction.

But caution is required here. More branchy code could be slower, even with predictable branches.

I'm not sure what heuristics would be most useful in guiding the choice between branchy vs. branchless. Obviously saving a lot of work in one side of the branch is a big one, but also creating loop-carried dep chains that would be the biggest bottleneck is another. (In that SO question, unrolling with multiple accumulators would have avoided a latency bottleneck.)


More total branches would take up more branch-predictor entries, which would likely mean that on average branches are predicted less well. This is another effect that microbenchmarks won't reveal.

On some CPUs, never-taken branches don't use any branch-prediction resources (since fall-through is the default prediction), but on at least some Intel CPUs, they probably do affect and are affected by other branches. http://agner.org/optimize/ says that recent Intel CPUs don't use static branch prediction, but instead

Also, taken branches always have the potential to reduce front-end throughput, even when predicted perfectly. CPUs have buffers between fetch and decode to hide fetch bubbles created by branches, but too many could expose them. uop-caches do help with this, but they aren't trace caches (so a taken branch usually means that the next uop comes from a different line of the uop cache).


Those inline-asm sequences have a flaw which hurts the branching version: The fast-path has a taken JMP, because it has to jump over the slow-path instead of putting it out-of-line at the end of the function. (This is only an inline-asm problem; gcc and clang of course already put unlikely branch targets out-of-line and have them jump back, so this is only a problem with the test methodology).

Taken-branch throughput can be an issue inside a tight loop like the one you're testing. (e.g. Haswell/Skylake can execute/retire 2 not-taken branches per clock, but only one taken branch per 2 clocks. Loop-branches are a special case, so tiny loops can run at 1 iteration per clock.)

@llvmbot
Copy link
Collaborator

llvmbot commented May 12, 2017

Oops, forgot to finish writing the paragraph about static branch prediction, and whether modern Intel CPUs use it or not. Matt Godbolt did some investigation of Nehalem vs. Haswell: https://xania.org/201602/bpu-part-two.

Also forgot to mention: on Intel Pentium-M/Core2 through Haswell, cmov is 2 uops with 2c latency. On Broadwell/SKL, it's 1 uop (like ADC/SBB and some other 3-input instructions, taking advantage of the 3-input uop support introduced for FMA in Haswell).

CMOV is only 1 uop / m-op on AMD, and on Silvermont (but it is 2c latency on Silvermont).

Anyway, on much of the current installed-base of x86 CPUs, CMOV is 2 uops / 2c latency, but that will change as more older Intel CPUs are replaced.

Of course, CMOV isn't the only option for branchless code. Especially on CPUs where it's 2 uops, an alternative like SAR/AND (or BMI2 ANDN) can be good vs. CMP/CMOV. See this interesting case of compiler hand-holding for example: http://stackoverflow.com/questions/34407437/what-is-the-efficient-way-to-count-set-bits-at-a-position-or-lower/34410357#34410357, where we couldn't avoid a CMP or TEST because POPCNT clobbers flags, so SAR/AND is no worse than TEST/CMOV even on CPUs with efficient CMOV.

@rotateright
Copy link
Contributor Author

Thanks, Peter.

One more variable to consider here: what is the size difference between the cmov and branch versions? I haven't counted the bytes, but that may tip the scale one way or the other when optimizing for size. And even nastier, are we supposed to be aligning the taken target block with nops? I think we do that for loop start blocks, but I haven't seen it with if/then.

@llvmbot
Copy link
Collaborator

llvmbot commented May 12, 2017

what is the size difference between the cmov and branch versions?

for gcc's versions:

branchless: (sub-optimal, should avoid the MOV with an LEA)
4008d0: 48 63 10 movsxd rdx,DWORD PTR [rax]
4008d3: 48 89 d1 mov rcx,rdx
4008d6: 48 01 da add rdx,rbx
4008d9: 83 c1 80 add ecx,0xffffff80
4008dc: 48 0f 4d da cmovge rbx,rdx
4008e0: 48 83 c0 04 add rax,0x4
4008e4: 4c 39 e0 cmp rax,r12
4008e7: 75 e7 jne 4008d0 <main+0xc0>
0x18 bytes of code, could be 0x16 I think

branchy:
400a20: 4d 63 1a movsxd r11,DWORD PTR [r10]
400a23: 41 83 fb 7f cmp r11d,0x7f
400a27: 7e 7d jle 400aa6 <main+0x116>
400a29: 49 83 c2 04 add r10,0x4
400a2d: 4c 01 db add rbx,r11
400a30: 4d 39 ca cmp r10,r9
400a33: 75 eb jne 400a20 <main+0x90>
0x14 bytes of code. (Could be 0x13 if gcc picked a reg that didn't need a REX prefix for the 32-bit cmp).

Sorry for using gcc output in this clang bug, but I already had the compiled binaries there to disassemble. I think it's going to be pretty common for the branchless version to be a little bit larger, especially when it requires constants in registers (because CMOV doesn't take immediate operands).

are we supposed to be aligning the taken target block with nops?

Definitely not, since the NOP would run inside the loop. Branch-target alignment is not very important in CPUs with uop-caches, and completely irrelevant when running from the loop buffer. (The loop buffer does act like a trace cache for tiny loops, I think, so the uop following a taken branch can issue as part of the same group as the branch.)

Putting even a single long-NOP inside the loop to align a non-loop branch target would hurt the front-end more than it helped, because that NOP would take up issue/retire bandwidth.

The alignment of branches (the jcc, not the target) can affect performance because it determines which other branches it aliases, so it can be hard to measure just the effect of branch alignment. Changing the alignment of anything tends to change the alignment of everything else in the program.

It could possibly help some to stretch out the real instructions in the loop by using redundant prefixes or longer addressing modes / immediates than necessary, if that avoids having the branch target be very near the end of a 16B-aligned block, but I'm not sure about that. (And unfortunately there's no mechanism in gas or NASM syntax, or any other asm syntax I'm aware of for requesting that.)

Decreasing code-density is a risky bet, especially when you don't have profiling data to show which loops are truly hot and actually bottlenecked on front-end throughput, not an execution unit, latency, or memory.

@michaeljclark
Copy link
Mannequin

michaeljclark mannequin commented May 12, 2017

With respect to microbenchmark using static values, i.e. always predicted branches, not being representative of a real workload, then one would ask the question of what would be representative of an actual workload.

  1. Values vary but are less than LLONG_MAX, 100% predicted branches
  2. Values vary but a few outliers are out of range, >99% predicted branches
  3. Pathological case. i.e. floating point noise values

Use case number 3 seems unusual as one developing software would typically convert values that they would expect would be in range versus what would essentially be a binary conversion of 0 or ULLONG_MAX for random values with random exponents that mostly fall outside of the range of u64. Actual code would be more likely to calculate a max value to scale the floating point values before a conversion (having written such code many times).

In essence, I am saying it would be fair to say that in the majority of code that the branch prediction rate is likely to be near the order of >99%, perhaps closer to 100% in most cases.

@michaeljclark
Copy link
Mannequin

michaeljclark mannequin commented May 12, 2017

Adding to that, whether the choice to use size_t / unsigned is because the value is intended to be positive versus gaining the additional range between LLONG_MAX and ULLONG_MAX.

Values from to 2^53 to 2^64, of course, will all be inexact conversions, and for the signed case of +/-2^53 to +/-2^63 are already handled in hardware with the MXCSR INEXACT bit being set by cvttss2si/cvttsd2si

@llvmbot
Copy link
Collaborator

llvmbot commented May 13, 2017

always predicted branches not being representative of a real workload

Agreed that a branchy float->u64 conversion will predict extremely well in most code. But it's not necessarily representative of other cases that are only say 98% predictable, which is the big-picture for this bug.

getting slightly off-topic here, I know you brought this up as a silly example of what not to optimize for.

  1. Pathological case. i.e. floating point noise values

essentially be a binary conversion of 0 or ULLONG_MAX for random values with
random exponents that mostly fall outside of the range of u64.

Fun fact about IEEE754: Half of all representable values have magnitude less than 1.0. Most have a magnitude below LLONG_MAX, so cvtss2si %xmm, %r64 will produce something other than the "integer indefinite" value (1ULL<<63) that Intel documents for out-of-range conversions: http://felixcloutier.com/x86/CVTSS2SI.html.

The range of representable values is huge, but they get sparser the farther away from 0 you get.

For negative floats, your conversion code appears to implement the semantics of (uint64_t)(int64_t)f.

That's what C programmers should write if they want an unsigned result but don't need the extra input range, and want efficiency on x86 without AVX512 (and probably some other architectures which only provide float<->signed conversions). Unfortunately C makes it hard to tell the compiler what you do/don't care about without making the code really ugly.

As far as the predictability of the branch: only bit patterns from 0x5f000000 to 0x7f7fffff and +Inf (0x7f800000) compare >= (float)(1ULL<<63). (Exponent=255 and non-zero mantissa is a NaN, which is unordered.) Check it out on https://www.h-schmidt.net/FloatConverter/IEEE754.html.


BTW, you can use LEA instead of XORQ so you can hoist the movabsq of the constant out of the loop without needing an extra MOV. XORing to flip the MSB is the same as adding to flip the MSB: with nowhere for the carry to go, add and add-without-carry are the same. (And so is subtract).

Unfortunately I don't see a way to use CMPLTPS/ANDPS to do the branchless stuff with only one cvttss2si (and avoid integer CMOV), unless we do the conditional sign-bit flip with vectors, too (e.g. integer left-shift of a compare result and XORPS, or maybe PSIGND). (And that would still change the result in the positive out-of-range because we'd get the "integer indefinite" value and not flip the MSB to make it 0 instead of 1ULL<<63.)

@llvmbot
Copy link
Collaborator

llvmbot commented May 13, 2017

Values from to 2^53 to 2^64, of course, will all be inexact conversions, and
for the signed case of +/-2^53 to +/-2^63 are already handled in hardware
with the MXCSR INEXACT bit being set by cvttss2si/cvttsd2si

Are you sure about that?

Not all integers are exactly representable as float/double (beyond 2^24 or 2^53), but all floats/doubles in that range represent integers. e.g. all the even integers from 2^24 to 2^25 are exactly representable. (2^24+1 is the first number that can't be exactly represented by IEEE float32 http://stackoverflow.com/questions/3793838/which-is-the-first-integer-that-an-ieee-754-float-is-incapable-of-representing-e).

Put another way, when the bias-corrected exponent is greater than the number of mantissa bits, multiplying the mantissa by 2^exponent shifts the decimal point (err, base-2 radix point?) to the right all the way out of the mantissa, leaving some zeros in the LSB of the base2 integer representation.

I haven't tested, but why would hardware raise the inexact exception in that case?

@michaeljclark
Copy link
Mannequin

michaeljclark mannequin commented May 13, 2017

Sure you there are some representable values but there is a loss of precision.

I would have to read the IEEE-754 spec to be sure on the exact rules regarding exact vs inexact conversions but there is a loss of precision above 2^53 such that a precise countable integer can not be distinguished due to right most bits of the significand that cannot be represented, by definition precision has to be lost as while a magnitude of 2^64 can be represented there are only repsectively 52 bits and 23 bits of precision for double and single.

At magnitudes around 2^63 there are 10 bits of double precision lost and 39 bits of single precision lost so the conversion will alias with values increasing in magnitude by 2^10 for double to u64 and 2^39 for single precision to u64. You can do an exact conversion to float or double for specific values but not the other way.

This is I believe why the subtraction of 2^63 causes the inexact flag to be set (which is done to allow the use of the signed hardware conversion to emulate an unsigned conversion).

There can only be an exact conversion for positive 0 to < 2^53 for double precision values and 0 to < 2^24 for single precision values.

In my particular use case I am emulating a RISC-V processor that has an unsigned conversions and I am running the RISC-V compliance tests on the results of the conversion intrinsics generated by the compiler.

I understand AVX-512 implements native unsigned conversions, in which case its handled by the hardware.

@llvmbot
Copy link
Collaborator

llvmbot commented May 15, 2017

Sorry for continuing the off-topic discussion in this thread, will keep this short.

I would have to read the IEEE-754 spec to be sure on the exact rules
regarding exact vs inexact conversions

If the rounding step changes the value, it's inexact. If not, it's exact.

For a fp->int conversion, it's a round to integer of the fp value. For a subtraction, it's a round to float or double of the result.

In the float->u64 case, it should be possible for both those operations to be exact, e.g. 1.5*2^63 - 2^63 = 2^62 exactly. (1.5 is a possible mantissa value).

I tested what x86 hardware conversion (on Skylake) does for an int->float->int round trip, using some printing around this:
float f = a;
long long roundtrip = f;
clang uses the expected cvtsi2ss and cvttss2si.

With input = 64 + (1LL << 30), rounding to nearest float loses the +64
inexact 0x40000040(1073741888) -> 1073741824.000000 -> 0x40000000(1073741824)

With input of 128 + (1LL << 30), that doesn't happen.
exact 0x40000080(1073741952) -> 1073741952.000000 -> 0x40000080(1073741952)

@michaeljclark
Copy link
Mannequin

michaeljclark mannequin commented May 15, 2017

Peter, when you say exact, you mean the inexact flag is not set in MXCSR? after the conversion or that the imprecise floating point aliases to an exact integer representation? (i.e. you can perform a round trip and get the same value). Subtle difference.

I'll run your test and check the MXCSR flags...

@michaeljclark
Copy link
Mannequin

michaeljclark mannequin commented May 15, 2017

The imprecise float is "aliasing" with an integer representation. float can't represent more than 24 bits of precision (counting the leading 1 bit). It's arguable as to whether the implementation is doing the right thing for magnitudes above the precision of the floating point type. The hardware is reporting an exact conversion to an integer when it can't know if the conversion is exact (using signed integers in this test as the subtract in the unsigned conversion can cause inexact). I guess "unknown" is not INEXACT.

$ cat roundtrip.c
#include <stdio.h>
#include <math.h>
#include <fenv.h>

typedef float f32;
typedef long long s64;

attribute ((noinline)) s64 fcvt_lu_s(f32 f) { return (s64)f; }
attribute ((noinline)) f32 fcvt_s_lu(s64 u) { return (f32)u; }

void test_fcvt(s64 u)
{
#pragma STDC FENV_ACCESS on

f32 f;
s64 v;
int nx;

feclearexcept(FE_ALL_EXCEPT);

nx = fetestexcept(FE_INEXACT);
printf("%lld %s\n", u, nx ? "inexact" : "exact");

feclearexcept(FE_ALL_EXCEPT);
f = fcvt_s_lu(u);
nx = fetestexcept(FE_INEXACT);
printf("%f %s\n", f, nx ? "inexact" : "exact");

feclearexcept(FE_ALL_EXCEPT);
v = fcvt_s_lu(f);
nx = fetestexcept(FE_INEXACT);
printf("%lld %s\n", v, nx ? "inexact" : "exact");

}

int main()
{
fesetround(FE_TONEAREST);

test_fcvt((1 << 20) + 64);
test_fcvt((1 << 20) + 128);
test_fcvt((1 << 30) + 64);
test_fcvt((1 << 30) + 128);

}

$ gcc roundtrip.c -o roundtrip
$ ./roundtrip
1048640 exact
1048640.000000 exact
1048640 exact
1048704 exact
1048704.000000 exact
1048704 exact
1073741888 exact
1073741824.000000 inexact
1073741824 exact
1073741952 exact
1073741952.000000 exact
1073741952 exact

@rotateright
Copy link
Contributor Author

A cmov-to-branch pass for x86 has been proposed here:
https://reviews.llvm.org/D34769

@RKSimon
Copy link
Collaborator

RKSimon commented Sep 5, 2017

A cmov-to-branch pass for x86 has been proposed here:
https://reviews.llvm.org/D34769

This was committed at rL308142

@llvmbot llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 10, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:X86 bugzilla Issues migrated from bugzilla
Projects
None yet
Development

No branches or pull requests

3 participants