Skip to content

Commit

Permalink
Change generation of 2^n values for fixed point conversions.
Browse files Browse the repository at this point in the history
As demonstrated by test code in
simd-everywhere#1260
the behavior of pow() in non-round-to-nearest rounding modes is not
exact.  This causes behavior divergence from ARMv8 hardware when not
using round-to-nearest.  The updated forms match hardware properly
across a range of values.  The tests are not updated to handle
rounding modes, as doing this in a cross-platform way is not trivial.
However, all existing test vectors pass properly, and in more
detailed testing, these changes are closer to hardware.
  • Loading branch information
Syonyk committed Jan 6, 2025
1 parent cf1db25 commit b70a397
Showing 1 changed file with 37 additions and 33 deletions.
70 changes: 37 additions & 33 deletions simde/arm/neon/cvt_n.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
*
* Copyright:
* 2023 Yi-Yen Chung <eric681@andestech.com> (Copyright owned by Andes Technology)
*
* Note: pow(2, n) does not generate proper (exact) results with rounding
* modes other than round-to-nearest.
* See https://github.com/simd-everywhere/simde/issues/1260
*/

#if !defined(SIMDE_ARM_NEON_CVT_N_H)
Expand All @@ -40,7 +44,7 @@ simde_vcvth_n_u16_f16(simde_float16_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
return simde_vcvth_u16_f16(
simde_float16_from_float32(
simde_float16_to_float32(a) * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))));
simde_float16_to_float32(a) * HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n))));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
#define simde_vcvth_n_u16_f16(a, n) vcvth_n_u16_f16(a, n)
Expand All @@ -56,7 +60,7 @@ simde_vcvth_n_f16_s16(int16_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
return simde_float16_from_float32(
HEDLEY_STATIC_CAST(simde_float32_t,
HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n)));
HEDLEY_STATIC_CAST(simde_float64_t, a) / (UINT64_C(1) << n)));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
#define simde_vcvth_n_f16_s16(a, n) vcvth_n_f16_s16(a, n)
Expand All @@ -72,7 +76,7 @@ simde_vcvth_n_f16_u16(uint16_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
return simde_float16_from_float32(
HEDLEY_STATIC_CAST(simde_float32_t,
HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n)));
HEDLEY_STATIC_CAST(simde_float64_t, a) / (UINT64_C(1) << n)));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
#define simde_vcvth_n_f16_u16(a, n) vcvth_n_f16_u16(a, n)
Expand All @@ -86,7 +90,7 @@ SIMDE_FUNCTION_ATTRIBUTES
int32_t
simde_vcvts_n_s32_f32(simde_float32_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
return simde_vcvts_s32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)));
return simde_vcvts_s32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n)));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vcvts_n_s32_f32(a, n) vcvts_n_s32_f32(a, n)
Expand All @@ -100,7 +104,7 @@ SIMDE_FUNCTION_ATTRIBUTES
uint32_t
simde_vcvts_n_u32_f32(simde_float32_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
return simde_vcvts_u32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)));
return simde_vcvts_u32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n)));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vcvts_n_u32_f32(a, n) vcvts_n_u32_f32(a, n)
Expand All @@ -115,7 +119,7 @@ simde_float32_t
simde_vcvts_n_f32_s32(int32_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
return HEDLEY_STATIC_CAST(simde_float32_t,
HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n));
HEDLEY_STATIC_CAST(simde_float64_t, a) / (UINT64_C(1) << n));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vcvts_n_f32_s32(a, n) vcvts_n_f32_s32(a, n)
Expand All @@ -130,7 +134,7 @@ simde_float32_t
simde_vcvts_n_f32_u32(uint32_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
return HEDLEY_STATIC_CAST(simde_float32_t,
HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n));
HEDLEY_STATIC_CAST(simde_float64_t, a) / (UINT64_C(1) << n));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vcvts_n_f32_u32(a, n) vcvts_n_f32_u32(a, n)
Expand All @@ -144,7 +148,7 @@ SIMDE_FUNCTION_ATTRIBUTES
int64_t
simde_vcvtd_n_s64_f64(simde_float64_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
return simde_vcvtd_s64_f64(a * simde_math_pow(2, n));
return simde_vcvtd_s64_f64(a * ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vcvtd_n_s64_f64(a, n) vcvtd_n_s64_f64(a, n)
Expand All @@ -158,7 +162,7 @@ SIMDE_FUNCTION_ATTRIBUTES
uint64_t
simde_vcvtd_n_u64_f64(simde_float64_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
return simde_vcvtd_u64_f64(a * simde_math_pow(2, n));
return simde_vcvtd_u64_f64(a * ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vcvtd_n_u64_f64(a, n) vcvtd_n_u64_f64(a, n)
Expand All @@ -172,7 +176,7 @@ SIMDE_FUNCTION_ATTRIBUTES
simde_float64_t
simde_vcvtd_n_f64_s64(int64_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
return HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n);
return HEDLEY_STATIC_CAST(simde_float64_t, a) / ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n);
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vcvtd_n_f64_s64(a, n) vcvtd_n_f64_s64(a, n)
Expand All @@ -186,7 +190,7 @@ SIMDE_FUNCTION_ATTRIBUTES
simde_float64_t
simde_vcvtd_n_f64_u64(uint64_t a, const int n)
SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
return HEDLEY_STATIC_CAST(simde_float64_t, a) / simde_math_pow(2, n);
return HEDLEY_STATIC_CAST(simde_float64_t, a) / ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n);
}
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vcvtd_n_f64_u64(a, n) vcvtd_n_f64_u64(a, n)
Expand All @@ -205,7 +209,7 @@ simde_vcvt_n_s32_f32(simde_float32x2_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)));
r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n)));
}

return simde_int32x2_from_private(r_);
Expand All @@ -227,7 +231,7 @@ simde_vcvt_n_s64_f64(simde_float64x1_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * simde_math_pow(2, n));
r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}

return simde_int64x1_from_private(r_);
Expand All @@ -251,7 +255,7 @@ simde_vcvt_n_u16_f16(simde_float16x4_t a, const int n)
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvth_u16_f16(simde_float16_from_float32(
simde_float16_to_float32(a_.values[i]) *
HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))));
HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n))));
}

return simde_uint16x4_from_private(r_);
Expand All @@ -273,7 +277,7 @@ simde_vcvt_n_u32_f32(simde_float32x2_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)));
r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n)));
}

return simde_uint32x2_from_private(r_);
Expand All @@ -295,7 +299,7 @@ simde_vcvt_n_u64_f64(simde_float64x1_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * simde_math_pow(2, n));
r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}

return simde_uint64x1_from_private(r_);
Expand All @@ -317,7 +321,7 @@ simde_vcvtq_n_s32_f32(simde_float32x4_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)));
r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n)));
}

return simde_int32x4_from_private(r_);
Expand All @@ -339,7 +343,7 @@ simde_vcvtq_n_s64_f64(simde_float64x2_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * simde_math_pow(2, n));
r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}

return simde_int64x2_from_private(r_);
Expand All @@ -363,7 +367,7 @@ simde_vcvtq_n_u16_f16(simde_float16x8_t a, const int n)
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvth_u16_f16(simde_float16_from_float32(
simde_float16_to_float32(a_.values[i]) *
HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n))));
HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n))));
}

return simde_uint16x8_from_private(r_);
Expand All @@ -385,7 +389,7 @@ simde_vcvtq_n_u32_f32(simde_float32x4_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, simde_math_pow(2, n)));
r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, (UINT64_C(1) << n)));
}

return simde_uint32x4_from_private(r_);
Expand All @@ -407,7 +411,7 @@ simde_vcvtq_n_u64_f64(simde_float64x2_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * simde_math_pow(2, n));
r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}

return simde_uint64x2_from_private(r_);
Expand All @@ -429,7 +433,7 @@ simde_vcvt_n_f16_u16(simde_uint16x4_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)));
r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / (UINT64_C(1) << n)));
}

return simde_float16x4_from_private(r_);
Expand All @@ -451,7 +455,7 @@ simde_vcvt_n_f16_s16(simde_int16x4_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)));
r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / (UINT64_C(1) << n)));
}

return simde_float16x4_from_private(r_);
Expand All @@ -473,7 +477,7 @@ simde_vcvtq_n_f16_u16(simde_uint16x8_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n)));
r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / (UINT64_C(1) << n)));
}

return simde_float16x8_from_private(r_);
Expand All @@ -495,7 +499,7 @@ simde_vcvtq_n_f16_s16(simde_int16x8_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, (a_.values[i] / simde_math_pow(2, n))));
r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / (UINT64_C(1) << n)));
}

return simde_float16x8_from_private(r_);
Expand All @@ -517,7 +521,7 @@ simde_vcvt_n_f32_u32(simde_uint32x2_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n));
r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / (UINT64_C(1) << n));
}

return simde_float32x2_from_private(r_);
Expand All @@ -539,7 +543,7 @@ simde_vcvt_n_f32_s32(simde_int32x2_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n));
r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / (UINT64_C(1) << n));
}

return simde_float32x2_from_private(r_);
Expand All @@ -561,7 +565,7 @@ simde_vcvt_n_f64_u64(simde_uint64x1_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n));
r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}

return simde_float64x1_from_private(r_);
Expand All @@ -583,7 +587,7 @@ simde_vcvtq_n_f64_u64(simde_uint64x2_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n));
r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}

return simde_float64x2_from_private(r_);
Expand All @@ -605,7 +609,7 @@ simde_vcvt_n_f64_s64(simde_int64x1_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n));
r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}

return simde_float64x1_from_private(r_);
Expand All @@ -627,7 +631,7 @@ simde_vcvtq_n_f64_s64(simde_int64x2_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n));
r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / ((n == 64) ? simde_math_pow(2, n) : UINT64_C(1) << n));
}

return simde_float64x2_from_private(r_);
Expand All @@ -649,7 +653,7 @@ simde_vcvtq_n_f32_s32(simde_int32x4_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n));
r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / (UINT64_C(1) << n));
}

return simde_float32x4_from_private(r_);
Expand All @@ -671,7 +675,7 @@ simde_vcvtq_n_f32_u32(simde_uint32x4_t a, const int n)

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / simde_math_pow(2, n));
r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / (UINT64_C(1) << n));
}

return simde_float32x4_from_private(r_);
Expand Down

0 comments on commit b70a397

Please sign in to comment.