@@ -92,6 +92,113 @@ get_math_module_state(PyObject *module)
92
92
return (math_module_state * )state ;
93
93
}
94
94
95
+ /*
96
+ Double and triple length extended precision algorithms from:
97
+
98
+ Accurate Sum and Dot Product
99
+ by Takeshi Ogita, Siegfried M. Rump, and Shin’Ichi Oishi
100
+ https://doi.org/10.1137/030601818
101
+ https://www.tuhh.de/ti3/paper/rump/OgRuOi05.pdf
102
+
103
+ */
104
+
105
+ typedef struct { double hi ; double lo ; } DoubleLength ;
106
+
107
+ static DoubleLength
108
+ dl_fast_sum (double a , double b )
109
+ {
110
+ /* Algorithm 1.1. Compensated summation of two floating point numbers. */
111
+ assert (fabs (a ) >= fabs (b ));
112
+ double x = a + b ;
113
+ double y = (a - x ) + b ;
114
+ return (DoubleLength ) {x , y };
115
+ }
116
+
117
+ static DoubleLength
118
+ dl_sum (double a , double b )
119
+ {
120
+ /* Algorithm 3.1 Error-free transformation of the sum */
121
+ double x = a + b ;
122
+ double z = x - a ;
123
+ double y = (a - (x - z )) + (b - z );
124
+ return (DoubleLength ) {x , y };
125
+ }
126
+
127
+ #ifndef UNRELIABLE_FMA
128
+
129
+ static DoubleLength
130
+ dl_mul (double x , double y )
131
+ {
132
+ /* Algorithm 3.5. Error-free transformation of a product */
133
+ double z = x * y ;
134
+ double zz = fma (x , y , - z );
135
+ return (DoubleLength ) {z , zz };
136
+ }
137
+
138
+ #else
139
+
140
+ /*
141
+ The default implementation of dl_mul() depends on the C math library
142
+ having an accurate fma() function as required by § 7.12.13.1 of the
143
+ C99 standard.
144
+
145
+ The UNRELIABLE_FMA option is provided as a slower but accurate
146
+ alternative for builds where the fma() function is found wanting.
147
+ The speed penalty may be modest (17% slower on an Apple M1 Max),
148
+ so don't hesitate to enable this build option.
149
+
150
+ The algorithms are from the T. J. Dekker paper:
151
+ A Floating-Point Technique for Extending the Available Precision
152
+ https://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
153
+ */
154
+
155
+ static DoubleLength
156
+ dl_split (double x ) {
157
+ // Dekker (5.5) and (5.6).
158
+ double t = x * 134217729.0 ; // Veltkamp constant = 2.0 ** 27 + 1
159
+ double hi = t - (t - x );
160
+ double lo = x - hi ;
161
+ return (DoubleLength ) {hi , lo };
162
+ }
163
+
164
+ static DoubleLength
165
+ dl_mul (double x , double y )
166
+ {
167
+ // Dekker (5.12) and mul12()
168
+ DoubleLength xx = dl_split (x );
169
+ DoubleLength yy = dl_split (y );
170
+ double p = xx .hi * yy .hi ;
171
+ double q = xx .hi * yy .lo + xx .lo * yy .hi ;
172
+ double z = p + q ;
173
+ double zz = p - z + q + xx .lo * yy .lo ;
174
+ return (DoubleLength ) {z , zz };
175
+ }
176
+
177
+ #endif
178
+
179
+ typedef struct { double hi ; double lo ; double tiny ; } TripleLength ;
180
+
181
+ static const TripleLength tl_zero = {0.0 , 0.0 , 0.0 };
182
+
183
+ static TripleLength
184
+ tl_fma (double x , double y , TripleLength total )
185
+ {
186
+ /* Algorithm 5.10 with SumKVert for K=3 */
187
+ DoubleLength pr = dl_mul (x , y );
188
+ DoubleLength sm = dl_sum (total .hi , pr .hi );
189
+ DoubleLength r1 = dl_sum (total .lo , pr .lo );
190
+ DoubleLength r2 = dl_sum (r1 .hi , sm .lo );
191
+ return (TripleLength ) {sm .hi , r2 .hi , total .tiny + r1 .lo + r2 .lo };
192
+ }
193
+
194
+ static double
195
+ tl_to_d (TripleLength total )
196
+ {
197
+ DoubleLength last = dl_sum (total .lo , total .hi );
198
+ return total .tiny + last .lo + last .hi ;
199
+ }
200
+
201
+
95
202
/*
96
203
sin(pi*x), giving accurate results for all finite x (especially x
97
204
integral or close to an integer). This is here for use in the
@@ -2301,6 +2408,7 @@ that are almost always correctly rounded, four techniques are used:
2301
2408
2302
2409
* lossless scaling using a power-of-two scaling factor
2303
2410
* accurate squaring using Veltkamp-Dekker splitting [1]
2411
+ or an equivalent with an fma() call
2304
2412
* compensated summation using a variant of the Neumaier algorithm [2]
2305
2413
* differential correction of the square root [3]
2306
2414
@@ -2359,14 +2467,21 @@ algorithm, effectively doubling the number of accurate bits.
2359
2467
This technique is used in Dekker's SQRT2 algorithm and again in
2360
2468
Borges' ALGORITHM 4 and 5.
2361
2469
2362
- Without proof for all cases, hypot() cannot claim to be always
2363
- correctly rounded. However for n <= 1000, prior to the final addition
2364
- that rounds the overall result, the internal accuracy of "h" together
2365
- with its correction of "x / (2.0 * h)" is at least 100 bits. [6]
2366
- Also, hypot() was tested against a Decimal implementation with
2367
- prec=300. After 100 million trials, no incorrectly rounded examples
2368
- were found. In addition, perfect commutativity (all permutations are
2369
- exactly equal) was verified for 1 billion random inputs with n=5. [7]
2470
+ The hypot() function is faithfully rounded (less than 1 ulp error)
2471
+ and usually correctly rounded (within 1/2 ulp). The squaring
2472
+ step is exact. The Neumaier summation computes as if in doubled
2473
+ precision (106 bits) and has the advantage that its input squares
2474
+ are non-negative so that the condition number of the sum is one.
2475
+ The square root with a differential correction is likewise computed
2476
+ as if in double precision.
2477
+
2478
+ For n <= 1000, prior to the final addition that rounds the overall
2479
+ result, the internal accuracy of "h" together with its correction of
2480
+ "x / (2.0 * h)" is at least 100 bits. [6] Also, hypot() was tested
2481
+ against a Decimal implementation with prec=300. After 100 million
2482
+ trials, no incorrectly rounded examples were found. In addition,
2483
+ perfect commutativity (all permutations are exactly equal) was
2484
+ verified for 1 billion random inputs with n=5. [7]
2370
2485
2371
2486
References:
2372
2487
@@ -2383,9 +2498,8 @@ exactly equal) was verified for 1 billion random inputs with n=5. [7]
2383
2498
static inline double
2384
2499
vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
2385
2500
{
2386
- const double T27 = 134217729.0 ; /* ldexp(1.0, 27) + 1.0) */
2387
- double x , scale , oldcsum , csum = 1.0 , frac1 = 0.0 , frac2 = 0.0 , frac3 = 0.0 ;
2388
- double t , hi , lo , h ;
2501
+ double x , h , scale , oldcsum , csum = 1.0 , frac1 = 0.0 , frac2 = 0.0 ;
2502
+ DoubleLength pr , sm ;
2389
2503
int max_e ;
2390
2504
Py_ssize_t i ;
2391
2505
@@ -2410,54 +2524,21 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2410
2524
x *= scale ;
2411
2525
assert (fabs (x ) < 1.0 );
2412
2526
2413
- t = x * T27 ;
2414
- hi = t - (t - x );
2415
- lo = x - hi ;
2416
- assert (hi + lo == x );
2417
-
2418
- x = hi * hi ;
2419
- assert (x <= 1.0 );
2420
- assert (fabs (csum ) >= fabs (x ));
2421
- oldcsum = csum ;
2422
- csum += x ;
2423
- frac1 += (oldcsum - csum ) + x ;
2424
-
2425
- x = 2.0 * hi * lo ;
2426
- assert (fabs (csum ) >= fabs (x ));
2427
- oldcsum = csum ;
2428
- csum += x ;
2429
- frac2 += (oldcsum - csum ) + x ;
2430
-
2431
- assert (csum + lo * lo == csum );
2432
- frac3 += lo * lo ;
2433
- }
2434
- h = sqrt (csum - 1.0 + (frac1 + frac2 + frac3 ));
2435
-
2436
- x = h ;
2437
- t = x * T27 ;
2438
- hi = t - (t - x );
2439
- lo = x - hi ;
2440
- assert (hi + lo == x );
2527
+ pr = dl_mul (x , x );
2528
+ assert (pr .hi <= 1.0 );
2441
2529
2442
- x = - hi * hi ;
2443
- assert (fabs (csum ) >= fabs (x ));
2444
- oldcsum = csum ;
2445
- csum += x ;
2446
- frac1 += (oldcsum - csum ) + x ;
2447
-
2448
- x = -2.0 * hi * lo ;
2449
- assert (fabs (csum ) >= fabs (x ));
2450
- oldcsum = csum ;
2451
- csum += x ;
2452
- frac2 += (oldcsum - csum ) + x ;
2453
-
2454
- x = - lo * lo ;
2455
- assert (fabs (csum ) >= fabs (x ));
2456
- oldcsum = csum ;
2457
- csum += x ;
2458
- frac3 += (oldcsum - csum ) + x ;
2459
-
2460
- x = csum - 1.0 + (frac1 + frac2 + frac3 );
2530
+ sm = dl_fast_sum (csum , pr .hi );
2531
+ csum = sm .hi ;
2532
+ frac1 += pr .lo ;
2533
+ frac2 += sm .lo ;
2534
+ }
2535
+ h = sqrt (csum - 1.0 + (frac1 + frac2 ));
2536
+ pr = dl_mul (- h , h );
2537
+ sm = dl_fast_sum (csum , pr .hi );
2538
+ csum = sm .hi ;
2539
+ frac1 += pr .lo ;
2540
+ frac2 += sm .lo ;
2541
+ x = csum - 1.0 + (frac1 + frac2 );
2461
2542
return (h + x / (2.0 * h )) / scale ;
2462
2543
}
2463
2544
/* When max_e < -1023, ldexp(1.0, -max_e) overflows.
@@ -2646,102 +2727,6 @@ long_add_would_overflow(long a, long b)
2646
2727
return (a > 0 ) ? (b > LONG_MAX - a ) : (b < LONG_MIN - a );
2647
2728
}
2648
2729
2649
- /*
2650
- Double and triple length extended precision algorithms from:
2651
-
2652
- Accurate Sum and Dot Product
2653
- by Takeshi Ogita, Siegfried M. Rump, and Shin’Ichi Oishi
2654
- https://doi.org/10.1137/030601818
2655
- https://www.tuhh.de/ti3/paper/rump/OgRuOi05.pdf
2656
-
2657
- */
2658
-
2659
- typedef struct { double hi ; double lo ; } DoubleLength ;
2660
-
2661
- static DoubleLength
2662
- dl_sum (double a , double b )
2663
- {
2664
- /* Algorithm 3.1 Error-free transformation of the sum */
2665
- double x = a + b ;
2666
- double z = x - a ;
2667
- double y = (a - (x - z )) + (b - z );
2668
- return (DoubleLength ) {x , y };
2669
- }
2670
-
2671
- #ifndef UNRELIABLE_FMA
2672
-
2673
- static DoubleLength
2674
- dl_mul (double x , double y )
2675
- {
2676
- /* Algorithm 3.5. Error-free transformation of a product */
2677
- double z = x * y ;
2678
- double zz = fma (x , y , - z );
2679
- return (DoubleLength ) {z , zz };
2680
- }
2681
-
2682
- #else
2683
-
2684
- /*
2685
- The default implementation of dl_mul() depends on the C math library
2686
- having an accurate fma() function as required by § 7.12.13.1 of the
2687
- C99 standard.
2688
-
2689
- The UNRELIABLE_FMA option is provided as a slower but accurate
2690
- alternative for builds where the fma() function is found wanting.
2691
- The speed penalty may be modest (17% slower on an Apple M1 Max),
2692
- so don't hesitate to enable this build option.
2693
-
2694
- The algorithms are from the T. J. Dekker paper:
2695
- A Floating-Point Technique for Extending the Available Precision
2696
- https://csclub.uwaterloo.ca/~pbarfuss/dekker1971.pdf
2697
- */
2698
-
2699
- static DoubleLength
2700
- dl_split (double x ) {
2701
- // Dekker (5.5) and (5.6).
2702
- double t = x * 134217729.0 ; // Veltkamp constant = 2.0 ** 27 + 1
2703
- double hi = t - (t - x );
2704
- double lo = x - hi ;
2705
- return (DoubleLength ) {hi , lo };
2706
- }
2707
-
2708
- static DoubleLength
2709
- dl_mul (double x , double y )
2710
- {
2711
- // Dekker (5.12) and mul12()
2712
- DoubleLength xx = dl_split (x );
2713
- DoubleLength yy = dl_split (y );
2714
- double p = xx .hi * yy .hi ;
2715
- double q = xx .hi * yy .lo + xx .lo * yy .hi ;
2716
- double z = p + q ;
2717
- double zz = p - z + q + xx .lo * yy .lo ;
2718
- return (DoubleLength ) {z , zz };
2719
- }
2720
-
2721
- #endif
2722
-
2723
- typedef struct { double hi ; double lo ; double tiny ; } TripleLength ;
2724
-
2725
- static const TripleLength tl_zero = {0.0 , 0.0 , 0.0 };
2726
-
2727
- static TripleLength
2728
- tl_fma (double x , double y , TripleLength total )
2729
- {
2730
- /* Algorithm 5.10 with SumKVert for K=3 */
2731
- DoubleLength pr = dl_mul (x , y );
2732
- DoubleLength sm = dl_sum (total .hi , pr .hi );
2733
- DoubleLength r1 = dl_sum (total .lo , pr .lo );
2734
- DoubleLength r2 = dl_sum (r1 .hi , sm .lo );
2735
- return (TripleLength ) {sm .hi , r2 .hi , total .tiny + r1 .lo + r2 .lo };
2736
- }
2737
-
2738
- static double
2739
- tl_to_d (TripleLength total )
2740
- {
2741
- DoubleLength last = dl_sum (total .lo , total .hi );
2742
- return total .tiny + last .lo + last .hi ;
2743
- }
2744
-
2745
2730
/*[clinic input]
2746
2731
math.sumprod
2747
2732
0 commit comments