Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4e99a3e
Fast vectorizable atan and atan2 functions.
mcourteaux Aug 10, 2024
0f00cb1
Default to not using fast atan versions if on CUDA.
mcourteaux Aug 10, 2024
6af7972
Finished fast atan/atan2 functions and tests.
mcourteaux Aug 10, 2024
7cabaa3
Correct attribution.
mcourteaux Aug 10, 2024
2850da3
Clang-format
mcourteaux Aug 10, 2024
4297809
Weird WebAssembly limits...
mcourteaux Aug 11, 2024
18bc63c
Small improvements to the optimization script.
mcourteaux Aug 11, 2024
d73177c
Polynomial optimization for log, exp, sin, cos with correct ranges.
mcourteaux Aug 11, 2024
a146f57
Improve fast atan performance tests for GPU.
mcourteaux Aug 12, 2024
4814c37
Bugfix fast_atan approximation. Fix correctness test to exceed the ra…
mcourteaux Aug 12, 2024
803a762
Cleanup
mcourteaux Aug 12, 2024
2a9b506
Enum class instead of enum for ApproximationPrecision.
mcourteaux Aug 12, 2024
dfb2264
Weird Metal limits. There should be a better way...
mcourteaux Aug 12, 2024
746337b
Skip test for WebGPU.
mcourteaux Aug 12, 2024
614abc2
Fast atan/atan2 polynomials reoptimized. New optimization strategy: ULP.
mcourteaux Aug 13, 2024
5163d78
Feedback Steven.
mcourteaux Aug 13, 2024
091ab5c
More comments and test mantissa error.
mcourteaux Aug 14, 2024
05ecbb6
Do not error when testing arctan performance on Metal / WebGPU.
mcourteaux Aug 14, 2024
bc2ed82
Rework precision specification. Generalize towards using this for oth…
mcourteaux Nov 11, 2024
5a5cb27
Clang-format.
mcourteaux Nov 11, 2024
58cb828
Fix makefile and clang-tidy.
mcourteaux Nov 11, 2024
735c431
Fix incorrect approximation selection when required precision is not …
mcourteaux Nov 12, 2024
f598106
Feedback from Steven.
mcourteaux Dec 3, 2024
20f9187
Implemented approximation tables for sin, cos, exp, log fast variants…
mcourteaux Feb 4, 2025
05ebf8d
Clang-format.
mcourteaux Feb 4, 2025
cf4ef47
Move Polynomial Optimizer Python script to tools/ directory.
mcourteaux Feb 4, 2025
7156130
Enable performance test for fast_atan and fast_atan2.
mcourteaux Feb 4, 2025
302534b
LLVM upper-limit 99 (CMake needs an upper limit).
mcourteaux Feb 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ option(THREADS_PREFER_PTHREAD_FLAG "When enabled, prefer to use the -pthread fla
find_package(Threads REQUIRED)

## LLVM
find_package(Halide_LLVM 18...20 REQUIRED
find_package(Halide_LLVM 18...99 REQUIRED
COMPONENTS WebAssembly X86
OPTIONAL_COMPONENTS AArch64 ARM Hexagon NVPTX PowerPC RISCV)

Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,7 @@ SOURCE_FILES = \
AlignLoads.cpp \
AllocationBoundsInference.cpp \
ApplySplit.cpp \
ApproximationTables.cpp \
Argument.cpp \
AssociativeOpsTable.cpp \
Associativity.cpp \
Expand Down
351 changes: 351 additions & 0 deletions src/ApproximationTables.cpp

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions src/ApproximationTables.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef HALIDE_APPROXIMATION_TABLES_H
#define HALIDE_APPROXIMATION_TABLES_H

#include <vector>

#include "IROperator.h"

namespace Halide {
namespace Internal {

struct Approximation {
ApproximationPrecision::OptimizationObjective objective;
struct Metrics {
double mse;
double mae;
double mulpe;
} metrics_f32, metrics_f64;
std::vector<double> coefficients;
};

const Approximation *best_atan_approximation(Halide::ApproximationPrecision precision, Type type);
const Approximation *best_sin_approximation(Halide::ApproximationPrecision precision, Type type);
const Approximation *best_cos_approximation(Halide::ApproximationPrecision precision, Type type);
const Approximation *best_log_approximation(Halide::ApproximationPrecision precision, Type type);
const Approximation *best_exp_approximation(Halide::ApproximationPrecision precision, Type type);
const Approximation *best_expm1_approximation(Halide::ApproximationPrecision precision, Type type);

} // namespace Internal
} // namespace Halide

#endif
4 changes: 2 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,7 @@ target_sources(
WrapCalls.h
)

# The sources that go into libHalide. For the sake of IDE support, headers that
# exist in src/ but are not public should be included here.
# The sources that go into libHalide.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you alter the comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because there are no headers in that list. That comments is clearly outdated. Unless I'm wildly misunderstanding something.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mcourteaux -- you understand it fine, the comment is out of date. The list of headers is right above.

target_sources(
Halide
PRIVATE
Expand All @@ -233,6 +232,7 @@ target_sources(
AlignLoads.cpp
AllocationBoundsInference.cpp
ApplySplit.cpp
ApproximationTables.cpp
Argument.cpp
AssociativeOpsTable.cpp
Associativity.cpp
Expand Down
205 changes: 169 additions & 36 deletions src/IROperator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <sstream>
#include <utility>

#include "ApproximationTables.h"
#include "CSE.h"
#include "ConstantBounds.h"
#include "Debug.h"
Expand Down Expand Up @@ -1336,46 +1337,36 @@ Expr rounding_mul_shift_right(Expr a, Expr b, int q) {
return rounding_mul_shift_right(std::move(a), std::move(b), make_const(qt, q));
}

Expr fast_log(const Expr &x) {
user_assert(x.type() == Float(32)) << "fast_log only works for Float(32)";
namespace {

Expr reduced, exponent;
range_reduce_log(x, &reduced, &exponent);
constexpr double PI = 3.14159265358979323846;
constexpr double TWO_OVER_PI = 0.63661977236758134308;
constexpr double PI_OVER_TWO = 1.57079632679489661923;

Expr x1 = reduced - 1.0f;

float coeff[] = {
0.07640318789187280912f,
-0.16252961013874300811f,
0.20625219040645212387f,
-0.25110261010892864775f,
0.33320464908377461777f,
-0.49997513376789826101f,
1.0f,
0.0f};

Expr result = evaluate_polynomial(x1, coeff, sizeof(coeff) / sizeof(coeff[0]));
result = result + cast<float>(exponent) * logf(2);
result = common_subexpression_elimination(result);
return result;
Expr constant(Type t, double value) {
if (t == Float(64)) {
return Expr(value);
}
if (t == Float(32)) {
return Expr(float(value));
}
internal_error << "Constants only for double or float.";
return 0;
}

namespace {

// A vectorizable sine and cosine implementation. Based on syrah fast vector math
// https://github.com/boulos/syrah/blob/master/src/include/syrah/FixedVectorMath.h#L55
[[deprecated("No precision parameter, use fast_sin_cos_v2 instead.")]]
Expr fast_sin_cos(const Expr &x_full, bool is_sin) {
const float two_over_pi = 0.636619746685028076171875f;
const float pi_over_two = 1.57079637050628662109375f;
Expr scaled = x_full * two_over_pi;
Expr scaled = x_full * float(TWO_OVER_PI);
Expr k_real = floor(scaled);
Expr k = cast<int>(k_real);
Expr k_mod4 = k % 4;
Expr sin_usecos = is_sin ? ((k_mod4 == 1) || (k_mod4 == 3)) : ((k_mod4 == 0) || (k_mod4 == 2));
Expr flip_sign = is_sin ? (k_mod4 > 1) : ((k_mod4 == 1) || (k_mod4 == 2));

// Reduce the angle modulo pi/2.
Expr x = x_full - k_real * pi_over_two;
// Reduce the angle modulo pi/2: i.e., to the angle within the quadrant.
Expr x = x_full - k_real * float(PI_OVER_TWO);

const float sin_c2 = -0.16666667163372039794921875f;
const float sin_c4 = 8.333347737789154052734375e-3;
Expand All @@ -1401,24 +1392,119 @@ Expr fast_sin_cos(const Expr &x_full, bool is_sin) {
return select(flip_sign, -tri_func, tri_func);
}

Expr fast_sin_cos_v2(const Expr &x_full, bool is_sin, ApproximationPrecision precision) {
Type type = x_full.type();
// Range reduction to interval [0, pi/2] which corresponds to a quadrant of the circle.
Expr scaled = x_full * constant(type, TWO_OVER_PI);
Expr k_real = floor(scaled);
Expr k = cast<int>(k_real);
Expr k_mod4 = k % 4;
Expr sin_usecos = is_sin ? ((k_mod4 == 1) || (k_mod4 == 3)) : ((k_mod4 == 0) || (k_mod4 == 2));
// sin_usecos = !sin_usecos;
Expr flip_sign = is_sin ? (k_mod4 > 1) : ((k_mod4 == 1) || (k_mod4 == 2));

// Reduce the angle modulo pi/2: i.e., to the angle within the quadrant.
Expr x = x_full - k_real * constant(type, PI_OVER_TWO);
x = select(sin_usecos, constant(type, PI_OVER_TWO) - x, x);

const Internal::Approximation *approx = Internal::best_sin_approximation(precision, type);
// const Internal::Approximation *approx = Internal::best_cos_approximation(precision);
const std::vector<double> &c = approx->coefficients;
Expr x2 = x * x;
Expr result = constant(type, c.back());
for (size_t i = 1; i < c.size(); ++i) {
result = x2 * result + constant(type, c[c.size() - i - 1]);
}
result *= x;
result = select(flip_sign, -result, result);
return common_subexpression_elimination(result, true);
}

} // namespace

Expr fast_sin(const Expr &x_full) {
return fast_sin_cos(x_full, true);
Expr fast_sin(const Expr &x, ApproximationPrecision precision) {
// return fast_sin_cos(x, true);
Expr native_is_fast = target_has_feature(Target::Vulkan);
return select(native_is_fast && precision.allow_native_when_faster,
sin(x), fast_sin_cos_v2(x, true, precision));
}

Expr fast_cos(const Expr &x_full) {
return fast_sin_cos(x_full, false);
Expr fast_cos(const Expr &x, ApproximationPrecision precision) {
// return fast_sin_cos(x, false);
Expr native_is_fast = target_has_feature(Target::Vulkan);
return select(native_is_fast && precision.allow_native_when_faster,
cos(x), fast_sin_cos_v2(x, false, precision));
}

Expr fast_exp(const Expr &x_full) {
// A vectorizable atan and atan2 implementation.
// Based on the ideas presented in https://mazzo.li/posts/vectorized-atan2.html.
Expr fast_atan_approximation(const Expr &x_full, ApproximationPrecision precision, bool between_m1_and_p1) {
Type type = x_full.type();
Expr x;
// if x > 1 -> atan(x) = Pi/2 - atan(1/x)
Expr x_gt_1 = abs(x_full) > 1.0f;
if (between_m1_and_p1) {
x = x_full;
} else {
x = select(x_gt_1, constant(type, 1.0) / x_full, x_full);
}
const Internal::Approximation *approx = Internal::best_atan_approximation(precision, type);
const std::vector<double> &c = approx->coefficients;
Expr x2 = x * x;
Expr result = constant(type, c.back());
for (size_t i = 1; i < c.size(); ++i) {
result = x2 * result + constant(type, c[c.size() - i - 1]);
}
result *= x;

if (!between_m1_and_p1) {
result = select(x_gt_1, select(x_full < 0, constant(type, -PI_OVER_TWO), constant(type, PI_OVER_TWO)) - result, result);
}
return common_subexpression_elimination(result, true);
}

Expr fast_atan(const Expr &x_full, ApproximationPrecision precision) {
return fast_atan_approximation(x_full, precision, false);
}

Expr fast_atan2(const Expr &y, const Expr &x, ApproximationPrecision precision) {
user_assert(y.type() == x.type()) << "fast_atan2 should take two arguments of the same type.";
Type type = y.type();
// Making sure we take the ratio of the biggest number by the smallest number (in absolute value)
// will always give us a number between -1 and +1, which is the range over which the approximation
// works well. We can therefore also skip the inversion logic in the fast_atan_approximation function
// by passing true for "between_m1_and_p1". This increases both speed (1 division instead of 2) and
// numerical precision.
Expr swap = abs(y) > abs(x);
Expr atan_input = select(swap, x, y) / select(swap, y, x);
Expr ati = fast_atan_approximation(atan_input, precision, true);
Expr pi_over_two = constant(type, PI_OVER_TWO);
Expr pi = constant(type, PI);
Expr at = select(swap, select(atan_input >= 0.0f, pi_over_two, -pi_over_two) - ati, ati);
// This select statement is literally taken over from the definition on Wikipedia.
// There might be optimizations to be done here, but I haven't tried that yet. -- Martijn
Expr result = select(
x > 0.0f, at,
x < 0.0f && y >= 0.0f, at + pi,
x < 0.0f && y < 0.0f, at - pi,
x == 0.0f && y > 0.0f, pi_over_two,
x == 0.0f && y < 0.0f, -pi_over_two,
0.0f);
return common_subexpression_elimination(result, true);
}

Expr fast_exp(const Expr &x_full, ApproximationPrecision prec) {
Type type = x_full.type();
user_assert(x_full.type() == Float(32)) << "fast_exp only works for Float(32)";

Expr scaled = x_full / logf(2.0);
Expr log2 = constant(type, std::log(2.0));

Expr scaled = x_full / log2;
Expr k_real = floor(scaled);
Expr k = cast<int>(k_real);
Expr x = x_full - k_real * logf(2.0);
Expr x = x_full - k_real * log2;

#if 0
float coeff[] = {
0.01314350012789660196f,
0.03668965196652099192f,
Expand All @@ -1427,6 +1513,17 @@ Expr fast_exp(const Expr &x_full) {
1.0f,
1.0f};
Expr result = evaluate_polynomial(x, coeff, sizeof(coeff) / sizeof(coeff[0]));
#else
const Internal::Approximation *approx = Internal::best_exp_approximation(prec, type);
const std::vector<double> &c = approx->coefficients;

Expr result = constant(type, c.back());
for (size_t i = 1; i < c.size(); ++i) {
result = x * result + constant(type, c[c.size() - i - 1]);
}
result = result * x + constant(type, 1.0);
result = result * x + constant(type, 1.0);
#endif

// Compute 2^k.
int fpbias = 127;
Expand All @@ -1436,6 +1533,42 @@ Expr fast_exp(const Expr &x_full) {
// thing as float.
Expr two_to_the_n = reinterpret<float>(biased << 23);
result *= two_to_the_n;
result = common_subexpression_elimination(result, true);
return result;
}

Expr fast_log(const Expr &x, ApproximationPrecision prec) {
Type type = x.type();
user_assert(x.type() == Float(32)) << "fast_log only works for Float(32)";

Expr log2 = constant(type, std::log(2.0));
Expr reduced, exponent;
range_reduce_log(x, &reduced, &exponent);

Expr x1 = reduced - 1.0f;
#if 0
float coeff[] = {
0.07640318789187280912f,
-0.16252961013874300811f,
0.20625219040645212387f,
-0.25110261010892864775f,
0.33320464908377461777f,
-0.49997513376789826101f,
1.0f,
0.0f};

Expr result = evaluate_polynomial(x1, coeff, sizeof(coeff) / sizeof(coeff[0]));
#else
const Internal::Approximation *approx = Internal::best_log_approximation(prec, type);
const std::vector<double> &c = approx->coefficients;

Expr result = constant(type, c.back());
for (size_t i = 1; i < c.size(); ++i) {
result = x1 * result + constant(type, c[c.size() - i - 1]);
}
result = result * x1;
#endif
result = result + cast<float>(exponent) * log2;
result = common_subexpression_elimination(result);
return result;
}
Expand Down Expand Up @@ -2272,14 +2405,14 @@ Expr erf(const Expr &x) {
return halide_erf(x);
}

Expr fast_pow(Expr x, Expr y) {
Expr fast_pow(Expr x, Expr y, ApproximationPrecision prec) {
if (auto i = as_const_int(y)) {
return raise_to_integer_power(std::move(x), *i);
}

x = cast<float>(std::move(x));
y = cast<float>(std::move(y));
return select(x == 0.0f, 0.0f, fast_exp(fast_log(x) * std::move(y)));
return select(x == 0.0f, 0.0f, fast_exp(fast_log(x, prec) * std::move(y), prec));
}

Expr fast_inverse(Expr x) {
Expand Down
Loading