From e56f01592cf08a5898dd1db30fe7d992a9145fd2 Mon Sep 17 00:00:00 2001 From: Basile Clement Date: Tue, 8 Feb 2022 19:33:56 +0100 Subject: [PATCH] Only commutative reductions can be parallelized Because parallelization changes the order of computation within the reduction, parallelizing associative but non-commutative reductions can result in (non-deterministically) incorrect results in the same way `reorder`ing them can. For instance Halide currently accepts the following code, but generates non-deterministic outputs on GPU. On CPU with `.parallel(r.x)`, OpenMP rejects the generated code (correctly) stating that the `#pragma omp atomic` is invalid for the same reasons. ```c++ #include #include "Halide.h" using namespace Halide; int main(int argc, char **argv) { Halide::Func A("A"), B("B"); Halide::Var i("i"); A(i) = i; B() = -1; Halide::RDom r(0, 1024); B() = A(r.x); A.compute_root(); B.update().atomic().gpu_blocks(r.x); B.compile_jit(get_host_target().with_feature(Target::CUDA)); Halide::Buffer b = B.realize(); printf("%d\n", b()); return 0; } ``` --- src/Func.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Func.cpp b/src/Func.cpp index a8e6da42ca41..bd2338685dea 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -368,9 +368,9 @@ void Stage::set_dim_type(const VarOrRVar &var, ForType t) { // its identity for each value in the definition if it is a Tuple const auto &prover_result = prove_associativity(func_name, args, values); - user_assert(prover_result.associative()) + user_assert(prover_result.associative() && prover_result.commutative()) << "Failed to call atomic() on " << name() - << " since it can't prove associativity of the operator.\n"; + << " since it can't prove associativity or commutativity of the operator.\n"; internal_assert(prover_result.size() == values.size()); } }