From e56f01592cf08a5898dd1db30fe7d992a9145fd2 Mon Sep 17 00:00:00 2001
From: Basile Clement <Elarnon@users.noreply.github.com>
Date: Tue, 8 Feb 2022 19:33:56 +0100
Subject: [PATCH] Only commutative reductions can be parallelized

Because parallelization changes the order of computation within the reduction, parallelizing associative but non-commutative reductions can result in (non-deterministically) incorrect results in the same way `reorder`ing them can.

For instance Halide currently accepts the following code, but generates non-deterministic outputs on GPU.  On CPU with `.parallel(r.x)`, OpenMP rejects the generated code (correctly) stating that the `#pragma omp atomic` is invalid for the same reasons.

```c++
#include <stdio.h>

#include "Halide.h"

using namespace Halide;

int main(int argc, char **argv) {
        Halide::Func A("A"), B("B");
        Halide::Var i("i");

        A(i) = i;
        B() = -1;
        Halide::RDom r(0, 1024);
        B() = A(r.x);

        A.compute_root();
        B.update().atomic().gpu_blocks(r.x);

        B.compile_jit(get_host_target().with_feature(Target::CUDA));
        Halide::Buffer<int32_t> b = B.realize();
        printf("%d\n", b());

        return 0;
}
```
---
 src/Func.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Func.cpp b/src/Func.cpp
index a8e6da42ca41..bd2338685dea 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -368,9 +368,9 @@ void Stage::set_dim_type(const VarOrRVar &var, ForType t) {
                             // its identity for each value in the definition if it is a Tuple
                             const auto &prover_result = prove_associativity(func_name, args, values);
 
-                            user_assert(prover_result.associative())
+                            user_assert(prover_result.associative() && prover_result.commutative())
                                 << "Failed to call atomic() on " << name()
-                                << " since it can't prove associativity of the operator.\n";
+                                << " since it can't prove associativity or commutativity of the operator.\n";
                             internal_assert(prover_result.size() == values.size());
                         }
                     }