halide · abadams · Dec 23, 2021 · Dec 16, 2021 · Dec 19, 2021 · Dec 19, 2021
diff --git a/src/LowerWarpShuffles.cpp b/src/LowerWarpShuffles.cpp
@@ -562,6 +562,22 @@ class LowerWarpShuffles : public IRMutator {
 
         internal_assert(may_use_warp_shuffle) << name << ", " << idx << ", " << lane << "\n";
 
+        // We must add .sync after volta architecture:
+        // https://docs.nvidia.com/cuda/volta-tuning-guide/index.html
+        string sync_suffix = "";
+        Target t = get_jit_target_from_environment();
+        int cap = t.get_cuda_capability_lower_bound();
+        if (cap >= 70) {
+            sync_suffix = ".sync";
+        }
+
+        auto shfl_args = [&](const std::vector<Expr> &args) {
+            if (cap >= 70) {
+                return args;
+            }
+            return std::vector({args[1], args[2], args[3]});
+        };
+
         string intrin_suffix;
         if (shuffle_type.is_float()) {
             intrin_suffix = ".f32";
@@ -578,12 +594,12 @@ class LowerWarpShuffles : public IRMutator {
         lane = solve_expression(lane, this_lane_name).result;
 
         Expr shuffled;
-
+        Expr membermask = (int)0xffffffff;
         if (expr_match(this_lane + wild, lane, result)) {
             // We know that 0 <= lane + wild < warp_size by how we
             // constructed it, so we can just do a shuffle down.
-            Expr down = Call::make(shuffle_type, "llvm.nvvm.shfl.down" + intrin_suffix,
-                                   {base_val, result[0], 31}, Call::PureExtern);
+            Expr down = Call::make(shuffle_type, "llvm.nvvm.shfl" + sync_suffix + ".down" + intrin_suffix,
+                                   shfl_args({membermask, base_val, result[0], 31}), Call::PureExtern);
             shuffled = down;
         } else if (expr_match((this_lane + wild) % wild, lane, result) &&
                    is_const_power_of_two_integer(result[1], &bits) &&
@@ -593,10 +609,10 @@ class LowerWarpShuffles : public IRMutator {
             // intermediate registers than using a general gather for
             // this.
             Expr mask = (1 << bits) - 1;
-            Expr down = Call::make(shuffle_type, "llvm.nvvm.shfl.down" + intrin_suffix,
-                                   {base_val, result[0], mask}, Call::PureExtern);
-            Expr up = Call::make(shuffle_type, "llvm.nvvm.shfl.up" + intrin_suffix,
-                                 {base_val, (1 << bits) - result[0], 0}, Call::PureExtern);
+            Expr down = Call::make(shuffle_type, "llvm.nvvm.shfl" + sync_suffix + ".down" + intrin_suffix,
+                                   shfl_args({membermask, base_val, result[0], mask}), Call::PureExtern);
+            Expr up = Call::make(shuffle_type, "llvm.nvvm.shfl" + sync_suffix + ".up" + intrin_suffix,
+                                 shfl_args({membermask, base_val, (1 << bits) - result[0], 0}), Call::PureExtern);
             Expr cond = (this_lane >= (1 << bits) - result[0]);
             Expr equiv = select(cond, up, down);
             shuffled = simplify(equiv, true, bounds);
@@ -609,8 +625,8 @@ class LowerWarpShuffles : public IRMutator {
             // could hypothetically be used for boundary conditions.
             Expr mask = simplify(((31 & ~(warp_size - 1)) << 8) | 31);
             // The idx variant can do a general gather. Use it for all other cases.
-            shuffled = Call::make(shuffle_type, "llvm.nvvm.shfl.idx" + intrin_suffix,
-                                  {base_val, lane, mask}, Call::PureExtern);
+            shuffled = Call::make(shuffle_type, "llvm.nvvm.shfl" + sync_suffix + ".idx" + intrin_suffix,
+                                  shfl_args({membermask, base_val, lane, mask}), Call::PureExtern);
         }
         // TODO: There are other forms, like butterfly and clamp, that
         // don't need to use the general gather

diff --git a/test/correctness/register_shuffle.cpp b/test/correctness/register_shuffle.cpp
@@ -6,10 +6,8 @@ int main(int argc, char **argv) {
     Target t = get_jit_target_from_environment();
 
     int cap = t.get_cuda_capability_lower_bound();
-    if (cap < 50 || cap >= 80) {
-        printf("[SKIP] CUDA with capability between 5.0 and 7.5 required\n");
-        // TODO: Use the shfl.sync intrinsics for cuda 8.0 and above
-        // See issue #5630
+    if (cap < 50) {
+        printf("[SKIP] CUDA with capability greater than or equal to 5.0 required, cap:%d\n", cap);
         return 0;
     }
 
@@ -495,6 +493,52 @@ int main(int argc, char **argv) {
         }
     }
 
+    {
+        // Use warp shuffle to do the reduction.
+        Func a, b, c;
+        Var x, y, yo, yi, ylane, u;
+        RVar ro, ri;
+
+        a(x, y) = x + y;
+        a.compute_root();
+
+        RDom r(0, 1024);
+        b(y) = 0;
+        b(y) += a(r, y);
+        c(y) = b(y);
+
+        int warp = 8;
+        c
+            .split(y, yo, yi, 1 * warp)
+            .split(yi, yi, ylane, 1)
+            .gpu_blocks(yo)
+            .gpu_threads(yi, ylane);
+        Func intm = b.update()
+            .split(r, ri, ro, warp)
+            .reorder(ri, ro)
+            .rfactor(ro, u);
+        intm
+            .compute_at(c, yi)
+            .update()
+            .gpu_lanes(u);
+        intm
+            .gpu_lanes(u);
+
+        Buffer<int> out = c.realize({256});
+        for (int y = 0; y < out.width(); y++) {
+            int correct = 0;
+            for (int x = 0; x < 1024; x++) {
+                correct += x + y;
+            }
+            int actual = out(y);
+            if (correct != actual) {
+                printf("out(%d) = %d instead of %d\n",
+                       y, actual, correct);
+                return -1;
+            }
+        }
+    }
+
     {
         // Test a case that caused combinatorial explosion
         Var x;