From 71f6c21191ec9e6a44df82ddb9d5850bda8d9908 Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Wed, 2 Feb 2022 17:06:04 -0500
Subject: [PATCH 1/9] Scalarize predicated Loads

---
 src/CodeGen_GPU_Dev.cpp            | 24 +++++++++++++-----
 src/CodeGen_Metal_Dev.cpp          | 39 ++++++++++++++++++++++++++++++
 test/correctness/gpu_vectorize.cpp | 36 +++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 6 deletions(-)
diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp
index 1f29f01fc1ad..108b9cef73a0 100644
--- a/src/CodeGen_GPU_Dev.cpp
+++ b/src/CodeGen_GPU_Dev.cpp
@@ -127,12 +127,24 @@ class ScalarizePredicatedLoadStore : public IRMutator {
 
     Expr visit(const Load *op) override {
         if (!is_const_one(op->predicate)) {
-            Expr load_expr = Load::make(op->type, op->name, op->index, op->image,
-                                        op->param, const_true(op->type.lanes()), op->alignment);
-            Expr pred_load = Call::make(load_expr.type(),
-                                        Call::if_then_else,
-                                        {op->predicate, load_expr},
-                                        Internal::Call::PureIntrinsic);
+            std::vector<Expr> lane_values;
+            for (int ln = 0; ln < op->type.lanes(); ln++) {
+                Expr load_expr = Load::make(op->type.element_of(),
+                                            op->name,
+                                            extract_lane(op->index, ln),
+                                            op->image,
+                                            op->param,
+                                            const_true(),
+                                            // TODO: alignment needs to be changed
+                                            op->alignment);
+                lane_values.push_back(Call::make(load_expr.type(),
+                                                 Call::if_then_else,
+                                                 {extract_lane(op->predicate, ln),
+                                                  load_expr,
+                                                  make_zero(op->type.element_of())},
+                                                 Internal::Call::PureIntrinsic));
+            }
+            Expr pred_load = Shuffle::make_concat(lane_values);
             return pred_load;
         } else {
             return op;
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 6089d84b9d3a..b8e0a5b87eed 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -93,6 +93,8 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
         void visit(const Free *op) override;
         void visit(const Cast *op) override;
         void visit(const Atomic *op) override;
+        void visit(const IfThenElse *op) override;
+        void visit(const Shuffle *op) override;
     };
 
     std::ostringstream src_stream;
@@ -544,6 +546,43 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Atomic *op) {
     user_assert(false) << "Atomic updates are not supported inside Metal kernels";
 }
 
+void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) {
+    if (op->type.is_scalar()) {
+        CodeGen_C::visit(op);
+    } else {
+        internal_assert(!op->vectors.empty());
+        for (size_t i = 1; i < op->vectors.size(); i++) {
+            internal_assert(op->vectors[0].type() == op->vectors[i].type());
+        }
+        internal_assert(op->type.lanes() == (int)op->indices.size());
+        const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
+        for (int i : op->indices) {
+            internal_assert(i >= -1 && i < max_index);
+        }
+
+        std::vector<string> vecs;
+        for (const Expr &v : op->vectors) {
+            vecs.push_back(print_expr(v));
+        }
+
+        string src = vecs[0];
+        ostringstream rhs;
+        // This code has always assumed/required that all the vectors
+        // have identical types, so let's verify
+        const Type t0 = op->vectors[0].type();
+        for (const auto &v : op->vectors) {
+            internal_assert(t0 == v.type());
+        }
+        string storage_name = unique_name('_');
+        rhs << "{";
+        for (int i : op->indices) {
+            rhs << vecs[i] << ",";
+        }
+        rhs << "}";
+        print_assignment(op->type, rhs.str());
+    }
+}
+
 void CodeGen_Metal_Dev::add_kernel(Stmt s,
                                    const string &name,
                                    const vector<DeviceArgument> &args) {
diff --git a/test/correctness/gpu_vectorize.cpp b/test/correctness/gpu_vectorize.cpp
index 407342cca1af..2e0ffeebc3b5 100644
--- a/test/correctness/gpu_vectorize.cpp
+++ b/test/correctness/gpu_vectorize.cpp
@@ -70,6 +70,42 @@ int main(int argc, char **argv) {
             }
         }
     }
+    {
+        Var x("x"), y("y"), xi("xi"), yi("yi");
+        Func f("f");
+        ImageParam im(Float(32), 2);
+
+        printf("Defining function...\n");
+
+        f(x, y) = select(im(x, y) > 32.0f, 1.0f, -1.0f) + im(x, y);
+
+        Target target = get_jit_target_from_environment();
+        if (target.has_gpu_feature()) {
+            f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::GuardWithIf).vectorize(xi, 4, TailStrategy::GuardWithIf);
+        }
+
+        printf("Realizing function...\n");
+        Buffer<float> input_img(32, 32);
+        for (int i = 0; i < 32; i++) {
+            for (int j = 0; j < 32; j++) {
+                input_img(i, j) = i + j;
+            }
+        }
+        im.set(input_img);
+
+        Buffer<float> imf = f.realize({32, 32}, target);
+
+        // Check the result was what we expected
+        for (int i = 0; i < 32; i++) {
+            for (int j = 0; j < 32; j++) {
+                float correct = (i + j > 32 ? 1.0f : -1.0f) + i + j;
+                if (fabs(imf(i, j) - correct) > 0.001f) {
+                    printf("imf[%d, %d] = %f instead of %f\n", i, j, imf(i, j), correct);
+                    return -1;
+                }
+            }
+        }
+    }
 
     printf("Success!\n");
     return 0;

From fc28a0481dcaeafac1015be8f069b6005a6dbf0d Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Mon, 7 Feb 2022 10:22:48 -0500
Subject: [PATCH 2/9] Cleanup

---
 src/CodeGen_GPU_Dev.cpp   | 6 ++----
 src/CodeGen_Metal_Dev.cpp | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp
index 108b9cef73a0..17eac6bba86f 100644
--- a/src/CodeGen_GPU_Dev.cpp
+++ b/src/CodeGen_GPU_Dev.cpp
@@ -116,8 +116,7 @@ class ScalarizePredicatedLoadStore : public IRMutator {
                                 mutate(extract_lane(s->index, ln)),
                                 s->param,
                                 const_true(),
-                                // TODO: alignment needs to be changed
-                                s->alignment)));
+                                s->alignment + ln)));
             }
             return Block::make(scalar_stmts);
         } else {
@@ -135,8 +134,7 @@ class ScalarizePredicatedLoadStore : public IRMutator {
                                             op->image,
                                             op->param,
                                             const_true(),
-                                            // TODO: alignment needs to be changed
-                                            op->alignment);
+                                            op->alignment + ln);
                 lane_values.push_back(Call::make(load_expr.type(),
                                                  Call::if_then_else,
                                                  {extract_lane(op->predicate, ln),
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index b8e0a5b87eed..0efa33b19c0e 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -93,7 +93,6 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
         void visit(const Free *op) override;
         void visit(const Cast *op) override;
         void visit(const Atomic *op) override;
-        void visit(const IfThenElse *op) override;
         void visit(const Shuffle *op) override;
     };
 

From 3e83b8203ee469a5933fda707fc9dbcc321b574b Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <kamil@adobe.com>
Date: Wed, 30 Mar 2022 17:17:44 -0400
Subject: [PATCH 3/9] Fix gpu_vectorize scalarization for D3D12

---
 src/CodeGen_D3D12Compute_Dev.cpp | 39 ++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index b1e99626e53b..67d8d6d03c81 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -106,6 +106,7 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
         void visit(const Cast *op) override;
         void visit(const Atomic *op) override;
         void visit(const FloatImm *op) override;
+        void visit(const Shuffle *op) override;
 
         Scope<> groupshared_allocations;
     };
@@ -957,6 +958,43 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Atomic *op) {
     user_assert(false) << "Atomics operations are not supported inside D3D12Compute kernel.\n";
 }
 
+void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op) {
+    if (op->type.is_scalar()) {
+        CodeGen_C::visit(op);
+    } else {
+        internal_assert(!op->vectors.empty());
+        for (size_t i = 1; i < op->vectors.size(); i++) {
+            internal_assert(op->vectors[0].type() == op->vectors[i].type());
+        }
+        internal_assert(op->type.lanes() == (int)op->indices.size());
+        const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
+        for (int i : op->indices) {
+            internal_assert(i >= -1 && i < max_index);
+        }
+
+        std::vector<string> vecs;
+        for (const Expr &v : op->vectors) {
+            vecs.push_back(print_expr(v));
+        }
+
+        string src = vecs[0];
+        ostringstream rhs;
+        // This code has always assumed/required that all the vectors
+        // have identical types, so let's verify
+        const Type t0 = op->vectors[0].type();
+        for (const auto &v : op->vectors) {
+            internal_assert(t0 == v.type());
+        }
+        string storage_name = unique_name('_');
+        rhs << "{";
+        for (int i : op->indices) {
+            rhs << vecs[i] << ",";
+        }
+        rhs << "}";
+        print_assignment(op->type, rhs.str());
+    }
+}
+
 void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op) {
     // TODO(marcos): just a pass-through for now, but we might consider doing
     // something different, such as adding the suffic 'u' to the integer that
@@ -1240,6 +1278,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
     }
 
     stream << "\n";
+
 }
 
 void CodeGen_D3D12Compute_Dev::init_module() {

From 28dac275d638ddca009ea16ef6421999af0e84de Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <kamil@adobe.com>
Date: Wed, 30 Mar 2022 18:20:23 -0400
Subject: [PATCH 4/9] Fix OpenCL scalarization

---
 src/CodeGen_OpenCL_Dev.cpp | 51 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index 539a0699b909..7f00cdd46b2e 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -858,8 +858,50 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
             }
             stream << ");\n";
         }
+    } else if (op->is_extract_element()) {
+        // OpenCL requires using .s<n> format for extracting an element
+      ostringstream rhs;
+      rhs << print_expr(op->vectors[0]);
+      rhs << ".s" << op->indices[0];
+      print_assignment(op->type, rhs.str());
+    } else if (op->type.is_scalar()) {
+        CodeGen_C::visit(op);
     } else {
-        internal_error << "Shuffle not implemented.\n";
+        internal_assert(!op->vectors.empty());
+        for (size_t i = 1; i < op->vectors.size(); i++) {
+            internal_assert(op->vectors[0].type() == op->vectors[i].type());
+        }
+        internal_assert(op->type.lanes() == (int)op->indices.size());
+        const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
+        for (int i : op->indices) {
+            internal_assert(i >= -1 && i < max_index);
+        }
+
+        std::vector<string> vecs;
+        for (const Expr &v : op->vectors) {
+            vecs.push_back(print_expr(v));
+        }
+
+        string src = vecs[0];
+        ostringstream rhs;
+        // This code has always assumed/required that all the vectors
+        // have identical types, so let's verify
+        const Type t0 = op->vectors[0].type();
+        for (const auto &v : op->vectors) {
+            internal_assert(t0 == v.type());
+        }
+        string storage_name = unique_name('_');
+        rhs << "(" << print_type(op->type) << ")";
+        rhs << "(";
+        for (int i : op->indices) {
+            rhs << vecs[i];
+            if (i < op->indices.size() - 1) {
+                rhs << ", ";
+            }
+        }
+        rhs << ")";
+        print_assignment(op->type, rhs.str());
+
     }
 }
 
@@ -926,6 +968,13 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s,
     debug(2) << "After eliminating bool vectors:\n"
              << s << "\n";
 
+    // We need to scalarize/de-predicate any loads/stores, since OpenCL does not
+    // support predication.
+    s = scalarize_predicated_loads_stores(s);
+
+    debug(2) << "After removing predication: \n"
+             << s;
+
     // Figure out which arguments should be passed in __constant.
     // Such arguments should be:
     // - not written to,

From 5ffa87ce964f1d7fbbff80ead1bcfe1741c5b337 Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Wed, 30 Mar 2022 18:39:01 -0400
Subject: [PATCH 5/9] Minor fixes

---
 src/CodeGen_D3D12Compute_Dev.cpp | 1 -
 src/CodeGen_OpenCL_Dev.cpp       | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index 67d8d6d03c81..4cf57dd8adb9 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -1278,7 +1278,6 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
     }
 
     stream << "\n";
-
 }
 
 void CodeGen_D3D12Compute_Dev::init_module() {
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index 7f00cdd46b2e..4bdc462db650 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -895,7 +895,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
         rhs << "(";
         for (int i : op->indices) {
             rhs << vecs[i];
-            if (i < op->indices.size() - 1) {
+            if (i < (int)(op->indices.size() - 1)) {
                 rhs << ", ";
             }
         }

From 06017f240cdc90bd63884a9169f4c486bd8003cf Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Wed, 30 Mar 2022 22:19:04 -0400
Subject: [PATCH 6/9] Formatting

---
 src/CodeGen_OpenCL_Dev.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index 4bdc462db650..c99bd1bb1919 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -860,10 +860,10 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
         }
     } else if (op->is_extract_element()) {
         // OpenCL requires using .s<n> format for extracting an element
-      ostringstream rhs;
-      rhs << print_expr(op->vectors[0]);
-      rhs << ".s" << op->indices[0];
-      print_assignment(op->type, rhs.str());
+        ostringstream rhs;
+        rhs << print_expr(op->vectors[0]);
+        rhs << ".s" << op->indices[0];
+        print_assignment(op->type, rhs.str());
     } else if (op->type.is_scalar()) {
         CodeGen_C::visit(op);
     } else {
@@ -901,7 +901,6 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
         }
         rhs << ")";
         print_assignment(op->type, rhs.str());
-
     }
 }
 

From c01d44bf9da1fff01bcf76769a6978ad321ba51f Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Thu, 31 Mar 2022 10:50:20 -0400
Subject: [PATCH 7/9] Address review comments

---
 src/CodeGen_D3D12Compute_Dev.cpp | 7 +++++--
 src/CodeGen_Metal_Dev.cpp        | 7 +++++--
 src/CodeGen_OpenCL_Dev.cpp       | 2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index 4cf57dd8adb9..19b4ab88080a 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -969,7 +969,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op)
         internal_assert(op->type.lanes() == (int)op->indices.size());
         const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
         for (int i : op->indices) {
-            internal_assert(i >= -1 && i < max_index);
+            internal_assert(i >= 0 && i < max_index);
         }
 
         std::vector<string> vecs;
@@ -988,7 +988,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op)
         string storage_name = unique_name('_');
         rhs << "{";
         for (int i : op->indices) {
-            rhs << vecs[i] << ",";
+            rhs << vecs[i];
+            if (i < (int)(op->indices.size() - 1)) {
+                rhs << ", ";
+            }
         }
         rhs << "}";
         print_assignment(op->type, rhs.str());
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 0efa33b19c0e..4cf8443fb5ae 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -556,7 +556,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) {
         internal_assert(op->type.lanes() == (int)op->indices.size());
         const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
         for (int i : op->indices) {
-            internal_assert(i >= -1 && i < max_index);
+            internal_assert(i >= 0 && i < max_index);
         }
 
         std::vector<string> vecs;
@@ -575,7 +575,10 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) {
         string storage_name = unique_name('_');
         rhs << "{";
         for (int i : op->indices) {
-            rhs << vecs[i] << ",";
+            rhs << vecs[i];
+            if (i < (int)(op->indices.size() - 1)) {
+                rhs << ", ";
+            }
         }
         rhs << "}";
         print_assignment(op->type, rhs.str());
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index c99bd1bb1919..9defbd377a2e 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -874,7 +874,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
         internal_assert(op->type.lanes() == (int)op->indices.size());
         const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
         for (int i : op->indices) {
-            internal_assert(i >= -1 && i < max_index);
+            internal_assert(i >= 0 && i < max_index);
         }
 
         std::vector<string> vecs;

From 63b38608700cad1bea7e27e51e4966cb7c871659 Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Thu, 31 Mar 2022 15:58:29 -0400
Subject: [PATCH 8/9] Move Shuffle impl to CodeGen_GPU_C class

---
 src/CodeGen_D3D12Compute_Dev.cpp | 56 ++++--------------------------
 src/CodeGen_GPU_Dev.cpp          | 42 +++++++++++++++++++++++
 src/CodeGen_GPU_Dev.h            | 27 +++++++++++++--
 src/CodeGen_Metal_Dev.cpp        | 52 +++-------------------------
 src/CodeGen_OpenCL_Dev.cpp       | 59 +++++++-------------------------
 5 files changed, 91 insertions(+), 145 deletions(-)

diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index 19b4ab88080a..41fa6ecc2660 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -3,7 +3,6 @@
 #include <sstream>
 #include <utility>
 
-#include "CodeGen_C.h"
 #include "CodeGen_D3D12Compute_Dev.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
@@ -62,10 +61,10 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
 protected:
     friend struct StoragePackUnpack;
 
-    class CodeGen_D3D12Compute_C : public CodeGen_C {
+    class CodeGen_D3D12Compute_C : public CodeGen_GPU_C {
     public:
         CodeGen_D3D12Compute_C(std::ostream &s, const Target &t)
-            : CodeGen_C(s, t) {
+            : CodeGen_GPU_C(s, t) {
             integer_suffix_style = IntegerSuffixStyle::HLSL;
         }
         void add_kernel(Stmt stmt,
@@ -88,7 +87,7 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
 
         std::string print_assignment(Type t, const std::string &rhs) override;
 
-        using CodeGen_C::visit;
+        using CodeGen_GPU_C::visit;
         void visit(const Evaluate *op) override;
         void visit(const Min *) override;
         void visit(const Max *) override;
@@ -106,7 +105,6 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
         void visit(const Cast *op) override;
         void visit(const Atomic *op) override;
         void visit(const FloatImm *op) override;
-        void visit(const Shuffle *op) override;
 
         Scope<> groupshared_allocations;
     };
@@ -304,7 +302,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const For *loop) {
 
     if (!is_gpu_var(loop->name)) {
         user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside D3D12Compute kernel\n";
-        CodeGen_C::visit(loop);
+        CodeGen_GPU_C::visit(loop);
         return;
     }
 
@@ -381,7 +379,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Call *op) {
         // directly.
         stream << "pow(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")";
     } else {
-        CodeGen_C::visit(op);
+        CodeGen_GPU_C::visit(op);
     }
 }
 
@@ -816,7 +814,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Free *op) {
 
 string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_assignment(Type type, const string &rhs) {
     string rhs_modified = print_reinforced_cast(type, rhs);
-    return CodeGen_C::print_assignment(type, rhs_modified);
+    return CodeGen_GPU_C::print_assignment(type, rhs_modified);
 }
 
 string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_vanilla_cast(Type type, const string &value_expr) {
@@ -958,46 +956,6 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Atomic *op) {
     user_assert(false) << "Atomics operations are not supported inside D3D12Compute kernel.\n";
 }
 
-void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op) {
-    if (op->type.is_scalar()) {
-        CodeGen_C::visit(op);
-    } else {
-        internal_assert(!op->vectors.empty());
-        for (size_t i = 1; i < op->vectors.size(); i++) {
-            internal_assert(op->vectors[0].type() == op->vectors[i].type());
-        }
-        internal_assert(op->type.lanes() == (int)op->indices.size());
-        const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
-        for (int i : op->indices) {
-            internal_assert(i >= 0 && i < max_index);
-        }
-
-        std::vector<string> vecs;
-        for (const Expr &v : op->vectors) {
-            vecs.push_back(print_expr(v));
-        }
-
-        string src = vecs[0];
-        ostringstream rhs;
-        // This code has always assumed/required that all the vectors
-        // have identical types, so let's verify
-        const Type t0 = op->vectors[0].type();
-        for (const auto &v : op->vectors) {
-            internal_assert(t0 == v.type());
-        }
-        string storage_name = unique_name('_');
-        rhs << "{";
-        for (int i : op->indices) {
-            rhs << vecs[i];
-            if (i < (int)(op->indices.size() - 1)) {
-                rhs << ", ";
-            }
-        }
-        rhs << "}";
-        print_assignment(op->type, rhs.str());
-    }
-}
-
 void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op) {
     // TODO(marcos): just a pass-through for now, but we might consider doing
     // something different, such as adding the suffic 'u' to the integer that
@@ -1005,7 +963,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op)
     // have seen division-by-zero shader warnings, and we postulated that it
     // could be indirectly related to compiler assumptions on signed integer
     // overflow when float_from_bits() is called, but we don't know for sure
-    return CodeGen_C::visit(op);
+    return CodeGen_GPU_C::visit(op);
 }
 
 void CodeGen_D3D12Compute_Dev::add_kernel(Stmt s,
diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp
index 17eac6bba86f..3606ae41e3c9 100644
--- a/src/CodeGen_GPU_Dev.cpp
+++ b/src/CodeGen_GPU_Dev.cpp
@@ -157,5 +157,47 @@ Stmt CodeGen_GPU_Dev::scalarize_predicated_loads_stores(Stmt &s) {
     return sps.mutate(s);
 }
 
+void CodeGen_GPU_C::visit(const Shuffle *op) {
+    if (op->type.is_scalar()) {
+        CodeGen_C::visit(op);
+    } else {
+        internal_assert(!op->vectors.empty());
+        for (size_t i = 1; i < op->vectors.size(); i++) {
+            internal_assert(op->vectors[0].type() == op->vectors[i].type());
+        }
+        internal_assert(op->type.lanes() == (int)op->indices.size());
+        const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
+        for (int i : op->indices) {
+            internal_assert(i >= 0 && i < max_index);
+        }
+
+        std::vector<std::string> vecs;
+        for (const Expr &v : op->vectors) {
+            vecs.push_back(print_expr(v));
+        }
+
+        std::string src = vecs[0];
+        std::ostringstream rhs;
+        std::string storage_name = unique_name('_');
+        if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) {
+            rhs << "(" << print_type(op->type) << ")(";
+        } else {
+            rhs << "{";
+        }
+        for (int i : op->indices) {
+            rhs << vecs[i];
+            if (i < (int)(op->indices.size() - 1)) {
+                rhs << ", ";
+            }
+        }
+        if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) {
+            rhs << ")";
+        } else {
+            rhs << "}";
+        }
+        print_assignment(op->type, rhs.str());
+    }
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_GPU_Dev.h b/src/CodeGen_GPU_Dev.h
index 2b516fd62d13..dfbd1f58c49b 100644
--- a/src/CodeGen_GPU_Dev.h
+++ b/src/CodeGen_GPU_Dev.h
@@ -7,6 +7,7 @@
 #include <string>
 #include <vector>
 
+#include "CodeGen_C.h"
 #include "DeviceArgument.h"
 #include "Expr.h"
 
@@ -73,8 +74,8 @@ struct CodeGen_GPU_Dev {
     static Stmt scalarize_predicated_loads_stores(Stmt &s);
 
     /** An mask describing which type of memory fence to use for the gpu_thread_barrier()
-    * intrinsic.  Not all GPUs APIs support all types.
-    */
+     * intrinsic.  Not all GPUs APIs support all types.
+     */
     enum MemoryFenceType {
         None = 0,    // No fence required (just a sync)
         Device = 1,  // Device/global memory fence
@@ -82,6 +83,28 @@ struct CodeGen_GPU_Dev {
     };
 };
 
+/** A base class for GPU backends that require C-like shader output.
+ * GPU backends derive from and specialize this class. */
+class CodeGen_GPU_C : public CodeGen_C {
+public:
+    /** OpenCL uses a different syntax than C for immediate vectors.  This
+    enum defines which style should be used by the backend. */
+    enum class VectorDeclarationStyle {
+        CLikeSyntax = 0,
+        OpenCLSyntax = 1
+    };
+
+    CodeGen_GPU_C(std::ostream &s, Target t)
+        : CodeGen_C(s, t) {
+    }
+
+protected:
+    using CodeGen_C::visit;
+    void visit(const Shuffle *op) override;
+
+    VectorDeclarationStyle vector_declaration_style = VectorDeclarationStyle::CLikeSyntax;
+};
+
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 4cf8443fb5ae..d76ebb708ce5 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -2,7 +2,6 @@
 #include <sstream>
 #include <utility>
 
-#include "CodeGen_C.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Metal_Dev.h"
@@ -50,17 +49,17 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
     }
 
 protected:
-    class CodeGen_Metal_C : public CodeGen_C {
+    class CodeGen_Metal_C : public CodeGen_GPU_C {
     public:
         CodeGen_Metal_C(std::ostream &s, const Target &t)
-            : CodeGen_C(s, t) {
+            : CodeGen_GPU_C(s, t) {
         }
         void add_kernel(const Stmt &stmt,
                         const std::string &name,
                         const std::vector<DeviceArgument> &args);
 
     protected:
-        using CodeGen_C::visit;
+        using CodeGen_GPU_C::visit;
         std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
         // Vectors in Metal come in two varieties, regular and packed.
         // For storage allocations and pointers used in address arithmetic,
@@ -93,7 +92,6 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
         void visit(const Free *op) override;
         void visit(const Cast *op) override;
         void visit(const Atomic *op) override;
-        void visit(const Shuffle *op) override;
     };
 
     std::ostringstream src_stream;
@@ -268,7 +266,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const For *loop) {
 
     } else {
         user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside Metal kernel\n";
-        CodeGen_C::visit(loop);
+        CodeGen_GPU_C::visit(loop);
     }
 }
 
@@ -322,7 +320,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Call *op) {
         stream << ");\n";
         print_assignment(op->type, "0");
     } else {
-        CodeGen_C::visit(op);
+        CodeGen_GPU_C::visit(op);
     }
 }
 
@@ -545,46 +543,6 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Atomic *op) {
     user_assert(false) << "Atomic updates are not supported inside Metal kernels";
 }
 
-void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) {
-    if (op->type.is_scalar()) {
-        CodeGen_C::visit(op);
-    } else {
-        internal_assert(!op->vectors.empty());
-        for (size_t i = 1; i < op->vectors.size(); i++) {
-            internal_assert(op->vectors[0].type() == op->vectors[i].type());
-        }
-        internal_assert(op->type.lanes() == (int)op->indices.size());
-        const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
-        for (int i : op->indices) {
-            internal_assert(i >= 0 && i < max_index);
-        }
-
-        std::vector<string> vecs;
-        for (const Expr &v : op->vectors) {
-            vecs.push_back(print_expr(v));
-        }
-
-        string src = vecs[0];
-        ostringstream rhs;
-        // This code has always assumed/required that all the vectors
-        // have identical types, so let's verify
-        const Type t0 = op->vectors[0].type();
-        for (const auto &v : op->vectors) {
-            internal_assert(t0 == v.type());
-        }
-        string storage_name = unique_name('_');
-        rhs << "{";
-        for (int i : op->indices) {
-            rhs << vecs[i];
-            if (i < (int)(op->indices.size() - 1)) {
-                rhs << ", ";
-            }
-        }
-        rhs << "}";
-        print_assignment(op->type, rhs.str());
-    }
-}
-
 void CodeGen_Metal_Dev::add_kernel(Stmt s,
                                    const string &name,
                                    const vector<DeviceArgument> &args) {
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index 9defbd377a2e..f5a51c88d555 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -4,7 +4,6 @@
 #include <utility>
 
 #include "CSE.h"
-#include "CodeGen_C.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_OpenCL_Dev.h"
@@ -55,18 +54,19 @@ class CodeGen_OpenCL_Dev : public CodeGen_GPU_Dev {
     }
 
 protected:
-    class CodeGen_OpenCL_C : public CodeGen_C {
+    class CodeGen_OpenCL_C : public CodeGen_GPU_C {
     public:
         CodeGen_OpenCL_C(std::ostream &s, Target t)
-            : CodeGen_C(s, t) {
+            : CodeGen_GPU_C(s, t) {
             integer_suffix_style = IntegerSuffixStyle::OpenCL;
+            vector_declaration_style = VectorDeclarationStyle::OpenCLSyntax;
         }
         void add_kernel(Stmt stmt,
                         const std::string &name,
                         const std::vector<DeviceArgument> &args);
 
     protected:
-        using CodeGen_C::visit;
+        using CodeGen_GPU_C::visit;
         std::string print_type(Type type, AppendSpaceIfNeeded append_space = DoNotAppendSpace) override;
         std::string print_reinterpret(Type type, const Expr &e) override;
         std::string print_extern_call(const Call *op) override;
@@ -223,7 +223,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) {
 
     } else {
         user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside OpenCL kernel\n";
-        CodeGen_C::visit(loop);
+        CodeGen_GPU_C::visit(loop);
     }
 }
 
@@ -351,7 +351,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
                 print_assignment(op->type, a0 + " >> " + a1);
             }
         } else {
-            CodeGen_C::visit(op);
+            CodeGen_GPU_C::visit(op);
         }
     } else if (op->is_intrinsic(Call::image_load)) {
         // image_load(<image name>, <buffer>, <x>, <x-extent>, <y>,
@@ -455,7 +455,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
             stream << write_image.str();
         }
     } else {
-        CodeGen_C::visit(op);
+        CodeGen_GPU_C::visit(op);
     }
 }
 
@@ -743,7 +743,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Cast *op) {
     if (op->type.is_vector()) {
         print_assignment(op->type, "convert_" + print_type(op->type) + "(" + print_expr(op->value) + ")");
     } else {
-        CodeGen_C::visit(op);
+        CodeGen_GPU_C::visit(op);
     }
 }
 
@@ -755,7 +755,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Select *op) {
         equiv.accept(this);
         return;
     }
-    CodeGen_C::visit(op);
+    CodeGen_GPU_C::visit(op);
 }
 
 void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Allocate *op) {
@@ -864,43 +864,8 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
         rhs << print_expr(op->vectors[0]);
         rhs << ".s" << op->indices[0];
         print_assignment(op->type, rhs.str());
-    } else if (op->type.is_scalar()) {
-        CodeGen_C::visit(op);
-    } else {
-        internal_assert(!op->vectors.empty());
-        for (size_t i = 1; i < op->vectors.size(); i++) {
-            internal_assert(op->vectors[0].type() == op->vectors[i].type());
-        }
-        internal_assert(op->type.lanes() == (int)op->indices.size());
-        const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
-        for (int i : op->indices) {
-            internal_assert(i >= 0 && i < max_index);
-        }
-
-        std::vector<string> vecs;
-        for (const Expr &v : op->vectors) {
-            vecs.push_back(print_expr(v));
-        }
-
-        string src = vecs[0];
-        ostringstream rhs;
-        // This code has always assumed/required that all the vectors
-        // have identical types, so let's verify
-        const Type t0 = op->vectors[0].type();
-        for (const auto &v : op->vectors) {
-            internal_assert(t0 == v.type());
-        }
-        string storage_name = unique_name('_');
-        rhs << "(" << print_type(op->type) << ")";
-        rhs << "(";
-        for (int i : op->indices) {
-            rhs << vecs[i];
-            if (i < (int)(op->indices.size() - 1)) {
-                rhs << ", ";
-            }
-        }
-        rhs << ")";
-        print_assignment(op->type, rhs.str());
+    } else  {
+        CodeGen_GPU_C::visit(op);
     }
 }
 
@@ -920,7 +885,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Atomic *op) {
 
     // Issue atomic stores.
     ScopedValue<bool> old_emit_atomic_stores(emit_atomic_stores, true);
-    CodeGen_C::visit(op);
+    CodeGen_GPU_C::visit(op);
 }
 
 void CodeGen_OpenCL_Dev::add_kernel(Stmt s,

From 524dee2357af31b4bb9b09ebba4845516f35e4d4 Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Thu, 31 Mar 2022 17:39:30 -0400
Subject: [PATCH 9/9] Extra space removal

---
 src/CodeGen_OpenCL_Dev.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index f5a51c88d555..bd58806b01e2 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -864,7 +864,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
         rhs << print_expr(op->vectors[0]);
         rhs << ".s" << op->indices[0];
         print_assignment(op->type, rhs.str());
-    } else  {
+    } else {
         CodeGen_GPU_C::visit(op);
     }
 }