From 71f6c21191ec9e6a44df82ddb9d5850bda8d9908 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 2 Feb 2022 17:06:04 -0500 Subject: [PATCH 1/9] Scalarize predicated Loads --- src/CodeGen_GPU_Dev.cpp | 24 +++++++++++++----- src/CodeGen_Metal_Dev.cpp | 39 ++++++++++++++++++++++++++++++ test/correctness/gpu_vectorize.cpp | 36 +++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 6 deletions(-) diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp index 1f29f01fc1ad..108b9cef73a0 100644 --- a/src/CodeGen_GPU_Dev.cpp +++ b/src/CodeGen_GPU_Dev.cpp @@ -127,12 +127,24 @@ class ScalarizePredicatedLoadStore : public IRMutator { Expr visit(const Load *op) override { if (!is_const_one(op->predicate)) { - Expr load_expr = Load::make(op->type, op->name, op->index, op->image, - op->param, const_true(op->type.lanes()), op->alignment); - Expr pred_load = Call::make(load_expr.type(), - Call::if_then_else, - {op->predicate, load_expr}, - Internal::Call::PureIntrinsic); + std::vector lane_values; + for (int ln = 0; ln < op->type.lanes(); ln++) { + Expr load_expr = Load::make(op->type.element_of(), + op->name, + extract_lane(op->index, ln), + op->image, + op->param, + const_true(), + // TODO: alignment needs to be changed + op->alignment); + lane_values.push_back(Call::make(load_expr.type(), + Call::if_then_else, + {extract_lane(op->predicate, ln), + load_expr, + make_zero(op->type.element_of())}, + Internal::Call::PureIntrinsic)); + } + Expr pred_load = Shuffle::make_concat(lane_values); return pred_load; } else { return op; diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index 6089d84b9d3a..b8e0a5b87eed 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -93,6 +93,8 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev { void visit(const Free *op) override; void visit(const Cast *op) override; void visit(const Atomic *op) override; + void visit(const IfThenElse *op) override; + void visit(const Shuffle *op) override; }; std::ostringstream src_stream; @@ -544,6 +546,43 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Atomic *op) { user_assert(false) << "Atomic updates are not supported inside Metal kernels"; } +void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) { + if (op->type.is_scalar()) { + CodeGen_C::visit(op); + } else { + internal_assert(!op->vectors.empty()); + for (size_t i = 1; i < op->vectors.size(); i++) { + internal_assert(op->vectors[0].type() == op->vectors[i].type()); + } + internal_assert(op->type.lanes() == (int)op->indices.size()); + const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); + for (int i : op->indices) { + internal_assert(i >= -1 && i < max_index); + } + + std::vector vecs; + for (const Expr &v : op->vectors) { + vecs.push_back(print_expr(v)); + } + + string src = vecs[0]; + ostringstream rhs; + // This code has always assumed/required that all the vectors + // have identical types, so let's verify + const Type t0 = op->vectors[0].type(); + for (const auto &v : op->vectors) { + internal_assert(t0 == v.type()); + } + string storage_name = unique_name('_'); + rhs << "{"; + for (int i : op->indices) { + rhs << vecs[i] << ","; + } + rhs << "}"; + print_assignment(op->type, rhs.str()); + } +} + void CodeGen_Metal_Dev::add_kernel(Stmt s, const string &name, const vector &args) { diff --git a/test/correctness/gpu_vectorize.cpp b/test/correctness/gpu_vectorize.cpp index 407342cca1af..2e0ffeebc3b5 100644 --- a/test/correctness/gpu_vectorize.cpp +++ b/test/correctness/gpu_vectorize.cpp @@ -70,6 +70,42 @@ int main(int argc, char **argv) { } } } + { + Var x("x"), y("y"), xi("xi"), yi("yi"); + Func f("f"); + ImageParam im(Float(32), 2); + + printf("Defining function...\n"); + + f(x, y) = select(im(x, y) > 32.0f, 1.0f, -1.0f) + im(x, y); + + Target target = get_jit_target_from_environment(); + if (target.has_gpu_feature()) { + f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::GuardWithIf).vectorize(xi, 4, TailStrategy::GuardWithIf); + } + + printf("Realizing function...\n"); + Buffer input_img(32, 32); + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { + input_img(i, j) = i + j; + } + } + im.set(input_img); + + Buffer imf = f.realize({32, 32}, target); + + // Check the result was what we expected + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { + float correct = (i + j > 32 ? 1.0f : -1.0f) + i + j; + if (fabs(imf(i, j) - correct) > 0.001f) { + printf("imf[%d, %d] = %f instead of %f\n", i, j, imf(i, j), correct); + return -1; + } + } + } + } printf("Success!\n"); return 0; From fc28a0481dcaeafac1015be8f069b6005a6dbf0d Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Mon, 7 Feb 2022 10:22:48 -0500 Subject: [PATCH 2/9] Cleanup --- src/CodeGen_GPU_Dev.cpp | 6 ++---- src/CodeGen_Metal_Dev.cpp | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp index 108b9cef73a0..17eac6bba86f 100644 --- a/src/CodeGen_GPU_Dev.cpp +++ b/src/CodeGen_GPU_Dev.cpp @@ -116,8 +116,7 @@ class ScalarizePredicatedLoadStore : public IRMutator { mutate(extract_lane(s->index, ln)), s->param, const_true(), - // TODO: alignment needs to be changed - s->alignment))); + s->alignment + ln))); } return Block::make(scalar_stmts); } else { @@ -135,8 +134,7 @@ class ScalarizePredicatedLoadStore : public IRMutator { op->image, op->param, const_true(), - // TODO: alignment needs to be changed - op->alignment); + op->alignment + ln); lane_values.push_back(Call::make(load_expr.type(), Call::if_then_else, {extract_lane(op->predicate, ln), diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index b8e0a5b87eed..0efa33b19c0e 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -93,7 +93,6 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev { void visit(const Free *op) override; void visit(const Cast *op) override; void visit(const Atomic *op) override; - void visit(const IfThenElse *op) override; void visit(const Shuffle *op) override; }; From 3e83b8203ee469a5933fda707fc9dbcc321b574b Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 30 Mar 2022 17:17:44 -0400 Subject: [PATCH 3/9] Fix gpu_vectorize scalarization for D3D12 --- src/CodeGen_D3D12Compute_Dev.cpp | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index b1e99626e53b..67d8d6d03c81 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -106,6 +106,7 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev { void visit(const Cast *op) override; void visit(const Atomic *op) override; void visit(const FloatImm *op) override; + void visit(const Shuffle *op) override; Scope<> groupshared_allocations; }; @@ -957,6 +958,43 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Atomic *op) { user_assert(false) << "Atomics operations are not supported inside D3D12Compute kernel.\n"; } +void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op) { + if (op->type.is_scalar()) { + CodeGen_C::visit(op); + } else { + internal_assert(!op->vectors.empty()); + for (size_t i = 1; i < op->vectors.size(); i++) { + internal_assert(op->vectors[0].type() == op->vectors[i].type()); + } + internal_assert(op->type.lanes() == (int)op->indices.size()); + const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); + for (int i : op->indices) { + internal_assert(i >= -1 && i < max_index); + } + + std::vector vecs; + for (const Expr &v : op->vectors) { + vecs.push_back(print_expr(v)); + } + + string src = vecs[0]; + ostringstream rhs; + // This code has always assumed/required that all the vectors + // have identical types, so let's verify + const Type t0 = op->vectors[0].type(); + for (const auto &v : op->vectors) { + internal_assert(t0 == v.type()); + } + string storage_name = unique_name('_'); + rhs << "{"; + for (int i : op->indices) { + rhs << vecs[i] << ","; + } + rhs << "}"; + print_assignment(op->type, rhs.str()); + } +} + void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op) { // TODO(marcos): just a pass-through for now, but we might consider doing // something different, such as adding the suffic 'u' to the integer that @@ -1240,6 +1278,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, } stream << "\n"; + } void CodeGen_D3D12Compute_Dev::init_module() { From 28dac275d638ddca009ea16ef6421999af0e84de Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 30 Mar 2022 18:20:23 -0400 Subject: [PATCH 4/9] Fix OpenCL scalarization --- src/CodeGen_OpenCL_Dev.cpp | 51 +++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 539a0699b909..7f00cdd46b2e 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -858,8 +858,50 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) { } stream << ");\n"; } + } else if (op->is_extract_element()) { + // OpenCL requires using .s format for extracting an element + ostringstream rhs; + rhs << print_expr(op->vectors[0]); + rhs << ".s" << op->indices[0]; + print_assignment(op->type, rhs.str()); + } else if (op->type.is_scalar()) { + CodeGen_C::visit(op); } else { - internal_error << "Shuffle not implemented.\n"; + internal_assert(!op->vectors.empty()); + for (size_t i = 1; i < op->vectors.size(); i++) { + internal_assert(op->vectors[0].type() == op->vectors[i].type()); + } + internal_assert(op->type.lanes() == (int)op->indices.size()); + const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); + for (int i : op->indices) { + internal_assert(i >= -1 && i < max_index); + } + + std::vector vecs; + for (const Expr &v : op->vectors) { + vecs.push_back(print_expr(v)); + } + + string src = vecs[0]; + ostringstream rhs; + // This code has always assumed/required that all the vectors + // have identical types, so let's verify + const Type t0 = op->vectors[0].type(); + for (const auto &v : op->vectors) { + internal_assert(t0 == v.type()); + } + string storage_name = unique_name('_'); + rhs << "(" << print_type(op->type) << ")"; + rhs << "("; + for (int i : op->indices) { + rhs << vecs[i]; + if (i < op->indices.size() - 1) { + rhs << ", "; + } + } + rhs << ")"; + print_assignment(op->type, rhs.str()); + } } @@ -926,6 +968,13 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s, debug(2) << "After eliminating bool vectors:\n" << s << "\n"; + // We need to scalarize/de-predicate any loads/stores, since OpenCL does not + // support predication. + s = scalarize_predicated_loads_stores(s); + + debug(2) << "After removing predication: \n" + << s; + // Figure out which arguments should be passed in __constant. // Such arguments should be: // - not written to, From 5ffa87ce964f1d7fbbff80ead1bcfe1741c5b337 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 30 Mar 2022 18:39:01 -0400 Subject: [PATCH 5/9] Minor fixes --- src/CodeGen_D3D12Compute_Dev.cpp | 1 - src/CodeGen_OpenCL_Dev.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index 67d8d6d03c81..4cf57dd8adb9 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -1278,7 +1278,6 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, } stream << "\n"; - } void CodeGen_D3D12Compute_Dev::init_module() { diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 7f00cdd46b2e..4bdc462db650 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -895,7 +895,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) { rhs << "("; for (int i : op->indices) { rhs << vecs[i]; - if (i < op->indices.size() - 1) { + if (i < (int)(op->indices.size() - 1)) { rhs << ", "; } } From 06017f240cdc90bd63884a9169f4c486bd8003cf Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 30 Mar 2022 22:19:04 -0400 Subject: [PATCH 6/9] Formatting --- src/CodeGen_OpenCL_Dev.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 4bdc462db650..c99bd1bb1919 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -860,10 +860,10 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) { } } else if (op->is_extract_element()) { // OpenCL requires using .s format for extracting an element - ostringstream rhs; - rhs << print_expr(op->vectors[0]); - rhs << ".s" << op->indices[0]; - print_assignment(op->type, rhs.str()); + ostringstream rhs; + rhs << print_expr(op->vectors[0]); + rhs << ".s" << op->indices[0]; + print_assignment(op->type, rhs.str()); } else if (op->type.is_scalar()) { CodeGen_C::visit(op); } else { @@ -901,7 +901,6 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) { } rhs << ")"; print_assignment(op->type, rhs.str()); - } } From c01d44bf9da1fff01bcf76769a6978ad321ba51f Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Thu, 31 Mar 2022 10:50:20 -0400 Subject: [PATCH 7/9] Address review comments --- src/CodeGen_D3D12Compute_Dev.cpp | 7 +++++-- src/CodeGen_Metal_Dev.cpp | 7 +++++-- src/CodeGen_OpenCL_Dev.cpp | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index 4cf57dd8adb9..19b4ab88080a 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -969,7 +969,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op) internal_assert(op->type.lanes() == (int)op->indices.size()); const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); for (int i : op->indices) { - internal_assert(i >= -1 && i < max_index); + internal_assert(i >= 0 && i < max_index); } std::vector vecs; @@ -988,7 +988,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op) string storage_name = unique_name('_'); rhs << "{"; for (int i : op->indices) { - rhs << vecs[i] << ","; + rhs << vecs[i]; + if (i < (int)(op->indices.size() - 1)) { + rhs << ", "; + } } rhs << "}"; print_assignment(op->type, rhs.str()); diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index 0efa33b19c0e..4cf8443fb5ae 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -556,7 +556,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) { internal_assert(op->type.lanes() == (int)op->indices.size()); const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); for (int i : op->indices) { - internal_assert(i >= -1 && i < max_index); + internal_assert(i >= 0 && i < max_index); } std::vector vecs; @@ -575,7 +575,10 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) { string storage_name = unique_name('_'); rhs << "{"; for (int i : op->indices) { - rhs << vecs[i] << ","; + rhs << vecs[i]; + if (i < (int)(op->indices.size() - 1)) { + rhs << ", "; + } } rhs << "}"; print_assignment(op->type, rhs.str()); diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index c99bd1bb1919..9defbd377a2e 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -874,7 +874,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) { internal_assert(op->type.lanes() == (int)op->indices.size()); const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); for (int i : op->indices) { - internal_assert(i >= -1 && i < max_index); + internal_assert(i >= 0 && i < max_index); } std::vector vecs; From 63b38608700cad1bea7e27e51e4966cb7c871659 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Thu, 31 Mar 2022 15:58:29 -0400 Subject: [PATCH 8/9] Move Shuffle impl to CodeGen_GPU_C class --- src/CodeGen_D3D12Compute_Dev.cpp | 56 ++++-------------------------- src/CodeGen_GPU_Dev.cpp | 42 +++++++++++++++++++++++ src/CodeGen_GPU_Dev.h | 27 +++++++++++++-- src/CodeGen_Metal_Dev.cpp | 52 +++------------------------- src/CodeGen_OpenCL_Dev.cpp | 59 +++++++------------------------- 5 files changed, 91 insertions(+), 145 deletions(-) diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index 19b4ab88080a..41fa6ecc2660 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -3,7 +3,6 @@ #include #include -#include "CodeGen_C.h" #include "CodeGen_D3D12Compute_Dev.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" @@ -62,10 +61,10 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev { protected: friend struct StoragePackUnpack; - class CodeGen_D3D12Compute_C : public CodeGen_C { + class CodeGen_D3D12Compute_C : public CodeGen_GPU_C { public: CodeGen_D3D12Compute_C(std::ostream &s, const Target &t) - : CodeGen_C(s, t) { + : CodeGen_GPU_C(s, t) { integer_suffix_style = IntegerSuffixStyle::HLSL; } void add_kernel(Stmt stmt, @@ -88,7 +87,7 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev { std::string print_assignment(Type t, const std::string &rhs) override; - using CodeGen_C::visit; + using CodeGen_GPU_C::visit; void visit(const Evaluate *op) override; void visit(const Min *) override; void visit(const Max *) override; @@ -106,7 +105,6 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev { void visit(const Cast *op) override; void visit(const Atomic *op) override; void visit(const FloatImm *op) override; - void visit(const Shuffle *op) override; Scope<> groupshared_allocations; }; @@ -304,7 +302,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const For *loop) { if (!is_gpu_var(loop->name)) { user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside D3D12Compute kernel\n"; - CodeGen_C::visit(loop); + CodeGen_GPU_C::visit(loop); return; } @@ -381,7 +379,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Call *op) { // directly. stream << "pow(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")"; } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } @@ -816,7 +814,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Free *op) { string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_assignment(Type type, const string &rhs) { string rhs_modified = print_reinforced_cast(type, rhs); - return CodeGen_C::print_assignment(type, rhs_modified); + return CodeGen_GPU_C::print_assignment(type, rhs_modified); } string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_vanilla_cast(Type type, const string &value_expr) { @@ -958,46 +956,6 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Atomic *op) { user_assert(false) << "Atomics operations are not supported inside D3D12Compute kernel.\n"; } -void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op) { - if (op->type.is_scalar()) { - CodeGen_C::visit(op); - } else { - internal_assert(!op->vectors.empty()); - for (size_t i = 1; i < op->vectors.size(); i++) { - internal_assert(op->vectors[0].type() == op->vectors[i].type()); - } - internal_assert(op->type.lanes() == (int)op->indices.size()); - const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); - for (int i : op->indices) { - internal_assert(i >= 0 && i < max_index); - } - - std::vector vecs; - for (const Expr &v : op->vectors) { - vecs.push_back(print_expr(v)); - } - - string src = vecs[0]; - ostringstream rhs; - // This code has always assumed/required that all the vectors - // have identical types, so let's verify - const Type t0 = op->vectors[0].type(); - for (const auto &v : op->vectors) { - internal_assert(t0 == v.type()); - } - string storage_name = unique_name('_'); - rhs << "{"; - for (int i : op->indices) { - rhs << vecs[i]; - if (i < (int)(op->indices.size() - 1)) { - rhs << ", "; - } - } - rhs << "}"; - print_assignment(op->type, rhs.str()); - } -} - void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op) { // TODO(marcos): just a pass-through for now, but we might consider doing // something different, such as adding the suffic 'u' to the integer that @@ -1005,7 +963,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op) // have seen division-by-zero shader warnings, and we postulated that it // could be indirectly related to compiler assumptions on signed integer // overflow when float_from_bits() is called, but we don't know for sure - return CodeGen_C::visit(op); + return CodeGen_GPU_C::visit(op); } void CodeGen_D3D12Compute_Dev::add_kernel(Stmt s, diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp index 17eac6bba86f..3606ae41e3c9 100644 --- a/src/CodeGen_GPU_Dev.cpp +++ b/src/CodeGen_GPU_Dev.cpp @@ -157,5 +157,47 @@ Stmt CodeGen_GPU_Dev::scalarize_predicated_loads_stores(Stmt &s) { return sps.mutate(s); } +void CodeGen_GPU_C::visit(const Shuffle *op) { + if (op->type.is_scalar()) { + CodeGen_C::visit(op); + } else { + internal_assert(!op->vectors.empty()); + for (size_t i = 1; i < op->vectors.size(); i++) { + internal_assert(op->vectors[0].type() == op->vectors[i].type()); + } + internal_assert(op->type.lanes() == (int)op->indices.size()); + const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); + for (int i : op->indices) { + internal_assert(i >= 0 && i < max_index); + } + + std::vector vecs; + for (const Expr &v : op->vectors) { + vecs.push_back(print_expr(v)); + } + + std::string src = vecs[0]; + std::ostringstream rhs; + std::string storage_name = unique_name('_'); + if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) { + rhs << "(" << print_type(op->type) << ")("; + } else { + rhs << "{"; + } + for (int i : op->indices) { + rhs << vecs[i]; + if (i < (int)(op->indices.size() - 1)) { + rhs << ", "; + } + } + if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) { + rhs << ")"; + } else { + rhs << "}"; + } + print_assignment(op->type, rhs.str()); + } +} + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_GPU_Dev.h b/src/CodeGen_GPU_Dev.h index 2b516fd62d13..dfbd1f58c49b 100644 --- a/src/CodeGen_GPU_Dev.h +++ b/src/CodeGen_GPU_Dev.h @@ -7,6 +7,7 @@ #include #include +#include "CodeGen_C.h" #include "DeviceArgument.h" #include "Expr.h" @@ -73,8 +74,8 @@ struct CodeGen_GPU_Dev { static Stmt scalarize_predicated_loads_stores(Stmt &s); /** An mask describing which type of memory fence to use for the gpu_thread_barrier() - * intrinsic. Not all GPUs APIs support all types. - */ + * intrinsic. Not all GPUs APIs support all types. + */ enum MemoryFenceType { None = 0, // No fence required (just a sync) Device = 1, // Device/global memory fence @@ -82,6 +83,28 @@ struct CodeGen_GPU_Dev { }; }; +/** A base class for GPU backends that require C-like shader output. + * GPU backends derive from and specialize this class. */ +class CodeGen_GPU_C : public CodeGen_C { +public: + /** OpenCL uses a different syntax than C for immediate vectors. This + enum defines which style should be used by the backend. */ + enum class VectorDeclarationStyle { + CLikeSyntax = 0, + OpenCLSyntax = 1 + }; + + CodeGen_GPU_C(std::ostream &s, Target t) + : CodeGen_C(s, t) { + } + +protected: + using CodeGen_C::visit; + void visit(const Shuffle *op) override; + + VectorDeclarationStyle vector_declaration_style = VectorDeclarationStyle::CLikeSyntax; +}; + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index 4cf8443fb5ae..d76ebb708ce5 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -2,7 +2,6 @@ #include #include -#include "CodeGen_C.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_Metal_Dev.h" @@ -50,17 +49,17 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev { } protected: - class CodeGen_Metal_C : public CodeGen_C { + class CodeGen_Metal_C : public CodeGen_GPU_C { public: CodeGen_Metal_C(std::ostream &s, const Target &t) - : CodeGen_C(s, t) { + : CodeGen_GPU_C(s, t) { } void add_kernel(const Stmt &stmt, const std::string &name, const std::vector &args); protected: - using CodeGen_C::visit; + using CodeGen_GPU_C::visit; std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override; // Vectors in Metal come in two varieties, regular and packed. // For storage allocations and pointers used in address arithmetic, @@ -93,7 +92,6 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev { void visit(const Free *op) override; void visit(const Cast *op) override; void visit(const Atomic *op) override; - void visit(const Shuffle *op) override; }; std::ostringstream src_stream; @@ -268,7 +266,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const For *loop) { } else { user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside Metal kernel\n"; - CodeGen_C::visit(loop); + CodeGen_GPU_C::visit(loop); } } @@ -322,7 +320,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Call *op) { stream << ");\n"; print_assignment(op->type, "0"); } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } @@ -545,46 +543,6 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Atomic *op) { user_assert(false) << "Atomic updates are not supported inside Metal kernels"; } -void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) { - if (op->type.is_scalar()) { - CodeGen_C::visit(op); - } else { - internal_assert(!op->vectors.empty()); - for (size_t i = 1; i < op->vectors.size(); i++) { - internal_assert(op->vectors[0].type() == op->vectors[i].type()); - } - internal_assert(op->type.lanes() == (int)op->indices.size()); - const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); - for (int i : op->indices) { - internal_assert(i >= 0 && i < max_index); - } - - std::vector vecs; - for (const Expr &v : op->vectors) { - vecs.push_back(print_expr(v)); - } - - string src = vecs[0]; - ostringstream rhs; - // This code has always assumed/required that all the vectors - // have identical types, so let's verify - const Type t0 = op->vectors[0].type(); - for (const auto &v : op->vectors) { - internal_assert(t0 == v.type()); - } - string storage_name = unique_name('_'); - rhs << "{"; - for (int i : op->indices) { - rhs << vecs[i]; - if (i < (int)(op->indices.size() - 1)) { - rhs << ", "; - } - } - rhs << "}"; - print_assignment(op->type, rhs.str()); - } -} - void CodeGen_Metal_Dev::add_kernel(Stmt s, const string &name, const vector &args) { diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 9defbd377a2e..f5a51c88d555 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -4,7 +4,6 @@ #include #include "CSE.h" -#include "CodeGen_C.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_OpenCL_Dev.h" @@ -55,18 +54,19 @@ class CodeGen_OpenCL_Dev : public CodeGen_GPU_Dev { } protected: - class CodeGen_OpenCL_C : public CodeGen_C { + class CodeGen_OpenCL_C : public CodeGen_GPU_C { public: CodeGen_OpenCL_C(std::ostream &s, Target t) - : CodeGen_C(s, t) { + : CodeGen_GPU_C(s, t) { integer_suffix_style = IntegerSuffixStyle::OpenCL; + vector_declaration_style = VectorDeclarationStyle::OpenCLSyntax; } void add_kernel(Stmt stmt, const std::string &name, const std::vector &args); protected: - using CodeGen_C::visit; + using CodeGen_GPU_C::visit; std::string print_type(Type type, AppendSpaceIfNeeded append_space = DoNotAppendSpace) override; std::string print_reinterpret(Type type, const Expr &e) override; std::string print_extern_call(const Call *op) override; @@ -223,7 +223,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) { } else { user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside OpenCL kernel\n"; - CodeGen_C::visit(loop); + CodeGen_GPU_C::visit(loop); } } @@ -351,7 +351,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { print_assignment(op->type, a0 + " >> " + a1); } } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } else if (op->is_intrinsic(Call::image_load)) { // image_load(, , , , , @@ -455,7 +455,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { stream << write_image.str(); } } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } @@ -743,7 +743,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Cast *op) { if (op->type.is_vector()) { print_assignment(op->type, "convert_" + print_type(op->type) + "(" + print_expr(op->value) + ")"); } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } @@ -755,7 +755,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Select *op) { equiv.accept(this); return; } - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Allocate *op) { @@ -864,43 +864,8 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) { rhs << print_expr(op->vectors[0]); rhs << ".s" << op->indices[0]; print_assignment(op->type, rhs.str()); - } else if (op->type.is_scalar()) { - CodeGen_C::visit(op); - } else { - internal_assert(!op->vectors.empty()); - for (size_t i = 1; i < op->vectors.size(); i++) { - internal_assert(op->vectors[0].type() == op->vectors[i].type()); - } - internal_assert(op->type.lanes() == (int)op->indices.size()); - const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); - for (int i : op->indices) { - internal_assert(i >= 0 && i < max_index); - } - - std::vector vecs; - for (const Expr &v : op->vectors) { - vecs.push_back(print_expr(v)); - } - - string src = vecs[0]; - ostringstream rhs; - // This code has always assumed/required that all the vectors - // have identical types, so let's verify - const Type t0 = op->vectors[0].type(); - for (const auto &v : op->vectors) { - internal_assert(t0 == v.type()); - } - string storage_name = unique_name('_'); - rhs << "(" << print_type(op->type) << ")"; - rhs << "("; - for (int i : op->indices) { - rhs << vecs[i]; - if (i < (int)(op->indices.size() - 1)) { - rhs << ", "; - } - } - rhs << ")"; - print_assignment(op->type, rhs.str()); + } else { + CodeGen_GPU_C::visit(op); } } @@ -920,7 +885,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Atomic *op) { // Issue atomic stores. ScopedValue old_emit_atomic_stores(emit_atomic_stores, true); - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } void CodeGen_OpenCL_Dev::add_kernel(Stmt s, From 524dee2357af31b4bb9b09ebba4845516f35e4d4 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Thu, 31 Mar 2022 17:39:30 -0400 Subject: [PATCH 9/9] Extra space removal --- src/CodeGen_OpenCL_Dev.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index f5a51c88d555..bd58806b01e2 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -864,7 +864,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) { rhs << print_expr(op->vectors[0]); rhs << ".s" << op->indices[0]; print_assignment(op->type, rhs.str()); - } else { + } else { CodeGen_GPU_C::visit(op); } }