diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index b1e99626e53b..41fa6ecc2660 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -3,7 +3,6 @@ #include #include -#include "CodeGen_C.h" #include "CodeGen_D3D12Compute_Dev.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" @@ -62,10 +61,10 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev { protected: friend struct StoragePackUnpack; - class CodeGen_D3D12Compute_C : public CodeGen_C { + class CodeGen_D3D12Compute_C : public CodeGen_GPU_C { public: CodeGen_D3D12Compute_C(std::ostream &s, const Target &t) - : CodeGen_C(s, t) { + : CodeGen_GPU_C(s, t) { integer_suffix_style = IntegerSuffixStyle::HLSL; } void add_kernel(Stmt stmt, @@ -88,7 +87,7 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev { std::string print_assignment(Type t, const std::string &rhs) override; - using CodeGen_C::visit; + using CodeGen_GPU_C::visit; void visit(const Evaluate *op) override; void visit(const Min *) override; void visit(const Max *) override; @@ -303,7 +302,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const For *loop) { if (!is_gpu_var(loop->name)) { user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside D3D12Compute kernel\n"; - CodeGen_C::visit(loop); + CodeGen_GPU_C::visit(loop); return; } @@ -380,7 +379,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Call *op) { // directly. stream << "pow(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")"; } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } @@ -815,7 +814,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Free *op) { string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_assignment(Type type, const string &rhs) { string rhs_modified = print_reinforced_cast(type, rhs); - return CodeGen_C::print_assignment(type, rhs_modified); + return CodeGen_GPU_C::print_assignment(type, rhs_modified); } string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_vanilla_cast(Type type, const string &value_expr) { @@ -964,7 +963,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op) // have seen division-by-zero shader warnings, and we postulated that it // could be indirectly related to compiler assumptions on signed integer // overflow when float_from_bits() is called, but we don't know for sure - return CodeGen_C::visit(op); + return CodeGen_GPU_C::visit(op); } void CodeGen_D3D12Compute_Dev::add_kernel(Stmt s, diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp index 1f29f01fc1ad..3606ae41e3c9 100644 --- a/src/CodeGen_GPU_Dev.cpp +++ b/src/CodeGen_GPU_Dev.cpp @@ -116,8 +116,7 @@ class ScalarizePredicatedLoadStore : public IRMutator { mutate(extract_lane(s->index, ln)), s->param, const_true(), - // TODO: alignment needs to be changed - s->alignment))); + s->alignment + ln))); } return Block::make(scalar_stmts); } else { @@ -127,12 +126,23 @@ class ScalarizePredicatedLoadStore : public IRMutator { Expr visit(const Load *op) override { if (!is_const_one(op->predicate)) { - Expr load_expr = Load::make(op->type, op->name, op->index, op->image, - op->param, const_true(op->type.lanes()), op->alignment); - Expr pred_load = Call::make(load_expr.type(), - Call::if_then_else, - {op->predicate, load_expr}, - Internal::Call::PureIntrinsic); + std::vector lane_values; + for (int ln = 0; ln < op->type.lanes(); ln++) { + Expr load_expr = Load::make(op->type.element_of(), + op->name, + extract_lane(op->index, ln), + op->image, + op->param, + const_true(), + op->alignment + ln); + lane_values.push_back(Call::make(load_expr.type(), + Call::if_then_else, + {extract_lane(op->predicate, ln), + load_expr, + make_zero(op->type.element_of())}, + Internal::Call::PureIntrinsic)); + } + Expr pred_load = Shuffle::make_concat(lane_values); return pred_load; } else { return op; @@ -147,5 +157,47 @@ Stmt CodeGen_GPU_Dev::scalarize_predicated_loads_stores(Stmt &s) { return sps.mutate(s); } +void CodeGen_GPU_C::visit(const Shuffle *op) { + if (op->type.is_scalar()) { + CodeGen_C::visit(op); + } else { + internal_assert(!op->vectors.empty()); + for (size_t i = 1; i < op->vectors.size(); i++) { + internal_assert(op->vectors[0].type() == op->vectors[i].type()); + } + internal_assert(op->type.lanes() == (int)op->indices.size()); + const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); + for (int i : op->indices) { + internal_assert(i >= 0 && i < max_index); + } + + std::vector vecs; + for (const Expr &v : op->vectors) { + vecs.push_back(print_expr(v)); + } + + std::string src = vecs[0]; + std::ostringstream rhs; + std::string storage_name = unique_name('_'); + if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) { + rhs << "(" << print_type(op->type) << ")("; + } else { + rhs << "{"; + } + for (int i : op->indices) { + rhs << vecs[i]; + if (i < (int)(op->indices.size() - 1)) { + rhs << ", "; + } + } + if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) { + rhs << ")"; + } else { + rhs << "}"; + } + print_assignment(op->type, rhs.str()); + } +} + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_GPU_Dev.h b/src/CodeGen_GPU_Dev.h index 2b516fd62d13..dfbd1f58c49b 100644 --- a/src/CodeGen_GPU_Dev.h +++ b/src/CodeGen_GPU_Dev.h @@ -7,6 +7,7 @@ #include #include +#include "CodeGen_C.h" #include "DeviceArgument.h" #include "Expr.h" @@ -73,8 +74,8 @@ struct CodeGen_GPU_Dev { static Stmt scalarize_predicated_loads_stores(Stmt &s); /** An mask describing which type of memory fence to use for the gpu_thread_barrier() - * intrinsic. Not all GPUs APIs support all types. - */ + * intrinsic. Not all GPUs APIs support all types. + */ enum MemoryFenceType { None = 0, // No fence required (just a sync) Device = 1, // Device/global memory fence @@ -82,6 +83,28 @@ struct CodeGen_GPU_Dev { }; }; +/** A base class for GPU backends that require C-like shader output. + * GPU backends derive from and specialize this class. */ +class CodeGen_GPU_C : public CodeGen_C { +public: + /** OpenCL uses a different syntax than C for immediate vectors. This + enum defines which style should be used by the backend. */ + enum class VectorDeclarationStyle { + CLikeSyntax = 0, + OpenCLSyntax = 1 + }; + + CodeGen_GPU_C(std::ostream &s, Target t) + : CodeGen_C(s, t) { + } + +protected: + using CodeGen_C::visit; + void visit(const Shuffle *op) override; + + VectorDeclarationStyle vector_declaration_style = VectorDeclarationStyle::CLikeSyntax; +}; + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index 6089d84b9d3a..d76ebb708ce5 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -2,7 +2,6 @@ #include #include -#include "CodeGen_C.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_Metal_Dev.h" @@ -50,17 +49,17 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev { } protected: - class CodeGen_Metal_C : public CodeGen_C { + class CodeGen_Metal_C : public CodeGen_GPU_C { public: CodeGen_Metal_C(std::ostream &s, const Target &t) - : CodeGen_C(s, t) { + : CodeGen_GPU_C(s, t) { } void add_kernel(const Stmt &stmt, const std::string &name, const std::vector &args); protected: - using CodeGen_C::visit; + using CodeGen_GPU_C::visit; std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override; // Vectors in Metal come in two varieties, regular and packed. // For storage allocations and pointers used in address arithmetic, @@ -267,7 +266,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const For *loop) { } else { user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside Metal kernel\n"; - CodeGen_C::visit(loop); + CodeGen_GPU_C::visit(loop); } } @@ -321,7 +320,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Call *op) { stream << ");\n"; print_assignment(op->type, "0"); } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 539a0699b909..bd58806b01e2 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -4,7 +4,6 @@ #include #include "CSE.h" -#include "CodeGen_C.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_OpenCL_Dev.h" @@ -55,18 +54,19 @@ class CodeGen_OpenCL_Dev : public CodeGen_GPU_Dev { } protected: - class CodeGen_OpenCL_C : public CodeGen_C { + class CodeGen_OpenCL_C : public CodeGen_GPU_C { public: CodeGen_OpenCL_C(std::ostream &s, Target t) - : CodeGen_C(s, t) { + : CodeGen_GPU_C(s, t) { integer_suffix_style = IntegerSuffixStyle::OpenCL; + vector_declaration_style = VectorDeclarationStyle::OpenCLSyntax; } void add_kernel(Stmt stmt, const std::string &name, const std::vector &args); protected: - using CodeGen_C::visit; + using CodeGen_GPU_C::visit; std::string print_type(Type type, AppendSpaceIfNeeded append_space = DoNotAppendSpace) override; std::string print_reinterpret(Type type, const Expr &e) override; std::string print_extern_call(const Call *op) override; @@ -223,7 +223,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) { } else { user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside OpenCL kernel\n"; - CodeGen_C::visit(loop); + CodeGen_GPU_C::visit(loop); } } @@ -351,7 +351,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { print_assignment(op->type, a0 + " >> " + a1); } } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } else if (op->is_intrinsic(Call::image_load)) { // image_load(, , , , , @@ -455,7 +455,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { stream << write_image.str(); } } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } @@ -743,7 +743,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Cast *op) { if (op->type.is_vector()) { print_assignment(op->type, "convert_" + print_type(op->type) + "(" + print_expr(op->value) + ")"); } else { - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } } @@ -755,7 +755,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Select *op) { equiv.accept(this); return; } - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Allocate *op) { @@ -858,8 +858,14 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) { } stream << ");\n"; } + } else if (op->is_extract_element()) { + // OpenCL requires using .s format for extracting an element + ostringstream rhs; + rhs << print_expr(op->vectors[0]); + rhs << ".s" << op->indices[0]; + print_assignment(op->type, rhs.str()); } else { - internal_error << "Shuffle not implemented.\n"; + CodeGen_GPU_C::visit(op); } } @@ -879,7 +885,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Atomic *op) { // Issue atomic stores. ScopedValue old_emit_atomic_stores(emit_atomic_stores, true); - CodeGen_C::visit(op); + CodeGen_GPU_C::visit(op); } void CodeGen_OpenCL_Dev::add_kernel(Stmt s, @@ -926,6 +932,13 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s, debug(2) << "After eliminating bool vectors:\n" << s << "\n"; + // We need to scalarize/de-predicate any loads/stores, since OpenCL does not + // support predication. + s = scalarize_predicated_loads_stores(s); + + debug(2) << "After removing predication: \n" + << s; + // Figure out which arguments should be passed in __constant. // Such arguments should be: // - not written to, diff --git a/test/correctness/gpu_vectorize.cpp b/test/correctness/gpu_vectorize.cpp index 407342cca1af..2e0ffeebc3b5 100644 --- a/test/correctness/gpu_vectorize.cpp +++ b/test/correctness/gpu_vectorize.cpp @@ -70,6 +70,42 @@ int main(int argc, char **argv) { } } } + { + Var x("x"), y("y"), xi("xi"), yi("yi"); + Func f("f"); + ImageParam im(Float(32), 2); + + printf("Defining function...\n"); + + f(x, y) = select(im(x, y) > 32.0f, 1.0f, -1.0f) + im(x, y); + + Target target = get_jit_target_from_environment(); + if (target.has_gpu_feature()) { + f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::GuardWithIf).vectorize(xi, 4, TailStrategy::GuardWithIf); + } + + printf("Realizing function...\n"); + Buffer input_img(32, 32); + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { + input_img(i, j) = i + j; + } + } + im.set(input_img); + + Buffer imf = f.realize({32, 32}, target); + + // Check the result was what we expected + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { + float correct = (i + j > 32 ? 1.0f : -1.0f) + i + j; + if (fabs(imf(i, j) - correct) > 0.001f) { + printf("imf[%d, %d] = %f instead of %f\n", i, j, imf(i, j), correct); + return -1; + } + } + } + } printf("Success!\n"); return 0;