halide · shoaibkamil · Apr 1, 2022 · Feb 2, 2022 · Feb 7, 2022 · Mar 23, 2022
diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -106,6 +106,7 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
  void visit(const Cast *op) override;
  void visit(const Atomic *op) override;
  void visit(const FloatImm *op) override;
+ void visit(const Shuffle *op) override;
 
  Scope<> groupshared_allocations;
  };
@@ -957,6 +958,43 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Atomic *op) {
  user_assert(false) << "Atomics operations are not supported inside D3D12Compute kernel.\n";
 }
 
+void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op) {
+ if (op->type.is_scalar()) {
+ CodeGen_C::visit(op);
+ } else {
+ internal_assert(!op->vectors.empty());
+ for (size_t i = 1; i < op->vectors.size(); i++) {
+ internal_assert(op->vectors[0].type() == op->vectors[i].type());
+ }
+ internal_assert(op->type.lanes() == (int)op->indices.size());
+ const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
+ for (int i : op->indices) {
+ internal_assert(i >= -1 && i < max_index);
+ }
+
+ std::vector<string> vecs;
+ for (const Expr &v : op->vectors) {
+ vecs.push_back(print_expr(v));
+ }
+
+ string src = vecs[0];
+ ostringstream rhs;
+ // This code has always assumed/required that all the vectors
+ // have identical types, so let's verify
+ const Type t0 = op->vectors[0].type();
+ for (const auto &v : op->vectors) {
+ internal_assert(t0 == v.type());
+ }
+ string storage_name = unique_name('_');
+ rhs << "{";
+ for (int i : op->indices) {
+ rhs << vecs[i] << ",";
+ }
+ rhs << "}";
+ print_assignment(op->type, rhs.str());
+ }
+}
+
 void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op) {
  // TODO(marcos): just a pass-through for now, but we might consider doing
  // something different, such as adding the suffic 'u' to the integer that

diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp
@@ -116,8 +116,7 @@ class ScalarizePredicatedLoadStore : public IRMutator {
  mutate(extract_lane(s->index, ln)),
  s->param,
  const_true(),
- // TODO: alignment needs to be changed
- s->alignment)));
+ s->alignment + ln)));
  }
  return Block::make(scalar_stmts);
  } else {
@@ -127,12 +126,23 @@ class ScalarizePredicatedLoadStore : public IRMutator {
 
  Expr visit(const Load *op) override {
  if (!is_const_one(op->predicate)) {
- Expr load_expr = Load::make(op->type, op->name, op->index, op->image,
- op->param, const_true(op->type.lanes()), op->alignment);
- Expr pred_load = Call::make(load_expr.type(),
- Call::if_then_else,
- {op->predicate, load_expr},
- Internal::Call::PureIntrinsic);
+ std::vector<Expr> lane_values;
+ for (int ln = 0; ln < op->type.lanes(); ln++) {
+ Expr load_expr = Load::make(op->type.element_of(),
+ op->name,
+ extract_lane(op->index, ln),
+ op->image,
+ op->param,
+ const_true(),
+ op->alignment + ln);
+ lane_values.push_back(Call::make(load_expr.type(),
+ Call::if_then_else,
+ {extract_lane(op->predicate, ln),
+ load_expr,
+ make_zero(op->type.element_of())},
+ Internal::Call::PureIntrinsic));
+ }
+ Expr pred_load = Shuffle::make_concat(lane_values);
  return pred_load;
  } else {
  return op;

diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
@@ -93,6 +93,7 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
  void visit(const Free *op) override;
  void visit(const Cast *op) override;
  void visit(const Atomic *op) override;
+ void visit(const Shuffle *op) override;
  };
 
  std::ostringstream src_stream;
@@ -544,6 +545,43 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Atomic *op) {
  user_assert(false) << "Atomic updates are not supported inside Metal kernels";
 }
 
+void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) {
+ if (op->type.is_scalar()) {
+ CodeGen_C::visit(op);
+ } else {
+ internal_assert(!op->vectors.empty());
+ for (size_t i = 1; i < op->vectors.size(); i++) {
+ internal_assert(op->vectors[0].type() == op->vectors[i].type());
+ }
+ internal_assert(op->type.lanes() == (int)op->indices.size());
+ const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
+ for (int i : op->indices) {
+ internal_assert(i >= -1 && i < max_index);
+ }
+
+ std::vector<string> vecs;
+ for (const Expr &v : op->vectors) {
+ vecs.push_back(print_expr(v));
+ }
+
+ string src = vecs[0];
+ ostringstream rhs;
+ // This code has always assumed/required that all the vectors
+ // have identical types, so let's verify
+ const Type t0 = op->vectors[0].type();
+ for (const auto &v : op->vectors) {
+ internal_assert(t0 == v.type());
+ }
+ string storage_name = unique_name('_');
+ rhs << "{";
+ for (int i : op->indices) {
+ rhs << vecs[i] << ",";
+ }
+ rhs << "}";
+ print_assignment(op->type, rhs.str());
+ }
+}
+
 void CodeGen_Metal_Dev::add_kernel(Stmt s,
  const string &name,
  const vector<DeviceArgument> &args) {

diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
@@ -858,8 +858,50 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
  }
  stream << ");\n";
  }
+ } else if (op->is_extract_element()) {
+ // OpenCL requires using .s<n> format for extracting an element
+ ostringstream rhs;
+ rhs << print_expr(op->vectors[0]);
+ rhs << ".s" << op->indices[0];
+ print_assignment(op->type, rhs.str());
+ } else if (op->type.is_scalar()) {
+ CodeGen_C::visit(op);
  } else {
- internal_error << "Shuffle not implemented.\n";
+ internal_assert(!op->vectors.empty());
+ for (size_t i = 1; i < op->vectors.size(); i++) {
+ internal_assert(op->vectors[0].type() == op->vectors[i].type());
+ }
+ internal_assert(op->type.lanes() == (int)op->indices.size());
+ const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
+ for (int i : op->indices) {
+ internal_assert(i >= -1 && i < max_index);
+ }
+
+ std::vector<string> vecs;
+ for (const Expr &v : op->vectors) {
+ vecs.push_back(print_expr(v));
+ }
+
+ string src = vecs[0];
+ ostringstream rhs;
+ // This code has always assumed/required that all the vectors
+ // have identical types, so let's verify
+ const Type t0 = op->vectors[0].type();
+ for (const auto &v : op->vectors) {
+ internal_assert(t0 == v.type());
+ }
+ string storage_name = unique_name('_');
+ rhs << "(" << print_type(op->type) << ")";
+ rhs << "(";
+ for (int i : op->indices) {
+ rhs << vecs[i];
+ if (i < (int)(op->indices.size() - 1)) {
+ rhs << ", ";
+ }
+ }
+ rhs << ")";
+ print_assignment(op->type, rhs.str());
+
  }
 }
 
@@ -926,6 +968,13 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s,
  debug(2) << "After eliminating bool vectors:\n"
  << s << "\n";
 
+ // We need to scalarize/de-predicate any loads/stores, since OpenCL does not
+ // support predication.
+ s = scalarize_predicated_loads_stores(s);
+
+ debug(2) << "After removing predication: \n"
+ << s;
+
  // Figure out which arguments should be passed in __constant.
  // Such arguments should be:
  // - not written to,

diff --git a/test/correctness/gpu_vectorize.cpp b/test/correctness/gpu_vectorize.cpp
@@ -70,6 +70,42 @@ int main(int argc, char **argv) {
  }
  }
  }
+ {
+ Var x("x"), y("y"), xi("xi"), yi("yi");
+ Func f("f");
+ ImageParam im(Float(32), 2);
+
+ printf("Defining function...\n");
+
+ f(x, y) = select(im(x, y) > 32.0f, 1.0f, -1.0f) + im(x, y);
+
+ Target target = get_jit_target_from_environment();
+ if (target.has_gpu_feature()) {
+ f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::GuardWithIf).vectorize(xi, 4, TailStrategy::GuardWithIf);
+ }
+
+ printf("Realizing function...\n");
+ Buffer<float> input_img(32, 32);
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 32; j++) {
+ input_img(i, j) = i + j;
+ }
+ }
+ im.set(input_img);
+
+ Buffer<float> imf = f.realize({32, 32}, target);
+
+ // Check the result was what we expected
+ for (int i = 0; i < 32; i++) {
+ for (int j = 0; j < 32; j++) {
+ float correct = (i + j > 32 ? 1.0f : -1.0f) + i + j;
+ if (fabs(imf(i, j) - correct) > 0.001f) {
+ printf("imf[%d, %d] = %f instead of %f\n", i, j, imf(i, j), correct);
+ return -1;
+ }
+ }
+ }
+ }
 
  printf("Success!\n");
  return 0;