Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GPU depredication/scalarization #6669

Merged
merged 11 commits into from
Apr 1, 2022
38 changes: 38 additions & 0 deletions src/CodeGen_D3D12Compute_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
void visit(const Cast *op) override;
void visit(const Atomic *op) override;
void visit(const FloatImm *op) override;
void visit(const Shuffle *op) override;

Scope<> groupshared_allocations;
};
Expand Down Expand Up @@ -957,6 +958,43 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Atomic *op) {
user_assert(false) << "Atomics operations are not supported inside D3D12Compute kernel.\n";
}

void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Shuffle *op) {
if (op->type.is_scalar()) {
CodeGen_C::visit(op);
} else {
internal_assert(!op->vectors.empty());
for (size_t i = 1; i < op->vectors.size(); i++) {
internal_assert(op->vectors[0].type() == op->vectors[i].type());
}
internal_assert(op->type.lanes() == (int)op->indices.size());
const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
for (int i : op->indices) {
internal_assert(i >= -1 && i < max_index);
shoaibkamil marked this conversation as resolved.
Show resolved Hide resolved
}

std::vector<string> vecs;
for (const Expr &v : op->vectors) {
vecs.push_back(print_expr(v));
}

string src = vecs[0];
ostringstream rhs;
// This code has always assumed/required that all the vectors
// have identical types, so let's verify
const Type t0 = op->vectors[0].type();
for (const auto &v : op->vectors) {
internal_assert(t0 == v.type());
}
shoaibkamil marked this conversation as resolved.
Show resolved Hide resolved
string storage_name = unique_name('_');
rhs << "{";
for (int i : op->indices) {
rhs << vecs[i] << ",";
shoaibkamil marked this conversation as resolved.
Show resolved Hide resolved
}
rhs << "}";
print_assignment(op->type, rhs.str());
}
}

void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op) {
// TODO(marcos): just a pass-through for now, but we might consider doing
// something different, such as adding the suffic 'u' to the integer that
Expand Down
26 changes: 18 additions & 8 deletions src/CodeGen_GPU_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,7 @@ class ScalarizePredicatedLoadStore : public IRMutator {
mutate(extract_lane(s->index, ln)),
s->param,
const_true(),
// TODO: alignment needs to be changed
s->alignment)));
s->alignment + ln)));
}
return Block::make(scalar_stmts);
} else {
Expand All @@ -127,12 +126,23 @@ class ScalarizePredicatedLoadStore : public IRMutator {

Expr visit(const Load *op) override {
if (!is_const_one(op->predicate)) {
Expr load_expr = Load::make(op->type, op->name, op->index, op->image,
op->param, const_true(op->type.lanes()), op->alignment);
Expr pred_load = Call::make(load_expr.type(),
Call::if_then_else,
{op->predicate, load_expr},
Internal::Call::PureIntrinsic);
std::vector<Expr> lane_values;
for (int ln = 0; ln < op->type.lanes(); ln++) {
Expr load_expr = Load::make(op->type.element_of(),
op->name,
extract_lane(op->index, ln),
op->image,
op->param,
const_true(),
op->alignment + ln);
lane_values.push_back(Call::make(load_expr.type(),
Call::if_then_else,
{extract_lane(op->predicate, ln),
load_expr,
make_zero(op->type.element_of())},
Internal::Call::PureIntrinsic));
}
Expr pred_load = Shuffle::make_concat(lane_values);
return pred_load;
} else {
return op;
Expand Down
38 changes: 38 additions & 0 deletions src/CodeGen_Metal_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
void visit(const Free *op) override;
void visit(const Cast *op) override;
void visit(const Atomic *op) override;
void visit(const Shuffle *op) override;
};

std::ostringstream src_stream;
Expand Down Expand Up @@ -544,6 +545,43 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Atomic *op) {
user_assert(false) << "Atomic updates are not supported inside Metal kernels";
}

void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Shuffle *op) {
if (op->type.is_scalar()) {
CodeGen_C::visit(op);
} else {
internal_assert(!op->vectors.empty());
for (size_t i = 1; i < op->vectors.size(); i++) {
internal_assert(op->vectors[0].type() == op->vectors[i].type());
}
internal_assert(op->type.lanes() == (int)op->indices.size());
const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
for (int i : op->indices) {
internal_assert(i >= -1 && i < max_index);
}

std::vector<string> vecs;
for (const Expr &v : op->vectors) {
vecs.push_back(print_expr(v));
}

string src = vecs[0];
ostringstream rhs;
// This code has always assumed/required that all the vectors
// have identical types, so let's verify
const Type t0 = op->vectors[0].type();
for (const auto &v : op->vectors) {
internal_assert(t0 == v.type());
}
string storage_name = unique_name('_');
rhs << "{";
for (int i : op->indices) {
rhs << vecs[i] << ",";
}
rhs << "}";
print_assignment(op->type, rhs.str());
}
}

void CodeGen_Metal_Dev::add_kernel(Stmt s,
const string &name,
const vector<DeviceArgument> &args) {
Expand Down
50 changes: 49 additions & 1 deletion src/CodeGen_OpenCL_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -858,8 +858,49 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Shuffle *op) {
}
stream << ");\n";
}
} else if (op->is_extract_element()) {
// OpenCL requires using .s<n> format for extracting an element
ostringstream rhs;
rhs << print_expr(op->vectors[0]);
rhs << ".s" << op->indices[0];
print_assignment(op->type, rhs.str());
} else if (op->type.is_scalar()) {
CodeGen_C::visit(op);
} else {
internal_error << "Shuffle not implemented.\n";
internal_assert(!op->vectors.empty());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic seems to be repeated three times. Is it really so different across the GPU backends that none of it can usefully be in the base class?

Copy link
Contributor Author

@shoaibkamil shoaibkamil Mar 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered adding a base class for the various CodeGen_<x>_Dev::CodeGen_<x>_C classes. Right now each of them is derived from CodeGen_C. However, I decided not to do that because it felt like a larger refactoring was necessary-- we probably could consolidate a lot of the logic into a superclass, since HLSL/OpenCL C/Metal have a lot of overlap.

I'm happy to this specific functionality into a superclass if you think that's a good idea.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering if it could go into codegen_c, but I see that the shuffle handling there relies on a helper class.

I think it's time to add a CodeGen_GPU_C intermediate class and we can slowly move stuff into it over time.

for (size_t i = 1; i < op->vectors.size(); i++) {
internal_assert(op->vectors[0].type() == op->vectors[i].type());
}
internal_assert(op->type.lanes() == (int)op->indices.size());
const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
for (int i : op->indices) {
internal_assert(i >= -1 && i < max_index);
}

std::vector<string> vecs;
for (const Expr &v : op->vectors) {
vecs.push_back(print_expr(v));
}

string src = vecs[0];
ostringstream rhs;
// This code has always assumed/required that all the vectors
// have identical types, so let's verify
const Type t0 = op->vectors[0].type();
for (const auto &v : op->vectors) {
internal_assert(t0 == v.type());
}
string storage_name = unique_name('_');
rhs << "(" << print_type(op->type) << ")";
rhs << "(";
for (int i : op->indices) {
rhs << vecs[i];
if (i < (int)(op->indices.size() - 1)) {
rhs << ", ";
}
}
rhs << ")";
print_assignment(op->type, rhs.str());
}
}

Expand Down Expand Up @@ -926,6 +967,13 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s,
debug(2) << "After eliminating bool vectors:\n"
<< s << "\n";

// We need to scalarize/de-predicate any loads/stores, since OpenCL does not
// support predication.
s = scalarize_predicated_loads_stores(s);

debug(2) << "After removing predication: \n"
<< s;

// Figure out which arguments should be passed in __constant.
// Such arguments should be:
// - not written to,
Expand Down
36 changes: 36 additions & 0 deletions test/correctness/gpu_vectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,42 @@ int main(int argc, char **argv) {
}
}
}
{
Var x("x"), y("y"), xi("xi"), yi("yi");
Func f("f");
ImageParam im(Float(32), 2);

printf("Defining function...\n");

f(x, y) = select(im(x, y) > 32.0f, 1.0f, -1.0f) + im(x, y);

Target target = get_jit_target_from_environment();
if (target.has_gpu_feature()) {
f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::GuardWithIf).vectorize(xi, 4, TailStrategy::GuardWithIf);
}

printf("Realizing function...\n");
Buffer<float> input_img(32, 32);
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 32; j++) {
input_img(i, j) = i + j;
}
}
im.set(input_img);

Buffer<float> imf = f.realize({32, 32}, target);

// Check the result was what we expected
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 32; j++) {
float correct = (i + j > 32 ? 1.0f : -1.0f) + i + j;
if (fabs(imf(i, j) - correct) > 0.001f) {
printf("imf[%d, %d] = %f instead of %f\n", i, j, imf(i, j), correct);
return -1;
}
}
}
}

printf("Success!\n");
return 0;
Expand Down