Skip to content

Commit

Permalink
Fix CUDA int8x4 vectorize (apache#3928)
Browse files Browse the repository at this point in the history
* Fix int8x4 vectorize

* Fix gpu shared/local memory accumulate

* Add test_shared_memory for int8x4

* Adjust test format

* Fix cpplint
  • Loading branch information
llehtahw authored and wweic committed Sep 16, 2019
1 parent 99235fd commit 983ac9c
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 37 deletions.
13 changes: 11 additions & 2 deletions src/codegen/codegen_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,24 @@ void CodeGenCUDA::PrintVecElemLoad(
const std::string& vec, Type t, int i, std::ostream& os) { // NOLINT(*)
static const char access[] = {'x', 'y', 'z', 'w'};
CHECK(i >= 0 && i < 4);
os << vec << "." << access[i];
if (t.is_int() && t.bits() == 8) {
os << "(0x000000ff & (" << vec << " >> " << i * 8 << "))";
} else {
os << vec << "." << access[i];
}
}

void CodeGenCUDA::PrintVecElemStore(
const std::string& vec, Type t, int i, const std::string& value) {
this->PrintIndent();
static const char access[] = {'x', 'y', 'z', 'w'};
CHECK(i >= 0 && i < 4);
stream << vec << "." << access[i] << " = " << value << ";\n";
if (t.is_int() && t.bits() == 8) {
stream << vec << "=" << vec << " & ~(0x000000ff << " << i * 8 << ") | ("
<< value << " << " << i * 8 << ");\n";
} else {
stream << vec << "." << access[i] << " = " << value << ";\n";
}
}

void CodeGenCUDA::PrintStorageSync(const Call* op) {
Expand Down
4 changes: 2 additions & 2 deletions src/pass/verify_gpu_code.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ class GPUCodeVerifier : public IRVisitor {
// visit an allocation of a buffer in shared memory, record its size
if (visited_local_buffers_.count(op->buffer_var.get()) != 0) {
size_t size = static_cast<size_t>(op->constant_allocation_size());
local_memory_per_block_ += size * op->type.bytes();
local_memory_per_block_ += size * op->type.bytes() * op->type.lanes();
} else if (visited_shared_buffers_.count(op->buffer_var.get()) != 0) {
size_t size = static_cast<size_t>(op->constant_allocation_size());
shared_memory_per_block_ += size * op->type.bytes();
shared_memory_per_block_ += size * op->type.bytes() * op->type.lanes();
}
}

Expand Down
1 change: 1 addition & 0 deletions tests/python/unittest/test_codegen_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def check_cuda(dtype, n, lanes):

check_cuda("float32", 64, 2)
check_cuda("float16", 64, 2)
check_cuda("int8", 64, 4)


def test_cuda_multiply_add():
Expand Down
72 changes: 39 additions & 33 deletions tests/python/unittest/test_pass_verify_gpu_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,39 +24,45 @@ def verify_pass(stmt):
return verify_pass

def test_shared_memory():
N = 1024
M = 128

A = tvm.placeholder((N,), name='A', dtype='float32')
B = tvm.compute((N, ), lambda i: A[i], name='B')

s = tvm.create_schedule([B.op])
AA = s.cache_read(A, "shared", [B])
o, i = s[B].split(s[B].op.axis[0], M)
s[AA].compute_at(s[B], o)
s[B].bind(o, tvm.thread_axis("blockIdx.x"))
s[B].bind(i, tvm.thread_axis("threadIdx.x"))

# shared memory usage: M * 4B
# thread usage: M

for target in ['opencl', 'cuda']:
if not tvm.context(target).exist:
continue
valid = [None]
with tvm.build_config(**{"add_lower_pass": [
(2, get_verify_pass(valid,
max_shared_memory_per_block=4 * M - 1,
max_threads_per_block=M))]}):
tvm.build(s, [A, B], target)
assert not valid[0]

with tvm.build_config(**{"add_lower_pass": [
(2, get_verify_pass(valid,
max_shared_memory_per_block=4 * M,
max_threads_per_block=M))]}):
tvm.build(s, [A, B], target)
assert valid[0]
def check_shared_memory(dtype):
N = 1024
M = 128

tvm_type = tvm.datatype._TVMType(dtype)
type_size = tvm_type.bits // 8 * tvm_type.lanes

A = tvm.placeholder((N,), name='A', dtype=dtype)
B = tvm.compute((N, ), lambda i: A[i], name='B')

s = tvm.create_schedule([B.op])
AA = s.cache_read(A, "shared", [B])
o, i = s[B].split(s[B].op.axis[0], M)
s[AA].compute_at(s[B], o)
s[B].bind(o, tvm.thread_axis("blockIdx.x"))
s[B].bind(i, tvm.thread_axis("threadIdx.x"))

# shared memory usage: M * sizeof(dtype) Bytes
# thread usage: M

for target in ['opencl', 'cuda']:
if not tvm.context(target).exist:
continue
valid = [None]
with tvm.build_config(**{"add_lower_pass": [
(2, get_verify_pass(valid,
max_shared_memory_per_block=type_size * M - 1,
max_threads_per_block=M))]}):
tvm.build(s, [A, B], target)
assert not valid[0]

with tvm.build_config(**{"add_lower_pass": [
(2, get_verify_pass(valid,
max_shared_memory_per_block=type_size * M,
max_threads_per_block=M))]}):
tvm.build(s, [A, B], target)
assert valid[0]
check_shared_memory('float32')
check_shared_memory('int8x4')

def test_local_memory():
N = 1024
Expand Down

0 comments on commit 983ac9c

Please sign in to comment.