Skip to content

Commit

Permalink
Revert "Added tesnorizeation for avx2 based gemm. (apache#3982)" (apa…
Browse files Browse the repository at this point in the history
…che#4007)

This reverts commit 23727eb.
  • Loading branch information
tqchen authored and wweic committed Sep 30, 2019
1 parent 533d66e commit c6d4fea
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 188 deletions.
94 changes: 0 additions & 94 deletions tests/python/contrib/test_gemm_avx2_acc32.py

This file was deleted.

94 changes: 0 additions & 94 deletions topi/python/topi/x86/tensor_intrin.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,97 +275,3 @@ def _instr(index):

with tvm.build_config(offset_factor=1, partition_const_loop=True):
return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})


def dot_1x4x16_int8_int8_int32_avx2():
"""
Int8 dot product by every 4 elements using x86 AVX2 instructions.
This function takes two arrays of int8 datatype -- data[4] and
kernel[16][4] -- and computes a dot product of data[4] with every
4 elements of kernels, resulting in output[16] of int32 datatype.
The pseudo code is as follows.
.. code-block:: c
void dot_1x4x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
int32 output[16]){
for (int i = 0; i < 16; i++){
out[i] = 0;
for (int k = 0; k < 4; k++){
out[i] += data[k] * kernel[i][k]
}
}
}
Physically, the kernel array sits in two AVX2 vector registers and
the data[4] is broadcasted to AVX2 vector register. This
function returns a TensorIntrin that can be used to tensorize
a schedule.
Returns
-------
intrin : TensorIntrin
The AVX2 int8 TensorIntrin that can be used in tensorizing schedule
"""

int32_lanes = 16 # 16 int32 lanes in AVX2
num_int8_elements = 4 # 4 int8 elements in int32
data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
k = tvm.reduce_axis((0, num_int8_elements), name='k')
C = tvm.compute((int32_lanes,),
lambda i: tvm.sum(data[k].astype('int32') *
kernel[i, k].astype('int32'),
axis=k),
name="C")

a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
offset_factor=1,
strides=[1])
b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
offset_factor=1,
strides=[tvm.var('ldw'), 1])

def _intrin_func(ins, outs):
def _instr(index):
ib = tvm.ir_builder.create()
if index == 1:
ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
return ib.get()

a_int8 = ins[0].vload([0], "uint8x4")
re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
vec_ai32 = re_int32.astype('int32x8')
vec_a = tvm.call_pure_intrin('int8x32', 'reinterpret', vec_ai32)
vec_b_0 = ins[1].vload([0, 0], "int8x32")
vec_b_1 = ins[1].vload([8, 0], "int8x32")
vec_one = tvm.const(1, "int16x16")
pair_reduction_0 = tvm.call_llvm_intrin('int16x16',
'llvm.x86.avx2.pmadd.ub.sw',
tvm.const(0, 'uint32'),
vec_a, vec_b_0)
quad_reduction_0 = tvm.call_llvm_intrin('int32x8',
'llvm.x86.avx2.pmadd.wd',
tvm.const(0, 'uint32'),
pair_reduction_0, vec_one)
pair_reduction_1 = tvm.call_llvm_intrin('int16x16',
'llvm.x86.avx2.pmadd.ub.sw',
tvm.const(0, 'uint32'),
vec_a, vec_b_1)
quad_reduction_1 = tvm.call_llvm_intrin('int32x8',
'llvm.x86.avx2.pmadd.wd',
tvm.const(0, 'uint32'),
pair_reduction_1, vec_one)
if index == 0:
ib.emit(outs[0].vstore([0], quad_reduction_0))
ib.emit(outs[0].vstore([8], quad_reduction_1))
else:
ib.emit(outs[0].vstore([0], quad_reduction_0 + \
outs[0].vload([0], 'int32x8')))
ib.emit(outs[0].vstore([8], quad_reduction_1 + \
outs[0].vload([8], 'int32x8')))
return ib.get()

# body, reset, update
return _instr(0), _instr(1), _instr(2)

with tvm.build_config(offset_factor=1, partition_const_loop=True):
return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})

0 comments on commit c6d4fea

Please sign in to comment.