Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancements] Several bugfixes and refactoring of dynamic generic reduction #1156

Merged
merged 65 commits into from
Sep 29, 2021
Merged
Show file tree
Hide file tree
Changes from 63 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
78c293c
Merge commit 'c840438b62e3071b8e658de7343c8e461387de97' as 'src/compo…
Jul 30, 2021
c840438
Squashed 'src/composable_kernel/' content from commit f6edda611
Jul 30, 2021
6204be8
add solver ConvIgemmFwdV6r1DlopsNchwKcyxNkhw; rename static ck source…
Jul 30, 2021
3c0614b
Merge commit '437cc595c6e206dfebb118985b5171bbc1e29eab' into composab…
Aug 6, 2021
437cc59
Squashed 'src/composable_kernel/' changes from f6edda611..5781adf5c
Aug 6, 2021
d822708
fix
Aug 6, 2021
0bf90d4
refactor
Aug 6, 2021
c67b040
remove online compilation from CK
Aug 7, 2021
fff94ec
refactor
Aug 7, 2021
44da477
fix
Aug 7, 2021
1208667
add ctest
Aug 7, 2021
9f89938
tidy
Aug 7, 2021
4f825a5
add tidy
Aug 8, 2021
2ad51a5
tidy
Aug 9, 2021
add55bb
tidy
Aug 9, 2021
5cae6d0
tidy
Aug 9, 2021
63204c0
tidy
Aug 9, 2021
64b8ab8
tidy
Aug 9, 2021
685ff52
tidy
Aug 9, 2021
aeddc20
tidy
Aug 9, 2021
7b9a9ea
tidy
Aug 9, 2021
b5d1fa3
tidy
Aug 9, 2021
bf9c7a7
add c-style pointer cast
Aug 10, 2021
f258bf4
vector/scalar pointer cast use c-style pointer cast instead of reinte…
Aug 10, 2021
15467d5
fix clang warning suppression
Aug 10, 2021
485800f
tidy
Aug 10, 2021
6f1ea68
suppress cppcheck
Aug 10, 2021
9547d24
fix enum issue
Aug 10, 2021
d921965
revert chagnes to hip build
Aug 10, 2021
f5680a9
Merge remote-tracking branch 'origin/develop' into composable_kernel_…
Aug 10, 2021
2bfe093
fix kernel filename
Aug 10, 2021
b31e661
update CK build script
Aug 10, 2021
f2e1a1c
rename
Aug 10, 2021
f03af0c
rename
Aug 11, 2021
4619a4e
make innner product compatiable on gfx900
Aug 11, 2021
d5e7530
Update src/include/miopen/solver/ck_utility_common.hpp
Aug 12, 2021
04999e3
compiler parameter use stream
Aug 13, 2021
b460246
use int instead of index_t in kernel wrapper
Aug 13, 2021
6445cae
DynamicBuffer, StaticBuffer, amd_buffer_load support customized value…
Aug 13, 2021
40860ed
Merge remote-tracking branch 'origin/develop' into composable_kernel_…
Aug 13, 2021
09f375e
Merge remote-tracking branch 'origin/develop' into composable_kernel_…
Aug 16, 2021
18a7570
refactor
Aug 16, 2021
3241ab4
refactor
Aug 16, 2021
d50dc04
change cmakelist
Aug 17, 2021
8db34a5
change ck common utility
Aug 17, 2021
77b4c43
fix
Aug 17, 2021
3e81fca
Merge remote-tracking branch 'origin/develop' into CK_upstream
Aug 21, 2021
3952bef
Squashed 'src/composable_kernel/' changes from 5781adf5c..31b403526
Aug 21, 2021
f64c376
Merge commit '3952bef74baa9cebb447035ccfdc67ed17c2c0e6' into CK_upstream
Aug 21, 2021
2f4fe70
Tiny fix in using data type template parameters in blockwise and dire…
qianfengz Sep 15, 2021
f098bfb
Fix with regard to implementing GetZeroVal() in both kernel and host
qianfengz Sep 15, 2021
ba91b99
Avoid convert to compType from dstDataType before writting the output…
qianfengz Sep 15, 2021
a26986e
Add half_t support to NumericLimits and make constexpr GetZeroVal() o…
qianfengz Sep 17, 2021
a7a0c6b
Add CONSTANT decorator for descriptor read buffer
qianfengz Sep 5, 2021
9e8229c
Use get_thread_local_1d_id() for thread local Id
qianfengz Sep 2, 2021
b483839
Rename GetZeroVal() to GetReductionZeroVal() in the kernels
qianfengz Sep 17, 2021
cbc4696
Remove constexpr from initialized zeroVal and tiny fix in reduction_o…
qianfengz Sep 17, 2021
3adc483
Occasional tiny simplification and update in the kernel files
qianfengz Sep 18, 2021
44e5537
Update in src/reducetensor.cpp for consistent IDs passing to the kernel
qianfengz Sep 18, 2021
dbb77c5
Update to re-order tensor dimensions on the host, split second_call k…
qianfengz Sep 18, 2021
41f57af
Update to remove OpenCL tidy checking failures
qianfengz Sep 18, 2021
a78d0ea
Small updates in src/reducetensor.cpp
qianfengz Sep 22, 2021
3afabc0
Merge remote-tracking branch 'upstream/CK_upstream' into reduction_fi…
qianfengz Sep 22, 2021
6ca9b6f
Update for better readability
qianfengz Sep 24, 2021
997d97e
Remove unused codes and not-needed template parameters in the kernel …
qianfengz Sep 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ struct GridwiseReduction_xy_to_x_blockwise
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];

auto zeroVal = opReduce::GetZeroVal();
const auto zeroVal = opReduce::GetReductionZeroVal();

const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
Expand Down Expand Up @@ -180,6 +180,10 @@ struct GridwiseReduction_xy_to_x_blockwise
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);

StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;

dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);

if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
Expand All @@ -200,11 +204,11 @@ struct GridwiseReduction_xy_to_x_blockwise
threadwise_dst_load.Run(
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);

accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}

auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Expand All @@ -218,7 +222,7 @@ struct GridwiseReduction_xy_to_x_blockwise
make_multi_index(block_global_1d_id));

threadwise_dst_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
}
};

Expand All @@ -239,7 +243,7 @@ struct GridwiseReduction_xy_to_x_blockwise
__shared__ compType p_in_block_buffer[BlockBufferSize];
__shared__ int block_indices_buffer[BlockBufferSize];

auto zeroVal = opReduce::GetZeroVal();
const auto zeroVal = opReduce::GetReductionZeroVal();

const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
Expand Down Expand Up @@ -281,7 +285,7 @@ struct GridwiseReduction_xy_to_x_blockwise
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
dstDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Expand Down Expand Up @@ -345,6 +349,10 @@ struct GridwiseReduction_xy_to_x_blockwise
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);

StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;

dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);

if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
Expand All @@ -368,11 +376,11 @@ struct GridwiseReduction_xy_to_x_blockwise
make_tuple(I0),
priorDstValue_buf);

accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}

auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Expand Down Expand Up @@ -400,7 +408,7 @@ struct GridwiseReduction_xy_to_x_blockwise
make_multi_index(block_global_1d_id));

threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
Expand All @@ -423,7 +431,7 @@ struct GridwiseReduction_xy_to_x_blockwise
__shared__ compType p_in_block_buffer[BlockBufferSize];
__shared__ int block_indices_buffer[BlockBufferSize];

auto zeroVal = opReduce::GetZeroVal();
const auto zeroVal = opReduce::GetReductionZeroVal();

const auto src_global_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
Expand Down Expand Up @@ -547,6 +555,10 @@ struct GridwiseReduction_xy_to_x_blockwise
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);

StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;

dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);

if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
Expand All @@ -570,11 +582,11 @@ struct GridwiseReduction_xy_to_x_blockwise
make_tuple(I0),
priorDstValue_buf);

accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}

auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Expand Down Expand Up @@ -602,7 +614,7 @@ struct GridwiseReduction_xy_to_x_blockwise
make_multi_index(block_global_1d_id));

threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
(void)ws_indices_global;
(void)indices_global;

const auto zeroVal = opReduce::GetZeroVal();
const auto zeroVal = opReduce::GetReductionZeroVal();

const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
Expand Down Expand Up @@ -147,6 +147,10 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);

StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;

dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);

if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
Expand All @@ -166,11 +170,11 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
threadwise_dst_load.Run(
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);

accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}

auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Expand All @@ -184,7 +188,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
make_multi_index(thread_global_1d_id));

threadwise_dst_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_buf);
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
};

template <>
Expand All @@ -200,7 +204,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
{
(void)ws_indices_global;

const auto zeroVal = opReduce::GetZeroVal();
const auto zeroVal = opReduce::GetReductionZeroVal();

const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>{}(zeroVal));
Expand Down Expand Up @@ -232,7 +236,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();

auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
dstDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Expand Down Expand Up @@ -271,6 +275,10 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);

StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;

dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);

if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
Expand All @@ -290,11 +298,11 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
threadwise_dst_load.Run(
dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);

accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}

auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Expand Down Expand Up @@ -322,7 +330,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
make_multi_index(thread_global_1d_id));

threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
};
Expand All @@ -340,7 +348,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
{
(void)origReduceLen;

const auto zeroVal = opReduce::GetZeroVal();
const auto zeroVal = opReduce::GetReductionZeroVal();

const auto src_global_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Global>(ws_values_global,
Expand Down Expand Up @@ -377,7 +385,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();

auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
dstDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Expand Down Expand Up @@ -430,6 +438,10 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>{}(alpha);

StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;

dstValue_buf(I0) = type_convert<dstDataType>{}(accuValue_buf[I0]);

if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
Expand All @@ -449,11 +461,11 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
threadwise_dst_load.Run(
dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);

accuValue_buf(I0) += type_convert<compType>{}(priorDstValue_buf[I0] * beta);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}

auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Expand Down Expand Up @@ -481,7 +493,7 @@ struct GridwiseReduction_xy_to_x_direct_threadwise
make_multi_index(thread_global_1d_id));

threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), accuValue_buf, dst1dDesc, dst_global_val_buf);
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
};
Expand Down
Loading